Newer
Older
AMI-Aptio-BIOS-Reversed / tools / recover_original_layout.py
@Ajax Dong Ajax Dong 2 days ago 11 KB Init
#!/usr/bin/env python3
"""Recover original source-layout hints from decompiled BIOS artifacts.

The BIOS tree contains many debug-build paths in ASSERT strings, PDB records,
README files, and decompiler sidecars. This tool normalizes those paths and
builds an evidence map from the current flat extraction back toward the
original EDK2/AMI package layout.
"""

from __future__ import annotations

import argparse
from collections import Counter, defaultdict
from dataclasses import dataclass, field
import json
from pathlib import Path
import re
from typing import Iterable


SCAN_SUFFIXES = {".c", ".h", ".md", ".json", ".txt"}
SKIP_DIRS = {".git", ".codex", ".agents", "__pycache__"}
GENERATED_OUTPUTS = {
    Path("docs/original_layout_evidence.json"),
    Path("docs/original_layout_recovery.md"),
}

DRIVE_PATH_RE = re.compile(r"[A-Za-z]:[\\/][^\"'`<>\r\n]+")
SOURCE_FIELD_RE = re.compile(r"\b(?:Source|Build path|Build|PDB):\s*([^|\n\r]+)", re.IGNORECASE)
BUILD_SEGMENT_RE = re.compile(
    r"(?:^|/)Build/(?P<target>[^/]+)/(?P<profile>[^/]+)/(?P<arch>[^/]+)/(?P<rest>.+)",
    re.IGNORECASE,
)
DEBUG_TAIL_RE = re.compile(r"^(?P<module_path>.+)/DEBUG/(?P<leaf>[^/]+)$", re.IGNORECASE)


@dataclass
class Evidence:
    module: str
    file: str
    kind: str
    category: str
    path: str
    inferred_package_path: str | None


@dataclass
class ModuleEvidence:
    module: str
    paths: Counter[str] = field(default_factory=Counter)
    package_paths: Counter[str] = field(default_factory=Counter)
    build_paths: Counter[str] = field(default_factory=Counter)
    source_paths: Counter[str] = field(default_factory=Counter)
    evidence: list[Evidence] = field(default_factory=list)


def iter_scan_files(root: Path) -> Iterable[Path]:
    for path in sorted(root.rglob("*")):
        if not path.is_file() or path.suffix.lower() not in SCAN_SUFFIXES:
            continue
        rel_parts = path.relative_to(root).parts
        if path.relative_to(root) in GENERATED_OUTPUTS:
            continue
        if any(part in SKIP_DIRS for part in rel_parts):
            continue
        yield path


def clean_candidate_path(raw: str) -> str:
    value = raw.strip().strip("`'\"")
    value = value.replace("\\\\", "/").replace("\\", "/")
    value = re.sub(r'"\s*"', "", value)
    value = re.sub(r"\s+", " ", value)
    value = value.rstrip(").,;:|")
    return value


def normalize_path(raw: str) -> str | None:
    value = clean_candidate_path(raw)
    match = re.search(r"[A-Za-z]:/", value)
    if match:
        value = value[match.start() :]
    value = value.replace("//", "/")
    if not re.match(r"^[A-Za-z]:/", value):
        return None
    return value


def infer_package_path(path: str) -> tuple[str | None, str]:
    normalized = path.replace("\\", "/")
    build = BUILD_SEGMENT_RE.search(normalized)
    if build:
        rest = build.group("rest")
        debug_tail = DEBUG_TAIL_RE.match(rest)
        if debug_tail:
            module_path = debug_tail.group("module_path")
            leaf = debug_tail.group("leaf")
            if leaf.lower() == "autogen.c" or leaf.lower().endswith(".pdb"):
                return module_path, "module-build"
        return rest, "build-reference"

    profile = re.search(
        r"(?:^|/)(?:DEBUG|RELEASE)_[^/]+/(?P<arch>[^/]+)/(?P<rest>.+)",
        normalized,
        re.IGNORECASE,
    )
    if profile:
        rest = profile.group("rest")
        debug_tail = DEBUG_TAIL_RE.match(rest)
        if debug_tail:
            leaf = debug_tail.group("leaf")
            if leaf.lower() == "autogen.c" or leaf.lower().endswith(".pdb"):
                return debug_tail.group("module_path"), "module-build"
        if rest.endswith("/DEBUG"):
            return rest.removesuffix("/DEBUG"), "module-build"
        return rest, "build-reference"

    marker = re.search(r"[A-Za-z]:/hs/(?P<rest>.+)", normalized, re.IGNORECASE)
    if marker:
        return marker.group("rest"), "source-reference"
    return None, "unknown"


def module_name_for(root: Path, path: Path) -> str:
    rel = path.relative_to(root)
    return rel.parts[0] if len(rel.parts) > 1 else "."


def extract_paths_from_text(text: str) -> Iterable[tuple[str, str]]:
    collapsed = re.sub(r'"\s*"', "", text)
    escaped_normalized = collapsed.replace("\\\\", "/")
    for match in DRIVE_PATH_RE.finditer(escaped_normalized):
        path = normalize_path(match.group(0))
        if path:
            yield "absolute-path", path

    for match in SOURCE_FIELD_RE.finditer(collapsed):
        candidate = clean_candidate_path(match.group(1))
        if "/" in candidate or "\\" in candidate:
            normalized = candidate.replace("\\", "/")
            normalized = normalized.strip(" -*")
            yield "source-field", normalized


def scan(root: Path) -> dict[str, ModuleEvidence]:
    modules: dict[str, ModuleEvidence] = {}
    for path in iter_scan_files(root):
        try:
            text = path.read_text(encoding="utf-8", errors="replace")
        except OSError:
            continue

        module = module_name_for(root, path)
        record = modules.setdefault(module, ModuleEvidence(module=module))
        rel_file = path.relative_to(root).as_posix()

        for kind, recovered_path in extract_paths_from_text(text):
            package_path, category = infer_package_path(recovered_path)
            evidence = Evidence(
                module=module,
                file=rel_file,
                kind=kind,
                category=category,
                path=recovered_path,
                inferred_package_path=package_path,
            )
            record.evidence.append(evidence)
            record.paths[recovered_path] += 1
            if package_path:
                record.package_paths[package_path] += 1
                if category == "module-build":
                    record.build_paths[package_path] += 1
                else:
                    record.source_paths[package_path] += 1

    return modules


def write_json(root: Path, modules: dict[str, ModuleEvidence], output: Path) -> None:
    payload = {
        "root": str(root),
        "modules": [
            {
                "module": module.module,
                "top_paths": module.paths.most_common(),
                "top_inferred_package_paths": module.package_paths.most_common(),
                "top_module_build_paths": module.build_paths.most_common(),
                "top_source_reference_paths": module.source_paths.most_common(),
                "evidence": [evidence.__dict__ for evidence in module.evidence],
            }
            for module in sorted(modules.values(), key=lambda item: item.module.lower())
            if module.evidence
        ],
    }
    output.parent.mkdir(parents=True, exist_ok=True)
    output.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")


def write_markdown(root: Path, modules: dict[str, ModuleEvidence], output: Path, limit: int) -> None:
    package_counter: Counter[str] = Counter()
    build_counter: Counter[str] = Counter()
    evidence_count = 0
    module_count = 0
    for module in modules.values():
        if module.evidence:
            module_count += 1
            evidence_count += len(module.evidence)
            package_counter.update(module.package_paths)
            build_counter.update(module.build_paths)

    lines: list[str] = [
        "# Original Source Layout Recovery",
        "",
        "Generated from debug/assert/PDB/source-path evidence in the current tree.",
        "Paths are evidence, not proof that every decompiled type or function name is original.",
        "",
        "## Summary",
        "",
        f"- Root: `{root}`",
        f"- Modules with path evidence: {module_count}",
        f"- Path evidence records: {evidence_count}",
        f"- Unique inferred package paths: {len(package_counter)}",
        f"- Unique module build paths: {len(build_counter)}",
        "",
        "## Most Common Module Build Paths",
        "",
        "| Count | Package path |",
        "|---:|---|",
    ]

    for package_path, count in build_counter.most_common(limit):
        lines.append(f"| {count} | `{package_path}` |")

    lines.extend(
        [
            "",
            "## Most Common Source Reference Paths",
            "",
            "| Count | Package path |",
            "|---:|---|",
        ]
    )

    source_counter = package_counter - build_counter
    for package_path, count in source_counter.most_common(limit):
        lines.append(f"| {count} | `{package_path}` |")

    lines.extend(
        [
            "",
            "## Module Map",
            "",
            "| Module | Best module build path | Evidence records |",
            "|---|---|---:|",
        ]
    )

    for module in sorted(modules.values(), key=lambda item: item.module.lower()):
        if not module.evidence:
            continue
        best_path = module.build_paths.most_common(1)[0][0] if module.build_paths else ""
        lines.append(f"| `{module.module}` | `{best_path}` | {len(module.evidence)} |")

    lines.extend(
        [
            "",
            "## Usage Notes",
            "",
            "- Prefer exact `Build/<target>/<profile>/<arch>/<package>/.../DEBUG/AutoGen.c`",
            "  evidence when restoring module paths.",
            "- PDB paths identify the built module directory; source-file paths can identify",
            "  linked libraries used inside that module.",
            "- Do not move files solely from this report. Use it to prioritize module-level",
            "  cleanup, README corrections, and later controlled layout reconstruction.",
            "",
        ]
    )

    output.parent.mkdir(parents=True, exist_ok=True)
    output.write_text("\n".join(lines), encoding="utf-8")


def print_summary(modules: dict[str, ModuleEvidence], limit: int) -> None:
    package_counter: Counter[str] = Counter()
    build_counter: Counter[str] = Counter()
    modules_with_evidence = 0
    evidence_records = 0
    for module in modules.values():
        if module.evidence:
            modules_with_evidence += 1
            evidence_records += len(module.evidence)
            package_counter.update(module.package_paths)
            build_counter.update(module.build_paths)

    print(f"Modules with path evidence: {modules_with_evidence}")
    print(f"Path evidence records: {evidence_records}")
    print(f"Unique inferred package paths: {len(package_counter)}")
    print(f"Unique module build paths: {len(build_counter)}")
    print()
    print("Top module build paths")
    print("----------------------")
    for package_path, count in build_counter.most_common(limit):
        print(f"{count:5d}  {package_path}")


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Recover original AMI/EDK2 package-layout evidence from decompiled BIOS files."
    )
    parser.add_argument("root", nargs="?", default=".", help="tree root to scan")
    parser.add_argument("--limit", type=int, default=30, help="rows to print in summaries")
    parser.add_argument("--json-out", type=Path, help="write detailed JSON evidence")
    parser.add_argument("--markdown-out", type=Path, help="write Markdown summary")
    args = parser.parse_args()

    root = Path(args.root).resolve()
    modules = scan(root)
    print_summary(modules, args.limit)

    if args.json_out:
        write_json(root, modules, args.json_out)
        print(f"\nWrote JSON evidence: {args.json_out}")
    if args.markdown_out:
        write_markdown(root, modules, args.markdown_out, args.limit)
        print(f"Wrote Markdown summary: {args.markdown_out}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())