#!/usr/bin/env python3
"""Recover original source-layout hints from decompiled BIOS artifacts.
The BIOS tree contains many debug-build paths in ASSERT strings, PDB records,
README files, and decompiler sidecars. This tool normalizes those paths and
builds an evidence map from the current flat extraction back toward the
original EDK2/AMI package layout.
"""
from __future__ import annotations
import argparse
from collections import Counter, defaultdict
from dataclasses import dataclass, field
import json
from pathlib import Path
import re
from typing import Iterable
SCAN_SUFFIXES = {".c", ".h", ".md", ".json", ".txt"}
SKIP_DIRS = {".git", ".codex", ".agents", "__pycache__"}
GENERATED_OUTPUTS = {
Path("docs/original_layout_evidence.json"),
Path("docs/original_layout_recovery.md"),
}
DRIVE_PATH_RE = re.compile(r"[A-Za-z]:[\\/][^\"'`<>\r\n]+")
SOURCE_FIELD_RE = re.compile(r"\b(?:Source|Build path|Build|PDB):\s*([^|\n\r]+)", re.IGNORECASE)
BUILD_SEGMENT_RE = re.compile(
r"(?:^|/)Build/(?P<target>[^/]+)/(?P<profile>[^/]+)/(?P<arch>[^/]+)/(?P<rest>.+)",
re.IGNORECASE,
)
DEBUG_TAIL_RE = re.compile(r"^(?P<module_path>.+)/DEBUG/(?P<leaf>[^/]+)$", re.IGNORECASE)
@dataclass
class Evidence:
module: str
file: str
kind: str
category: str
path: str
inferred_package_path: str | None
@dataclass
class ModuleEvidence:
module: str
paths: Counter[str] = field(default_factory=Counter)
package_paths: Counter[str] = field(default_factory=Counter)
build_paths: Counter[str] = field(default_factory=Counter)
source_paths: Counter[str] = field(default_factory=Counter)
evidence: list[Evidence] = field(default_factory=list)
def iter_scan_files(root: Path) -> Iterable[Path]:
for path in sorted(root.rglob("*")):
if not path.is_file() or path.suffix.lower() not in SCAN_SUFFIXES:
continue
rel_parts = path.relative_to(root).parts
if path.relative_to(root) in GENERATED_OUTPUTS:
continue
if any(part in SKIP_DIRS for part in rel_parts):
continue
yield path
def clean_candidate_path(raw: str) -> str:
value = raw.strip().strip("`'\"")
value = value.replace("\\\\", "/").replace("\\", "/")
value = re.sub(r'"\s*"', "", value)
value = re.sub(r"\s+", " ", value)
value = value.rstrip(").,;:|")
return value
def normalize_path(raw: str) -> str | None:
value = clean_candidate_path(raw)
match = re.search(r"[A-Za-z]:/", value)
if match:
value = value[match.start() :]
value = value.replace("//", "/")
if not re.match(r"^[A-Za-z]:/", value):
return None
return value
def infer_package_path(path: str) -> tuple[str | None, str]:
normalized = path.replace("\\", "/")
build = BUILD_SEGMENT_RE.search(normalized)
if build:
rest = build.group("rest")
debug_tail = DEBUG_TAIL_RE.match(rest)
if debug_tail:
module_path = debug_tail.group("module_path")
leaf = debug_tail.group("leaf")
if leaf.lower() == "autogen.c" or leaf.lower().endswith(".pdb"):
return module_path, "module-build"
return rest, "build-reference"
profile = re.search(
r"(?:^|/)(?:DEBUG|RELEASE)_[^/]+/(?P<arch>[^/]+)/(?P<rest>.+)",
normalized,
re.IGNORECASE,
)
if profile:
rest = profile.group("rest")
debug_tail = DEBUG_TAIL_RE.match(rest)
if debug_tail:
leaf = debug_tail.group("leaf")
if leaf.lower() == "autogen.c" or leaf.lower().endswith(".pdb"):
return debug_tail.group("module_path"), "module-build"
if rest.endswith("/DEBUG"):
return rest.removesuffix("/DEBUG"), "module-build"
return rest, "build-reference"
marker = re.search(r"[A-Za-z]:/hs/(?P<rest>.+)", normalized, re.IGNORECASE)
if marker:
return marker.group("rest"), "source-reference"
return None, "unknown"
def module_name_for(root: Path, path: Path) -> str:
rel = path.relative_to(root)
return rel.parts[0] if len(rel.parts) > 1 else "."
def extract_paths_from_text(text: str) -> Iterable[tuple[str, str]]:
collapsed = re.sub(r'"\s*"', "", text)
escaped_normalized = collapsed.replace("\\\\", "/")
for match in DRIVE_PATH_RE.finditer(escaped_normalized):
path = normalize_path(match.group(0))
if path:
yield "absolute-path", path
for match in SOURCE_FIELD_RE.finditer(collapsed):
candidate = clean_candidate_path(match.group(1))
if "/" in candidate or "\\" in candidate:
normalized = candidate.replace("\\", "/")
normalized = normalized.strip(" -*")
yield "source-field", normalized
def scan(root: Path) -> dict[str, ModuleEvidence]:
modules: dict[str, ModuleEvidence] = {}
for path in iter_scan_files(root):
try:
text = path.read_text(encoding="utf-8", errors="replace")
except OSError:
continue
module = module_name_for(root, path)
record = modules.setdefault(module, ModuleEvidence(module=module))
rel_file = path.relative_to(root).as_posix()
for kind, recovered_path in extract_paths_from_text(text):
package_path, category = infer_package_path(recovered_path)
evidence = Evidence(
module=module,
file=rel_file,
kind=kind,
category=category,
path=recovered_path,
inferred_package_path=package_path,
)
record.evidence.append(evidence)
record.paths[recovered_path] += 1
if package_path:
record.package_paths[package_path] += 1
if category == "module-build":
record.build_paths[package_path] += 1
else:
record.source_paths[package_path] += 1
return modules
def write_json(root: Path, modules: dict[str, ModuleEvidence], output: Path) -> None:
payload = {
"root": str(root),
"modules": [
{
"module": module.module,
"top_paths": module.paths.most_common(),
"top_inferred_package_paths": module.package_paths.most_common(),
"top_module_build_paths": module.build_paths.most_common(),
"top_source_reference_paths": module.source_paths.most_common(),
"evidence": [evidence.__dict__ for evidence in module.evidence],
}
for module in sorted(modules.values(), key=lambda item: item.module.lower())
if module.evidence
],
}
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
def write_markdown(root: Path, modules: dict[str, ModuleEvidence], output: Path, limit: int) -> None:
package_counter: Counter[str] = Counter()
build_counter: Counter[str] = Counter()
evidence_count = 0
module_count = 0
for module in modules.values():
if module.evidence:
module_count += 1
evidence_count += len(module.evidence)
package_counter.update(module.package_paths)
build_counter.update(module.build_paths)
lines: list[str] = [
"# Original Source Layout Recovery",
"",
"Generated from debug/assert/PDB/source-path evidence in the current tree.",
"Paths are evidence, not proof that every decompiled type or function name is original.",
"",
"## Summary",
"",
f"- Root: `{root}`",
f"- Modules with path evidence: {module_count}",
f"- Path evidence records: {evidence_count}",
f"- Unique inferred package paths: {len(package_counter)}",
f"- Unique module build paths: {len(build_counter)}",
"",
"## Most Common Module Build Paths",
"",
"| Count | Package path |",
"|---:|---|",
]
for package_path, count in build_counter.most_common(limit):
lines.append(f"| {count} | `{package_path}` |")
lines.extend(
[
"",
"## Most Common Source Reference Paths",
"",
"| Count | Package path |",
"|---:|---|",
]
)
source_counter = package_counter - build_counter
for package_path, count in source_counter.most_common(limit):
lines.append(f"| {count} | `{package_path}` |")
lines.extend(
[
"",
"## Module Map",
"",
"| Module | Best module build path | Evidence records |",
"|---|---|---:|",
]
)
for module in sorted(modules.values(), key=lambda item: item.module.lower()):
if not module.evidence:
continue
best_path = module.build_paths.most_common(1)[0][0] if module.build_paths else ""
lines.append(f"| `{module.module}` | `{best_path}` | {len(module.evidence)} |")
lines.extend(
[
"",
"## Usage Notes",
"",
"- Prefer exact `Build/<target>/<profile>/<arch>/<package>/.../DEBUG/AutoGen.c`",
" evidence when restoring module paths.",
"- PDB paths identify the built module directory; source-file paths can identify",
" linked libraries used inside that module.",
"- Do not move files solely from this report. Use it to prioritize module-level",
" cleanup, README corrections, and later controlled layout reconstruction.",
"",
]
)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text("\n".join(lines), encoding="utf-8")
def print_summary(modules: dict[str, ModuleEvidence], limit: int) -> None:
package_counter: Counter[str] = Counter()
build_counter: Counter[str] = Counter()
modules_with_evidence = 0
evidence_records = 0
for module in modules.values():
if module.evidence:
modules_with_evidence += 1
evidence_records += len(module.evidence)
package_counter.update(module.package_paths)
build_counter.update(module.build_paths)
print(f"Modules with path evidence: {modules_with_evidence}")
print(f"Path evidence records: {evidence_records}")
print(f"Unique inferred package paths: {len(package_counter)}")
print(f"Unique module build paths: {len(build_counter)}")
print()
print("Top module build paths")
print("----------------------")
for package_path, count in build_counter.most_common(limit):
print(f"{count:5d} {package_path}")
def main() -> int:
parser = argparse.ArgumentParser(
description="Recover original AMI/EDK2 package-layout evidence from decompiled BIOS files."
)
parser.add_argument("root", nargs="?", default=".", help="tree root to scan")
parser.add_argument("--limit", type=int, default=30, help="rows to print in summaries")
parser.add_argument("--json-out", type=Path, help="write detailed JSON evidence")
parser.add_argument("--markdown-out", type=Path, help="write Markdown summary")
args = parser.parse_args()
root = Path(args.root).resolve()
modules = scan(root)
print_summary(modules, args.limit)
if args.json_out:
write_json(root, modules, args.json_out)
print(f"\nWrote JSON evidence: {args.json_out}")
if args.markdown_out:
write_markdown(root, modules, args.markdown_out, args.limit)
print(f"Wrote Markdown summary: {args.markdown_out}")
return 0
if __name__ == "__main__":
raise SystemExit(main())