From 824c621641c5b7d699f2cde1859e0139394c374c Mon Sep 17 00:00:00 2001 From: Renko6626 <2300011496@stu.pku.edu.cn> Date: Thu, 11 Jun 2026 15:35:36 +0800 Subject: [PATCH] Add scripts/import_th_re_data.py (Ghidra importer), move .py into scripts/ --- README.md | 27 ++++++- diffable.py => scripts/diffable.py | 0 scripts/import_th_re_data.py | 110 +++++++++++++++++++++++++++++ stats.py => scripts/stats.py | 0 4 files changed, 136 insertions(+), 1 deletion(-) rename diffable.py => scripts/diffable.py (100%) create mode 100644 scripts/import_th_re_data.py rename stats.py => scripts/stats.py (100%) diff --git a/README.md b/README.md index 86fb1f1..7676dcf 100644 --- a/README.md +++ b/README.md @@ -104,4 +104,29 @@ for struct_name, struct in structs.items(): print() ``` -If you have written scripts for importing into a tool like ghidra or IDA, please share them in an [issue](https://github.com/exphp-share/th-re-data/issues) or [on Discord](https://discord.gg/fvPJvHJ) and I can add them or link to them here. +## Importing into Ghidra + +[`scripts/import_th_re_data.py`](scripts/import_th_re_data.py) imports `funcs.json` and `statics.json` +into a Ghidra project (function/static names + comments). + +It is **safe by default**: it only fills in Ghidra's own placeholder names (`FUN_`/`DAT_`/no symbol/no +comment) and never clobbers a name you've already set. Pass `--overwrite` to replace existing names, or +`--dry-run` to just print the counts without touching the database. + +Two ways to run it (point it at a game's data folder, e.g. `data/th16.v1.00a/`): + +```bash +# A) inside Ghidra: Script Manager, or +# analyzeHeadless -process -postScript import_th_re_data.py data/th16.v1.00a +# +# B) standalone PyGhidra driver (Ghidra 12 dropped Jython, so headless = CPython + pyghidra): +python scripts/import_th_re_data.py data/th16.v1.00a \ + --project-dir DIR --project NAME --program /prog +``` + +(Data-symbol edits only persist via the driver's `proj.save()`; `analyzeHeadless -postScript` does not +save them, though function renames persist either way. See the docstring at the top of the script.) + +If you have written import scripts for other tools (IDA, etc.), please share them in an +[issue](https://github.com/exphp-share/th-re-data/issues) or [on Discord](https://discord.gg/fvPJvHJ) +and I can add them or link to them here. diff --git a/diffable.py b/scripts/diffable.py similarity index 100% rename from diffable.py rename to scripts/diffable.py diff --git a/scripts/import_th_re_data.py b/scripts/import_th_re_data.py new file mode 100644 index 0000000..67b4495 --- /dev/null +++ b/scripts/import_th_re_data.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +"""Import exphp-share/th-re-data labels (funcs.json + statics.json) into Ghidra. + +Safe by default: only fills Ghidra defaults (FUN_/DAT_/no symbol/no comment); never clobbers +names you already set. --overwrite replaces them; --dry-run previews counts. + +Run inside Ghidra (Script Manager or `analyzeHeadless -postScript `), or as a +standalone PyGhidra driver (Ghidra 12 dropped Jython, so headless = CPython + pyghidra): + + python import_th_re_data.py --project-dir DIR --project NAME --program /prog + +where DATA_DIR is a th-re-data game folder, e.g. data/th16.v1.00a/. Names use `::` for the +class; this flattens to `__`. NOTE: data-symbol edits only persist via the driver's proj.save() +(analyzeHeadless -postScript does not save them); function renames persist either way. +""" +import json, os, re + +_PLACEHOLDER = re.compile(r'^(sub_|nullsub_?|j_|thunk_?|FID_|\?|loc_|unknown_?)', re.I) +def _keep(n, incl): return bool(n) and (incl or not _PLACEHOLDER.match(n)) +def _san(n): return re.sub(r'[^A-Za-z0-9_]', '_', n.replace("::", "__")) # Foo::bar -> Foo__bar + + +def apply(prog, data_dir, dry=False, overwrite=False, incl=False): + from ghidra.program.model.symbol import SourceType + from ghidra.program.model.listing import CodeUnit + US, DEF = SourceType.USER_DEFINED, SourceType.DEFAULT + addr = prog.getAddressFactory().getDefaultAddressSpace().getAddress + fm, st, lst = prog.getFunctionManager(), prog.getSymbolTable(), prog.getListing() + n = dict(applied=0, overwritten=0, skipped=0, missing=0, comments=0) + + def comment(a, ctype, text): + if not text or (lst.getComment(ctype, a) is not None and not overwrite): + return + if not dry: + lst.setComment(a, ctype, "[th-re-data] " + text) + n["comments"] += 1 + + funcs = os.path.join(data_dir, "funcs.json") + if os.path.exists(funcs): + for r in json.load(open(funcs, encoding="utf-8")): + if not _keep(r.get("name"), incl): + continue + a = addr(int(r["addr"], 16)); f = fm.getFunctionAt(a) + if f is None: + n["missing"] += 1; continue + sym = f.getSymbol(); named = sym is not None and sym.getSource() != DEF + if named and not overwrite: + n["skipped"] += 1 + else: + if not dry: + f.setName(_san(r["name"]), US) + n["overwritten" if named else "applied"] += 1 + comment(a, CodeUnit.PLATE_COMMENT, r.get("comment")) + + statics = os.path.join(data_dir, "statics.json") + if os.path.exists(statics): + for r in json.load(open(statics, encoding="utf-8")): + if not _keep(r.get("name"), incl): + continue + a = addr(int(r["addr"], 16)); p = st.getPrimarySymbol(a) + named = p is not None and not p.isDynamic() and p.getSource() != DEF + if named and not overwrite: + n["skipped"] += 1 + else: + if not dry: + p.setName(_san(r["name"]), US) if named else st.createLabel(a, _san(r["name"]), US) + n["overwritten" if named else "applied"] += 1 + comment(a, CodeUnit.EOL_COMMENT, r.get("comment")) + # statics' "type" field is intentionally not applied (could clobber existing data layout). + return n + + +def _summary(n, dry, overwrite): + tag = ("[dry-run] " if dry else "") + ("[overwrite] " if overwrite else "[safe] ") + print(tag + "applied=%(applied)d overwritten=%(overwritten)d skipped=%(skipped)d " + "missing=%(missing)d comments=%(comments)d" % n) + + +if __name__ == "__main__": + cp = globals().get("currentProgram") # injected only in Ghidra script context + if cp is not None: # mode A: inside Ghidra (tool owns tx + save) + args = list(getScriptArgs()) # noqa: F821 + dry, ov, incl = "--dry-run" in args, "--overwrite" in args, "--include-placeholders" in args + dd = next((a for a in args if not a.startswith("-")), None) \ + or askDirectory("th-re-data dir", "Select").getPath() # noqa: F821 + _summary(apply(cp, dd, dry, ov, incl), dry, ov) + else: # mode B: standalone PyGhidra driver + import argparse, pyghidra + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("data_dir"); ap.add_argument("--project-dir", required=True) + ap.add_argument("--project", required=True); ap.add_argument("--program", required=True) + ap.add_argument("--dry-run", action="store_true"); ap.add_argument("--overwrite", action="store_true") + ap.add_argument("--include-placeholders", action="store_true") + a = ap.parse_args() + pyghidra.start() + from ghidra.base.project import GhidraProject + folder, _, name = a.program.rpartition("/") + proj = GhidraProject.openProject(os.path.abspath(a.project_dir), a.project, False) + prog = proj.openProgram(folder or "/", name, False) + try: + tx = prog.startTransaction("import th-re-data") + try: + n = apply(prog, a.data_dir, a.dry_run, a.overwrite, a.include_placeholders) + finally: + prog.endTransaction(tx, not a.dry_run) + if not a.dry_run: + proj.save(prog) # required for data-symbol changes to persist + finally: + proj.close() + _summary(n, a.dry_run, a.overwrite) diff --git a/stats.py b/scripts/stats.py similarity index 100% rename from stats.py rename to scripts/stats.py