You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
133 lines
4.0 KiB
Python
133 lines
4.0 KiB
Python
"""Unpack Office files (DOCX, PPTX, XLSX) for editing.
|
|
|
|
Extracts the ZIP archive, pretty-prints XML files, and optionally:
|
|
- Merges adjacent runs with identical formatting (DOCX only)
|
|
- Simplifies adjacent tracked changes from same author (DOCX only)
|
|
|
|
Usage:
|
|
python unpack.py <office_file> <output_dir> [options]
|
|
|
|
Examples:
|
|
python unpack.py document.docx unpacked/
|
|
python unpack.py presentation.pptx unpacked/
|
|
python unpack.py document.docx unpacked/ --merge-runs false
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
import defusedxml.minidom
|
|
|
|
from helpers.merge_runs import merge_runs as do_merge_runs
|
|
from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines
|
|
|
|
SMART_QUOTE_REPLACEMENTS = {
|
|
"\u201c": "“",
|
|
"\u201d": "”",
|
|
"\u2018": "‘",
|
|
"\u2019": "’",
|
|
}
|
|
|
|
|
|
def unpack(
|
|
input_file: str,
|
|
output_directory: str,
|
|
merge_runs: bool = True,
|
|
simplify_redlines: bool = True,
|
|
) -> tuple[None, str]:
|
|
input_path = Path(input_file)
|
|
output_path = Path(output_directory)
|
|
suffix = input_path.suffix.lower()
|
|
|
|
if not input_path.exists():
|
|
return None, f"Error: {input_file} does not exist"
|
|
|
|
if suffix not in {".docx", ".pptx", ".xlsx"}:
|
|
return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file"
|
|
|
|
try:
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
with zipfile.ZipFile(input_path, "r") as zf:
|
|
zf.extractall(output_path)
|
|
|
|
xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
|
|
for xml_file in xml_files:
|
|
_pretty_print_xml(xml_file)
|
|
|
|
message = f"Unpacked {input_file} ({len(xml_files)} XML files)"
|
|
|
|
if suffix == ".docx":
|
|
if simplify_redlines:
|
|
simplify_count, _ = do_simplify_redlines(str(output_path))
|
|
message += f", simplified {simplify_count} tracked changes"
|
|
|
|
if merge_runs:
|
|
merge_count, _ = do_merge_runs(str(output_path))
|
|
message += f", merged {merge_count} runs"
|
|
|
|
for xml_file in xml_files:
|
|
_escape_smart_quotes(xml_file)
|
|
|
|
return None, message
|
|
|
|
except zipfile.BadZipFile:
|
|
return None, f"Error: {input_file} is not a valid Office file"
|
|
except Exception as e:
|
|
return None, f"Error unpacking: {e}"
|
|
|
|
|
|
def _pretty_print_xml(xml_file: Path) -> None:
|
|
try:
|
|
content = xml_file.read_text(encoding="utf-8")
|
|
dom = defusedxml.minidom.parseString(content)
|
|
xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8"))
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _escape_smart_quotes(xml_file: Path) -> None:
|
|
try:
|
|
content = xml_file.read_text(encoding="utf-8")
|
|
for char, entity in SMART_QUOTE_REPLACEMENTS.items():
|
|
content = content.replace(char, entity)
|
|
xml_file.write_text(content, encoding="utf-8")
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Unpack an Office file (DOCX, PPTX, XLSX) for editing"
|
|
)
|
|
parser.add_argument("input_file", help="Office file to unpack")
|
|
parser.add_argument("output_directory", help="Output directory")
|
|
parser.add_argument(
|
|
"--merge-runs",
|
|
type=lambda x: x.lower() == "true",
|
|
default=True,
|
|
metavar="true|false",
|
|
help="Merge adjacent runs with identical formatting (DOCX only, default: true)",
|
|
)
|
|
parser.add_argument(
|
|
"--simplify-redlines",
|
|
type=lambda x: x.lower() == "true",
|
|
default=True,
|
|
metavar="true|false",
|
|
help="Merge adjacent tracked changes from same author (DOCX only, default: true)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
_, message = unpack(
|
|
args.input_file,
|
|
args.output_directory,
|
|
merge_runs=args.merge_runs,
|
|
simplify_redlines=args.simplify_redlines,
|
|
)
|
|
print(message)
|
|
|
|
if "Error" in message:
|
|
sys.exit(1)
|