""" Validator for tracked changes in Word documents. """ import subprocess import tempfile import zipfile from pathlib import Path class RedliningValidator: def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"): self.unpacked_dir = Path(unpacked_dir) self.original_docx = Path(original_docx) self.verbose = verbose self.author = author self.namespaces = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" } def repair(self) -> int: return 0 def validate(self): modified_file = self.unpacked_dir / "word" / "document.xml" if not modified_file.exists(): print(f"FAILED - Modified document.xml not found at {modified_file}") return False try: import xml.etree.ElementTree as ET tree = ET.parse(modified_file) root = tree.getroot() del_elements = root.findall(".//w:del", self.namespaces) ins_elements = root.findall(".//w:ins", self.namespaces) author_del_elements = [ elem for elem in del_elements if elem.get(f"{{{self.namespaces['w']}}}author") == self.author ] author_ins_elements = [ elem for elem in ins_elements if elem.get(f"{{{self.namespaces['w']}}}author") == self.author ] if not author_del_elements and not author_ins_elements: if self.verbose: print(f"PASSED - No tracked changes by {self.author} found.") return True except Exception: pass with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) try: with zipfile.ZipFile(self.original_docx, "r") as zip_ref: zip_ref.extractall(temp_path) except Exception as e: print(f"FAILED - Error unpacking original docx: {e}") return False original_file = temp_path / "word" / "document.xml" if not original_file.exists(): print( f"FAILED - Original document.xml not found in {self.original_docx}" ) return False try: import xml.etree.ElementTree as ET modified_tree = ET.parse(modified_file) modified_root = modified_tree.getroot() original_tree = ET.parse(original_file) original_root = original_tree.getroot() except ET.ParseError as e: print(f"FAILED - Error parsing XML files: {e}") return False self._remove_author_tracked_changes(original_root) self._remove_author_tracked_changes(modified_root) modified_text = self._extract_text_content(modified_root) original_text = self._extract_text_content(original_root) if modified_text != original_text: error_message = self._generate_detailed_diff( original_text, modified_text ) print(error_message) return False if self.verbose: print(f"PASSED - All changes by {self.author} are properly tracked") return True def _generate_detailed_diff(self, original_text, modified_text): error_parts = [ f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes", "", "Likely causes:", " 1. Modified text inside another author's or tags", " 2. Made edits without proper tracked changes", " 3. Didn't nest inside when deleting another's insertion", "", "For pre-redlined documents, use correct patterns:", " - To reject another's INSERTION: Nest inside their ", " - To restore another's DELETION: Add new AFTER their ", "", ] git_diff = self._get_git_word_diff(original_text, modified_text) if git_diff: error_parts.extend(["Differences:", "============", git_diff]) else: error_parts.append("Unable to generate word diff (git not available)") return "\n".join(error_parts) def _get_git_word_diff(self, original_text, modified_text): try: with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) original_file = temp_path / "original.txt" modified_file = temp_path / "modified.txt" original_file.write_text(original_text, encoding="utf-8") modified_file.write_text(modified_text, encoding="utf-8") result = subprocess.run( [ "git", "diff", "--word-diff=plain", "--word-diff-regex=.", "-U0", "--no-index", str(original_file), str(modified_file), ], capture_output=True, text=True, ) if result.stdout.strip(): lines = result.stdout.split("\n") content_lines = [] in_content = False for line in lines: if line.startswith("@@"): in_content = True continue if in_content and line.strip(): content_lines.append(line) if content_lines: return "\n".join(content_lines) result = subprocess.run( [ "git", "diff", "--word-diff=plain", "-U0", "--no-index", str(original_file), str(modified_file), ], capture_output=True, text=True, ) if result.stdout.strip(): lines = result.stdout.split("\n") content_lines = [] in_content = False for line in lines: if line.startswith("@@"): in_content = True continue if in_content and line.strip(): content_lines.append(line) return "\n".join(content_lines) except (subprocess.CalledProcessError, FileNotFoundError, Exception): pass return None def _remove_author_tracked_changes(self, root): ins_tag = f"{{{self.namespaces['w']}}}ins" del_tag = f"{{{self.namespaces['w']}}}del" author_attr = f"{{{self.namespaces['w']}}}author" for parent in root.iter(): to_remove = [] for child in parent: if child.tag == ins_tag and child.get(author_attr) == self.author: to_remove.append(child) for elem in to_remove: parent.remove(elem) deltext_tag = f"{{{self.namespaces['w']}}}delText" t_tag = f"{{{self.namespaces['w']}}}t" for parent in root.iter(): to_process = [] for child in parent: if child.tag == del_tag and child.get(author_attr) == self.author: to_process.append((child, list(parent).index(child))) for del_elem, del_index in reversed(to_process): for elem in del_elem.iter(): if elem.tag == deltext_tag: elem.tag = t_tag for child in reversed(list(del_elem)): parent.insert(del_index, child) parent.remove(del_elem) def _extract_text_content(self, root): p_tag = f"{{{self.namespaces['w']}}}p" t_tag = f"{{{self.namespaces['w']}}}t" paragraphs = [] for p_elem in root.findall(f".//{p_tag}"): text_parts = [] for t_elem in p_elem.findall(f".//{t_tag}"): if t_elem.text: text_parts.append(t_elem.text) paragraph_text = "".join(text_parts) if paragraph_text: paragraphs.append(paragraph_text) return "\n".join(paragraphs) if __name__ == "__main__": raise RuntimeError("This module should not be run directly.")