#!/usr/bin/env python3 """Fix text artifacts in existing raw scraped card data. This script applies the text cleaning rules to existing JSON files without re-scraping them from the web. Usage: python scripts/fix_raw_text.py """ import json import re import sys from pathlib import Path # Same patterns as in the scraper TEXT_ARTIFACT_FIXES = [ # Energy-related artifacts (r"\baEnergy\b", "an Energy"), (r"\bofEnergy\b", "of Energy"), (r"\bextraEnergy\b", "extra Energy"), (r"\battachedEnergy\b", "attached Energy"), (r"\banyEnergy\b", "any Energy"), (r"(\d+)Energy\b", r"\1 Energy"), (r"(\d+)-HP\b", r"\1 HP"), # Pokemon-related artifacts (r"\bBasicPokémon\b", "Basic Pokémon"), (r"\bBenchedPokémon\b", "Benched Pokémon"), (r"\bthePokémon\b", "the Pokémon"), (r"\bthisPokémon\b", "this Pokémon"), (r"\byourPokémon\b", "your Pokémon"), (r"\bActivePokémon\b", "Active Pokémon"), (r"\bDefendingPokémon\b", "Defending Pokémon"), (r"\bopponent'sPokémon\b", "opponent's Pokémon"), (r"\bOpponent'sPokémon\b", "Opponent's Pokémon"), (r"\bthatPokémon\b", "that Pokémon"), (r"\beachPokémon\b", "each Pokémon"), (r"\baPokémon\b", "a Pokémon"), ] def clean_text(text: str | None) -> str | None: """Apply text cleaning rules.""" if not text: return text result = text for pattern, replacement in TEXT_ARTIFACT_FIXES: result = re.sub(pattern, replacement, result) return result def fix_card_file(file_path: Path) -> bool: """Fix text artifacts in a single card file. Returns True if the file was modified. """ with open(file_path) as f: data = json.load(f) modified = False # Fix attack effect texts for attack in data.get("attacks", []): if attack.get("effect_text"): cleaned = clean_text(attack["effect_text"]) if cleaned != attack["effect_text"]: attack["effect_text"] = cleaned modified = True # Fix ability effect texts for ability in data.get("abilities", []): if ability.get("effect_text"): cleaned = clean_text(ability["effect_text"]) if cleaned != ability["effect_text"]: ability["effect_text"] = cleaned modified = True # Fix trainer effect text (stored in flavor_text for raw data) if data.get("flavor_text"): cleaned = clean_text(data["flavor_text"]) if cleaned != data["flavor_text"]: data["flavor_text"] = cleaned modified = True # Fix effect_text field (for trainer cards) if data.get("effect_text"): cleaned = clean_text(data["effect_text"]) if cleaned != data["effect_text"]: data["effect_text"] = cleaned modified = True if modified: with open(file_path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) return modified def main(): raw_dir = Path(__file__).parent.parent / "data" / "raw" if not raw_dir.exists(): print(f"Error: Raw data directory not found: {raw_dir}") return 1 total_files = 0 modified_files = 0 for set_dir in raw_dir.iterdir(): if not set_dir.is_dir() or set_dir.name.startswith("_"): continue for card_file in set_dir.glob("*.json"): total_files += 1 if fix_card_file(card_file): modified_files += 1 print(f" Fixed: {card_file.name}") print(f"\nProcessed {total_files} files, modified {modified_files}") return 0 if __name__ == "__main__": sys.exit(main())