mantimon-tcg/backend/scripts/fix_raw_text.py
Cal Corum c6e3695760 Fix card data pipeline: fossil cards and text artifacts
Scraper fixes:
- Detect fossil cards (Helix/Dome Fossil, Old Amber) as Trainer/Item cards
- Add text artifact cleaning for stripped energy icons:
  - 'aEnergy' -> 'an Energy'
  - 'extraEnergy' -> 'extra Energy'
  - 'BenchedPokémon' -> 'Benched Pokémon'
  - And 20+ other common patterns

Converter improvements:
- Add evolution chain validation to detect broken evolves_from references
- Track conversion errors and validation warnings in _index.json
- Return errors from convert_set() for better debugging

Data fixes:
- Fixed 4 fossil cards (now correctly typed as trainer/item)
- Fixed text artifacts in 46 raw card files
- Regenerated all 382 card definitions
- All evolution chains now valid

Added fix_raw_text.py utility script for batch text cleanup.
2026-01-27 14:37:03 -06:00

126 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""Fix text artifacts in existing raw scraped card data.
This script applies the text cleaning rules to existing JSON files
without re-scraping them from the web.
Usage:
python scripts/fix_raw_text.py
"""
import json
import re
import sys
from pathlib import Path
# Same patterns as in the scraper
TEXT_ARTIFACT_FIXES = [
# Energy-related artifacts
(r"\baEnergy\b", "an Energy"),
(r"\bofEnergy\b", "of Energy"),
(r"\bextraEnergy\b", "extra Energy"),
(r"\battachedEnergy\b", "attached Energy"),
(r"\banyEnergy\b", "any Energy"),
(r"(\d+)Energy\b", r"\1 Energy"),
(r"(\d+)-HP\b", r"\1 HP"),
# Pokemon-related artifacts
(r"\bBasicPokémon\b", "Basic Pokémon"),
(r"\bBenchedPokémon\b", "Benched Pokémon"),
(r"\bthePokémon\b", "the Pokémon"),
(r"\bthisPokémon\b", "this Pokémon"),
(r"\byourPokémon\b", "your Pokémon"),
(r"\bActivePokémon\b", "Active Pokémon"),
(r"\bDefendingPokémon\b", "Defending Pokémon"),
(r"\bopponent'sPokémon\b", "opponent's Pokémon"),
(r"\bOpponent'sPokémon\b", "Opponent's Pokémon"),
(r"\bthatPokémon\b", "that Pokémon"),
(r"\beachPokémon\b", "each Pokémon"),
(r"\baPokémon\b", "a Pokémon"),
]
def clean_text(text: str | None) -> str | None:
"""Apply text cleaning rules."""
if not text:
return text
result = text
for pattern, replacement in TEXT_ARTIFACT_FIXES:
result = re.sub(pattern, replacement, result)
return result
def fix_card_file(file_path: Path) -> bool:
"""Fix text artifacts in a single card file.
Returns True if the file was modified.
"""
with open(file_path) as f:
data = json.load(f)
modified = False
# Fix attack effect texts
for attack in data.get("attacks", []):
if attack.get("effect_text"):
cleaned = clean_text(attack["effect_text"])
if cleaned != attack["effect_text"]:
attack["effect_text"] = cleaned
modified = True
# Fix ability effect texts
for ability in data.get("abilities", []):
if ability.get("effect_text"):
cleaned = clean_text(ability["effect_text"])
if cleaned != ability["effect_text"]:
ability["effect_text"] = cleaned
modified = True
# Fix trainer effect text (stored in flavor_text for raw data)
if data.get("flavor_text"):
cleaned = clean_text(data["flavor_text"])
if cleaned != data["flavor_text"]:
data["flavor_text"] = cleaned
modified = True
# Fix effect_text field (for trainer cards)
if data.get("effect_text"):
cleaned = clean_text(data["effect_text"])
if cleaned != data["effect_text"]:
data["effect_text"] = cleaned
modified = True
if modified:
with open(file_path, "w") as f:
json.dump(data, f, indent=2)
return modified
def main():
raw_dir = Path(__file__).parent.parent / "data" / "raw"
if not raw_dir.exists():
print(f"Error: Raw data directory not found: {raw_dir}")
return 1
total_files = 0
modified_files = 0
for set_dir in raw_dir.iterdir():
if not set_dir.is_dir() or set_dir.name.startswith("_"):
continue
for card_file in set_dir.glob("*.json"):
total_files += 1
if fix_card_file(card_file):
modified_files += 1
print(f" Fixed: {card_file.name}")
print(f"\nProcessed {total_files} files, modified {modified_files}")
return 0
if __name__ == "__main__":
sys.exit(main())