mantimon-tcg/backend/scripts/fix_raw_text.py
Cal Corum adb55dec12 Fix scraper to preserve energy types in effect text
The source website uses <span class='energy-text energy-text--type-fire'>
to render inline energy icons. BeautifulSoup's get_text() was stripping
these spans, losing the energy type information and causing merged text
like 'Discard aEnergy' instead of 'Discard a Fire Energy'.

Changes:
- Add ENERGY_TEXT_TYPES mapping for inline energy references
- Add replace_energy_text_spans() to convert spans to text before extraction
- Add extract_effect_text() helper with proper text joining (separator=' ')
- Update parse_attack(), parse_ability(), _parse_trainer_details() to use it
- Fix JSON encoding in convert_cards.py to use UTF-8 (ensure_ascii=False)

Before: 'Discard an Energy from this Pokémon'
After:  'Discard a Fire Energy from this Pokémon'

Re-scraped all 372 cards and regenerated 382 definitions.
2026-01-27 15:10:02 -06:00

126 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""Fix text artifacts in existing raw scraped card data.
This script applies the text cleaning rules to existing JSON files
without re-scraping them from the web.
Usage:
python scripts/fix_raw_text.py
"""
import json
import re
import sys
from pathlib import Path
# Same patterns as in the scraper
TEXT_ARTIFACT_FIXES = [
# Energy-related artifacts
(r"\baEnergy\b", "an Energy"),
(r"\bofEnergy\b", "of Energy"),
(r"\bextraEnergy\b", "extra Energy"),
(r"\battachedEnergy\b", "attached Energy"),
(r"\banyEnergy\b", "any Energy"),
(r"(\d+)Energy\b", r"\1 Energy"),
(r"(\d+)-HP\b", r"\1 HP"),
# Pokemon-related artifacts
(r"\bBasicPokémon\b", "Basic Pokémon"),
(r"\bBenchedPokémon\b", "Benched Pokémon"),
(r"\bthePokémon\b", "the Pokémon"),
(r"\bthisPokémon\b", "this Pokémon"),
(r"\byourPokémon\b", "your Pokémon"),
(r"\bActivePokémon\b", "Active Pokémon"),
(r"\bDefendingPokémon\b", "Defending Pokémon"),
(r"\bopponent'sPokémon\b", "opponent's Pokémon"),
(r"\bOpponent'sPokémon\b", "Opponent's Pokémon"),
(r"\bthatPokémon\b", "that Pokémon"),
(r"\beachPokémon\b", "each Pokémon"),
(r"\baPokémon\b", "a Pokémon"),
]
def clean_text(text: str | None) -> str | None:
"""Apply text cleaning rules."""
if not text:
return text
result = text
for pattern, replacement in TEXT_ARTIFACT_FIXES:
result = re.sub(pattern, replacement, result)
return result
def fix_card_file(file_path: Path) -> bool:
"""Fix text artifacts in a single card file.
Returns True if the file was modified.
"""
with open(file_path) as f:
data = json.load(f)
modified = False
# Fix attack effect texts
for attack in data.get("attacks", []):
if attack.get("effect_text"):
cleaned = clean_text(attack["effect_text"])
if cleaned != attack["effect_text"]:
attack["effect_text"] = cleaned
modified = True
# Fix ability effect texts
for ability in data.get("abilities", []):
if ability.get("effect_text"):
cleaned = clean_text(ability["effect_text"])
if cleaned != ability["effect_text"]:
ability["effect_text"] = cleaned
modified = True
# Fix trainer effect text (stored in flavor_text for raw data)
if data.get("flavor_text"):
cleaned = clean_text(data["flavor_text"])
if cleaned != data["flavor_text"]:
data["flavor_text"] = cleaned
modified = True
# Fix effect_text field (for trainer cards)
if data.get("effect_text"):
cleaned = clean_text(data["effect_text"])
if cleaned != data["effect_text"]:
data["effect_text"] = cleaned
modified = True
if modified:
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return modified
def main():
raw_dir = Path(__file__).parent.parent / "data" / "raw"
if not raw_dir.exists():
print(f"Error: Raw data directory not found: {raw_dir}")
return 1
total_files = 0
modified_files = 0
for set_dir in raw_dir.iterdir():
if not set_dir.is_dir() or set_dir.name.startswith("_"):
continue
for card_file in set_dir.glob("*.json"):
total_files += 1
if fix_card_file(card_file):
modified_files += 1
print(f" Fixed: {card_file.name}")
print(f"\nProcessed {total_files} files, modified {modified_files}")
return 0
if __name__ == "__main__":
sys.exit(main())