The source website uses <span class='energy-text energy-text--type-fire'> to render inline energy icons. BeautifulSoup's get_text() was stripping these spans, losing the energy type information and causing merged text like 'Discard aEnergy' instead of 'Discard a Fire Energy'. Changes: - Add ENERGY_TEXT_TYPES mapping for inline energy references - Add replace_energy_text_spans() to convert spans to text before extraction - Add extract_effect_text() helper with proper text joining (separator=' ') - Update parse_attack(), parse_ability(), _parse_trainer_details() to use it - Fix JSON encoding in convert_cards.py to use UTF-8 (ensure_ascii=False) Before: 'Discard an Energy from this Pokémon' After: 'Discard a Fire Energy from this Pokémon' Re-scraped all 372 cards and regenerated 382 definitions.
126 lines
3.6 KiB
Python
126 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Fix text artifacts in existing raw scraped card data.
|
|
|
|
This script applies the text cleaning rules to existing JSON files
|
|
without re-scraping them from the web.
|
|
|
|
Usage:
|
|
python scripts/fix_raw_text.py
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Same patterns as in the scraper
|
|
TEXT_ARTIFACT_FIXES = [
|
|
# Energy-related artifacts
|
|
(r"\baEnergy\b", "an Energy"),
|
|
(r"\bofEnergy\b", "of Energy"),
|
|
(r"\bextraEnergy\b", "extra Energy"),
|
|
(r"\battachedEnergy\b", "attached Energy"),
|
|
(r"\banyEnergy\b", "any Energy"),
|
|
(r"(\d+)Energy\b", r"\1 Energy"),
|
|
(r"(\d+)-HP\b", r"\1 HP"),
|
|
# Pokemon-related artifacts
|
|
(r"\bBasicPokémon\b", "Basic Pokémon"),
|
|
(r"\bBenchedPokémon\b", "Benched Pokémon"),
|
|
(r"\bthePokémon\b", "the Pokémon"),
|
|
(r"\bthisPokémon\b", "this Pokémon"),
|
|
(r"\byourPokémon\b", "your Pokémon"),
|
|
(r"\bActivePokémon\b", "Active Pokémon"),
|
|
(r"\bDefendingPokémon\b", "Defending Pokémon"),
|
|
(r"\bopponent'sPokémon\b", "opponent's Pokémon"),
|
|
(r"\bOpponent'sPokémon\b", "Opponent's Pokémon"),
|
|
(r"\bthatPokémon\b", "that Pokémon"),
|
|
(r"\beachPokémon\b", "each Pokémon"),
|
|
(r"\baPokémon\b", "a Pokémon"),
|
|
]
|
|
|
|
|
|
def clean_text(text: str | None) -> str | None:
|
|
"""Apply text cleaning rules."""
|
|
if not text:
|
|
return text
|
|
|
|
result = text
|
|
for pattern, replacement in TEXT_ARTIFACT_FIXES:
|
|
result = re.sub(pattern, replacement, result)
|
|
return result
|
|
|
|
|
|
def fix_card_file(file_path: Path) -> bool:
|
|
"""Fix text artifacts in a single card file.
|
|
|
|
Returns True if the file was modified.
|
|
"""
|
|
with open(file_path) as f:
|
|
data = json.load(f)
|
|
|
|
modified = False
|
|
|
|
# Fix attack effect texts
|
|
for attack in data.get("attacks", []):
|
|
if attack.get("effect_text"):
|
|
cleaned = clean_text(attack["effect_text"])
|
|
if cleaned != attack["effect_text"]:
|
|
attack["effect_text"] = cleaned
|
|
modified = True
|
|
|
|
# Fix ability effect texts
|
|
for ability in data.get("abilities", []):
|
|
if ability.get("effect_text"):
|
|
cleaned = clean_text(ability["effect_text"])
|
|
if cleaned != ability["effect_text"]:
|
|
ability["effect_text"] = cleaned
|
|
modified = True
|
|
|
|
# Fix trainer effect text (stored in flavor_text for raw data)
|
|
if data.get("flavor_text"):
|
|
cleaned = clean_text(data["flavor_text"])
|
|
if cleaned != data["flavor_text"]:
|
|
data["flavor_text"] = cleaned
|
|
modified = True
|
|
|
|
# Fix effect_text field (for trainer cards)
|
|
if data.get("effect_text"):
|
|
cleaned = clean_text(data["effect_text"])
|
|
if cleaned != data["effect_text"]:
|
|
data["effect_text"] = cleaned
|
|
modified = True
|
|
|
|
if modified:
|
|
with open(file_path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
return modified
|
|
|
|
|
|
def main():
|
|
raw_dir = Path(__file__).parent.parent / "data" / "raw"
|
|
|
|
if not raw_dir.exists():
|
|
print(f"Error: Raw data directory not found: {raw_dir}")
|
|
return 1
|
|
|
|
total_files = 0
|
|
modified_files = 0
|
|
|
|
for set_dir in raw_dir.iterdir():
|
|
if not set_dir.is_dir() or set_dir.name.startswith("_"):
|
|
continue
|
|
|
|
for card_file in set_dir.glob("*.json"):
|
|
total_files += 1
|
|
if fix_card_file(card_file):
|
|
modified_files += 1
|
|
print(f" Fixed: {card_file.name}")
|
|
|
|
print(f"\nProcessed {total_files} files, modified {modified_files}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|