diff --git a/CLAUDE.md b/CLAUDE.md index 59952be..45a8267 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,6 +61,21 @@ python analyze_cardset_rarity.py # Analyze players by franchise and rarity (batt python rank_pitching_staffs.py # Rank teams 1-30 by pitching staff quality ``` +### Position Validation +```bash +# Verify position assignments after card generation (recommended after every run) +./scripts/check_positions.sh [api_url] + +# Examples: +./scripts/check_positions.sh 27 # Check production +./scripts/check_positions.sh 27 https://pddev.manticorum.com/api # Check dev + +# The script flags: +# - Anomalous DH counts (should be <5 for full-season cards) +# - Missing outfield positions (indicates defensive calculation failures) +# - Mismatches between player positions and cardpositions table +``` + ## Data Input Requirements ### FanGraphs Data (place in data-input/[YEAR] [TYPE] Cardset/) @@ -213,4 +228,31 @@ Before running retrosheet_data.py, verify these configuration settings: ```python ob_vl = float(108 * (df_data['BB_vL'] + df_data['HBP_vL']) / df_data['TBF_vL']) result = min(ob_vl, 0.8) # Now works correctly -``` \ No newline at end of file +``` + +### Outfielders Assigned as DH (Defense Column Mismatch) +**Problem**: All outfielders show `pos_1 = "DH"` instead of LF/CF/RF; cardpositions table has 0 outfield positions + +**Root Cause**: Code checks for `bis_runs_outfield` or `tz_runs_outfield` columns in defense CSV files, but Baseball Reference only provides `tz_runs_total` + +**Symptoms**: +- 50+ players with DH as pos_1 (should be <5 for full season) +- No LF/CF/RF positions in player records +- Log errors: "Outfield position failed: 'tz_runs_outfield'" + +**Solution** (retrosheet_data.py lines 889, 926, 947): +```python +# Wrong - checks batter stats row instead of defense dataframe columns +if 'tz_runs_total' in row: # ❌ + +# Correct - checks defense dataframe for actual column +if 'bis_runs_total' in pos_df.columns: # ✅ + +# Wrong - column doesn't exist in CSV +of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df else 'tz_runs_outfield' # ❌ + +# Correct - fallback to column that exists +of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df.columns else 'tz_runs_total' # ✅ +``` + +**Verification**: Run `./scripts/check_positions.sh ` after card generation to catch this issue \ No newline at end of file diff --git a/retrosheet_data.py b/retrosheet_data.py index 80f12b8..1770e5a 100644 --- a/retrosheet_data.py +++ b/retrosheet_data.py @@ -58,9 +58,9 @@ MIN_TBF_VR = MIN_PA_VR CARDSET_ID = 27 if 'live' in PLAYER_DESCRIPTION.lower() else 28 # 27: 2005 Live, 28: 2005 Promos # Per-Update Parameters -SEASON_PCT = 162 / 162 # Full season +SEASON_PCT = 28 / 162 # Full season START_DATE = 20050301 # YYYYMMDD format - 2005 Opening Day -END_DATE = 20050430 # YYYYMMDD format - 2005 Regular Season End +END_DATE = 20050430 # YYYYMMDD format - Month 1 of play POST_DATA = True LAST_WEEK_RATIO = 0.0 if PLAYER_DESCRIPTION == 'Live' else 0.0 LAST_TWOWEEKS_RATIO = 0.0 @@ -886,7 +886,7 @@ def calc_positions(bs: pd.DataFrame) -> pd.DataFrame: if row['key_bbref'] in pos_df.index: logger.info(f'Running {position} stats for {row["use_name"]} {row["last_name"]}') try: - if 'tz_runs_total' in row: + if 'bis_runs_total' in pos_df.columns: average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) + int(pos_df.at[row["key_bbref"], 'bis_runs_total']) + min( @@ -923,7 +923,7 @@ def calc_positions(bs: pd.DataFrame) -> pd.DataFrame: for pos_df, position in [(df_lf, 'lf'), (df_cf, 'cf'), (df_rf, 'rf')]: if row["key_bbref"] in pos_df.index: try: - if 'tz_runs_total' in row: + if 'bis_runs_total' in pos_df.columns: average_range = (int(pos_df.at[row["key_bbref"], 'tz_runs_total']) + int(pos_df.at[row["key_bbref"], 'bis_runs_total']) + min( @@ -944,7 +944,7 @@ def calc_positions(bs: pd.DataFrame) -> pd.DataFrame: season_pct=season_pct ) }) - of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df else 'tz_runs_outfield' + of_run_rating = 'bis_runs_outfield' if 'bis_runs_outfield' in pos_df.columns else 'tz_runs_total' of_arms.append(int(pos_df.at[row["key_bbref"], of_run_rating])) no_data = False except Exception as e: diff --git a/scripts/check_positions.sh b/scripts/check_positions.sh new file mode 100755 index 0000000..ed0b781 --- /dev/null +++ b/scripts/check_positions.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Usage: ./scripts/check_positions.sh [api_url] +# Example: ./scripts/check_positions.sh 27 +# Example: ./scripts/check_positions.sh 27 https://pddev.manticorum.com/api + +CARDSET_ID=$1 +API_URL=${2:-"https://pd.manticorum.com/api"} + +if [ -z "$CARDSET_ID" ]; then + echo "Error: Cardset ID required" + echo "Usage: $0 [api_url]" + echo "Example: $0 27" + echo "Example: $0 27 https://pddev.manticorum.com/api" + exit 1 +fi + +echo "======================================" +echo "Position Analysis for Cardset $CARDSET_ID" +echo "API: $API_URL" +echo "======================================" +echo "" + +# Fetch players and save to temp file +TEMP_FILE=$(mktemp) +curl -s "$API_URL/v2/players?cardset_id=$CARDSET_ID" | jq -r '.players[] | "\(.p_name),\(.pos_1),\(.pos_2),\(.pos_3)"' > "$TEMP_FILE" + +TOTAL_PLAYERS=$(wc -l < "$TEMP_FILE") +echo "Total players: $TOTAL_PLAYERS" +echo "" + +# Position distribution +echo "=== Position 1 Distribution ===" +cat "$TEMP_FILE" | cut -d',' -f2 | sort | uniq -c | sort -rn +echo "" + +# Count DHs +DH_COUNT=$(cat "$TEMP_FILE" | cut -d',' -f2 | grep -c "^DH$") +echo "=== DH Analysis ===" +echo "Total DH players: $DH_COUNT" + +# Flag if DH count is anomalous (>5 for full season, >10% of total) +DH_PERCENT=$((DH_COUNT * 100 / TOTAL_PLAYERS)) +if [ $DH_COUNT -gt 5 ] && [ $DH_PERCENT -gt 10 ]; then + echo "⚠️ WARNING: Unusually high number of DH players ($DH_COUNT = ${DH_PERCENT}%)" + echo " Expected: <5 for full season cards" +elif [ $DH_COUNT -gt 5 ]; then + echo "⚠️ NOTICE: Above-average DH count ($DH_COUNT)" +else + echo "✅ DH count is normal ($DH_COUNT)" +fi +echo "" + +# Show DH players if count is suspicious +if [ $DH_COUNT -gt 5 ]; then + echo "=== DH Players (should mostly be full-time DHs) ===" + cat "$TEMP_FILE" | grep ",DH," | cut -d',' -f1 | head -20 + if [ $DH_COUNT -gt 20 ]; then + echo "... and $((DH_COUNT - 20)) more" + fi + echo "" +fi + +# Count outfielders +OF_COUNT=$(cat "$TEMP_FILE" | cut -d',' -f2 | grep -cE "^(LF|CF|RF)$") +echo "=== Outfield Analysis ===" +echo "Total outfielders: $OF_COUNT" +echo " LF: $(cat "$TEMP_FILE" | cut -d',' -f2 | grep -c "^LF$")" +echo " CF: $(cat "$TEMP_FILE" | cut -d',' -f2 | grep -c "^CF$")" +echo " RF: $(cat "$TEMP_FILE" | cut -d',' -f2 | grep -c "^RF$")" + +# Flag if no outfielders (major bug) +if [ $OF_COUNT -eq 0 ]; then + echo "🚨 CRITICAL: No outfielders found! Defensive positions likely failed." +elif [ $OF_COUNT -lt 20 ]; then + echo "⚠️ WARNING: Very few outfielders ($OF_COUNT). Check defensive position calculations." +else + echo "✅ Outfield count looks normal ($OF_COUNT)" +fi +echo "" + +# Check cardpositions table for outfield positions +echo "=== CardPositions Table Check ===" +CARDPOS_OF_COUNT=$(curl -s "$API_URL/v2/cardpositions?cardset_id=$CARDSET_ID" | jq '[.positions[] | select(.position | test("LF|CF|RF"))] | length') +echo "Outfield positions in cardpositions table: $CARDPOS_OF_COUNT" + +if [ "$CARDPOS_OF_COUNT" = "0" ]; then + echo "🚨 CRITICAL: No outfield positions in database! Defensive calculations failed." +elif [ $CARDPOS_OF_COUNT -lt $((OF_COUNT - 5)) ]; then + echo "⚠️ WARNING: Fewer cardpositions than players with OF pos_1" +else + echo "✅ CardPositions table looks good" +fi +echo "" + +# Sample outfielders to verify +echo "=== Sample Outfielders (for manual verification) ===" +cat "$TEMP_FILE" | grep -E ",LF,|,CF,|,RF," | head -5 | column -t -s',' +echo "" + +# Clean up +rm "$TEMP_FILE" + +echo "======================================" +echo "Analysis complete!" +echo "======================================"