major-domo-database/.claude/sqlite-to-postgres/botched-sbaplayer-matching/update_csv_files.py
Cal Corum 7130a1fd43 Postgres Migration
Migration documentation and scripts
2025-08-25 07:18:31 -05:00

174 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Add resolution columns to the CSV files for manual review
"""
import csv
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('UpdateCSV')
def add_resolution_column_to_unmatched():
"""Add resolution column to unmatched players CSV"""
input_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review.csv'
output_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review_updated.csv'
with open(input_file, 'r') as infile:
reader = csv.DictReader(infile)
# Read all rows
rows = list(reader)
# Add resolution column to each row
for row in rows:
row['resolution'] = '' # Empty for manual filling
# Add some helpful comments based on the suggested match
if row['suggested_sbaplayer_id'] and not row['suggested_sbaplayer_id'].startswith('PARTIAL:'):
row['resolution'] = 'ACCEPT' # Pre-fill exact matches as ACCEPT
elif row['suggested_sbaplayer_id'].startswith('PARTIAL:'):
row['resolution'] = 'REVIEW' # Mark partial matches for review
else:
row['resolution'] = 'CREATE_NEW' # Default for no suggestions
# Write updated CSV
with open(output_file, 'w', newline='') as outfile:
fieldnames = list(reader.fieldnames) + ['resolution']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
logger.info(f"Updated unmatched players CSV: {output_file}")
return output_file
def add_resolution_column_to_high_risk():
"""Add resolution column to high risk matches CSV"""
input_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches.csv'
output_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches_updated.csv'
with open(input_file, 'r') as infile:
reader = csv.DictReader(infile)
# Read all rows
rows = list(reader)
# Add resolution column to each row
for row in rows:
row['resolution'] = '' # Empty for manual filling
# Add helpful pre-fills based on risk type and bbref_id presence
if row['risk_type'] == 'sbaplayer_conflict':
# Check if both have bbref_ids - if so, they are definitely different people
sba1_bbref = row['sba1_bbref'].strip()
sba2_bbref = row['sba2_bbref'].strip()
if sba1_bbref and sba2_bbref and sba1_bbref != sba2_bbref:
# Different bbref_ids = definitely different people
row['resolution'] = 'DIFFERENT_PEOPLE'
elif not sba1_bbref and not sba2_bbref:
# Neither has bbref_id - could be duplicate
if row['similarity_score'] == '1.000':
row['resolution'] = f"MERGE_{row['sba2_id']}_INTO_{row['sba1_id']}"
else:
row['resolution'] = 'DIFFERENT_PEOPLE'
elif sba1_bbref and not sba2_bbref:
# One has bbref_id, one doesn't - could be duplicate needing bbref_id
row['resolution'] = f"MERGE_{row['sba2_id']}_INTO_{row['sba1_id']}"
elif sba2_bbref and not sba1_bbref:
# One has bbref_id, one doesn't - could be duplicate needing bbref_id
row['resolution'] = f"MERGE_{row['sba1_id']}_INTO_{row['sba2_id']}"
else:
# Same bbref_id - definitely duplicate
row['resolution'] = f"MERGE_{row['sba2_id']}_INTO_{row['sba1_id']}"
elif row['risk_type'] == 'player_ambiguous_match':
row['resolution'] = f"USE_SBA_{row['sba1_id']}" # Pre-select first option
elif row['risk_type'] == 'middle_initial_conflict':
row['resolution'] = 'DIFFERENT_PEOPLE'
# Write updated CSV
with open(output_file, 'w', newline='') as outfile:
fieldnames = list(reader.fieldnames) + ['resolution']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
logger.info(f"Updated high risk matches CSV: {output_file}")
return output_file
def create_instructions_file():
"""Create instructions file for manual review"""
instructions = """
# CSV REVIEW INSTRUCTIONS
## File 1: unmatched_players_for_review_updated.csv
**Resolution Column Values:**
- `ACCEPT` - Use the suggested_sbaplayer_id (pre-filled for exact matches)
- `USE_SBA_123` - Use specific SbaPlayer ID 123 instead of suggestion
- `CREATE_NEW` - Create new SbaPlayer record for this player
- `SKIP` - Skip this player for now (won't be processed)
- `REVIEW` - Needs manual review (pre-filled for partial matches)
**Pre-filled Values:**
- Exact name matches are pre-filled as `ACCEPT`
- Partial matches are marked as `REVIEW`
- No suggestions are marked as `CREATE_NEW`
## File 2: high_risk_player_matches_updated.csv
**Resolution Column Values:**
- `MERGE_123_INTO_456` - Merge SbaPlayer 123 into SbaPlayer 456
- `DIFFERENT_PEOPLE` - These are actually different people, keep separate
- `USE_SBA_123` - For player matches, use this specific SbaPlayer ID
- `CREATE_NEW` - Create new SbaPlayer record
- `SKIP` - Skip this for now
**Pre-filled Logic for SbaPlayer Conflicts:**
- Different bbref_ids = `DIFFERENT_PEOPLE` (bbref_ids are globally unique)
- Same bbref_id = `MERGE` (definitely duplicates)
- One has bbref_id, one doesn't = `MERGE` suggestion (review needed)
- Neither has bbref_id + identical names = `MERGE` suggestion
- Player ambiguous matches pre-select the first suggested SbaPlayer
- Middle initial conflicts are marked as `DIFFERENT_PEOPLE`
## Important Notes:
- **bbref_ids are globally unique** - trust them completely
- If two SbaPlayers have different bbref_ids, they are different people
- If one has bbref_id and one doesn't, they might be the same person
## Next Steps:
1. Review and edit the resolution columns in both files
2. Save the files when done
3. Let Claude know you're ready to process the changes
## Common Patterns:
- bbref_id mismatches (like "HALP") should usually be `CREATE_NEW`
- Different bbref_ids = always different people
- Common names like "Carlos Martinez" need careful review
- Middle initials usually indicate different people
"""
with open('/mnt/NV2/Development/major-domo/database/CSV_REVIEW_INSTRUCTIONS.txt', 'w') as f:
f.write(instructions)
logger.info("Created instructions file: CSV_REVIEW_INSTRUCTIONS.txt")
def main():
"""Update both CSV files with resolution columns"""
logger.info("Adding resolution columns to CSV files...")
unmatched_file = add_resolution_column_to_unmatched()
high_risk_file = add_resolution_column_to_high_risk()
create_instructions_file()
logger.info(f"\n=== CSV FILES UPDATED ===")
logger.info(f"Unmatched players: {unmatched_file}")
logger.info(f"High risk matches: {high_risk_file}")
logger.info(f"Instructions: CSV_REVIEW_INSTRUCTIONS.txt")
logger.info(f"\nPlease review and edit the 'resolution' column in both files.")
if __name__ == "__main__":
main()