major-domo-database/.claude/sqlite-to-postgres/botched-sbaplayer-matching/update_csv_files.py

#!/usr/bin/env python3
"""
Add resolution columns to the CSV files for manual review
"""
import csv
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('UpdateCSV')

def add_resolution_column_to_unmatched():
    """Add resolution column to unmatched players CSV"""
    input_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review.csv'
    output_file = '/mnt/NV2/Development/major-domo/database/unmatched_players_for_review_updated.csv'

    with open(input_file, 'r') as infile:
        reader = csv.DictReader(infile)

        # Read all rows
        rows = list(reader)

        # Add resolution column to each row
        for row in rows:
            row['resolution'] = ''  # Empty for manual filling

            # Add some helpful comments based on the suggested match
            if row['suggested_sbaplayer_id'] and not row['suggested_sbaplayer_id'].startswith('PARTIAL:'):
                row['resolution'] = 'ACCEPT'  # Pre-fill exact matches as ACCEPT
            elif row['suggested_sbaplayer_id'].startswith('PARTIAL:'):
                row['resolution'] = 'REVIEW'  # Mark partial matches for review
            else:
                row['resolution'] = 'CREATE_NEW'  # Default for no suggestions

    # Write updated CSV
    with open(output_file, 'w', newline='') as outfile:
        fieldnames = list(reader.fieldnames) + ['resolution']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(rows)

    logger.info(f"Updated unmatched players CSV: {output_file}")
    return output_file

def add_resolution_column_to_high_risk():
    """Add resolution column to high risk matches CSV"""
    input_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches.csv'
    output_file = '/mnt/NV2/Development/major-domo/database/high_risk_player_matches_updated.csv'

    with open(input_file, 'r') as infile:
        reader = csv.DictReader(infile)

        # Read all rows
        rows = list(reader)

        # Add resolution column to each row
        for row in rows:
            row['resolution'] = ''  # Empty for manual filling

            # Add helpful pre-fills based on risk type and bbref_id presence
            if row['risk_type'] == 'sbaplayer_conflict':
                # Check if both have bbref_ids - if so, they are definitely different people
                sba1_bbref = row['sba1_bbref'].strip()
                sba2_bbref = row['sba2_bbref'].strip()

                if sba1_bbref and sba2_bbref and sba1_bbref != sba2_bbref:
                    # Different bbref_ids = definitely different people
                    row['resolution'] = 'DIFFERENT_PEOPLE'
                elif not sba1_bbref and not sba2_bbref:
                    # Neither has bbref_id - could be duplicate
                    if row['similarity_score'] == '1.000':
                        row['resolution'] = f"MERGE_{row['sba2_id']}_INTO_{row['sba1_id']}"
                    else:
                        row['resolution'] = 'DIFFERENT_PEOPLE'
                elif sba1_bbref and not sba2_bbref:
                    # One has bbref_id, one doesn't - could be duplicate needing bbref_id
                    row['resolution'] = f"MERGE_{row['sba2_id']}_INTO_{row['sba1_id']}"
                elif sba2_bbref and not sba1_bbref:
                    # One has bbref_id, one doesn't - could be duplicate needing bbref_id
                    row['resolution'] = f"MERGE_{row['sba1_id']}_INTO_{row['sba2_id']}"
                else:
                    # Same bbref_id - definitely duplicate
                    row['resolution'] = f"MERGE_{row['sba2_id']}_INTO_{row['sba1_id']}"

            elif row['risk_type'] == 'player_ambiguous_match':
                row['resolution'] = f"USE_SBA_{row['sba1_id']}"  # Pre-select first option
            elif row['risk_type'] == 'middle_initial_conflict':
                row['resolution'] = 'DIFFERENT_PEOPLE'

    # Write updated CSV
    with open(output_file, 'w', newline='') as outfile:
        fieldnames = list(reader.fieldnames) + ['resolution']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(rows)

    logger.info(f"Updated high risk matches CSV: {output_file}")
    return output_file

def create_instructions_file():
    """Create instructions file for manual review"""
    instructions = """
# CSV REVIEW INSTRUCTIONS

## File 1: unmatched_players_for_review_updated.csv

**Resolution Column Values:**
- `ACCEPT` - Use the suggested_sbaplayer_id (pre-filled for exact matches)
- `USE_SBA_123` - Use specific SbaPlayer ID 123 instead of suggestion
- `CREATE_NEW` - Create new SbaPlayer record for this player
- `SKIP` - Skip this player for now (won't be processed)
- `REVIEW` - Needs manual review (pre-filled for partial matches)

**Pre-filled Values:**
- Exact name matches are pre-filled as `ACCEPT`
- Partial matches are marked as `REVIEW`
- No suggestions are marked as `CREATE_NEW`

## File 2: high_risk_player_matches_updated.csv

**Resolution Column Values:**
- `MERGE_123_INTO_456` - Merge SbaPlayer 123 into SbaPlayer 456
- `DIFFERENT_PEOPLE` - These are actually different people, keep separate
- `USE_SBA_123` - For player matches, use this specific SbaPlayer ID
- `CREATE_NEW` - Create new SbaPlayer record
- `SKIP` - Skip this for now

**Pre-filled Logic for SbaPlayer Conflicts:**
- Different bbref_ids = `DIFFERENT_PEOPLE` (bbref_ids are globally unique)
- Same bbref_id = `MERGE` (definitely duplicates)
- One has bbref_id, one doesn't = `MERGE` suggestion (review needed)
- Neither has bbref_id + identical names = `MERGE` suggestion
- Player ambiguous matches pre-select the first suggested SbaPlayer
- Middle initial conflicts are marked as `DIFFERENT_PEOPLE`

## Important Notes:
- **bbref_ids are globally unique** - trust them completely
- If two SbaPlayers have different bbref_ids, they are different people
- If one has bbref_id and one doesn't, they might be the same person

## Next Steps:
1. Review and edit the resolution columns in both files
2. Save the files when done
3. Let Claude know you're ready to process the changes

## Common Patterns:
- bbref_id mismatches (like "HALP") should usually be `CREATE_NEW`
- Different bbref_ids = always different people
- Common names like "Carlos Martinez" need careful review
- Middle initials usually indicate different people
"""

    with open('/mnt/NV2/Development/major-domo/database/CSV_REVIEW_INSTRUCTIONS.txt', 'w') as f:
        f.write(instructions)

    logger.info("Created instructions file: CSV_REVIEW_INSTRUCTIONS.txt")

def main():
    """Update both CSV files with resolution columns"""
    logger.info("Adding resolution columns to CSV files...")

    unmatched_file = add_resolution_column_to_unmatched()
    high_risk_file = add_resolution_column_to_high_risk()
    create_instructions_file()

    logger.info(f"\n=== CSV FILES UPDATED ===")
    logger.info(f"Unmatched players: {unmatched_file}")
    logger.info(f"High risk matches: {high_risk_file}")
    logger.info(f"Instructions: CSV_REVIEW_INSTRUCTIONS.txt")
    logger.info(f"\nPlease review and edit the 'resolution' column in both files.")

if __name__ == "__main__":
    main()