paper-dynasty-database/tests/test_refractor_evaluator.py

"""Tests for the refractor evaluator service (WP-08).

Unit tests verify tier assignment, advancement, partial progress, idempotency,
full refractor tier, and no-regression behaviour without touching any database,
using stub Peewee models bound to an in-memory SQLite database.

The formula engine (WP-09) and Peewee models (WP-05/WP-07) are not imported
from db_engine/formula_engine; instead the tests supply minimal stubs and
inject them via the _stats_model, _state_model, _compute_value_fn, and
_tier_from_value_fn overrides on evaluate_card().

Stub track thresholds (batter):
  T1: 37   T2: 149   T3: 448   T4: 896

Useful reference values:
  value=30    → T0 (below T1=37)
  value=50    → T1 (37 <= 50 < 149)
  value=100   → T1 (stays T1; T2 threshold is 149)
  value=160   → T2 (149 <= 160 < 448)
  value=900   → T4 (>= 896) → fully_evolved
"""

import pytest
from datetime import datetime
from peewee import (
    BooleanField,
    CharField,
    DateTimeField,
    FloatField,
    ForeignKeyField,
    IntegerField,
    Model,
    SqliteDatabase,
)

from app.services.refractor_evaluator import evaluate_card

# ---------------------------------------------------------------------------
# Stub models — mirror WP-01/WP-04/WP-07 schema without importing db_engine
# ---------------------------------------------------------------------------

_test_db = SqliteDatabase(":memory:")


class TrackStub(Model):
    """Minimal RefractorTrack stub for evaluator tests."""

    card_type = CharField(unique=True)
    t1_threshold = IntegerField()
    t2_threshold = IntegerField()
    t3_threshold = IntegerField()
    t4_threshold = IntegerField()

    class Meta:
        database = _test_db
        table_name = "refractor_track"


class CardStateStub(Model):
    """Minimal RefractorCardState stub for evaluator tests."""

    player_id = IntegerField()
    team_id = IntegerField()
    track = ForeignKeyField(TrackStub)
    current_tier = IntegerField(default=0)
    current_value = FloatField(default=0.0)
    fully_evolved = BooleanField(default=False)
    last_evaluated_at = DateTimeField(null=True)

    class Meta:
        database = _test_db
        table_name = "refractor_card_state"
        indexes = ((("player_id", "team_id"), True),)


class StatsStub(Model):
    """Minimal PlayerSeasonStats stub for evaluator tests."""

    player_id = IntegerField()
    team_id = IntegerField()
    season = IntegerField()
    pa = IntegerField(default=0)
    hits = IntegerField(default=0)
    doubles = IntegerField(default=0)
    triples = IntegerField(default=0)
    hr = IntegerField(default=0)
    outs = IntegerField(default=0)
    strikeouts = IntegerField(default=0)

    class Meta:
        database = _test_db
        table_name = "player_season_stats"


# ---------------------------------------------------------------------------
# Formula stubs — avoid importing app.services.formula_engine before WP-09
# ---------------------------------------------------------------------------


def _compute_value(card_type: str, stats) -> float:
    """Stub compute_value_for_track: returns pa for batter, outs/3+k for pitchers."""
    if card_type == "batter":
        singles = stats.hits - stats.doubles - stats.triples - stats.hr
        tb = singles + 2 * stats.doubles + 3 * stats.triples + 4 * stats.hr
        return float(stats.pa + tb * 2)
    return stats.outs / 3 + stats.strikeouts


def _tier_from_value(value: float, track) -> int:
    """Stub tier_from_value using TrackStub fields t1_threshold/t2_threshold/etc."""
    if isinstance(track, dict):
        t1, t2, t3, t4 = (
            track["t1_threshold"],
            track["t2_threshold"],
            track["t3_threshold"],
            track["t4_threshold"],
        )
    else:
        t1, t2, t3, t4 = (
            track.t1_threshold,
            track.t2_threshold,
            track.t3_threshold,
            track.t4_threshold,
        )
    if value >= t4:
        return 4
    if value >= t3:
        return 3
    if value >= t2:
        return 2
    if value >= t1:
        return 1
    return 0


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


@pytest.fixture(autouse=True)
def _db():
    """Create tables before each test and drop them afterwards."""
    _test_db.connect(reuse_if_open=True)
    _test_db.create_tables([TrackStub, CardStateStub, StatsStub])
    yield
    _test_db.drop_tables([StatsStub, CardStateStub, TrackStub])


@pytest.fixture()
def batter_track():
    return TrackStub.create(
        card_type="batter",
        t1_threshold=37,
        t2_threshold=149,
        t3_threshold=448,
        t4_threshold=896,
    )


@pytest.fixture()
def sp_track():
    return TrackStub.create(
        card_type="sp",
        t1_threshold=10,
        t2_threshold=40,
        t3_threshold=120,
        t4_threshold=240,
    )


def _make_state(player_id, team_id, track, current_tier=0, current_value=0.0):
    return CardStateStub.create(
        player_id=player_id,
        team_id=team_id,
        track=track,
        current_tier=current_tier,
        current_value=current_value,
        fully_evolved=False,
        last_evaluated_at=None,
    )


def _make_stats(player_id, team_id, season, **kwargs):
    return StatsStub.create(
        player_id=player_id, team_id=team_id, season=season, **kwargs
    )


def _eval(player_id, team_id):
    return evaluate_card(
        player_id,
        team_id,
        _stats_model=StatsStub,
        _state_model=CardStateStub,
        _compute_value_fn=_compute_value,
        _tier_from_value_fn=_tier_from_value,
    )


# ---------------------------------------------------------------------------
# Unit tests
# ---------------------------------------------------------------------------


class TestTierAssignment:
    """Tier assigned from computed value against track thresholds."""

    def test_value_below_t1_stays_t0(self, batter_track):
        """value=30 is below T1 threshold (37) → tier stays 0."""
        _make_state(1, 1, batter_track)
        # pa=30, no extra hits → value = 30 + 0 = 30 < 37
        _make_stats(1, 1, 1, pa=30)
        result = _eval(1, 1)
        assert result["current_tier"] == 0

    def test_value_at_t1_threshold_assigns_tier_1(self, batter_track):
        """value=50 → T1 (37 <= 50 < 149)."""
        _make_state(1, 1, batter_track)
        # pa=50, no hits → value = 50 + 0 = 50
        _make_stats(1, 1, 1, pa=50)
        result = _eval(1, 1)
        assert result["current_tier"] == 1

    def test_tier_advancement_to_t2(self, batter_track):
        """value=160 → T2 (149 <= 160 < 448)."""
        _make_state(1, 1, batter_track)
        # pa=160, no hits → value = 160
        _make_stats(1, 1, 1, pa=160)
        result = _eval(1, 1)
        assert result["current_tier"] == 2

    def test_partial_progress_stays_t1(self, batter_track):
        """value=100 with T2=149 → stays T1, does not advance to T2."""
        _make_state(1, 1, batter_track)
        # pa=100 → value = 100, T2 threshold = 149 → tier 1
        _make_stats(1, 1, 1, pa=100)
        result = _eval(1, 1)
        assert result["current_tier"] == 1
        assert result["fully_evolved"] is False

    def test_fully_evolved_at_t4(self, batter_track):
        """value >= T4 (896) → tier=4 and fully_evolved=True."""
        _make_state(1, 1, batter_track)
        # pa=900 → value = 900 >= 896
        _make_stats(1, 1, 1, pa=900)
        result = _eval(1, 1)
        assert result["current_tier"] == 4
        assert result["fully_evolved"] is True


class TestNoRegression:
    """current_tier never decreases."""

    def test_tier_never_decreases(self, batter_track):
        """If current_tier=2 and new value only warrants T1, tier stays 2."""
        # Seed state at tier 2
        _make_state(1, 1, batter_track, current_tier=2, current_value=160.0)
        # Sparse stats: value=50 → would be T1, but current is T2
        _make_stats(1, 1, 1, pa=50)
        result = _eval(1, 1)
        assert result["current_tier"] == 2  # no regression

    def test_tier_advances_when_value_improves(self, batter_track):
        """If current_tier=1 and new value warrants T3, tier advances to 3."""
        _make_state(1, 1, batter_track, current_tier=1, current_value=50.0)
        # pa=500 → value = 500 >= 448 → T3
        _make_stats(1, 1, 1, pa=500)
        result = _eval(1, 1)
        assert result["current_tier"] == 3


class TestIdempotency:
    """Calling evaluate_card twice with same stats returns the same result."""

    def test_idempotent_same_result(self, batter_track):
        """Two evaluations with identical stats produce the same tier and value."""
        _make_state(1, 1, batter_track)
        _make_stats(1, 1, 1, pa=160)
        result1 = _eval(1, 1)
        result2 = _eval(1, 1)
        assert result1["current_tier"] == result2["current_tier"]
        assert result1["current_value"] == result2["current_value"]
        assert result1["fully_evolved"] == result2["fully_evolved"]

    def test_idempotent_at_fully_evolved(self, batter_track):
        """Repeated evaluation at T4 remains fully_evolved=True."""
        _make_state(1, 1, batter_track)
        _make_stats(1, 1, 1, pa=900)
        _eval(1, 1)
        result = _eval(1, 1)
        assert result["current_tier"] == 4
        assert result["fully_evolved"] is True


class TestCareerTotals:
    """Stats are summed across all seasons for the player/team pair."""

    def test_multi_season_stats_summed(self, batter_track):
        """Stats from two seasons are aggregated into a single career total."""
        _make_state(1, 1, batter_track)
        # Season 1: pa=80, Season 2: pa=90 → total pa=170 → value=170 → T2
        _make_stats(1, 1, 1, pa=80)
        _make_stats(1, 1, 2, pa=90)
        result = _eval(1, 1)
        assert result["current_tier"] == 2
        assert result["current_value"] == 170.0

    def test_zero_stats_stays_t0(self, batter_track):
        """No stats rows → all zeros → value=0 → tier=0."""
        _make_state(1, 1, batter_track)
        result = _eval(1, 1)
        assert result["current_tier"] == 0
        assert result["current_value"] == 0.0

    def test_other_team_stats_not_included(self, batter_track):
        """Stats for the same player on a different team are not counted."""
        _make_state(1, 1, batter_track)
        _make_stats(1, 1, 1, pa=50)
        # Same player, different team — should not count
        _make_stats(1, 2, 1, pa=200)
        result = _eval(1, 1)
        # Only pa=50 counted → value=50 → T1
        assert result["current_tier"] == 1
        assert result["current_value"] == 50.0


class TestFullyEvolvedPersistence:
    """T2-1: fully_evolved=True is preserved even when stats drop or are absent."""

    def test_fully_evolved_persists_when_stats_zeroed(self, batter_track):
        """Card at T4/fully_evolved=True stays fully_evolved after stats are removed.

        What: Set up a RefractorCardState at tier=4 with fully_evolved=True.
        Then call evaluate_card with no season stats rows (zero career totals).
        The evaluator computes value=0 -> new_tier=0, but current_tier must
        stay at 4 (no regression) and fully_evolved must remain True.

        Why: fully_evolved is a permanent achievement flag — it must not be
        revoked if a team's stats are rolled back, corrected, or simply not
        yet imported.  The no-regression rule (max(current, new)) prevents
        tier demotion; this test confirms that fully_evolved follows the same
        protection.
        """
        # Seed state at T4 fully_evolved
        _make_state(1, 1, batter_track, current_tier=4, current_value=900.0)
        # No stats rows — career totals will be all zeros
        # (no _make_stats call)

        result = _eval(1, 1)

        # The no-regression rule keeps tier at 4
        assert result["current_tier"] == 4, (
            f"Expected tier=4 (no regression), got {result['current_tier']}"
        )
        # fully_evolved must still be True since tier >= 4
        assert result["fully_evolved"] is True, (
            "fully_evolved was reset to False after re-evaluation with zero stats"
        )

    def test_fully_evolved_persists_with_partial_stats(self, batter_track):
        """Card at T4 stays fully_evolved even with stats below T1.

        What: Same setup as above but with a season stats row giving value=30
        (below T1=37).  The computed tier would be 0, but current_tier must
        not regress from 4.

        Why: Validates that no-regression applies regardless of whether stats
        are zero or merely insufficient for the achieved tier.
        """
        _make_state(1, 1, batter_track, current_tier=4, current_value=900.0)
        # pa=30 -> value=30, which is below T1=37 -> computed tier=0
        _make_stats(1, 1, 1, pa=30)

        result = _eval(1, 1)

        assert result["current_tier"] == 4
        assert result["fully_evolved"] is True


class TestMissingState:
    """ValueError when no card state exists for (player_id, team_id)."""

    def test_missing_state_raises(self, batter_track):
        """evaluate_card raises ValueError when no state row exists."""
        # No card state created
        with pytest.raises(ValueError, match="No refractor_card_state"):
            _eval(99, 99)


class TestReturnShape:
    """Return dict has the expected keys and types."""

    def test_return_keys(self, batter_track):
        """Result dict contains all expected keys."""
        _make_state(1, 1, batter_track)
        result = _eval(1, 1)
        assert set(result.keys()) == {
            "player_id",
            "team_id",
            "current_tier",
            "current_value",
            "fully_evolved",
            "last_evaluated_at",
        }

    def test_last_evaluated_at_is_iso_string(self, batter_track):
        """last_evaluated_at is a non-empty ISO-8601 string."""
        _make_state(1, 1, batter_track)
        result = _eval(1, 1)
        ts = result["last_evaluated_at"]
        assert isinstance(ts, str) and len(ts) > 0
        # Must be parseable as a datetime
        datetime.fromisoformat(ts)


class TestFullyEvolvedFlagCorrection:
    """T3-7: fully_evolved/tier mismatch is corrected by evaluate_card.

    A database corruption where fully_evolved=True but current_tier < 4 can
    occur if the flag was set incorrectly by a migration or external script.
    evaluate_card must re-derive fully_evolved from the freshly-computed tier
    (after the no-regression max() is applied), not trust the stored flag.
    """

    def test_fully_evolved_flag_corrected_when_tier_below_4(self, batter_track):
        """fully_evolved=True with current_tier=3 is corrected to False after evaluation.

        What: Manually set database state to fully_evolved=True, current_tier=3
        (a corruption scenario — tier 3 cannot be "fully evolved" since T4 is
        the maximum tier).  Provide stats that compute to a value in the T3
        range (value=500, which is >= T3=448 but < T4=896).

        After evaluate_card:
          - computed value = 500 → new_tier = 3
          - no-regression: max(current_tier=3, new_tier=3) = 3  → tier stays 3
          - fully_evolved = (3 >= 4) = False  → flag is corrected

        Why: The evaluator always recomputes fully_evolved from the final
        current_tier rather than preserving the stored flag.  This ensures
        that a corrupted fully_evolved=True at tier<4 is silently repaired
        on the next evaluation without requiring a separate migration.
        """
        # Inject corruption: fully_evolved=True but tier=3
        state = CardStateStub.create(
            player_id=1,
            team_id=1,
            track=batter_track,
            current_tier=3,
            current_value=500.0,
            fully_evolved=True,  # intentionally wrong
            last_evaluated_at=None,
        )
        # Stats that compute to value=500: pa=500, no hits → value=500+0=500
        # T3 threshold=448, T4 threshold=896 → tier=3, NOT 4
        _make_stats(1, 1, 1, pa=500)

        result = _eval(1, 1)

        assert result["current_tier"] == 3, (
            f"Expected tier=3 after evaluation with value=500, got {result['current_tier']}"
        )
        assert result["fully_evolved"] is False, (
            "fully_evolved should have been corrected to False for tier=3, "
            f"got {result['fully_evolved']}"
        )

        # Confirm the database row was updated (not just the return dict)
        state_reloaded = CardStateStub.get_by_id(state.id)
        assert state_reloaded.fully_evolved is False, (
            "fully_evolved was not persisted as False after correction"
        )

    def test_fully_evolved_flag_preserved_when_tier_reaches_4(self, batter_track):
        """fully_evolved=True with current_tier=3 stays True when new stats push to T4.

        What: Same corruption setup as above (fully_evolved=True, tier=3),
        but now provide stats with value=900 (>= T4=896).

        After evaluate_card:
          - computed value = 900 → new_tier = 4
          - no-regression: max(current_tier=3, new_tier=4) = 4  → advances to 4
          - fully_evolved = (4 >= 4) = True  → flag stays True (correctly)

        Why: Confirms the evaluator correctly sets fully_evolved=True when
        the re-computed tier legitimately reaches T4 regardless of whether
        the stored flag was already True before evaluation.
        """
        CardStateStub.create(
            player_id=1,
            team_id=1,
            track=batter_track,
            current_tier=3,
            current_value=500.0,
            fully_evolved=True,  # stored flag (will be re-derived)
            last_evaluated_at=None,
        )
        # pa=900 → value=900 >= T4=896 → new_tier=4
        _make_stats(1, 1, 1, pa=900)

        result = _eval(1, 1)

        assert result["current_tier"] == 4, (
            f"Expected tier=4 for value=900, got {result['current_tier']}"
        )
        assert result["fully_evolved"] is True, (
            f"Expected fully_evolved=True for tier=4, got {result['fully_evolved']}"
        )


class TestMultiTeamStatIsolation:
    """T3-8: A player's refractor value is isolated to a specific team's stats.

    The evaluator queries BattingSeasonStats WHERE player_id=? AND team_id=?.
    When a player has stats on two different teams in the same season, each
    team's RefractorCardState must reflect only that team's stats — not a
    combined total.
    """

    def test_multi_team_same_season_stats_isolated(self, batter_track):
        """Each team's refractor value reflects only that team's stats, not combined.

        What: Create one player with BattingSeasonStats on team_id=1 (pa=80)
        and team_id=2 (pa=120) in the same season.  Create a RefractorCardState
        for each team.  Evaluate each team's card separately and verify:
          - Team 1 state: value = 80 → tier = T1 (80 >= T1=37, < T2=149)
          - Team 2 state: value = 120 → tier = T1 (120 >= T1=37, < T2=149)
          - Neither value equals the combined total (80+120=200 → would be T2)

        Why: Confirms the `WHERE player_id=? AND team_id=?` filter in the
        evaluator is correctly applied.  Without proper team isolation, the
        combined total of 200 would cross the T2 threshold (149) and both
        states would be incorrectly assigned to T2.  This is a critical
        correctness requirement: a player traded between teams should have
        separate refractor progressions for their time with each franchise.
        """
        # Stats on team 1: pa=80 → value=80 (T1: 37<=80<149)
        _make_stats(player_id=1, team_id=1, season=11, pa=80)
        # Stats on team 2: pa=120 → value=120 (T1: 37<=120<149)
        _make_stats(player_id=1, team_id=2, season=11, pa=120)

        # combined pa would be 200 → value=200 → T2 (149<=200<448)
        # Each team must see only its own stats, not 200

        _make_state(player_id=1, team_id=1, track=batter_track)
        _make_state(player_id=1, team_id=2, track=batter_track)

        result_team1 = _eval(player_id=1, team_id=1)
        result_team2 = _eval(player_id=1, team_id=2)

        # Team 1: only pa=80 counted → value=80 → T1
        assert result_team1["current_value"] == 80.0, (
            f"Team 1 value should be 80.0 (its own stats only), "
            f"got {result_team1['current_value']}"
        )
        assert result_team1["current_tier"] == 1, (
            f"Team 1 tier should be T1 for value=80, got {result_team1['current_tier']}"
        )

        # Team 2: only pa=120 counted → value=120 → T1
        assert result_team2["current_value"] == 120.0, (
            f"Team 2 value should be 120.0 (its own stats only), "
            f"got {result_team2['current_value']}"
        )
        assert result_team2["current_tier"] == 1, (
            f"Team 2 tier should be T1 for value=120, got {result_team2['current_tier']}"
        )

        # Sanity: neither team crossed T2 (which would happen if stats were combined)
        assert (
            result_team1["current_tier"] != 2 and result_team2["current_tier"] != 2
        ), (
            "At least one team was incorrectly assigned T2 — stats may have been combined"
        )

    def test_multi_team_different_seasons_isolated(self, batter_track):
        """Stats for the same player across multiple seasons remain per-team isolated.

        What: Same player with two seasons of stats for each of two teams:
          - team_id=1: season 10 pa=90, season 11 pa=70 → combined=160
          - team_id=2: season 10 pa=100, season 11 pa=80 → combined=180

        After evaluation:
          - Team 1: value=160 → T2 (149<=160<448)
          - Team 2: value=180 → T2 (149<=180<448)

        The test confirms that cross-team season aggregation does not bleed
        stats from team 2 into team 1's calculation or vice versa.

        Why: Multi-season aggregation and multi-team isolation must work
        together.  A bug that incorrectly sums all player stats regardless
        of team would produce combined values of 340 → T2, which coincidentally
        passes, but the per-team values and tiers would be wrong.
        This test uses values where cross-contamination would produce a
        materially different value (340 vs 160/180), catching that class of bug.
        """
        # Team 1 stats: total pa=160 → value=160 → T2
        _make_stats(player_id=1, team_id=1, season=10, pa=90)
        _make_stats(player_id=1, team_id=1, season=11, pa=70)

        # Team 2 stats: total pa=180 → value=180 → T2
        _make_stats(player_id=1, team_id=2, season=10, pa=100)
        _make_stats(player_id=1, team_id=2, season=11, pa=80)

        _make_state(player_id=1, team_id=1, track=batter_track)
        _make_state(player_id=1, team_id=2, track=batter_track)

        result_team1 = _eval(player_id=1, team_id=1)
        result_team2 = _eval(player_id=1, team_id=2)

        assert result_team1["current_value"] == 160.0, (
            f"Team 1 multi-season value should be 160.0, got {result_team1['current_value']}"
        )
        assert result_team1["current_tier"] == 2, (
            f"Team 1 tier should be T2 for value=160, got {result_team1['current_tier']}"
        )

        assert result_team2["current_value"] == 180.0, (
            f"Team 2 multi-season value should be 180.0, got {result_team2['current_value']}"
        )
        assert result_team2["current_tier"] == 2, (
            f"Team 2 tier should be T2 for value=180, got {result_team2['current_tier']}"
        )