mantimon-tcg/backend/app/services/profanity_service.py

"""Profanity filtering service for user-generated content.

This module provides validation for display names and other user-provided
text to filter inappropriate language.

Uses the better-profanity library for detection with customizable word lists.
Includes preprocessing to catch common bypass attempts like:
- Numbers attached to words (shit123)
- Leet-speak substitutions (sh1t, f@ck)
- Special characters embedded in words (s.h.i.t)

Example:
    from app.services.profanity_service import validate_display_name

    # In an API endpoint
    is_valid, error = validate_display_name("PlayerName")
    if not is_valid:
        raise HTTPException(400, error)
"""

import re

from better_profanity import profanity

# Initialize profanity filter with default word list
# Can be customized with profanity.add_censor_words([...])
profanity.load_censor_words()

# Leet-speak character mappings for normalization
LEET_SUBSTITUTIONS: dict[str, str] = {
    "0": "o",
    "1": "i",
    "3": "e",
    "4": "a",
    "5": "s",
    "7": "t",
    "8": "b",
    "@": "a",
    "$": "s",
    "!": "i",
    "+": "t",
}


class ProfanityValidationError(Exception):
    """Error raised when content contains profanity."""

    pass


def _separate_letters_numbers(text: str) -> str:
    """Separate letter sequences from number sequences with spaces.

    Catches bypass attempts like "shit123" -> "shit 123".

    Args:
        text: The text to process.

    Returns:
        Text with spaces between letter and number sequences.
    """
    result = re.sub(r"([a-zA-Z])(\d)", r"\1 \2", text)
    result = re.sub(r"(\d)([a-zA-Z])", r"\1 \2", result)
    return result


def _apply_leet_substitutions(text: str) -> str:
    """Convert leet-speak characters to their letter equivalents.

    Catches bypass attempts like "sh1t", "f@ck", "$hit".

    Args:
        text: The text to process.

    Returns:
        Text with leet-speak characters replaced.
    """
    result = text.lower()
    for leet, letter in LEET_SUBSTITUTIONS.items():
        result = result.replace(leet, letter)
    return result


def _remove_separators(text: str) -> str:
    """Remove separator characters used to break up words.

    Catches bypass attempts like "s.h.i.t", "f-u-c-k".

    Args:
        text: The text to process.

    Returns:
        Text with separator characters removed.
    """
    return re.sub(r"[.\-_]", "", text)


def contains_profanity(text: str) -> bool:
    """Check if text contains profanity.

    Applies multiple normalization strategies to catch bypass attempts:
    1. Direct check on original text
    2. Separate letters from numbers (shit123 -> shit 123)
    3. Leet-speak substitution (sh1t -> shit)
    4. Separator removal (s.h.i.t -> shit)

    Args:
        text: The text to check.

    Returns:
        True if profanity is detected, False otherwise.

    Example:
        if contains_profanity(username):
            reject_username()
    """
    # Check original text first
    if profanity.contains_profanity(text):
        return True

    # Check with letters separated from numbers (shit123 -> shit 123)
    separated = _separate_letters_numbers(text)
    if profanity.contains_profanity(separated):
        return True

    # Check with leet-speak substitutions
    leet_normalized = _apply_leet_substitutions(text)
    if profanity.contains_profanity(leet_normalized):
        return True

    # Check with separators removed (s.h.i.t -> shit)
    no_separators = _remove_separators(text)
    if profanity.contains_profanity(no_separators):
        return True

    # Check combined: leet + separators removed
    combined = _apply_leet_substitutions(_remove_separators(text))
    return bool(profanity.contains_profanity(combined))


def validate_display_name(name: str) -> tuple[bool, str | None]:
    """Validate a display name for profanity.

    Uses enhanced profanity detection that catches bypass attempts
    like leet-speak (sh1t) and number suffixes (shit123).

    Args:
        name: The display name to validate.

    Returns:
        Tuple of (is_valid, error_message).
        If valid, returns (True, None).
        If invalid, returns (False, "error message").

    Example:
        is_valid, error = validate_display_name("BadWord123")
        if not is_valid:
            raise HTTPException(400, error)
    """
    if contains_profanity(name):
        return False, "Display name contains inappropriate language"
    return True, None


def validate_text(text: str, field_name: str = "text") -> tuple[bool, str | None]:
    """Validate arbitrary text for profanity.

    Generic validation function for any user-provided text field.
    Uses enhanced profanity detection that catches bypass attempts.

    Args:
        text: The text to validate.
        field_name: Name of the field for error messages.

    Returns:
        Tuple of (is_valid, error_message).

    Example:
        is_valid, error = validate_text(bio, "bio")
    """
    if contains_profanity(text):
        return False, f"{field_name.title()} contains inappropriate language"
    return True, None


def censor_text(text: str, censor_char: str = "*") -> str:
    """Censor profanity in text by replacing with censor characters.

    Useful for displaying user content that may contain profanity
    rather than rejecting it entirely.

    Args:
        text: The text to censor.
        censor_char: Character to use for censoring (default: *).

    Returns:
        Text with profanity replaced by censor characters.

    Example:
        safe_text = censor_text("some bad words")
        # Returns: "some *** words"
    """
    return profanity.censor(text, censor_char)


def add_custom_words(words: list[str]) -> None:
    """Add custom words to the profanity filter.

    Use this to add game-specific or community-specific terms
    that should be blocked.

    Args:
        words: List of words to add to the filter.

    Example:
        add_custom_words(["customterm", "anotherterm"])
    """
    profanity.add_censor_words(words)