claude-home/media-tools/scripts/pokeflix_scraper.py

#!/usr/bin/env python3
"""
Pokeflix Scraper - Download Pokemon episodes from pokeflix.tv

Pokeflix hosts videos directly on their CDN (v1.pkflx.com). This scraper:
1. Extracts the episode list from a season browse page
2. Visits each episode page to detect its CDN episode number
3. Downloads videos directly from the CDN via yt-dlp

Usage:
    # Download entire season
    python pokeflix_scraper.py --url "https://www.pokeflix.tv/browse/pokemon-indigo-league" --output ~/Pokemon/

    # Download specific episode range
    python pokeflix_scraper.py --url "..." --start 1 --end 10 --output ~/Pokemon/

    # Resume interrupted download
    python pokeflix_scraper.py --url "..." --output ~/Pokemon/ --resume

    # Dry run (extract URLs only)
    python pokeflix_scraper.py --url "..." --dry-run

    # Choose quality
    python pokeflix_scraper.py --url "..." --quality 720p --output ~/Pokemon/

Dependencies:
    pip install playwright
    playwright install chromium
    # yt-dlp must be installed: pip install yt-dlp

Author: Cal Corum (with Jarvis assistance)
"""

import argparse
import asyncio
import json
import logging
import random
import re
import subprocess
import sys
from dataclasses import dataclass, field, asdict
from datetime import datetime
from pathlib import Path
from typing import Optional

try:
    from playwright.async_api import async_playwright, Page, Browser
except ImportError:
    print("ERROR: playwright not installed. Run: pip install playwright && playwright install chromium")
    sys.exit(1)


# ============================================================================
# Data Classes
# ============================================================================

@dataclass
class Episode:
    """Represents a single episode with its metadata and download status."""
    cdn_number: int  # The actual episode number on the CDN
    title: str
    page_url: str
    slug: str  # URL slug e.g., "01-pokemon-i-choose-you"
    video_url: Optional[str] = None
    downloaded: bool = False
    error: Optional[str] = None

    @property
    def filename(self) -> str:
        """Generate safe filename for the episode."""
        safe_title = re.sub(r'[<>:"/\\|?*]', '', self.title)
        safe_title = safe_title.strip()
        return f"E{self.cdn_number:02d} - {safe_title}.mp4"


@dataclass
class Season:
    """Represents a season/series with all its episodes."""
    name: str
    url: str
    cdn_slug: str  # e.g., "01-indigo-league" - used for CDN URLs
    episodes: list[Episode] = field(default_factory=list)

    @property
    def safe_name(self) -> str:
        """Generate safe directory name for the season."""
        safe = re.sub(r'[<>:"/\\|?*]', '', self.name)
        return safe.strip()


@dataclass
class DownloadState:
    """Persistent state for resumable downloads."""
    season_url: str
    season_name: str
    cdn_slug: str
    episodes: dict[int, dict] = field(default_factory=dict)  # cdn_number -> episode dict
    episode_urls: list[str] = field(default_factory=list)  # All episode page URLs
    last_updated: str = ""

    def save(self, path: Path) -> None:
        """Save state to JSON file."""
        self.last_updated = datetime.now().isoformat()
        with open(path, 'w') as f:
            json.dump(asdict(self), f, indent=2)

    @classmethod
    def load(cls, path: Path) -> Optional['DownloadState']:
        """Load state from JSON file."""
        if not path.exists():
            return None
        with open(path) as f:
            data = json.load(f)
        return cls(**data)


# ============================================================================
# Logging Setup
# ============================================================================

def setup_logging(verbose: bool = False) -> logging.Logger:
    """Configure logging with console output."""
    logger = logging.getLogger('pokeflix_scraper')
    logger.setLevel(logging.DEBUG if verbose else logging.INFO)

    if not logger.handlers:
        console = logging.StreamHandler()
        console.setLevel(logging.DEBUG if verbose else logging.INFO)
        console.setFormatter(logging.Formatter(
            '%(asctime)s [%(levelname)s] %(message)s',
            datefmt='%H:%M:%S'
        ))
        logger.addHandler(console)

    return logger


# ============================================================================
# Scraper Class
# ============================================================================

class PokeflixScraper:
    """
    Scrapes pokeflix.tv for video URLs and downloads them.

    Pokeflix hosts videos on their CDN with URLs like:
    https://v1.pkflx.com/hls/{season-slug}/{ep-num}/{ep-num}_{quality}.mp4

    The episode number must be detected by visiting each episode page,
    as the browse page URL slugs don't contain episode numbers.
    """

    BASE_URL = "https://www.pokeflix.tv"
    CDN_URL = "https://v1.pkflx.com/hls"

    # Map browse page URL slugs to CDN slugs
    SEASON_SLUG_MAP = {
        'pokemon-indigo-league': '01-indigo-league',
        'pokemon-adventures-in-the-orange-islands': '02-orange-islands',
        'pokemon-the-johto-journeys': '03-johto-journeys',
        'pokemon-johto-league-champions': '04-johto-league-champions',
        'pokemon-master-quest': '05-master-quest',
        'pokemon-advanced': '06-advanced',
        'pokemon-advanced-challenge': '07-advanced-challenge',
        'pokemon-advanced-battle': '08-advanced-battle',
        'pokemon-battle-frontier': '09-battle-frontier',
        'pokemon-diamond-and-pearl': '10-diamond-and-pearl',
        'pokemon-dp-battle-dimension': '11-battle-dimension',
        'pokemon-dp-galactic-battles': '12-galactic-battles',
        'pokemon-dp-sinnoh-league-victors': '13-sinnoh-league-victors',
        'pokemon-black-white': '14-black-and-white',
        'pokemon-bw-rival-destinies': '15-rival-destinies',
        'pokemon-bw-adventures-in-unova': '16-adventures-in-unova',
        'pokemon-xy': '17-xy',
        'pokemon-xy-kalos-quest': '18-kalos-quest',
        'pokemon-xyz': '19-xyz',
        'pokemon-sun-moon': '20-sun-and-moon',
        'pokemon-sun-moon-ultra-adventures': '21-ultra-adventures',
        'pokemon-sun-moon-ultra-legends': '22-ultra-legends',
        'pokemon-journeys': '23-journeys',
        'pokemon-master-journeys': '24-master-journeys',
        'pokemon-ultimate-journeys': '25-ultimate-journeys',
        'pokemon-horizons': '26-horizons',
    }

    def __init__(
        self,
        output_dir: Path,
        headless: bool = False,
        dry_run: bool = False,
        verbose: bool = False,
        quality: str = "1080p"
    ):
        self.output_dir = output_dir
        self.headless = headless
        self.dry_run = dry_run
        self.quality = quality
        self.logger = setup_logging(verbose)
        self.browser: Optional[Browser] = None
        self._context = None

    async def __aenter__(self):
        """Async context manager entry - launch browser."""
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=self.headless,
            args=['--disable-blink-features=AutomationControlled']
        )
        self._playwright = playwright
        # Create a persistent context to reuse
        self._context = await self.browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        )
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit - close browser."""
        if self._context:
            await self._context.close()
        if self.browser:
            await self.browser.close()
        await self._playwright.stop()

    async def _new_page(self) -> Page:
        """Create a new page using the shared context."""
        return await self._context.new_page()

    async def _random_delay(self, min_sec: float = 1.0, max_sec: float = 3.0):
        """Random delay to avoid detection."""
        delay = random.uniform(min_sec, max_sec)
        await asyncio.sleep(delay)

    async def _wait_for_cloudflare(self, page: Page, timeout: int = 60):
        """Wait for Cloudflare challenge to be solved by user."""
        try:
            # Check if we're on a Cloudflare challenge page
            is_cf = await page.query_selector('#challenge-running, .cf-browser-verification, [id*="challenge"]')
            if is_cf:
                self.logger.warning("Cloudflare challenge detected - please solve it in the browser window")
                self.logger.info("Waiting up to 60 seconds for challenge completion...")

                # Wait for the challenge to be solved (URL changes or challenge element disappears)
                for _ in range(timeout):
                    await asyncio.sleep(1)
                    is_cf = await page.query_selector('#challenge-running, .cf-browser-verification, [id*="challenge"]')
                    if not is_cf:
                        self.logger.info("Cloudflare challenge completed!")
                        await asyncio.sleep(2)  # Wait for page to fully load
                        return True

                self.logger.error("Cloudflare challenge timeout - please try again")
                return False
        except Exception:
            pass
        return True

    def _get_cdn_slug(self, browse_url: str) -> Optional[str]:
        """Extract CDN slug from browse page URL."""
        match = re.search(r'/browse/([^/]+)', browse_url)
        if match:
            page_slug = match.group(1)
            if page_slug in self.SEASON_SLUG_MAP:
                return self.SEASON_SLUG_MAP[page_slug]
            self.logger.warning(f"Unknown season slug: {page_slug}, will try to detect from page")
        return None

    def _construct_video_url(self, cdn_slug: str, ep_num: int) -> str:
        """Construct direct CDN video URL."""
        return f"{self.CDN_URL}/{cdn_slug}/{ep_num:02d}/{ep_num:02d}_{self.quality}.mp4"

    def _slug_to_title(self, slug: str) -> str:
        """Convert URL slug to human-readable title."""
        # Remove season prefix like "01-"
        title_slug = re.sub(r'^\d+-', '', slug)
        # Convert to title case
        title = title_slug.replace('-', ' ').title()
        # Clean up common words
        title = re.sub(r'\bPokemon\b', 'Pokémon', title)
        return title

    async def get_episode_list(self, season_url: str) -> tuple[str, str, list[tuple[str, str]]]:
        """
        Get the list of episode URLs from a season browse page.

        Returns:
            Tuple of (season_name, cdn_slug, list of (page_url, slug) tuples)
        """
        self.logger.info(f"Fetching season page: {season_url}")

        cdn_slug = self._get_cdn_slug(season_url)

        page = await self._new_page()
        try:
            await page.goto(season_url, wait_until='networkidle', timeout=60000)
            await self._wait_for_cloudflare(page)
            await self._random_delay(2, 4)

            # Extract season title
            title_elem = await page.query_selector('h1, .season-title, .series-title')
            if not title_elem:
                title_elem = await page.query_selector('title')
            season_name = await title_elem.inner_text() if title_elem else "Unknown Season"
            season_name = season_name.replace('Pokéflix - Watch ', '').replace(' for free online!', '').strip()

            self.logger.info(f"Season: {season_name}")

            # Find all episode links with /v/ pattern
            links = await page.query_selector_all('a[href^="/v/"]')
            self.logger.info(f"Found {len(links)} episode links")

            # If we don't have CDN slug, detect it from first episode
            if not cdn_slug and links:
                first_href = await links[0].get_attribute('href')
                cdn_slug = await self._detect_cdn_slug(first_href)

            if not cdn_slug:
                self.logger.error("Could not determine CDN slug for this season")
                return season_name, "unknown", []

            self.logger.info(f"CDN slug: {cdn_slug}")

            # Collect all episode URLs
            episode_data = []
            seen_urls = set()

            for link in links:
                href = await link.get_attribute('href')
                if not href or href in seen_urls:
                    continue
                seen_urls.add(href)

                # Extract slug from URL
                slug_match = re.search(r'/v/(.+)', href)
                if slug_match:
                    slug = slug_match.group(1)
                    full_url = self.BASE_URL + href
                    episode_data.append((full_url, slug))

            return season_name, cdn_slug, episode_data

        finally:
            await page.close()

    async def _detect_cdn_slug(self, episode_href: str) -> Optional[str]:
        """Visit an episode page to detect the CDN slug from network requests."""
        self.logger.info("Detecting CDN slug from episode page...")

        detected_slug = None

        async def capture_request(request):
            nonlocal detected_slug
            if 'v1.pkflx.com/hls/' in request.url:
                match = re.search(r'hls/([^/]+)/', request.url)
                if match:
                    detected_slug = match.group(1)

        page = await self._new_page()
        page.on('request', capture_request)

        try:
            await page.goto(self.BASE_URL + episode_href, timeout=60000)
            await self._wait_for_cloudflare(page)
            await asyncio.sleep(5)
            return detected_slug
        finally:
            await page.close()

    async def get_episode_cdn_number(self, page_url: str) -> Optional[int]:
        """
        Visit an episode page and detect its CDN episode number.

        Returns:
            The episode number used in CDN URLs, or None if not detected
        """
        detected_num = None

        async def capture_request(request):
            nonlocal detected_num
            if 'v1.pkflx.com/hls/' in request.url:
                match = re.search(r'/(\d+)/\d+_', request.url)
                if match:
                    detected_num = int(match.group(1))

        page = await self._new_page()
        page.on('request', capture_request)

        try:
            await page.goto(page_url, timeout=60000)
            await self._wait_for_cloudflare(page)

            # Wait for initial load
            await asyncio.sleep(2)

            # Try to trigger video playback by clicking play button or video area
            play_selectors = [
                'button[aria-label*="play" i]',
                '.play-button',
                '[class*="play"]',
                'video',
                '.video-player',
                '.player',
                '#player',
            ]

            for selector in play_selectors:
                try:
                    elem = await page.query_selector(selector)
                    if elem:
                        await elem.click()
                        await asyncio.sleep(0.5)
                        if detected_num:
                            break
                except Exception:
                    pass

            # Wait for video requests after click attempts
            for _ in range(10):  # Wait up to 5 seconds
                if detected_num:
                    break
                await asyncio.sleep(0.5)

            # If still not detected, try looking in page source
            if not detected_num:
                content = await page.content()
                match = re.search(r'v1\.pkflx\.com/hls/[^/]+/(\d+)/', content)
                if match:
                    detected_num = int(match.group(1))

            return detected_num
        finally:
            await page.close()

    def download_video(self, video_url: str, output_path: Path) -> bool:
        """
        Download video using yt-dlp.

        Args:
            video_url: Direct CDN URL to the video
            output_path: Full path for output file

        Returns:
            True if download succeeded
        """
        if self.dry_run:
            self.logger.info(f"  [DRY RUN] Would download: {video_url}")
            self.logger.info(f"            To: {output_path}")
            return True

        self.logger.info(f"  Downloading: {output_path.name}")

        cmd = [
            'yt-dlp',
            '--no-warnings',
            '-o', str(output_path),
            '--no-part',
            video_url
        ]

        try:
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=1800
            )

            if result.returncode == 0:
                self.logger.info(f"  Download complete!")
                return True
            else:
                self.logger.error(f"  yt-dlp error: {result.stderr}")
                return False

        except subprocess.TimeoutExpired:
            self.logger.error("  Download timed out after 30 minutes")
            return False
        except FileNotFoundError:
            self.logger.error("  yt-dlp not found. Install with: pip install yt-dlp")
            return False

    async def download_direct(
        self,
        season_url: str,
        start_ep: int,
        end_ep: int,
        resume: bool = False
    ) -> None:
        """
        Direct download mode - download episodes by number without visiting pages.

        This is faster and more reliable when you know the episode range.
        """
        # Get CDN slug from URL
        cdn_slug = self._get_cdn_slug(season_url)
        if not cdn_slug:
            self.logger.error("Unknown season - direct mode requires a known season URL")
            self.logger.info("Known seasons: " + ", ".join(self.SEASON_SLUG_MAP.keys()))
            return

        # In direct mode, output_dir is the final destination (no subfolder created)
        season_dir = self.output_dir
        season_dir.mkdir(parents=True, exist_ok=True)

        self.logger.info(f"Direct download mode: Episodes {start_ep}-{end_ep}")
        self.logger.info(f"CDN slug: {cdn_slug}")
        self.logger.info(f"Quality: {self.quality}")
        self.logger.info(f"Output: {season_dir}")

        downloaded = 0
        skipped = 0
        failed = 0

        for ep_num in range(start_ep, end_ep + 1):
            output_path = season_dir / f"E{ep_num:02d}.mp4"

            # Check if already exists
            if output_path.exists() and resume:
                self.logger.info(f"E{ep_num:02d}: Skipping (file exists)")
                skipped += 1
                continue

            video_url = self._construct_video_url(cdn_slug, ep_num)
            self.logger.info(f"E{ep_num:02d}: Downloading...")

            success = self.download_video(video_url, output_path)

            if success:
                downloaded += 1
            else:
                failed += 1

            if not self.dry_run:
                await self._random_delay(0.5, 1.5)

        self.logger.info(f"\nComplete! Downloaded: {downloaded}, Skipped: {skipped}, Failed: {failed}")

    async def download_season(
        self,
        season_url: str,
        start_ep: Optional[int] = None,
        end_ep: Optional[int] = None,
        resume: bool = False,
        direct: bool = False
    ) -> None:
        """
        Download all episodes from a season.

        Args:
            season_url: URL to the season browse page
            start_ep: First episode number to download (inclusive)
            end_ep: Last episode number to download (inclusive)
            resume: Whether to resume from previous state
            direct: If True, skip page visits and download by episode number
        """
        # Direct mode - just download by episode number
        if direct:
            if start_ep is None or end_ep is None:
                self.logger.error("Direct mode requires --start and --end episode numbers")
                return
            await self.download_direct(season_url, start_ep, end_ep, resume)
            return

        # Get episode list
        season_name, cdn_slug, episode_data = await self.get_episode_list(season_url)

        if not episode_data:
            self.logger.error("No episodes found!")
            return

        # Create output directory
        season_dir = self.output_dir / re.sub(r'[<>:"/\\|?*]', '', season_name).strip()
        season_dir.mkdir(parents=True, exist_ok=True)

        # State file for resume
        state_path = season_dir / 'download_state.json'
        state = None

        if resume:
            state = DownloadState.load(state_path)
            if state:
                self.logger.info(f"Resuming from previous state ({state.last_updated})")

        if not state:
            state = DownloadState(
                season_url=season_url,
                season_name=season_name,
                cdn_slug=cdn_slug,
                episode_urls=[url for url, _ in episode_data]
            )

        self.logger.info(f"Processing {len(episode_data)} episodes (quality: {self.quality})")

        # Process each episode
        downloaded_count = 0
        skipped_count = 0
        failed_count = 0

        for page_url, slug in episode_data:
            title = self._slug_to_title(slug)

            # Check if we already have this episode in state (by URL)
            existing_ep = None
            for ep_data in state.episodes.values():
                if ep_data.get('page_url') == page_url:
                    existing_ep = ep_data
                    break

            if existing_ep and existing_ep.get('downloaded') and resume:
                self.logger.info(f"Skipping: {title} (already downloaded)")
                skipped_count += 1
                continue

            # Get the CDN episode number by visiting the page
            self.logger.info(f"Checking: {title}")
            cdn_num = await self.get_episode_cdn_number(page_url)

            if cdn_num is None:
                self.logger.error(f"  Could not detect episode number, skipping")
                failed_count += 1
                continue

            # Check if within requested range
            if start_ep is not None and cdn_num < start_ep:
                self.logger.info(f"  Episode {cdn_num} before start range, skipping")
                continue
            if end_ep is not None and cdn_num > end_ep:
                self.logger.info(f"  Episode {cdn_num} after end range, skipping")
                continue

            # Check if file already exists
            output_path = season_dir / f"E{cdn_num:02d} - {title}.mp4"
            if output_path.exists() and resume:
                self.logger.info(f"  File exists, skipping")
                state.episodes[str(cdn_num)] = {
                    'cdn_number': cdn_num,
                    'title': title,
                    'page_url': page_url,
                    'slug': slug,
                    'video_url': self._construct_video_url(cdn_slug, cdn_num),
                    'downloaded': True,
                    'error': None
                }
                state.save(state_path)
                skipped_count += 1
                continue

            # Construct video URL and download
            video_url = self._construct_video_url(cdn_slug, cdn_num)
            self.logger.info(f"  Episode {cdn_num}: {title}")

            success = self.download_video(video_url, output_path)

            # Save state
            state.episodes[str(cdn_num)] = {
                'cdn_number': cdn_num,
                'title': title,
                'page_url': page_url,
                'slug': slug,
                'video_url': video_url,
                'downloaded': success,
                'error': None if success else "Download failed"
            }
            state.save(state_path)

            if success:
                downloaded_count += 1
            else:
                failed_count += 1

            # Delay between episodes
            if not self.dry_run:
                await self._random_delay(1, 2)

        # Summary
        self.logger.info(f"\nComplete!")
        self.logger.info(f"  Downloaded: {downloaded_count}")
        self.logger.info(f"  Skipped: {skipped_count}")
        self.logger.info(f"  Failed: {failed_count}")
        self.logger.info(f"  Output: {season_dir}")


# ============================================================================
# CLI
# ============================================================================

def main():
    parser = argparse.ArgumentParser(
        description='Download Pokemon episodes from pokeflix.tv',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s --url "https://www.pokeflix.tv/browse/pokemon-indigo-league" --output ~/Pokemon/
  %(prog)s --url "..." --start 1 --end 10 --output ~/Pokemon/
  %(prog)s --url "..." --output ~/Pokemon/ --resume
  %(prog)s --url "..." --quality 720p --output ~/Pokemon/
  %(prog)s --url "..." --dry-run
        """
    )

    parser.add_argument(
        '--url', '-u',
        required=True,
        help='URL of the season/series page'
    )
    parser.add_argument(
        '--output', '-o',
        type=Path,
        default=Path.home() / 'Downloads' / 'Pokemon',
        help='Output directory (default: ~/Downloads/Pokemon)'
    )
    parser.add_argument(
        '--start', '-s',
        type=int,
        help='Start episode number (CDN number)'
    )
    parser.add_argument(
        '--end', '-e',
        type=int,
        help='End episode number (CDN number)'
    )
    parser.add_argument(
        '--quality', '-q',
        choices=['1080p', '720p', '360p'],
        default='1080p',
        help='Video quality (default: 1080p)'
    )
    parser.add_argument(
        '--resume', '-r',
        action='store_true',
        help='Resume from previous download state'
    )
    parser.add_argument(
        '--dry-run', '-n',
        action='store_true',
        help='Extract URLs only, do not download'
    )
    parser.add_argument(
        '--headless',
        action='store_true',
        help='Run browser in headless mode (may trigger anti-bot)'
    )
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
        help='Enable verbose logging'
    )
    parser.add_argument(
        '--direct',
        action='store_true',
        help='Direct download mode - skip page visits, just download episode range by number'
    )

    args = parser.parse_args()

    async def run():
        async with PokeflixScraper(
            output_dir=args.output,
            headless=args.headless,
            dry_run=args.dry_run,
            verbose=args.verbose,
            quality=args.quality
        ) as scraper:
            await scraper.download_season(
                season_url=args.url,
                start_ep=args.start,
                end_ep=args.end,
                resume=args.resume,
                direct=args.direct
            )

    asyncio.run(run())


if __name__ == '__main__':
    main()