#!/usr/bin/env python3 """ Pokeflix Scraper - Download Pokemon episodes from pokeflix.tv Pokeflix hosts videos directly on their CDN (v1.pkflx.com). This scraper: 1. Extracts the episode list from a season browse page 2. Visits each episode page to detect its CDN episode number 3. Downloads videos directly from the CDN via yt-dlp Usage: # Download entire season python pokeflix_scraper.py --url "https://www.pokeflix.tv/browse/pokemon-indigo-league" --output ~/Pokemon/ # Download specific episode range python pokeflix_scraper.py --url "..." --start 1 --end 10 --output ~/Pokemon/ # Resume interrupted download python pokeflix_scraper.py --url "..." --output ~/Pokemon/ --resume # Dry run (extract URLs only) python pokeflix_scraper.py --url "..." --dry-run # Choose quality python pokeflix_scraper.py --url "..." --quality 720p --output ~/Pokemon/ Dependencies: pip install playwright playwright install chromium # yt-dlp must be installed: pip install yt-dlp Author: Cal Corum (with Jarvis assistance) """ import argparse import asyncio import json import logging import random import re import subprocess import sys from dataclasses import dataclass, field, asdict from datetime import datetime from pathlib import Path from typing import Optional try: from playwright.async_api import async_playwright, Page, Browser except ImportError: print("ERROR: playwright not installed. Run: pip install playwright && playwright install chromium") sys.exit(1) # ============================================================================ # Data Classes # ============================================================================ @dataclass class Episode: """Represents a single episode with its metadata and download status.""" cdn_number: int # The actual episode number on the CDN title: str page_url: str slug: str # URL slug e.g., "01-pokemon-i-choose-you" video_url: Optional[str] = None downloaded: bool = False error: Optional[str] = None @property def filename(self) -> str: """Generate safe filename for the episode.""" safe_title = re.sub(r'[<>:"/\\|?*]', '', self.title) safe_title = safe_title.strip() return f"E{self.cdn_number:02d} - {safe_title}.mp4" @dataclass class Season: """Represents a season/series with all its episodes.""" name: str url: str cdn_slug: str # e.g., "01-indigo-league" - used for CDN URLs episodes: list[Episode] = field(default_factory=list) @property def safe_name(self) -> str: """Generate safe directory name for the season.""" safe = re.sub(r'[<>:"/\\|?*]', '', self.name) return safe.strip() @dataclass class DownloadState: """Persistent state for resumable downloads.""" season_url: str season_name: str cdn_slug: str episodes: dict[int, dict] = field(default_factory=dict) # cdn_number -> episode dict episode_urls: list[str] = field(default_factory=list) # All episode page URLs last_updated: str = "" def save(self, path: Path) -> None: """Save state to JSON file.""" self.last_updated = datetime.now().isoformat() with open(path, 'w') as f: json.dump(asdict(self), f, indent=2) @classmethod def load(cls, path: Path) -> Optional['DownloadState']: """Load state from JSON file.""" if not path.exists(): return None with open(path) as f: data = json.load(f) return cls(**data) # ============================================================================ # Logging Setup # ============================================================================ def setup_logging(verbose: bool = False) -> logging.Logger: """Configure logging with console output.""" logger = logging.getLogger('pokeflix_scraper') logger.setLevel(logging.DEBUG if verbose else logging.INFO) if not logger.handlers: console = logging.StreamHandler() console.setLevel(logging.DEBUG if verbose else logging.INFO) console.setFormatter(logging.Formatter( '%(asctime)s [%(levelname)s] %(message)s', datefmt='%H:%M:%S' )) logger.addHandler(console) return logger # ============================================================================ # Scraper Class # ============================================================================ class PokeflixScraper: """ Scrapes pokeflix.tv for video URLs and downloads them. Pokeflix hosts videos on their CDN with URLs like: https://v1.pkflx.com/hls/{season-slug}/{ep-num}/{ep-num}_{quality}.mp4 The episode number must be detected by visiting each episode page, as the browse page URL slugs don't contain episode numbers. """ BASE_URL = "https://www.pokeflix.tv" CDN_URL = "https://v1.pkflx.com/hls" # Map browse page URL slugs to CDN slugs SEASON_SLUG_MAP = { 'pokemon-indigo-league': '01-indigo-league', 'pokemon-adventures-in-the-orange-islands': '02-orange-islands', 'pokemon-the-johto-journeys': '03-johto-journeys', 'pokemon-johto-league-champions': '04-johto-league-champions', 'pokemon-master-quest': '05-master-quest', 'pokemon-advanced': '06-advanced', 'pokemon-advanced-challenge': '07-advanced-challenge', 'pokemon-advanced-battle': '08-advanced-battle', 'pokemon-battle-frontier': '09-battle-frontier', 'pokemon-diamond-and-pearl': '10-diamond-and-pearl', 'pokemon-dp-battle-dimension': '11-battle-dimension', 'pokemon-dp-galactic-battles': '12-galactic-battles', 'pokemon-dp-sinnoh-league-victors': '13-sinnoh-league-victors', 'pokemon-black-white': '14-black-and-white', 'pokemon-bw-rival-destinies': '15-rival-destinies', 'pokemon-bw-adventures-in-unova': '16-adventures-in-unova', 'pokemon-xy': '17-xy', 'pokemon-xy-kalos-quest': '18-kalos-quest', 'pokemon-xyz': '19-xyz', 'pokemon-sun-moon': '20-sun-and-moon', 'pokemon-sun-moon-ultra-adventures': '21-ultra-adventures', 'pokemon-sun-moon-ultra-legends': '22-ultra-legends', 'pokemon-journeys': '23-journeys', 'pokemon-master-journeys': '24-master-journeys', 'pokemon-ultimate-journeys': '25-ultimate-journeys', 'pokemon-horizons': '26-horizons', } def __init__( self, output_dir: Path, headless: bool = False, dry_run: bool = False, verbose: bool = False, quality: str = "1080p" ): self.output_dir = output_dir self.headless = headless self.dry_run = dry_run self.quality = quality self.logger = setup_logging(verbose) self.browser: Optional[Browser] = None self._context = None async def __aenter__(self): """Async context manager entry - launch browser.""" playwright = await async_playwright().start() self.browser = await playwright.chromium.launch( headless=self.headless, args=['--disable-blink-features=AutomationControlled'] ) self._playwright = playwright # Create a persistent context to reuse self._context = await self.browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit - close browser.""" if self._context: await self._context.close() if self.browser: await self.browser.close() await self._playwright.stop() async def _new_page(self) -> Page: """Create a new page using the shared context.""" return await self._context.new_page() async def _random_delay(self, min_sec: float = 1.0, max_sec: float = 3.0): """Random delay to avoid detection.""" delay = random.uniform(min_sec, max_sec) await asyncio.sleep(delay) async def _wait_for_cloudflare(self, page: Page, timeout: int = 60): """Wait for Cloudflare challenge to be solved by user.""" try: # Check if we're on a Cloudflare challenge page is_cf = await page.query_selector('#challenge-running, .cf-browser-verification, [id*="challenge"]') if is_cf: self.logger.warning("Cloudflare challenge detected - please solve it in the browser window") self.logger.info("Waiting up to 60 seconds for challenge completion...") # Wait for the challenge to be solved (URL changes or challenge element disappears) for _ in range(timeout): await asyncio.sleep(1) is_cf = await page.query_selector('#challenge-running, .cf-browser-verification, [id*="challenge"]') if not is_cf: self.logger.info("Cloudflare challenge completed!") await asyncio.sleep(2) # Wait for page to fully load return True self.logger.error("Cloudflare challenge timeout - please try again") return False except Exception: pass return True def _get_cdn_slug(self, browse_url: str) -> Optional[str]: """Extract CDN slug from browse page URL.""" match = re.search(r'/browse/([^/]+)', browse_url) if match: page_slug = match.group(1) if page_slug in self.SEASON_SLUG_MAP: return self.SEASON_SLUG_MAP[page_slug] self.logger.warning(f"Unknown season slug: {page_slug}, will try to detect from page") return None def _construct_video_url(self, cdn_slug: str, ep_num: int) -> str: """Construct direct CDN video URL.""" return f"{self.CDN_URL}/{cdn_slug}/{ep_num:02d}/{ep_num:02d}_{self.quality}.mp4" def _slug_to_title(self, slug: str) -> str: """Convert URL slug to human-readable title.""" # Remove season prefix like "01-" title_slug = re.sub(r'^\d+-', '', slug) # Convert to title case title = title_slug.replace('-', ' ').title() # Clean up common words title = re.sub(r'\bPokemon\b', 'Pokémon', title) return title async def get_episode_list(self, season_url: str) -> tuple[str, str, list[tuple[str, str]]]: """ Get the list of episode URLs from a season browse page. Returns: Tuple of (season_name, cdn_slug, list of (page_url, slug) tuples) """ self.logger.info(f"Fetching season page: {season_url}") cdn_slug = self._get_cdn_slug(season_url) page = await self._new_page() try: await page.goto(season_url, wait_until='networkidle', timeout=60000) await self._wait_for_cloudflare(page) await self._random_delay(2, 4) # Extract season title title_elem = await page.query_selector('h1, .season-title, .series-title') if not title_elem: title_elem = await page.query_selector('title') season_name = await title_elem.inner_text() if title_elem else "Unknown Season" season_name = season_name.replace('Pokéflix - Watch ', '').replace(' for free online!', '').strip() self.logger.info(f"Season: {season_name}") # Find all episode links with /v/ pattern links = await page.query_selector_all('a[href^="/v/"]') self.logger.info(f"Found {len(links)} episode links") # If we don't have CDN slug, detect it from first episode if not cdn_slug and links: first_href = await links[0].get_attribute('href') cdn_slug = await self._detect_cdn_slug(first_href) if not cdn_slug: self.logger.error("Could not determine CDN slug for this season") return season_name, "unknown", [] self.logger.info(f"CDN slug: {cdn_slug}") # Collect all episode URLs episode_data = [] seen_urls = set() for link in links: href = await link.get_attribute('href') if not href or href in seen_urls: continue seen_urls.add(href) # Extract slug from URL slug_match = re.search(r'/v/(.+)', href) if slug_match: slug = slug_match.group(1) full_url = self.BASE_URL + href episode_data.append((full_url, slug)) return season_name, cdn_slug, episode_data finally: await page.close() async def _detect_cdn_slug(self, episode_href: str) -> Optional[str]: """Visit an episode page to detect the CDN slug from network requests.""" self.logger.info("Detecting CDN slug from episode page...") detected_slug = None async def capture_request(request): nonlocal detected_slug if 'v1.pkflx.com/hls/' in request.url: match = re.search(r'hls/([^/]+)/', request.url) if match: detected_slug = match.group(1) page = await self._new_page() page.on('request', capture_request) try: await page.goto(self.BASE_URL + episode_href, timeout=60000) await self._wait_for_cloudflare(page) await asyncio.sleep(5) return detected_slug finally: await page.close() async def get_episode_cdn_number(self, page_url: str) -> Optional[int]: """ Visit an episode page and detect its CDN episode number. Returns: The episode number used in CDN URLs, or None if not detected """ detected_num = None async def capture_request(request): nonlocal detected_num if 'v1.pkflx.com/hls/' in request.url: match = re.search(r'/(\d+)/\d+_', request.url) if match: detected_num = int(match.group(1)) page = await self._new_page() page.on('request', capture_request) try: await page.goto(page_url, timeout=60000) await self._wait_for_cloudflare(page) # Wait for initial load await asyncio.sleep(2) # Try to trigger video playback by clicking play button or video area play_selectors = [ 'button[aria-label*="play" i]', '.play-button', '[class*="play"]', 'video', '.video-player', '.player', '#player', ] for selector in play_selectors: try: elem = await page.query_selector(selector) if elem: await elem.click() await asyncio.sleep(0.5) if detected_num: break except Exception: pass # Wait for video requests after click attempts for _ in range(10): # Wait up to 5 seconds if detected_num: break await asyncio.sleep(0.5) # If still not detected, try looking in page source if not detected_num: content = await page.content() match = re.search(r'v1\.pkflx\.com/hls/[^/]+/(\d+)/', content) if match: detected_num = int(match.group(1)) return detected_num finally: await page.close() def download_video(self, video_url: str, output_path: Path) -> bool: """ Download video using yt-dlp. Args: video_url: Direct CDN URL to the video output_path: Full path for output file Returns: True if download succeeded """ if self.dry_run: self.logger.info(f" [DRY RUN] Would download: {video_url}") self.logger.info(f" To: {output_path}") return True self.logger.info(f" Downloading: {output_path.name}") cmd = [ 'yt-dlp', '--no-warnings', '-o', str(output_path), '--no-part', video_url ] try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=1800 ) if result.returncode == 0: self.logger.info(f" Download complete!") return True else: self.logger.error(f" yt-dlp error: {result.stderr}") return False except subprocess.TimeoutExpired: self.logger.error(" Download timed out after 30 minutes") return False except FileNotFoundError: self.logger.error(" yt-dlp not found. Install with: pip install yt-dlp") return False async def download_direct( self, season_url: str, start_ep: int, end_ep: int, resume: bool = False ) -> None: """ Direct download mode - download episodes by number without visiting pages. This is faster and more reliable when you know the episode range. """ # Get CDN slug from URL cdn_slug = self._get_cdn_slug(season_url) if not cdn_slug: self.logger.error("Unknown season - direct mode requires a known season URL") self.logger.info("Known seasons: " + ", ".join(self.SEASON_SLUG_MAP.keys())) return # In direct mode, output_dir is the final destination (no subfolder created) season_dir = self.output_dir season_dir.mkdir(parents=True, exist_ok=True) self.logger.info(f"Direct download mode: Episodes {start_ep}-{end_ep}") self.logger.info(f"CDN slug: {cdn_slug}") self.logger.info(f"Quality: {self.quality}") self.logger.info(f"Output: {season_dir}") downloaded = 0 skipped = 0 failed = 0 for ep_num in range(start_ep, end_ep + 1): output_path = season_dir / f"E{ep_num:02d}.mp4" # Check if already exists if output_path.exists() and resume: self.logger.info(f"E{ep_num:02d}: Skipping (file exists)") skipped += 1 continue video_url = self._construct_video_url(cdn_slug, ep_num) self.logger.info(f"E{ep_num:02d}: Downloading...") success = self.download_video(video_url, output_path) if success: downloaded += 1 else: failed += 1 if not self.dry_run: await self._random_delay(0.5, 1.5) self.logger.info(f"\nComplete! Downloaded: {downloaded}, Skipped: {skipped}, Failed: {failed}") async def download_season( self, season_url: str, start_ep: Optional[int] = None, end_ep: Optional[int] = None, resume: bool = False, direct: bool = False ) -> None: """ Download all episodes from a season. Args: season_url: URL to the season browse page start_ep: First episode number to download (inclusive) end_ep: Last episode number to download (inclusive) resume: Whether to resume from previous state direct: If True, skip page visits and download by episode number """ # Direct mode - just download by episode number if direct: if start_ep is None or end_ep is None: self.logger.error("Direct mode requires --start and --end episode numbers") return await self.download_direct(season_url, start_ep, end_ep, resume) return # Get episode list season_name, cdn_slug, episode_data = await self.get_episode_list(season_url) if not episode_data: self.logger.error("No episodes found!") return # Create output directory season_dir = self.output_dir / re.sub(r'[<>:"/\\|?*]', '', season_name).strip() season_dir.mkdir(parents=True, exist_ok=True) # State file for resume state_path = season_dir / 'download_state.json' state = None if resume: state = DownloadState.load(state_path) if state: self.logger.info(f"Resuming from previous state ({state.last_updated})") if not state: state = DownloadState( season_url=season_url, season_name=season_name, cdn_slug=cdn_slug, episode_urls=[url for url, _ in episode_data] ) self.logger.info(f"Processing {len(episode_data)} episodes (quality: {self.quality})") # Process each episode downloaded_count = 0 skipped_count = 0 failed_count = 0 for page_url, slug in episode_data: title = self._slug_to_title(slug) # Check if we already have this episode in state (by URL) existing_ep = None for ep_data in state.episodes.values(): if ep_data.get('page_url') == page_url: existing_ep = ep_data break if existing_ep and existing_ep.get('downloaded') and resume: self.logger.info(f"Skipping: {title} (already downloaded)") skipped_count += 1 continue # Get the CDN episode number by visiting the page self.logger.info(f"Checking: {title}") cdn_num = await self.get_episode_cdn_number(page_url) if cdn_num is None: self.logger.error(f" Could not detect episode number, skipping") failed_count += 1 continue # Check if within requested range if start_ep is not None and cdn_num < start_ep: self.logger.info(f" Episode {cdn_num} before start range, skipping") continue if end_ep is not None and cdn_num > end_ep: self.logger.info(f" Episode {cdn_num} after end range, skipping") continue # Check if file already exists output_path = season_dir / f"E{cdn_num:02d} - {title}.mp4" if output_path.exists() and resume: self.logger.info(f" File exists, skipping") state.episodes[str(cdn_num)] = { 'cdn_number': cdn_num, 'title': title, 'page_url': page_url, 'slug': slug, 'video_url': self._construct_video_url(cdn_slug, cdn_num), 'downloaded': True, 'error': None } state.save(state_path) skipped_count += 1 continue # Construct video URL and download video_url = self._construct_video_url(cdn_slug, cdn_num) self.logger.info(f" Episode {cdn_num}: {title}") success = self.download_video(video_url, output_path) # Save state state.episodes[str(cdn_num)] = { 'cdn_number': cdn_num, 'title': title, 'page_url': page_url, 'slug': slug, 'video_url': video_url, 'downloaded': success, 'error': None if success else "Download failed" } state.save(state_path) if success: downloaded_count += 1 else: failed_count += 1 # Delay between episodes if not self.dry_run: await self._random_delay(1, 2) # Summary self.logger.info(f"\nComplete!") self.logger.info(f" Downloaded: {downloaded_count}") self.logger.info(f" Skipped: {skipped_count}") self.logger.info(f" Failed: {failed_count}") self.logger.info(f" Output: {season_dir}") # ============================================================================ # CLI # ============================================================================ def main(): parser = argparse.ArgumentParser( description='Download Pokemon episodes from pokeflix.tv', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s --url "https://www.pokeflix.tv/browse/pokemon-indigo-league" --output ~/Pokemon/ %(prog)s --url "..." --start 1 --end 10 --output ~/Pokemon/ %(prog)s --url "..." --output ~/Pokemon/ --resume %(prog)s --url "..." --quality 720p --output ~/Pokemon/ %(prog)s --url "..." --dry-run """ ) parser.add_argument( '--url', '-u', required=True, help='URL of the season/series page' ) parser.add_argument( '--output', '-o', type=Path, default=Path.home() / 'Downloads' / 'Pokemon', help='Output directory (default: ~/Downloads/Pokemon)' ) parser.add_argument( '--start', '-s', type=int, help='Start episode number (CDN number)' ) parser.add_argument( '--end', '-e', type=int, help='End episode number (CDN number)' ) parser.add_argument( '--quality', '-q', choices=['1080p', '720p', '360p'], default='1080p', help='Video quality (default: 1080p)' ) parser.add_argument( '--resume', '-r', action='store_true', help='Resume from previous download state' ) parser.add_argument( '--dry-run', '-n', action='store_true', help='Extract URLs only, do not download' ) parser.add_argument( '--headless', action='store_true', help='Run browser in headless mode (may trigger anti-bot)' ) parser.add_argument( '--verbose', '-v', action='store_true', help='Enable verbose logging' ) parser.add_argument( '--direct', action='store_true', help='Direct download mode - skip page visits, just download episode range by number' ) args = parser.parse_args() async def run(): async with PokeflixScraper( output_dir=args.output, headless=args.headless, dry_run=args.dry_run, verbose=args.verbose, quality=args.quality ) as scraper: await scraper.download_season( season_url=args.url, start_ep=args.start, end_ep=args.end, resume=args.resume, direct=args.direct ) asyncio.run(run()) if __name__ == '__main__': main()