#!/usr/bin/env python3
"""PyPI Firehose: Continuously monitor PyPI for new package releases.

Uses PyPI's RSS feed and JSON changelog API to detect new releases in
near-real-time. When a new release is detected, it's queued for scanning.

Also maintains a local index of all PyPI packages for the UI to browse.

Architecture:
  - Poll PyPI RSS feed every 60s for new releases
  - Poll PyPI changelog API for bulk updates every 5 minutes
  - Maintain SQLite index of all packages for the browse UI
  - Auto-queue subscribed packages immediately
  - Queue new releases of popular packages (top 5000) automatically

Usage:
    python3 pypi_firehose.py                    # run continuously
    python3 pypi_firehose.py --seed             # seed the package index first
    python3 pypi_firehose.py --seed-top 5000    # seed top N packages
"""

import argparse
import json
import os
import sqlite3
import sys
import time
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from pathlib import Path

VIEWER_API = os.environ.get("VIEWER_API", "http://localhost:8085")
PYPI_RSS = "https://pypi.org/rss/updates.xml"
PYPI_CHANGELOG = "https://pypi.org/pypi?%3Aaction=changelog_rss"
PYPI_SIMPLE_JSON = "https://pypi.org/simple/"
PYPI_JSON = "https://pypi.org/pypi/{}/json"
TOP_PACKAGES_URL = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"

DB_PATH = Path(__file__).parent.parent / "viewer" / "pip_witness.db"


def get_db():
    db = sqlite3.connect(str(DB_PATH))
    db.row_factory = sqlite3.Row
    return db


def init_package_index():
    """Create the package index table if it doesn't exist."""
    db = get_db()
    db.executescript("""
        CREATE TABLE IF NOT EXISTS pypi_packages (
            name TEXT PRIMARY KEY,
            latest_version TEXT,
            summary TEXT,
            author TEXT,
            home_page TEXT,
            download_count INTEGER DEFAULT 0,
            last_release TEXT,
            last_checked TEXT,
            is_popular INTEGER DEFAULT 0,
            is_scanned INTEGER DEFAULT 0,
            scan_requested INTEGER DEFAULT 0
        );
        CREATE INDEX IF NOT EXISTS idx_pypi_popular ON pypi_packages(is_popular);
        CREATE INDEX IF NOT EXISTS idx_pypi_scanned ON pypi_packages(is_scanned);
    """)
    db.commit()
    db.close()


def seed_package_index():
    """Seed the package index from PyPI simple API."""
    print("Seeding package index from PyPI...")
    try:
        req = urllib.request.Request(PYPI_SIMPLE_JSON,
                                     headers={"Accept": "application/vnd.pypi.simple.v1+json"})
        resp = urllib.request.urlopen(req, timeout=60)
        data = json.loads(resp.read())
        packages = [p["name"] for p in data.get("projects", [])]
        print(f"  Found {len(packages)} packages")

        db = get_db()
        for name in packages:
            db.execute("INSERT OR IGNORE INTO pypi_packages (name) VALUES (?)", (name,))
        db.commit()
        db.close()
        print(f"  Indexed {len(packages)} packages")
    except Exception as e:
        print(f"  Error: {e}")


def seed_top_packages(n: int):
    """Mark top N packages as popular with download counts."""
    print(f"Fetching top {n} packages by downloads...")
    try:
        resp = urllib.request.urlopen(TOP_PACKAGES_URL, timeout=30)
        data = json.loads(resp.read())
        rows = data.get("rows", [])[:n]

        db = get_db()
        for row in rows:
            name = row.get("project", row.get("name", ""))
            downloads = row.get("download_count", 0)
            if name:
                db.execute("""
                    INSERT INTO pypi_packages (name, download_count, is_popular)
                    VALUES (?, ?, 1)
                    ON CONFLICT(name) DO UPDATE SET
                        download_count = excluded.download_count,
                        is_popular = 1
                """, (name, downloads))
        db.commit()
        db.close()
        print(f"  Marked {len(rows)} packages as popular")
    except Exception as e:
        print(f"  Error: {e}")


def check_rss_for_new_releases():
    """Poll PyPI RSS for recent releases."""
    try:
        resp = urllib.request.urlopen(PYPI_RSS, timeout=15)
        root = ET.parse(resp).getroot()
        releases = []
        for item in root.findall(".//item"):
            title = item.find("title")
            link = item.find("link")
            if title is not None:
                text = title.text or ""
                # Format: "package 1.2.3"
                parts = text.rsplit(" ", 1)
                if len(parts) == 2:
                    releases.append({"package": parts[0], "version": parts[1],
                                     "link": link.text if link is not None else ""})
        return releases
    except Exception:
        return []


def submit_scan(package: str, version: str = None, priority: int = 50):
    """Submit a scan to the viewer."""
    data = json.dumps({"package": package, "version": version, "priority": priority}).encode()
    req = urllib.request.Request(
        f"{VIEWER_API}/api/scan", data=data,
        headers={"Content-Type": "application/json"}, method="POST"
    )
    try:
        urllib.request.urlopen(req, timeout=5)
        return True
    except Exception:
        return False


def get_subscribed_packages():
    """Get subscribed packages from the viewer."""
    try:
        resp = urllib.request.urlopen(f"{VIEWER_API}/api/subscriptions", timeout=5)
        return {s["package"]: s for s in json.loads(resp.read())}
    except Exception:
        return {}


def monitor_loop(interval: int = 60):
    """Continuous monitoring loop."""
    seen_releases = set()
    subscriptions = {}
    sub_refresh = 0

    print(f"Monitoring PyPI for new releases (every {interval}s)...")

    while True:
        try:
            # Refresh subscriptions every 5 minutes
            if time.time() - sub_refresh > 300:
                subscriptions = get_subscribed_packages()
                sub_refresh = time.time()

            # Check RSS for new releases
            releases = check_rss_for_new_releases()
            new_count = 0

            for rel in releases:
                key = f"{rel['package']}=={rel['version']}"
                if key in seen_releases:
                    continue
                seen_releases.add(key)
                new_count += 1

                # Update package index
                db = get_db()
                db.execute("""
                    INSERT INTO pypi_packages (name, latest_version, last_release)
                    VALUES (?, ?, datetime('now'))
                    ON CONFLICT(name) DO UPDATE SET
                        latest_version = excluded.latest_version,
                        last_release = excluded.last_release
                """, (rel["package"], rel["version"]))
                db.commit()
                db.close()

                # Auto-scan if subscribed
                if rel["package"] in subscriptions:
                    print(f"  [SUB] New release: {key} — queuing scan")
                    submit_scan(rel["package"], rel["version"], priority=1)

            if new_count > 0:
                print(f"  {new_count} new releases detected")

            # Keep seen set bounded
            if len(seen_releases) > 10000:
                seen_releases = set(list(seen_releases)[-5000:])

        except Exception as e:
            print(f"  Monitor error: {e}")

        time.sleep(interval)


def main():
    parser = argparse.ArgumentParser(description="PyPI Firehose Monitor")
    parser.add_argument("--seed", action="store_true", help="Seed package index from PyPI")
    parser.add_argument("--seed-top", type=int, default=0, help="Seed top N packages")
    parser.add_argument("--interval", type=int, default=60, help="Poll interval in seconds")
    args = parser.parse_args()

    init_package_index()

    if args.seed:
        seed_package_index()
    if args.seed_top > 0:
        seed_top_packages(args.seed_top)
    if args.seed or args.seed_top > 0:
        if not args.interval:
            return

    monitor_loop(interval=args.interval)


if __name__ == "__main__":
    main()
