#!/usr/bin/env python3
"""Continuous scanner: monitors PyPI for new releases and backfills unscanned packages.

Priority order:
  1. Subscribed packages with new releases (priority 1)
  2. New releases of top-5000 packages (priority 5)
  3. New releases of any package (priority 15)
  4. Unscanned popular packages (priority 30)
  5. Unscanned packages from the backlog (priority 50)

Runs forever. Polls PyPI RSS every 30s for new releases,
and drip-feeds unscanned packages from the DB into the scan queue.
"""

import json
import os
import sqlite3
import sys
import time
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path

VIEWER_API = os.environ.get("VIEWER_API", "http://localhost:8085")
PYPI_RSS = "https://pypi.org/rss/updates.xml"
DB_PATH = Path(__file__).parent.parent / "viewer" / "pip_witness.db"

# How many backfill scans to submit per cycle
BACKFILL_BATCH = 3
# How often to poll RSS (seconds)
RSS_INTERVAL = 30
# How often to submit backfill batch (seconds)
BACKFILL_INTERVAL = 60


def get_db():
    db = sqlite3.connect(str(DB_PATH))
    db.row_factory = sqlite3.Row
    return db


def submit_scan(package: str, version: str = None, priority: int = 50):
    data = json.dumps({"package": package, "version": version, "priority": priority}).encode()
    req = urllib.request.Request(
        f"{VIEWER_API}/api/scan", data=data,
        headers={"Content-Type": "application/json"}, method="POST"
    )
    try:
        urllib.request.urlopen(req, timeout=5)
        return True
    except Exception:
        return False


def get_subscribed():
    try:
        resp = urllib.request.urlopen(f"{VIEWER_API}/api/subscriptions", timeout=5)
        return {s["package"]: s for s in json.loads(resp.read())}
    except Exception:
        return {}


def get_queue_size():
    try:
        resp = urllib.request.urlopen(f"{VIEWER_API}/api/stats", timeout=5)
        return json.loads(resp.read()).get("queued_scans", 0)
    except Exception:
        return 999  # assume busy if can't reach


def get_already_scanned():
    """Get set of (package, version) already completed."""
    db = get_db()
    rows = db.execute("SELECT package, version FROM scans WHERE status='completed'").fetchall()
    db.close()
    return {(r["package"], r["version"]) for r in rows}


def get_popular_unscanned(limit: int):
    """Get popular packages that haven't been scanned yet."""
    db = get_db()
    try:
        rows = db.execute("""
            SELECT p.name, p.latest_version FROM pypi_packages p
            LEFT JOIN scans s ON s.package = p.name AND s.status = 'completed'
            WHERE p.is_popular = 1 AND s.id IS NULL
            ORDER BY p.download_count DESC
            LIMIT ?
        """, (limit,)).fetchall()
    except Exception:
        rows = []
    db.close()
    return [(r["name"], r["latest_version"]) for r in rows]


def get_backlog_unscanned(limit: int):
    """Get any unscanned packages from the index."""
    db = get_db()
    try:
        rows = db.execute("""
            SELECT p.name, p.latest_version FROM pypi_packages p
            LEFT JOIN scans s ON s.package = p.name AND s.status = 'completed'
            WHERE s.id IS NULL
            ORDER BY p.download_count DESC
            LIMIT ?
        """, (limit,)).fetchall()
    except Exception:
        rows = []
    db.close()
    return [(r["name"], r["latest_version"]) for r in rows]


def poll_rss():
    """Poll PyPI RSS for recent releases. Returns list of (package, version)."""
    try:
        resp = urllib.request.urlopen(PYPI_RSS, timeout=15)
        root = ET.parse(resp).getroot()
        releases = []
        for item in root.findall(".//item"):
            title = item.find("title")
            if title is not None and title.text:
                parts = title.text.rsplit(" ", 1)
                if len(parts) == 2:
                    releases.append((parts[0], parts[1]))
        return releases
    except Exception:
        return []


def main():
    print("pip-witness continuous scanner")
    print(f"  API: {VIEWER_API}")
    print(f"  RSS poll: every {RSS_INTERVAL}s")
    print(f"  Backfill: {BACKFILL_BATCH} packages every {BACKFILL_INTERVAL}s")
    print()

    seen_releases = set()
    scanned = get_already_scanned()
    subscriptions = get_subscribed()
    last_sub_refresh = time.time()
    last_backfill = 0
    cycle = 0

    while True:
        cycle += 1
        now = time.time()

        # Refresh subscriptions every 5 minutes
        if now - last_sub_refresh > 300:
            subscriptions = get_subscribed()
            last_sub_refresh = now

        # Don't submit if queue is already big
        queue_size = get_queue_size()
        if queue_size > 50:
            print(f"  [{cycle}] Queue has {queue_size} items, waiting...")
            time.sleep(RSS_INTERVAL)
            continue

        # --- Priority 1-15: New releases from RSS ---
        releases = poll_rss()
        new_count = 0
        for pkg, ver in releases:
            key = (pkg, ver)
            if key in seen_releases or key in scanned:
                continue
            seen_releases.add(key)

            if pkg in subscriptions:
                # Priority 1: subscribed package
                print(f"  [NEW] {pkg}=={ver} (SUBSCRIBED — priority 1)")
                submit_scan(pkg, ver, priority=1)
                new_count += 1
            else:
                # Check if it's popular
                db = get_db()
                is_pop = False
                try:
                    row = db.execute("SELECT is_popular FROM pypi_packages WHERE name=?", (pkg,)).fetchone()
                    is_pop = row and row["is_popular"]
                except Exception:
                    pass
                db.close()

                if is_pop:
                    # Priority 5: popular package new release
                    print(f"  [NEW] {pkg}=={ver} (popular — priority 5)")
                    submit_scan(pkg, ver, priority=5)
                    new_count += 1
                else:
                    # Priority 15: any new release
                    print(f"  [NEW] {pkg}=={ver} (priority 15)")
                    submit_scan(pkg, ver, priority=15)
                    new_count += 1

            # Update scanned set
            scanned.add(key)

        if new_count > 0:
            print(f"  [{cycle}] {new_count} new releases queued")

        # --- Priority 30-50: Backfill unscanned packages ---
        if now - last_backfill > BACKFILL_INTERVAL and queue_size < 20:
            last_backfill = now

            # First: popular unscanned
            popular = get_popular_unscanned(BACKFILL_BATCH)
            for pkg, ver in popular:
                if (pkg, ver) not in scanned:
                    print(f"  [BACKFILL] {pkg} (popular — priority 30)")
                    submit_scan(pkg, ver, priority=30)
                    scanned.add((pkg, ver))

            # Then: any unscanned (if room)
            if len(popular) < BACKFILL_BATCH:
                remaining = BACKFILL_BATCH - len(popular)
                backlog = get_backlog_unscanned(remaining)
                for pkg, ver in backlog:
                    if (pkg, ver) not in scanned:
                        print(f"  [BACKFILL] {pkg} (priority 50)")
                        submit_scan(pkg, ver, priority=50)
                        scanned.add((pkg, ver))

        # Log status periodically
        if cycle % 10 == 0:
            print(f"  [{cycle}] queue={queue_size} seen={len(seen_releases)} scanned={len(scanned)}")

        # Keep seen set bounded
        if len(seen_releases) > 10000:
            seen_releases = set(list(seen_releases)[-5000:])

        time.sleep(RSS_INTERVAL)


if __name__ == "__main__":
    main()
