#!/bin/bash
set -euo pipefail

# pip-witness entrypoint: Multi-step attestation workflow
#
# Produces 4 signed DSSE attestation collections:
# Step 1: pip-install       — Install with full ptrace tracing + all attestors
# Step 2: pip-install-sbom  — CycloneDX SBOM generation (signed)
# Step 3: pip-install-sarif — SARIF security findings (signed)
# Step 4: pip-install-import — Import with ptrace to catch import-time attacks
#
# Each step is its own DSSE envelope with subjects, stored in Archivista.
# Policy layout links them via AttestationsFrom.

PACKAGE="${1:?Usage: pip-witness <package-spec>}"
STEP_NAME="${STEP_NAME:-pip-install}"
TS=$(date +%s)
INSTALL_ATT="/attestations/${STEP_NAME}-${TS}.json"
IMPORT_ATT="/attestations/${STEP_NAME}-import-${TS}.json"
SIGNING_KEY="/etc/pip-witness/signing-key.pem"

# Extract bare package name (strip version spec for import)
PKG_NAME=$(echo "${PACKAGE}" | sed 's/[<>=!].*//' | sed 's/\[.*//' | tr '-' '_' | tr '[:upper:]' '[:lower:]')

echo "=========================================="
echo "  pip-witness: multi-step attestation"
echo "=========================================="
echo "Package:     ${PACKAGE}"
echo "Import as:   ${PKG_NAME}"
echo "Steps:       install → import → analyze"
echo ""

ARCHIVISTA_FLAGS=""
if [ -n "${ARCHIVISTA_SERVER:-}" ]; then
    ARCHIVISTA_FLAGS="--enable-archivista --archivista-server ${ARCHIVISTA_SERVER}"
fi

# Strip sensitive env vars before scanning to avoid secretscan false positives (Issue #6)
unset ANTHROPIC_API_KEY 2>/dev/null || true

# Attestors for install step
# omnitrail:  file tree with permissions/ownership/hashes
# secretscan: detect hardcoded secrets in installed files
# slsa:       SLSA v1.0 provenance document
INSTALL_ATTESTORS="environment,omnitrail,secretscan,slsa"
# Attestors for import step (lighter weight)
IMPORT_ATTESTORS="environment"

# Note: cilock doesn't support --material/--product flags (those are witness-only)
# Material/product attestors are populated by omnitrail instead

# =====================================================================
# STEP 1: Install with full tracing
# =====================================================================
echo "[Step 1/3] Installing with full ptrace instrumentation..."
echo "  Attestors: command-run (trace), pip-install, ${INSTALL_ATTESTORS}"

cilock run \
    --step "${STEP_NAME}" \
    --trace \
    --signer-file-key-path "${SIGNING_KEY}" \
    --outfile "${INSTALL_ATT}" \
    --attestations "pip-install,${INSTALL_ATTESTORS}" \
    --hashes sha256 \
    ${ARCHIVISTA_FLAGS} \
    -d /workspace \
    -- pip install --verbose --no-cache-dir "${PACKAGE}" 2>&1 || true

echo "  Install attestation: ${INSTALL_ATT}"
echo ""

# =====================================================================
# STEP 2: SBOM Generation (wrapped in cilock attestation collection)
# =====================================================================
SBOM_RAW="/tmp/sbom-${TS}.cdx.json"
SBOM_ATT="/attestations/${STEP_NAME}-sbom-${TS}.json"
echo "[Step 2/5] Generating CycloneDX SBOM as signed attestation..."

if command -v cyclonedx-py &>/dev/null; then
    # Generate raw SBOM first, then wrap in cilock step
    cilock run \
        --step "${STEP_NAME}-sbom" \
        --signer-file-key-path "${SIGNING_KEY}" \
        --outfile "${SBOM_ATT}" \
        --attestations "environment" \
        --hashes sha256 \
        ${ARCHIVISTA_FLAGS} \
        -d /workspace \
        -- cyclonedx-py environment --of json -o "${SBOM_RAW}" 2>&1 || true

    if [ -f "${SBOM_RAW}" ]; then
        # Copy raw SBOM to attestations dir for convenience
        cp "${SBOM_RAW}" "/attestations/${STEP_NAME}-sbom-${TS}.cdx.json" 2>/dev/null || true
        COMPONENTS=$(python3 -c "import json; d=json.load(open('${SBOM_RAW}')); print(len(d.get('components',[])))" 2>/dev/null || echo "?")
        echo "  SBOM: ${SBOM_ATT} (${COMPONENTS} components, signed DSSE)"
    else
        echo "  SBOM generation failed"
    fi
else
    echo "  cyclonedx-py not available, skipping SBOM"
fi
echo ""

# =====================================================================
# STEP 3: SARIF Report (wrapped in cilock attestation collection)
# =====================================================================
SARIF_RAW="/tmp/sarif-${TS}.sarif"
SARIF_ATT="/attestations/${STEP_NAME}-sarif-${TS}.json"
echo "[Step 3/5] Generating SARIF report as signed attestation..."

# Build the SARIF generation script that runs inside cilock
cat > /tmp/generate_sarif_step.sh << SARIF_SCRIPT
#!/bin/bash
set -e

# pip-audit: scan for known vulnerabilities
if command -v pip-audit &>/dev/null; then
    pip-audit --format=json --output=/tmp/pip-audit.json 2>/dev/null || true
fi

# bandit: static security analysis on installed code
SITE_PACKAGES=\$(python3 -c "import site; print(site.getsitepackages()[0])" 2>/dev/null)
if command -v bandit &>/dev/null && [ -n "\${SITE_PACKAGES}" ] && [ -d "\${SITE_PACKAGES}/${PKG_NAME}/" ]; then
    bandit -r "\${SITE_PACKAGES}/${PKG_NAME}/" -f sarif -o /tmp/bandit.sarif 2>/dev/null || true
fi

# Generate combined SARIF
SARIF_OUTPUT="${SARIF_RAW}" python3 /pip-witness/generate_sarif.py \
    "${PKG_NAME}" "${PACKAGE}" "${INSTALL_ATT}" "${SBOM_RAW}" 2>/dev/null || true

# Copy raw SARIF to attestations dir for convenience
if [ -f "${SARIF_RAW}" ]; then
    cp "${SARIF_RAW}" "/attestations/${STEP_NAME}-sarif-${TS}.sarif" 2>/dev/null || true
fi
SARIF_SCRIPT
chmod +x /tmp/generate_sarif_step.sh

cilock run \
    --step "${STEP_NAME}-sarif" \
    --signer-file-key-path "${SIGNING_KEY}" \
    --outfile "${SARIF_ATT}" \
    --attestations "environment" \
    --hashes sha256 \
    ${ARCHIVISTA_FLAGS} \
    -d /workspace \
    -- bash /tmp/generate_sarif_step.sh 2>&1 || true

if [ -f "${SARIF_RAW}" ]; then
    TOTAL=$(python3 -c "import json; d=json.load(open('${SARIF_RAW}')); print(sum(len(r.get('results',[])) for r in d.get('runs',[])))" 2>/dev/null || echo "?")
    echo "  SARIF: ${SARIF_ATT} (${TOTAL} findings, signed DSSE)"
else
    echo "  SARIF generation failed"
fi
echo ""

# =====================================================================
# STEP 4: Import with tracing (catches import-time attacks)
# =====================================================================
echo "[Step 4/5] Importing package with ptrace tracing..."
echo "  This catches malware that only activates on import (e.g., sys.meta_path hooks,"
echo "  atexit handlers, background threads, codec attacks)"

# Build the import test script
cat > /tmp/import_test.py << 'IMPORT_SCRIPT'
"""pip-witness import-time tracer.

Imports the package and records Python-level changes that ptrace can't see:
- sys.meta_path modifications (import hooks)
- atexit handler registration
- threading.Thread creation at module scope
- codecs.register calls
- sys.modules changes
"""
import sys
import json
import importlib

package = sys.argv[1] if len(sys.argv) > 1 else ""
results = {
    "package": package,
    "import_success": False,
    "import_error": None,
    "meta_path_before": len(sys.meta_path),
    "meta_path_after": 0,
    "meta_path_added": 0,
    "modules_before": len(sys.modules),
    "modules_after": 0,
    "modules_added": 0,
    "atexit_registered": False,
    "threads_started": False,
}

# Snapshot before import
original_meta_path = list(sys.meta_path)
original_modules = set(sys.modules.keys())

# Hook atexit to detect registration
import atexit
original_register = atexit.register
atexit_calls = []
def hooked_register(func, *args, **kwargs):
    atexit_calls.append({"func": str(func), "module": getattr(func, "__module__", "?")})
    return original_register(func, *args, **kwargs)
atexit.register = hooked_register

# Hook threading to detect thread creation
import threading
original_thread_init = threading.Thread.__init__
thread_creations = []
def hooked_thread_init(self, *args, **kwargs):
    thread_creations.append({"target": str(kwargs.get("target", args[0] if args else "?"))})
    original_thread_init(self, *args, **kwargs)
threading.Thread.__init__ = hooked_thread_init

# Do the import
try:
    mod = importlib.import_module(package)
    results["import_success"] = True
except Exception as e:
    results["import_error"] = str(e)

# Record changes
results["meta_path_after"] = len(sys.meta_path)
results["meta_path_added"] = len(sys.meta_path) - len(original_meta_path)
results["modules_after"] = len(sys.modules)
results["modules_added"] = len(sys.modules) - len(original_modules)
results["new_modules"] = sorted(list(set(sys.modules.keys()) - original_modules))[:50]
results["atexit_registered"] = len(atexit_calls) > 0
results["atexit_calls"] = atexit_calls
results["threads_started"] = len(thread_creations) > 0
results["thread_creations"] = thread_creations

# New meta_path entries
new_finders = []
for finder in sys.meta_path:
    if finder not in original_meta_path:
        new_finders.append({"type": type(finder).__name__, "module": type(finder).__module__})
results["new_meta_path_finders"] = new_finders

# Output
print(json.dumps(results, indent=2, default=str))

# Restore hooks
atexit.register = original_register
threading.Thread.__init__ = original_thread_init
IMPORT_SCRIPT

# Run the import inside ptrace tracing
# Note: subjects come from the -d workdir (cilock hashes files in workdir)
cilock run \
    --step "${STEP_NAME}-import" \
    --trace \
    --signer-file-key-path "${SIGNING_KEY}" \
    --outfile "${IMPORT_ATT}" \
    --attestations "${IMPORT_ATTESTORS}" \
    --hashes sha256 \
    ${ARCHIVISTA_FLAGS} \
    -d /workspace \
    -- python3 /tmp/import_test.py "${PKG_NAME}" 2>&1 || true

echo "  Import attestation: ${IMPORT_ATT}"
echo ""

# =====================================================================
# STEP 5: Summary
# =====================================================================
echo "[Step 5/5] Attestation summary"
echo "=========================================="

# Parse import test results
if [ -f "${IMPORT_ATT}" ]; then
    python3 -c "
import json, base64, sys
try:
    with open('${INSTALL_ATT}') as f:
        install = json.load(f)
    ip = json.loads(base64.b64decode(install['payload']))
    for att in ip.get('predicate',{}).get('attestations',[]):
        d = att.get('attestation',{})
        if 'processes' in d:
            procs = d['processes']
            total_files = sum(len(p.get('openedfiles',{})) for p in procs)
            total_net = sum(len(p.get('network',{}).get('connections',[])) for p in procs if p.get('network'))
            total_writes = sum(len(p.get('fileOps',{}).get('writes',[])) for p in procs if p.get('fileOps'))
            total_events = sum(len(p.get('syscallEvents',[])) for p in procs)
            print(f'Install: {len(procs)} processes | {total_files} files | {total_net} connections | {total_writes} writes | {total_events} syscall events')
except Exception as e:
    print(f'Install summary error: {e}')

try:
    with open('${IMPORT_ATT}') as f:
        imp = json.load(f)
    ip = json.loads(base64.b64decode(imp['payload']))
    for att in ip.get('predicate',{}).get('attestations',[]):
        d = att.get('attestation',{})
        if 'processes' in d:
            procs = d['processes']
            total_net = sum(len(p.get('network',{}).get('connections',[])) for p in procs if p.get('network'))
            total_events = sum(len(p.get('syscallEvents',[])) for p in procs)
            print(f'Import: {len(procs)} processes | {total_net} connections | {total_events} syscall events')
except Exception as e:
    print(f'Import summary error: {e}')
" 2>/dev/null || true
fi

echo ""
echo "Attestations (4 signed DSSE collections):"
echo "  1. Install: ${INSTALL_ATT}"
echo "  2. SBOM:    ${SBOM_ATT}"
echo "  3. SARIF:   ${SARIF_ATT}"
echo "  4. Import:  ${IMPORT_ATT}"
echo "=========================================="
