#!/bin/bash # ANAF datornici — LIVE scraper wrapper (Cloudflare Turnstile via 2captcha). # # Mirrors scrape-cnsc.sh / scrape-anaf-datornici.sh pattern but runs a Python # script (not TSX) because the live scraper uses requests + psycopg2 and shares # nothing with the data.gov.ro one-shot TS importer. # # Infisical Machine Identity → env-file (DATABASE_URL + TWOCAPTCHA_KEY) → # docker run --env-file (NEVER -e $VAR), file deleted post-launch. # # Idempotent (UPSERT on cui+publication_date). Designed to be triggered # quarterly by vreaudigital-anaf-datornici.timer. # # ⚠️ COST: each run spends real money via 2captcha (~$0.50-3 per quarterly # tick, ~$60-100 one-time for 10-year backfill). Do NOT enable the systemd # timer until TWOCAPTCHA_KEY is funded — see HANDOFF-anaf-datornici-2captcha.md. # # Env knobs: # DRY_RUN=1 — parse-only, zero spend, zero DB writes. # BACKFILL_FROM=2016-Q1 — iterate from quarter X through current. # CATEGORIES=mari,mijlocii — subset of {mari,mijlocii,mici,institutii_publice,persoane_fizice}. # INCLUDE_LISTA_ALBA=1 — also scrape anaf.lista_alba (separate endpoint). # # Run: # sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-datornici-live.sh # sudo DRY_RUN=1 /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-datornici-live.sh # sudo BACKFILL_FROM=2016-Q1 INCLUDE_LISTA_ALBA=1 /opt/.../scrape-anaf-datornici-live.sh set -euo pipefail DRY_RUN="${DRY_RUN:-0}" BACKFILL_FROM="${BACKFILL_FROM:-}" CATEGORIES="${CATEGORIES:-}" INCLUDE_LISTA_ALBA="${INCLUDE_LISTA_ALBA:-0}" LOG=/var/log/vreaudigital-anaf-datornici.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } log "=== ANAF datornici LIVE scrape started (dry_run=$DRY_RUN backfill=$BACKFILL_FROM lista_alba=$INCLUDE_LISTA_ALBA) ===" if docker ps --filter name=vreaudigital-anaf-datornici-live --format '{{.Names}}' \ | grep -q '^vreaudigital-anaf-datornici-live$'; then log "WARN: vreaudigital-anaf-datornici-live already running, skipping this tick" exit 0 fi docker rm -f vreaudigital-anaf-datornici-live 2>/dev/null || true # ── Fetch DATABASE_URL + TWOCAPTCHA_KEY via Infisical Machine Identity ── source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) umask 077 ENVF=$(mktemp /tmp/.vreaudigital-anaf-datornici-live-env.XXXXXX) DBURL=$(infisical secrets get DATABASE_URL \ --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --token="$TOKEN" --plain --silent) echo "DATABASE_URL=$DBURL" > "$ENVF" unset DBURL # TWOCAPTCHA_KEY: required unless DRY_RUN=1. If missing, abort with a clear # pointer to the handoff doc — DO NOT silently run (would still hit ANAF page). if [ "$DRY_RUN" != "1" ]; then # Try primary path first ($INFISICAL_PATH = /vreaudigital), fall back to root. # Some users add TWOCAPTCHA_KEY at root path / (less project-namespaced). for try_path in "$INFISICAL_PATH" "/"; do TWOCAPTCHA_KEY=$(infisical secrets get TWOCAPTCHA_KEY \ --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$try_path" \ --token="$TOKEN" --plain --silent 2>/dev/null || true) [ -n "${TWOCAPTCHA_KEY:-}" ] && break done if [ -z "${TWOCAPTCHA_KEY:-}" ]; then log "ERROR: TWOCAPTCHA_KEY missing in Infisical (checked $INFISICAL_PATH + /) — see HANDOFF-anaf-datornici-2captcha.md" log " Add via: NEW SECRET PROTOCOL (Infisical, either path /vreaudigital or /)" rm -f "$ENVF" exit 3 fi echo "TWOCAPTCHA_KEY=$TWOCAPTCHA_KEY" >> "$ENVF" unset TWOCAPTCHA_KEY fi unset TOKEN # Pass-through env knobs echo "DRY_RUN=$DRY_RUN" >> "$ENVF" [ -n "$BACKFILL_FROM" ] && echo "BACKFILL_FROM=$BACKFILL_FROM" >> "$ENVF" [ -n "$CATEGORIES" ] && echo "CATEGORIES=$CATEGORIES" >> "$ENVF" [ "$INCLUDE_LISTA_ALBA" = "1" ] && echo "INCLUDE_LISTA_ALBA=1" >> "$ENVF" echo "ANAF_DATORNICI_LOG=/work/.log/anaf-datornici.log" >> "$ENVF" cd /opt/vreaudigital/services/seap-scraper # Ensure /work/.log is writable inside container (host bind-mount); the # Python process also tees to stdout → docker logs → journald. mkdir -p .log CID=$(docker run -d \ --name vreaudigital-anaf-datornici-live \ --network host \ --env-file "$ENVF" \ -v "$(pwd):/work" \ -w /work \ --user "$(id -u):$(id -g)" \ --restart no \ python:3.12-slim \ bash -c "pip install --quiet --no-cache-dir psycopg2-binary requests && python3 scrapers/anaf_datornici/scraper.py") log "container started: $CID" sleep 3 rm -f "$ENVF" log "envfile cleaned" docker wait vreaudigital-anaf-datornici-live >/dev/null EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-datornici-live 2>/dev/null || echo "?") docker logs vreaudigital-anaf-datornici-live 2>&1 | tail -30 | tee -a "$LOG" log "=== ANAF datornici LIVE scrape done (exit=$EXIT_CODE) ===" exit "$EXIT_CODE"