#!/bin/bash # Scraper Transparență Bugetară MFP — Faza 1: enumerare universul entităților # publice raportoare + fuzzy match nume → CUI. # # Faza 2 (descărcare rapoarte XML) nu e implementată: aplicația MFP cere # CAPTCHA pe fiecare căutare, ceea ce necesită captcha solver extern (2captcha # / anti-captcha) și un buget pentru ~1.6M cereri (4-8K USD pentru ingest # istoric complet 2020-2025). Vezi BUGETAR-PLAN.md pentru detalii. # # Modes: # MODE=enumerate (default) → enumeră (sector × județ) → bugetar.entitate # MODE=match-cui → fuzzy match denumire → firms.entities.cui_normalized # MODE=full → enumerate + match-cui într-o singură rulare # # Idempotent. Sigur de rulat repetat (UPSERT). set -euo pipefail MODE="${MODE:-enumerate}" JUDET="${JUDET:-}" SECTOR="${SECTOR:-}" DELAY_MS="${DELAY_MS:-500}" LOG=/var/log/vreaudigital-bugetar.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } log "=== bugetar scraper started (mode=$MODE judet=${JUDET:-ALL} sector=${SECTOR:-ALL}) ===" # Guard: previous run still going? if docker ps --filter name=vreaudigital-bugetar --format '{{.Names}}' | grep -q '^vreaudigital-bugetar$'; then log "WARN: vreaudigital-bugetar already running, skipping" exit 0 fi docker rm -f vreaudigital-bugetar 2>/dev/null || true # ── Fetch DATABASE_URL via Infisical Machine Identity ── source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) umask 077 ENVF=$(mktemp /tmp/.vreaudigital-bugetar-env.XXXXXX) DBURL=$(infisical secrets get DATABASE_URL \ --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --token="$TOKEN" --plain --silent) echo "DATABASE_URL=$DBURL" > "$ENVF" unset DBURL TOKEN cd /opt/vreaudigital/services/seap-scraper # Make sure node_modules exists. if [ ! -d node_modules/tsx ]; then log "Installing seap-scraper deps..." docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \ node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null fi run_scraper_mode() { local mode="$1" local extra_args="" [ -n "$JUDET" ] && extra_args="$extra_args --judet=$JUDET" [ -n "$SECTOR" ] && extra_args="$extra_args --sector=$SECTOR" [ "$mode" = "enumerate" ] && extra_args="$extra_args --delay-ms=$DELAY_MS" log "running mode=$mode args=$extra_args" CID=$(docker run -d \ --name "vreaudigital-bugetar-$mode" \ --network host \ --env-file "$ENVF" \ -v "$(pwd):/work" \ -w /work \ --user "$(id -u):$(id -g)" \ --restart no \ node:22-alpine \ npx tsx src/scrape-bugetar.ts --mode="$mode" $extra_args) log " container: $CID" sleep 3 # daemon a citit envfile docker wait "vreaudigital-bugetar-$mode" >/dev/null EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' "vreaudigital-bugetar-$mode" 2>/dev/null || echo "?") docker logs "vreaudigital-bugetar-$mode" 2>&1 | tail -10 | tee -a "$LOG" docker rm -f "vreaudigital-bugetar-$mode" >/dev/null 2>&1 || true return "$EXIT_CODE" } EXIT_CODE=0 case "$MODE" in enumerate) run_scraper_mode enumerate || EXIT_CODE=$? ;; match-cui) run_scraper_mode match-cui || EXIT_CODE=$? ;; full) run_scraper_mode enumerate || EXIT_CODE=$? if [ "$EXIT_CODE" -eq 0 ]; then run_scraper_mode match-cui || EXIT_CODE=$? fi ;; *) log "ERROR: unknown MODE=$MODE (use enumerate|match-cui|full)" EXIT_CODE=2 ;; esac rm -f "$ENVF" log "envfile cleaned" log "=== bugetar scraper done (exit=$EXIT_CODE) ===" exit "$EXIT_CODE"