#!/bin/bash # CNAS — Casa Națională de Asigurări de Sănătate. # Scrapes the central WP media library at cnas.ro/wp-content/uploads/ for # furnizori-de-servicii-medicale PDFs (~70-90 active docs as of 2026-05). # Per-county Angular SPA at cas.cnas.ro/casXX is currently empty (handoff # documented in CNAS-PLAN.md). # # Mirrors scrape-anre.sh / scrape-regas.sh pattern: Infisical Machine Identity # → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch. # Container has poppler-utils installed for pdftotext. # # Idempotent. Safe to run from cron weekly (CNAS uploads ~5-15 files/month). # # Env knobs: # LIMIT=0 (default: 0 = all matched files) # MODE=full (full | metadata-only | parse-only) # # Run: # sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # full # sudo LIMIT=5 /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # smoke test # sudo MODE=metadata-only /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # list-only set -euo pipefail LIMIT="${LIMIT:-0}" MODE="${MODE:-full}" LOG=/var/log/vreaudigital-cnas.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } log "=== CNAS scrape started (limit=$LIMIT mode=$MODE) ===" if docker ps --filter name=vreaudigital-cnas --format '{{.Names}}' | grep -q '^vreaudigital-cnas$'; then log "WARN: vreaudigital-cnas already running, skipping this tick" exit 0 fi docker rm -f vreaudigital-cnas 2>/dev/null || true # ── Fetch DATABASE_URL via Infisical Machine Identity ── source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) umask 077 ENVF=$(mktemp /tmp/.vreaudigital-cnas-env.XXXXXX) DBURL=$(infisical secrets get DATABASE_URL \ --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --token="$TOKEN" --plain --silent) echo "DATABASE_URL=$DBURL" > "$ENVF" unset DBURL TOKEN cd /opt/vreaudigital/services/seap-scraper if [ ! -d node_modules/tsx ]; then log "Installing seap-scraper deps..." docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \ node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null fi EXTRA_ARGS="" [ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT" case "$MODE" in metadata-only) EXTRA_ARGS="$EXTRA_ARGS --metadata-only" ;; parse-only) EXTRA_ARGS="$EXTRA_ARGS --parse-only" ;; full) ;; *) log "ERROR: unknown MODE=$MODE (full|metadata-only|parse-only)"; exit 1 ;; esac # Note: poppler-utils is installed at container start for pdftotext + pdfinfo. # Using sh -c so we can chain apk add + npx tsx in a single command. CID=$(docker run -d \ --name vreaudigital-cnas \ --network host \ --env-file "$ENVF" \ -v "$(pwd):/work" \ -w /work \ --user 0:0 \ --restart no \ node:22-alpine \ sh -c "apk add --no-cache poppler-utils >/dev/null && npx tsx src/scrape-cnas.ts $EXTRA_ARGS") log "container started: $CID" sleep 3 rm -f "$ENVF" log "envfile cleaned" docker wait vreaudigital-cnas >/dev/null EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-cnas 2>/dev/null || echo "?") docker logs vreaudigital-cnas 2>&1 | tail -50 | tee -a "$LOG" log "=== CNAS scrape done (exit=$EXIT_CODE) ===" exit "$EXIT_CODE"