#!/bin/bash # GNM — Garda Națională de Mediu. # Scrapes the gnm.ro WordPress RSS feed (~36 pages × 10 items) for environmental # enforcement press releases. Persists every release to gnm.comunicate, flags # is_enforcement, and runs a regex pass to surface (firm, fine_lei) tuples into # gnm.amenzi_extrase. # # Mirrors scrape-ancom.sh / scrape-anre.sh pattern: Infisical Machine Identity # → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch. # # Idempotent (UPSERT on guid; skip on raw_hash unchanged). Safe to run from cron. # # Env knobs: # MAX_PAGES=0 (default: 0 = walk until empty, max 50) # SINCE_DAYS=0 (default: 0 = no cutoff; >0 = stop at first item older than N days) # # Run: # sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # smoke (20 articles) # sudo SINCE_DAYS=30 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # incremental # sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # full (~360 articles) set -euo pipefail MAX_PAGES="${MAX_PAGES:-0}" SINCE_DAYS="${SINCE_DAYS:-0}" LOG=/var/log/vreaudigital-gnm.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } log "=== GNM scrape started (max_pages=$MAX_PAGES since_days=$SINCE_DAYS) ===" if docker ps --filter name=vreaudigital-gnm --format '{{.Names}}' | grep -q '^vreaudigital-gnm$'; then log "WARN: vreaudigital-gnm already running, skipping this tick" exit 0 fi docker rm -f vreaudigital-gnm 2>/dev/null || true # ── Fetch DATABASE_URL via Infisical Machine Identity ── source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) umask 077 ENVF=$(mktemp /tmp/.vreaudigital-gnm-env.XXXXXX) DBURL=$(infisical secrets get DATABASE_URL \ --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --token="$TOKEN" --plain --silent) echo "DATABASE_URL=$DBURL" > "$ENVF" unset DBURL TOKEN cd /opt/vreaudigital/services/seap-scraper if [ ! -d node_modules/tsx ]; then log "Installing seap-scraper deps..." docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \ node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null fi EXTRA_ARGS="" [ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES" [ "$SINCE_DAYS" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --since-days=$SINCE_DAYS" CID=$(docker run -d \ --name vreaudigital-gnm \ --network host \ --env-file "$ENVF" \ -v "$(pwd):/work" \ -w /work \ --user "$(id -u):$(id -g)" \ --restart no \ node:22-alpine \ npx tsx src/scrape-gnm.ts $EXTRA_ARGS) log "container started: $CID" sleep 3 rm -f "$ENVF" log "envfile cleaned" docker wait vreaudigital-gnm >/dev/null EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-gnm 2>/dev/null || echo "?") docker logs vreaudigital-gnm 2>&1 | tail -30 | tee -a "$LOG" log "=== GNM scrape done (exit=$EXIT_CODE) ===" exit "$EXIT_CODE"