Files
vreau-digital/services/seap-scraper/cron/scrape-gnm.sh
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

89 lines
3.1 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
# GNM — Garda Națională de Mediu.
# Scrapes the gnm.ro WordPress RSS feed (~36 pages × 10 items) for environmental
# enforcement press releases. Persists every release to gnm.comunicate, flags
# is_enforcement, and runs a regex pass to surface (firm, fine_lei) tuples into
# gnm.amenzi_extrase.
#
# Mirrors scrape-ancom.sh / scrape-anre.sh pattern: Infisical Machine Identity
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (UPSERT on guid; skip on raw_hash unchanged). Safe to run from cron.
#
# Env knobs:
# MAX_PAGES=0 (default: 0 = walk until empty, max 50)
# SINCE_DAYS=0 (default: 0 = no cutoff; >0 = stop at first item older than N days)
#
# Run:
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # smoke (20 articles)
# sudo SINCE_DAYS=30 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # incremental
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # full (~360 articles)
set -euo pipefail
MAX_PAGES="${MAX_PAGES:-0}"
SINCE_DAYS="${SINCE_DAYS:-0}"
LOG=/var/log/vreaudigital-gnm.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== GNM scrape started (max_pages=$MAX_PAGES since_days=$SINCE_DAYS) ==="
if docker ps --filter name=vreaudigital-gnm --format '{{.Names}}' | grep -q '^vreaudigital-gnm$'; then
log "WARN: vreaudigital-gnm already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-gnm 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-gnm-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=""
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
[ "$SINCE_DAYS" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --since-days=$SINCE_DAYS"
CID=$(docker run -d \
--name vreaudigital-gnm \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-gnm.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-gnm >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-gnm 2>/dev/null || echo "?")
docker logs vreaudigital-gnm 2>&1 | tail -30 | tee -a "$LOG"
log "=== GNM scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"