initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
+82
View File
@@ -0,0 +1,82 @@
#!/bin/bash
# Daily delta enrichment from ANAF webservicesp v9.
# Runs the tsx script inside a node:22-alpine container so satra doesn't
# need node installed at host level. DATABASE_URL is fetched fresh from
# Infisical and passed via --env-file (mode 600, deleted right after the
# container starts) — never on the docker run command line.
#
# Tier selection: pass TIER=daily|full|bulk as env (default: daily).
# Concurrency: pass ANAF_CONCURRENCY=N (default: 2).
#
# Idempotent. Safe to run from cron.
set -euo pipefail
TIER="${TIER:-daily}"
ANAF_CONCURRENCY="${ANAF_CONCURRENCY:-2}"
LOG=/var/log/vreaudigital-anaf.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ANAF enrichment started (tier=$TIER, concurrency=$ANAF_CONCURRENCY) ==="
# Bail if a previous run is still going — daily/full tier should always
# finish well under 24h, so a still-running container means trouble.
if docker ps --filter name=vreaudigital-anaf --format '{{.Names}}' | grep -q '^vreaudigital-anaf$'; then
log "WARN: vreaudigital-anaf already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-anaf 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
# ── Launch detached docker container ──
cd /opt/vreaudigital/services/seap-scraper
# Make sure node_modules exists (first run on a fresh host).
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
CID=$(docker run -d \
--name vreaudigital-anaf \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/enrich-anaf.ts --concurrency="$ANAF_CONCURRENCY" --tier="$TIER")
log "container started: $CID"
# Daemon has read --env-file by the time `docker run -d` returns.
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
# Wait synchronously so systemd Type=oneshot accurately captures runtime.
docker wait vreaudigital-anaf >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf 2>/dev/null || echo "?")
docker logs vreaudigital-anaf 2>&1 | tail -5 | tee -a "$LOG"
log "=== ANAF enrichment done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+343
View File
@@ -0,0 +1,343 @@
#!/bin/bash
# Full geocoding fallback chain for firms.entities (WHERE lat IS NULL).
#
# Re-runnable / idempotent. Filters every stage on `lat IS NULL` so re-runs
# are no-ops once coverage is full. Safe to call after any ONRC fresh import
# (import-onrc-fresh.sh) which by itself does NOT geocode new rows.
#
# Stage chain (highest accuracy first):
# 1. geonames_postal — exact 6-digit RO postal match against firms.postal_codes_best
# 2. uat_centroid — by siruta → public."GisUat" polygon centroid
# 3. photon — Komoot Photon OSM geocoder (local 127.0.0.1:2322), street-level
# 3b/3c/3d. uat_centroid by postal_codes (locality+county median) — for rows w/o
# adr_strada (Photon's filter requires it). Tries locality token,
# then Comuna parent, then â/î normalization.
# 4. judet_centroid — last resort, county median from firms.postal_codes
#
# Two rows in the entire dataset have literally zero address fields and stay NULL.
#
# Usage:
# sudo /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
# sudo SKIP_PHOTON=1 /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
#
# Env:
# SKIP_PHOTON=1 — skip stage 3 (photon docker) — useful when Photon down
# PHOTON_CONCURRENCY=40
# PHOTON_BATCH=200
set -euo pipefail
LOG=/var/log/vreaudigital-geocode-firms.log
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
SEAP_DIR="$(dirname "$SCRIPT_DIR")"
SKIP_PHOTON="${SKIP_PHOTON:-0}"
PHOTON_CONCURRENCY="${PHOTON_CONCURRENCY:-40}"
PHOTON_BATCH="${PHOTON_BATCH:-200}"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== Geocode-firms fallback chain started ==="
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
exit 1
fi
# shellcheck disable=SC1091
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
initial_null=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
log "Initial WHERE lat IS NULL count: $initial_null"
if [ "$initial_null" = "0" ]; then
log "Nothing to do — no firms with NULL lat."
unset DATABASE_URL TOKEN DB PGPASSWORD
exit 0
fi
# ── Stage 1: geonames_postal ────────────────────────────────────────────────
log "[stage 1] geonames_postal (exact 6-digit postal match)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.lat IS NULL
AND e.adr_cod_postal ~ '^[0-9]{6}\$'
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
)
UPDATE firms.entities e
SET
lat = pc.lat::double precision,
lng = pc.lng::double precision,
geom = ST_SetSRID(ST_MakePoint(pc.lng::double precision, pc.lat::double precision), 4326)::geography,
geocode_source = 'geonames_postal',
geocode_score = 0.6,
geocoded_at = now(),
updated_at = now()
FROM firms.postal_codes_best pc, cand
WHERE e.cui = cand.cui
AND e.adr_cod_postal = pc.postal_code
AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 1] updated $n rows"
# ── Stage 2: uat_centroid by siruta ─────────────────────────────────────────
log "[stage 2] uat_centroid (via siruta → GisUat polygon centroid)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.lat IS NULL
AND e.siruta IS NOT NULL
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
)
UPDATE firms.entities e
SET
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM public.\"GisUat\" gu, cand
WHERE e.cui = cand.cui
AND e.siruta = gu.siruta
AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 2] updated $n rows"
# ── Stage 3: photon (docker) ────────────────────────────────────────────────
if [ "$SKIP_PHOTON" = "1" ]; then
log "[stage 3] SKIP_PHOTON=1 — skipping photon stage"
else
remaining_photon=$(psql -At -c "
SELECT count(*) FROM firms.entities
WHERE geocode_source IS NULL
AND adr_strada IS NOT NULL
AND adr_judet IS NOT NULL
")
if [ "$remaining_photon" = "0" ]; then
log "[stage 3] no photon-eligible rows — skipping"
else
log "[stage 3] photon — $remaining_photon candidates..."
if docker ps --filter name=vreaudigital-geocode --format '{{.Names}}' | grep -q '^vreaudigital-geocode$'; then
log "WARN: vreaudigital-geocode already running — skipping stage 3"
else
docker rm -f vreaudigital-geocode 2>/dev/null || true
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-geocode-env.XXXXXX)
printf 'DATABASE_URL=%s\nPHOTON_URL=http://127.0.0.1:2322\n' \
"$DATABASE_URL" > "$ENVF"
cd "$SEAP_DIR"
CID=$(docker run -d \
--name vreaudigital-geocode \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" -w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
sh -c "npx tsx src/geocode-photon.ts --concurrency=$PHOTON_CONCURRENCY --batch=$PHOTON_BATCH")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
docker wait vreaudigital-geocode >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-geocode 2>/dev/null || echo "?")
docker logs vreaudigital-geocode 2>&1 | tail -10 | tee -a "$LOG"
log "[stage 3] photon container exit=$EXIT_CODE"
fi
fi
fi
unset DATABASE_URL TOKEN DB
# ── Stage 3b/3c/3d: uat_centroid by name (no siruta, no postal) ─────────────
# For rows w/o adr_strada (skipped by photon) match postal_codes locality+county
# median. Three normalization variants try locality token, comuna parent, and
# Romanian â/î diacritic normalization.
log "[stage 3b] uat_centroid by postal_codes locality+county median (locality token)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
),
loc_clean AS (
SELECT
cui,
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
upper(unaccent(trim(regexp_replace(
regexp_replace(adr_localitate, ',.*\$', ''),
'^(Sat|Or[şs]\\.?|Mun\\.?|Loc\\.?|Cartier|Comuna)\\s+', '', 'i'
)))) AS loc_key
FROM cand
),
pc_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
upper(unaccent(place_name)) AS loc_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE place_name IS NOT NULL
GROUP BY 1, 2
)
UPDATE firms.entities e
SET
lat = pc.lat,
lng = pc.lng,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM loc_clean lc
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
WHERE e.cui = lc.cui AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 3b] updated $n rows"
log "[stage 3c] uat_centroid by Comuna parent..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
),
loc_clean AS (
SELECT
cui,
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
upper(unaccent(trim((regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1]))) AS loc_key
FROM cand
),
pc_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
upper(unaccent(place_name)) AS loc_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE place_name IS NOT NULL
GROUP BY 1, 2
)
UPDATE firms.entities e
SET
lat = pc.lat,
lng = pc.lng,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM loc_clean lc
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
WHERE e.cui = lc.cui AND e.lat IS NULL AND lc.loc_key IS NOT NULL
RETURNING 1
" | wc -l)
log "[stage 3c] updated $n rows"
log "[stage 3d] uat_centroid with â/î normalization (Oraş/Comuna/locality)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH cand AS (
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
),
loc_norm AS (
SELECT
cui,
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
upper(unaccent(translate(trim(coalesce(
(regexp_match(adr_localitate, 'Or[şs]\\.?\\s+([^,]+)', 'i'))[1],
(regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1],
regexp_replace(regexp_replace(adr_localitate, ',.*\$',''), '^(Sat|Loc\\.?)\\s+','','i')
)), 'îÎ', 'âÂ'))) AS loc_key
FROM cand
),
pc_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
upper(unaccent(translate(place_name, 'îÎ','âÂ'))) AS loc_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE place_name IS NOT NULL
GROUP BY 1, 2
)
UPDATE firms.entities e
SET
lat = pc.lat,
lng = pc.lng,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM loc_norm ln
JOIN pc_agg pc ON pc.judet_key = ln.judet_key AND pc.loc_key = ln.loc_key
WHERE e.cui = ln.cui AND e.lat IS NULL AND ln.loc_key IS NOT NULL
RETURNING 1
" | wc -l)
log "[stage 3d] updated $n rows"
# ── Stage 4: judet_centroid fallback ────────────────────────────────────────
log "[stage 4] judet_centroid (county median, last resort)..."
n=$(psql -v ON_ERROR_STOP=1 -At -c "
WITH judet_agg AS (
SELECT
upper(unaccent(coalesce(county,''))) AS judet_key,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
FROM firms.postal_codes
WHERE county IS NOT NULL
GROUP BY 1
)
UPDATE firms.entities e
SET
lat = ja.lat,
lng = ja.lng,
geom = ST_SetSRID(ST_MakePoint(ja.lng, ja.lat), 4326)::geography,
geocode_source = 'judet_centroid',
geocode_score = 0.1,
geocoded_at = now(),
updated_at = now()
FROM judet_agg ja
WHERE upper(unaccent(regexp_replace(e.adr_judet,'^MUNICIPIUL ',''))) = ja.judet_key
AND e.lat IS NULL
RETURNING 1
" | wc -l)
log "[stage 4] updated $n rows"
# ── Final stats ─────────────────────────────────────────────────────────────
log "Final stats:"
psql -A -F"|" -c "
SELECT
geocode_source,
count(*) AS rows
FROM firms.entities
GROUP BY geocode_source
ORDER BY rows DESC;
" 2>&1 | tee -a "$LOG"
residual=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
log "Residual WHERE lat IS NULL: $residual (out of reach — no address fields)"
log "=== Geocode-firms fallback chain done ==="
unset PGPASSWORD
+144
View File
@@ -0,0 +1,144 @@
#!/bin/bash
# Daily data-freshness heartbeat for vreaudigital.ro
# - Queries max(fetched_at) per primary table across 17 schemas
# - Compares against per-source expected cadence (days)
# - Posts a webhook payload if any source is stale beyond threshold
# - Always exits 0 (alerts are signal, not error — cron noise budget = 1 alert/day)
#
# Run from satra cron at 07:00 daily.
# Designed to be paranoid-safe: never echoes the DB password, never fails
# loud on transient DB blips (only fails when the heartbeat itself can't run).
set -uo pipefail
LOG=/var/log/vreaudigital-heartbeat.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }
WEBHOOK_URL="https://n8n.beletage.ro/webhook/satra-backup-alert"
HOSTNAME_TAG="vreaudigital"
log "=== Heartbeat started ==="
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
exit 1
fi
# shellcheck disable=SC1091
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login \
--method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" \
--path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DATABASE_URL TOKEN DB
# Per-source cadence query. Each row: source_label, expected_max_days, actual_gap_days,
# last_seen_date. Sources stuck at known long staleness (anaf datornici Q1 2016) are
# excluded — heartbeat noise budget is for fixable freshness, not known constants.
QUERY=$(cat <<'SQL'
WITH probes AS (
SELECT 'seap.announcements' AS label, 2 AS expected_days, max(publication_date)::date AS last_seen FROM seap.announcements
UNION ALL
SELECT 'seap.wsp_sync_state', 1, max(last_run_at)::date FROM seap.wsp_sync_state
UNION ALL
SELECT 'seap.sync_state(da)', 30, max(updated_at)::date FROM seap.sync_state WHERE source='da'
UNION ALL
SELECT 'firms.entities', 100, max(updated_at)::date FROM firms.entities
UNION ALL
SELECT 'firms.financials', 400, max(fetched_at)::date FROM firms.financials
UNION ALL
SELECT 'fonduri.beneficiar_anunt', 7, max(data_publicare)::date FROM fonduri.beneficiar_anunt
UNION ALL
SELECT 'fonduri.afir_plati', 365, max(fetched_at)::date FROM fonduri.afir_plati
UNION ALL
SELECT 'regas.ajutoare', 45, max(fetched_at)::date FROM regas.ajutoare
UNION ALL
SELECT 'aep.donatii_pj', 60, max(fetched_at)::date FROM aep.donatii_pj
UNION ALL
SELECT 'ani.declaratii', 400, max(fetched_at)::date FROM ani.declaratii
UNION ALL
SELECT 'bugetar.entitate', 60, max(updated_at)::date FROM bugetar.entitate
UNION ALL
SELECT 'anre.licente', 14, max(fetched_at)::date FROM anre.licente
UNION ALL
SELECT 'ancom.operatori', 14, max(fetched_at)::date FROM ancom.operatori
UNION ALL
SELECT 'cnsc.decizii', 14, max(fetched_at)::date FROM cnsc.decizii
UNION ALL
SELECT 'cnas.furnizori', 60, max(fetched_at)::date FROM cnas.furnizori
UNION ALL
SELECT 'asf.entitati', 14, max(fetched_at)::date FROM asf.entitati
UNION ALL
SELECT 'aaas.firme', 30, max(fetched_at)::date FROM aaas.firme
UNION ALL
SELECT 'curteacont.rapoarte', 14, max(fetched_at)::date FROM curteacont.rapoarte
UNION ALL
SELECT 'apia.fermieri', 60, max(fetched_at)::date FROM apia.fermieri
UNION ALL
SELECT 'gnm.comunicate', 14, max(fetched_at)::date FROM gnm.comunicate
)
SELECT label, expected_days,
-- clamp future dates (TED publication-date can be in the future) and
-- treat NULL last_seen as ancient (empty table → alert).
-- NB: LEAST(NULL, x) = x in PG (returns NULL only if all args NULL),
-- so explicit CASE for NULL handling.
CASE WHEN last_seen IS NULL THEN 9999
ELSE (now()::date - LEAST(last_seen, now()::date)) END AS gap_days,
COALESCE(last_seen::text, 'NEVER') AS last_seen,
CASE WHEN last_seen IS NULL THEN 'STALE'
WHEN (now()::date - LEAST(last_seen, now()::date)) > expected_days THEN 'STALE'
ELSE 'OK' END AS status
FROM probes
ORDER BY CASE WHEN last_seen IS NULL THEN 9999
ELSE (now()::date - LEAST(last_seen, now()::date)) END DESC;
SQL
)
OUT=$(psql -v ON_ERROR_STOP=1 -A -F$'\t' -t -c "$QUERY" 2>&1) || {
log "ERROR: psql failed — heartbeat skipped this run"
log "$OUT"
exit 0
}
unset PGPASSWORD
STALE_LIST=$(echo "$OUT" | awk -F'\t' '$5=="STALE" { printf "%s (gap=%sd, expected≤%sd, last=%s)\n", $1, $3, $2, $4 }')
STALE_COUNT=$(echo -n "$STALE_LIST" | grep -c . || true)
TOTAL=$(echo -n "$OUT" | grep -c . || true)
log "Probed $TOTAL sources, $STALE_COUNT stale"
echo "$OUT" | awk -F'\t' '{ printf " %-30s %s gap=%sd last=%s\n", $1, $5, $3, $4 }' | tee -a "$LOG"
if [ "$STALE_COUNT" -gt 0 ]; then
log "ALERT — posting to webhook"
PAYLOAD=$(jq -nc \
--arg s "STALE" \
--arg h "$HOSTNAME_TAG" \
--argjson c "$STALE_COUNT" \
--argjson t "$TOTAL" \
--arg d "$STALE_LIST" \
'{status:$s, host:$h, service:"data-heartbeat", stale_count:$c, total:$t, details:$d}')
curl -sS -X POST -H "Content-Type: application/json" --max-time 30 \
-d "$PAYLOAD" "$WEBHOOK_URL" >/dev/null 2>&1 || log "webhook POST failed (non-fatal)"
fi
log "=== Done ==="
exit 0
+132
View File
@@ -0,0 +1,132 @@
#!/bin/bash
# AFIR historical XLSX importer wrapper.
#
# Downloads a yearly AFIR FEADR/FEGA XLSX, normalizes to pipe-TSV, ships to
# satra, COPYs into fonduri.staging_afir, then INSERTs into fonduri.afir_plati
# with source_year tagging.
#
# Idempotent: rows with the matching source_year are deleted before insert
# (XLSX dumps are stateless reflections of AFIR DB at publication time).
#
# Usage:
# ./import-afir-historical.sh URL YEAR FUND [LIMIT]
# URL: AFIR XLSX direct download URL
# YEAR: 4-digit source year, e.g. 2023
# FUND: 'feadr' or 'fega' (informational; schema is identical)
# LIMIT: optional integer — only insert first N rows (smoke test)
#
# Example:
# ./import-afir-historical.sh \
# 'https://www.afir.ro/media/35cm3jdr/listaplati_2023_feadr_actualizata.xlsx' \
# 2023 feadr
#
# Smoke test (1000 rows):
# ./import-afir-historical.sh '<url>' 2023 feadr 1000
set -euo pipefail
URL="${1:?URL required}"
YEAR="${2:?YEAR required}"
FUND="${3:?FUND required (feadr|fega)}"
LIMIT="${4:-}"
if ! [[ "$YEAR" =~ ^20[0-9]{2}$ ]]; then
echo "[afir-historical] ERROR: YEAR must be 4-digit (got: $YEAR)" >&2
exit 2
fi
if [[ "$FUND" != "feadr" && "$FUND" != "fega" ]]; then
echo "[afir-historical] ERROR: FUND must be 'feadr' or 'fega' (got: $FUND)" >&2
exit 2
fi
WORK_LOCAL="/tmp/afir-historical-$$"
WORK_REMOTE="/tmp/afir-historical-$YEAR-$FUND"
trap "rm -rf $WORK_LOCAL" EXIT
mkdir -p "$WORK_LOCAL"
XLSX_LOCAL="$WORK_LOCAL/listaplati_${YEAR}_${FUND}.xlsx"
TSV_LOCAL="$WORK_LOCAL/listaplati_${YEAR}_${FUND}.tsv"
echo "[afir-historical] === ${YEAR} ${FUND} ==="
# 1. Download (resume-friendly, large file safe). Run on satra to skip the
# upload-back-to-server hop — the XLSX is 30 MB.
echo "[afir-historical] downloading on satra..."
ssh satra "mkdir -p $WORK_REMOTE && curl -sLkf --max-time 600 -o $WORK_REMOTE/listaplati.xlsx '$URL' && ls -lh $WORK_REMOTE/listaplati.xlsx"
# 2. Normalize to pipe-delimited TSV using existing python3-openpyxl on satra.
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)/scripts"
echo "[afir-historical] uploading normalizer..."
scp -q "$SCRIPT_DIR/import-afir-historical.py" satra:$WORK_REMOTE/normalize.py
echo "[afir-historical] normalizing XLSX → TSV (this takes ~2-5 min for 500K rows)..."
ssh satra "python3 $WORK_REMOTE/normalize.py $WORK_REMOTE/listaplati.xlsx $WORK_REMOTE/data.tsv 2>&1 | tail -20"
# 3. Optional smoke-test truncation
TSV_REMOTE="$WORK_REMOTE/data.tsv"
if [ -n "$LIMIT" ]; then
echo "[afir-historical] LIMIT=$LIMIT — truncating TSV for smoke test..."
ssh satra "head -n $LIMIT $WORK_REMOTE/data.tsv > $WORK_REMOTE/data.smoke.tsv && wc -l $WORK_REMOTE/data.smoke.tsv"
TSV_REMOTE="$WORK_REMOTE/data.smoke.tsv"
fi
# 4. Stage + INSERT on Postgres via /tmp/baseline.sh (Infisical-aware psql wrapper).
echo "[afir-historical] staging + insert..."
ssh satra "/tmp/baseline.sh <<SQL
\\set ON_ERROR_STOP on
TRUNCATE TABLE fonduri.staging_afir;
\\copy fonduri.staging_afir (beneficiar_name, last_name, mama_cui, localitate, cod_masura, obiectiv, data_start, data_end, fega_op, fega_total, feadr_op, feadr_total, op_amount, cofinantare, ue_total) FROM '$TSV_REMOTE' WITH (FORMAT text, DELIMITER '|', NULL '')
SELECT 'staging_loaded' AS step, COUNT(*) AS rows FROM fonduri.staging_afir;
-- Idempotent: drop existing rows for (year, fund) before reinsert.
-- We use cod_masura prefix as a fund discriminator: FEGA codes start with
-- a single letter or specific scheme (DPB, ANTPDD, etc); FEADR is 'M ' prefix
-- or numeric. For safety in the LIMIT smoke test we DON'T delete; only
-- delete on a full run (LIMIT empty).
SQL"
if [ -z "$LIMIT" ]; then
echo "[afir-historical] full run — deleting prior rows for source_year=$YEAR..."
ssh satra "/tmp/baseline.sh -c \"DELETE FROM fonduri.afir_plati WHERE source_year = $YEAR;\""
fi
ssh satra "/tmp/baseline.sh <<SQL
\\set ON_ERROR_STOP on
INSERT INTO fonduri.afir_plati (
source_year, beneficiar_name, last_name, mama_cui, localitate,
cod_masura, obiectiv, data_start, data_end,
fega_op, fega_total, feadr_op, feadr_total,
op_amount, cofinantare, ue_total
)
SELECT
$YEAR,
beneficiar_name, NULLIF(last_name, ''), NULLIF(mama_cui, ''), NULLIF(localitate, ''),
NULLIF(cod_masura, ''), NULLIF(obiectiv, ''), NULLIF(data_start, ''), NULLIF(data_end, ''),
NULLIF(fega_op, '')::numeric,
NULLIF(fega_total, '')::numeric,
NULLIF(feadr_op, '')::numeric,
NULLIF(feadr_total, '')::numeric,
NULLIF(op_amount, '')::numeric,
NULLIF(cofinantare, '')::numeric,
NULLIF(ue_total, '')::numeric
FROM fonduri.staging_afir;
SELECT '$YEAR-$FUND' AS run,
COUNT(*) AS rows_inserted,
COUNT(DISTINCT beneficiar_name) AS distinct_beneficiars,
SUM(CASE WHEN feadr_total > 0 THEN 1 END) AS with_feadr,
SUM(CASE WHEN fega_total > 0 THEN 1 END) AS with_fega,
SUM(ue_total)::bigint AS sum_ue_eur
FROM fonduri.afir_plati WHERE source_year = $YEAR;
SQL"
if [ -z "$LIMIT" ]; then
echo "[afir-historical] cleaning up remote workdir..."
ssh satra "rm -rf $WORK_REMOTE"
fi
echo "[afir-historical] === done ($YEAR $FUND) ==="
+210
View File
@@ -0,0 +1,210 @@
#!/bin/bash
# APIA "Lista fermieri" importer wrapper.
#
# Discovers CKAN package "lista-fermierilor-campania-apia-{YEAR}" on
# data.gov.ro and ingests each XLSX resource into apia.fermieri. The
# package can grow over time as more UATs publish their lists; the importer
# is resource-id keyed so re-runs are idempotent (DELETE WHERE
# source_resource_id = X before re-INSERT).
#
# Pattern follows cron/import-afir-historical.sh but simpler — APIA XLSX is
# tiny (KB-MB, not 30 MB), so we don't need streaming COPY tricks; we
# stage on satra and load directly.
#
# Usage:
# ./import-apia-fermieri.sh # all years (currently 2024)
# ./import-apia-fermieri.sh 2024 # only the given year
# ./import-apia-fermieri.sh 2024 1 # smoke test: only first resource
#
# Requires `jq` and `python3-openpyxl` on satra (already installed).
set -euo pipefail
YEAR_FILTER="${1:-}" # empty = all years discoverable
RESOURCE_LIMIT="${2:-0}" # 0 = all resources within selected year(s)
WORK_LOCAL="/tmp/apia-import-$$"
trap "rm -rf $WORK_LOCAL" EXIT
mkdir -p "$WORK_LOCAL"
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)/scripts"
NORMALIZER="$SCRIPT_DIR/import-apia-fermieri.py"
# 1. Discover candidate datasets via CKAN search.
echo "[apia-import] discovering CKAN datasets..."
curl -sSL --max-time 60 \
"https://data.gov.ro/api/3/action/package_search?q=lista+fermieri+APIA&rows=50" \
> "$WORK_LOCAL/search.json"
# Extract: dataset_name | resource_id | resource_url | resource_format | resource_name
# Filter to xlsx resources whose dataset name matches lista-fermier*-apia-*.
python3 - "$WORK_LOCAL/search.json" "$YEAR_FILTER" > "$WORK_LOCAL/resources.tsv" <<'PY'
import json, sys, re
path, year_filter = sys.argv[1], sys.argv[2]
with open(path) as f:
d = json.load(f)
results = d.get("result", {}).get("results", [])
out_lines = []
for pkg in results:
name = pkg.get("name", "")
if not re.search(r"lista[-_]ferm", name, re.I):
continue
# Year extraction from package name (e.g. "lista-fermierilor-campania-apia-2024")
m = re.search(r"(20\d{2})", name)
pkg_year = m.group(1) if m else ""
if year_filter and pkg_year != year_filter:
continue
for rs in pkg.get("resources", []):
fmt = (rs.get("format") or "").upper()
if fmt not in ("XLSX", "XLS"):
continue
rid = rs.get("id") or ""
rurl = rs.get("url") or ""
rname = (rs.get("name") or "").replace("\t", " ")
if not (rid and rurl and pkg_year):
continue
out_lines.append(f"{name}\t{pkg_year}\t{rid}\t{rurl}\t{rname}")
if not out_lines:
print("[apia-import] no matching xlsx resources found", file=sys.stderr)
print("\n".join(out_lines))
PY
N_RESOURCES=$(wc -l < "$WORK_LOCAL/resources.tsv" || echo 0)
echo "[apia-import] found $N_RESOURCES candidate XLSX resource(s)"
if [ "$N_RESOURCES" -eq 0 ]; then
exit 0
fi
# Optional smoke truncation (head N).
if [ "$RESOURCE_LIMIT" -gt 0 ] 2>/dev/null; then
head -n "$RESOURCE_LIMIT" "$WORK_LOCAL/resources.tsv" > "$WORK_LOCAL/resources.smoke.tsv"
mv "$WORK_LOCAL/resources.smoke.tsv" "$WORK_LOCAL/resources.tsv"
echo "[apia-import] smoke mode — truncated to first $RESOURCE_LIMIT resource(s)"
fi
# 2. Upload normalizer to satra (once).
echo "[apia-import] uploading normalizer..."
ssh satra "mkdir -p /tmp/apia-import"
scp -q "$NORMALIZER" satra:/tmp/apia-import/normalize.py
# 3. For each resource: download → normalize → stage → INSERT.
TOTAL_ROWS=0
TOTAL_INSERTED=0
TOTAL_RESOURCES=0
while IFS=$'\t' read -r DATASET_ID YEAR RESOURCE_ID SOURCE_URL RESOURCE_NAME; do
TOTAL_RESOURCES=$((TOTAL_RESOURCES + 1))
WORK_REMOTE="/tmp/apia-import/$RESOURCE_ID"
echo "[apia-import] === $DATASET_ID / $RESOURCE_ID ($RESOURCE_NAME) ==="
STARTED_AT=$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)
T0=$(date +%s%3N)
ssh satra "mkdir -p $WORK_REMOTE && curl -sLkf --max-time 120 -o $WORK_REMOTE/listaferm.xlsx '$SOURCE_URL' && ls -lh $WORK_REMOTE/listaferm.xlsx"
ssh satra "python3 /tmp/apia-import/normalize.py \
$WORK_REMOTE/listaferm.xlsx $WORK_REMOTE/data.tsv \
'$YEAR' '$DATASET_ID' '$RESOURCE_ID' '$SOURCE_URL' 2>&1 | tail -5"
N_TSV=$(ssh satra "wc -l < $WORK_REMOTE/data.tsv")
echo "[apia-import] normalized rows: $N_TSV"
# Idempotent: drop existing rows for this resource_id, then re-INSERT.
ssh satra "/tmp/baseline.sh <<SQL
\\set ON_ERROR_STOP on
TRUNCATE TABLE apia.staging_fermieri;
\\copy apia.staging_fermieri FROM '$WORK_REMOTE/data.tsv' WITH (FORMAT text, DELIMITER '|', NULL '')
SELECT 'staged' AS step, COUNT(*) AS rows FROM apia.staging_fermieri;
DELETE FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
-- Dedupe within the staging set on the natural key (UAT XLSXes occasionally
-- list the same farmer twice for separate parcel categories). Pick the row
-- with max suprafata_ha so we don't lose the larger declaration.
INSERT INTO apia.fermieri (
campaign_year, name, comuna_oras, sat, centru_apia,
responsabil_uat, suprafata_ha,
source_dataset_id, source_resource_id, source_url
)
SELECT DISTINCT ON (campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''))
campaign_year::smallint,
name,
NULLIF(comuna_oras, ''),
NULLIF(sat, ''),
NULLIF(centru_apia, ''),
NULLIF(responsabil_uat, ''),
NULLIF(suprafata_ha, '')::numeric,
source_dataset_id,
source_resource_id,
source_url
FROM apia.staging_fermieri
ORDER BY campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''),
NULLIF(suprafata_ha,'')::numeric DESC NULLS LAST
ON CONFLICT (campaign_year, name, comuna_oras, sat) DO UPDATE
SET centru_apia = EXCLUDED.centru_apia,
responsabil_uat = EXCLUDED.responsabil_uat,
suprafata_ha = EXCLUDED.suprafata_ha,
source_dataset_id = EXCLUDED.source_dataset_id,
source_resource_id = EXCLUDED.source_resource_id,
source_url = EXCLUDED.source_url,
fetched_at = now();
SELECT 'inserted' AS step,
COUNT(*) AS rows_now
FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
SQL"
N_NOW=$(ssh satra "/tmp/baseline.sh -t -A -c \"SELECT COUNT(*) FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';\" 2>/dev/null | tail -1")
echo "[apia-import] inserted rows for $RESOURCE_ID: $N_NOW"
T1=$(date +%s%3N)
DURATION=$((T1 - T0))
# Log the run
ssh satra "/tmp/baseline.sh -c \"
INSERT INTO apia.scrape_log (
source_dataset_id, source_resource_id, source_url, campaign_year,
rows_seen, rows_inserted, duration_ms, started_at
) VALUES (
'$DATASET_ID', '$RESOURCE_ID', '$SOURCE_URL', $YEAR,
$N_TSV, $N_NOW, $DURATION, '$STARTED_AT'
);\" 2>&1 | tail -2"
TOTAL_ROWS=$((TOTAL_ROWS + N_TSV))
TOTAL_INSERTED=$((TOTAL_INSERTED + N_NOW))
ssh satra "rm -rf $WORK_REMOTE"
done < "$WORK_LOCAL/resources.tsv"
# 4. CUI matcher
echo "[apia-import] matching CUI..."
ssh satra "/tmp/baseline.sh -c 'SELECT * FROM apia.match_cui();' 2>&1 | tail -10"
# 5. Refresh MV
echo "[apia-import] refreshing materialized view..."
ssh satra "/tmp/baseline.sh -c 'REFRESH MATERIALIZED VIEW apia.mv_per_cui;' 2>&1 | tail -5"
# 6. Final summary
echo "[apia-import] === SUMMARY ==="
ssh satra "/tmp/baseline.sh <<'SQL'
SELECT
'totals' AS metric,
COUNT(*) AS rows_total,
COUNT(DISTINCT source_resource_id) AS resources,
COUNT(DISTINCT comuna_oras) AS comune,
COUNT(DISTINCT centru_apia) AS centre_apia,
ROUND(SUM(suprafata_ha)::numeric, 2) AS total_ha,
COUNT(*) FILTER (WHERE cui IS NOT NULL) AS rows_with_cui,
COUNT(*) FILTER (WHERE is_legal_person) AS rows_pj
FROM apia.fermieri;
SQL"
echo "[apia-import] === done ($TOTAL_RESOURCES resource(s), $TOTAL_INSERTED rows) ==="
@@ -0,0 +1,526 @@
#!/bin/bash
# Historical financial backfill 2015-2019 from data.gov.ro / MFP.
#
# Why a separate script: 2015 and pre-2020 files have slightly different
# schemas (WEB_UU 2015 has 21 cols vs 22 for 2016+; WEB_BL_BS_SL 2015 has 23
# cols vs 22 for 2016+; WEB_INST_DE_CREDIT 2016/2017/2019 has 23 cols vs 25
# for 2024). The daily importer (import-financials.sh +
# import-financials-ong-banks.sh) assumes the 2020+ schema and silently fails
# or rejects older years. This wrapper:
# 1) Downloads the right files from data.gov.ro for the requested years.
# 2) Loads them via a session-local TEMP TABLE matched to that year's column
# count, then INSERTs into the canonical firms.financials* tables.
#
# Usage on satra:
# /opt/vreaudigital/services/seap-scraper/cron/import-financials-historical.sh
# YEARS="2017 2018" /opt/...../import-financials-historical.sh # subset
#
# Idempotent — PK (cui, year) + ON CONFLICT DO UPDATE.
#
# Banks: 2015 and 2018 have no Inst_de_credit file at data.gov.ro. Banks for
# 2016/2017/2019 use the pre-IFRS schema (21 indicators), so this script also
# loads pre-2020 bank files into firms.financials_banks with the JSONB
# `indicators` column carrying everything; the typed columns are mapped
# best-effort (i21 instead of i23 → cifra_afaceri).
set -uo pipefail
DATA_DIR=/opt/vreaudigital/data/mfinante
LOG=/var/log/vreaudigital-fin-historical.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
mkdir -p "$DATA_DIR"
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
--path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
YEARS="${YEARS:-2015 2016 2017 2018 2019}"
log "=== Historical financial import started (YEARS=$YEARS) ==="
# Discover a download URL from a data.gov.ro slug by filename regex.
# Args: slug pattern (pattern is a Python regex matched on resource name)
discover() {
local slug="$1"
local pattern="$2"
curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
| python3 -c "
import json, sys, re
d = json.load(sys.stdin)
pat = re.compile(r'''$pattern''', re.I)
for r in d.get('result', {}).get('resources', []):
if pat.search(r.get('name', '')):
print(r.get('url', '')); break
"
}
# Download a file from data.gov.ro if not already present.
# Args: local_path url
fetch() {
local file="$1"
local url="$2"
if [ -s "$file" ]; then
log " [SKIP] $file already exists ($(stat -c%s "$file") bytes)"
return 0
fi
if [ -z "$url" ]; then
log " [ERR] No URL for $file"
return 1
fi
log " Downloading $url$file"
curl -fsL --max-time 300 -o "$file" "$url" || { log " [ERR] download failed"; rm -f "$file"; return 1; }
log " OK $(stat -c%s "$file") bytes"
}
# ─── WEB_UU (companies, prescurtat) ──────────────────────────────────────
import_uu() {
local year="$1"
local file="$DATA_DIR/web_uu_${year}.txt"
local slug="situatii_financiare_${year}"
local pattern url ncols
case "$year" in
2015) pattern="^web_uu.*${year}\\.txt$"; ncols=21 ;;
*) pattern="^web_uu.*${year}\\.txt$"; ncols=22 ;;
esac
if [ ! -s "$file" ]; then
url=$(discover "$slug" "$pattern")
fetch "$file" "$url" || return 1
fi
log "[$year/WEB_UU] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
if [ "$ncols" -eq 22 ]; then
# Standard schema (2016+): CUI,CAEN,I1..I20. I20 = salariati.
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$year/WEB_UU] UPSERT..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9,
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
'mfinante:WEB_UU'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
else
# 2015 schema (21 cols, CUI,CAEN,I1..I19). The pre-2016 reporting
# ordering omits the modern I12 (patrimoniul_regiei) column entirely
# and shifts everything from cifra_afaceri onward one position left:
# 2015 I12 ↔ modern I13 (cifra_afaceri)
# 2015 I13 ↔ modern I14 (venituri_total)
# ...
# 2015 I18 ↔ modern I19 (pierdere_neta)
# 2015 I19 ↔ modern I20 (numar_salariati)
# Verified by matching cifra_afaceri / salariati to a stable CUI's
# 2016-2024 series. Without this remap, salariati was being ingested
# as pierdere_neta and cifra_afaceri was off by one column.
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$year/WEB_UU] UPSERT (2015 left-shift remap)..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9,
i10, i11,
NULL::numeric(20,2), -- patrimoniul_regiei not in 2015 schema
i12, i13, i14, i15, i16, i17, i18, -- cifra_afaceri..pierdere_neta
CASE WHEN i19 BETWEEN 0 AND 100000000 THEN i19::bigint ELSE NULL END,
'mfinante:WEB_UU'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
fi
}
# ─── WEB_BL_BS_SL ────────────────────────────────────────────────────────
import_bl() {
local year="$1"
local file="$DATA_DIR/web_bl_bs_sl_${year}.txt"
local slug="situatii_financiare_${year}"
local pattern url ncols
pattern="^web_bl_bs_sl.*${year}\\.txt$"
case "$year" in
2015) ncols=23 ;; # has extra I21
*) ncols=22 ;;
esac
if [ ! -s "$file" ]; then
url=$(discover "$slug" "$pattern")
fetch "$file" "$url" || return 1
fi
log "[$year/WEB_BL_BS_SL] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
if [ "$ncols" -eq 22 ]; then
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$year/WEB_BL_BS_SL] UPSERT..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9,
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
'mfinante:WEB_BL_BS_SL'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
else
# 2015 BL_BS_SL schema (23 cols, CUI,CAEN,I1..I21). The pre-2016 BL
# reporting has an extra (unknown) field somewhere between
# capital_subscris (I11) and cifra_afaceri. Empirically (cross-checked
# CUI 538310 against 2016-2024 series): cifra_afaceri lives at I14
# (not I13), salariati at I21. Treat I12,I13 as patrimoniul_regiei +
# an unmapped field (likely related to regii autonome / provizioane
# detail); both empty for typical SRLs. Map:
# 2015 BL I1..I11 = modern I1..I11
# 2015 BL I12 → patrimoniul_regiei (modern I12)
# 2015 BL I13 → dropped (unknown)
# 2015 BL I14 → cifra_afaceri (modern I13)
# 2015 BL I15..I20 → modern I14..I19
# 2015 BL I21 → numar_salariati (modern I20)
psql -v ON_ERROR_STOP=1 <<COPYEOF
CREATE TEMP TABLE tmp_bl23 (
cui text, caen text,
i1 numeric(20,2), i2 numeric(20,2), i3 numeric(20,2), i4 numeric(20,2),
i5 numeric(20,2), i6 numeric(20,2), i7 numeric(20,2), i8 numeric(20,2),
i9 numeric(20,2), i10 numeric(20,2), i11 numeric(20,2), i12 numeric(20,2),
i13 numeric(20,2), i14 numeric(20,2), i15 numeric(20,2), i16 numeric(20,2),
i17 numeric(20,2), i18 numeric(20,2), i19 numeric(20,2), i20 numeric(20,2),
i21 numeric(20,2)
); -- session-scoped; dropped when psql exits
\\copy tmp_bl23 FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
i12, -- patrimoniul_regiei
i14, i15, i16, i17, i18, i19, i20, -- cifra_afaceri..pierdere_neta
CASE WHEN i21 BETWEEN 0 AND 100000000 THEN i21::bigint ELSE NULL END,
'mfinante:WEB_BL_BS_SL'
FROM tmp_bl23
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
COPYEOF
fi
}
# ─── WEB_ONG (49 cols, schema consistent across 2015-2024) ───────────────
import_ong() {
local year="$1"
local file="$DATA_DIR/web_ong_${year}.txt"
local slug="situatii_financiare_${year}"
local url
if [ ! -s "$file" ]; then
url=$(discover "$slug" "^web_ong.*${year}\\.txt$")
fetch "$file" "$url" || return 1
fi
local header_cols
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
log "[$year/WEB_ONG] COPY $file ($(stat -c%s "$file") bytes, $header_cols cols)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
if [ "$header_cols" -eq 49 ]; then
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
elif [ "$header_cols" -eq 51 ]; then
# 2018 schema: ...,I44,DEN_CAENO,I45,DEN_CAEN,I46 (extra UNQUOTED text
# columns whose contents contain commas — breaks naive CSV parsing).
# Preprocess into a 49-col file by walking backwards from end to identify
# the two text columns (variable comma count).
local cleaned="${file}.cleaned49"
log "[$year/WEB_ONG] Preprocessing 51→49 cols (stripping DEN_CAEN/DEN_CAENO)..."
python3 - "$file" "$cleaned" <<'PYEOF'
import sys
src, dst = sys.argv[1], sys.argv[2]
NUM_RE = __import__('re').compile(r'^-?\d+(\.\d+)?$|^$')
out = open(dst, 'w')
with open(src) as fh:
header = fh.readline().rstrip('\n').split(',')
# write reduced header (drop DEN_CAEN, DEN_CAENO positions 47 and 49, zero-indexed)
keep = [i for i, h in enumerate(header) if h.upper() not in ('DEN_CAEN', 'DEN_CAENO')]
out.write(','.join(header[i] for i in keep) + '\n')
for line in fh:
line = line.rstrip('\n')
parts = line.split(',')
# Walk from end: parts[-1] = i46 (numeric), then DEN_CAEN spans
# multiple parts (text). parts[-X] = i45 (numeric/empty), then
# DEN_CAENO spans, then parts[-Y] = i44 (numeric/empty).
n = len(parts)
# Find last 3 numeric-or-empty trailing fields by scanning back.
# i46 = parts[n-1]; find i45 = first numeric/empty going back from n-2.
i46_idx = n - 1
# walk backwards skipping non-numeric until we hit numeric -> that's i45
j = n - 2
while j >= 0 and not NUM_RE.match(parts[j]):
j -= 1
i45_idx = j
# den_caen spans (i45_idx+1 .. i46_idx-1) → join those
# continue back to find i44
j -= 1
while j >= 0 and not NUM_RE.match(parts[j]):
j -= 1
i44_idx = j
if i44_idx < 0 or i45_idx < 0:
# malformed row — skip
continue
# Reassemble: parts[0..i44_idx] + parts[i45_idx] + parts[i46_idx]
new_parts = parts[:i44_idx+1] + [parts[i45_idx]] + [parts[i46_idx]]
if len(new_parts) != 49:
# row doesn't fit expected 49-col output → skip
continue
out.write(','.join(new_parts) + '\n')
out.close()
PYEOF
log "[$year/WEB_ONG] Cleaned $(wc -l < "$cleaned") lines (incl. header)"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$cleaned' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
rm -f "$cleaned"
else
log "[$year/WEB_ONG] unexpected col count $header_cols, skipping"
return 0
fi
log "[$year/WEB_ONG] UPSERT..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials_ong (
cui, year, caen, caeno,
capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
personal_neeconomic, personal_economic, indicators
)
SELECT DISTINCT ON (cui)
cui, $year, caen, caeno,
NULLIF(i12, '')::numeric(20,2),
NULLIF(i38, '')::numeric(20,2),
NULLIF(i40, '')::numeric(20,2),
NULLIF(i42, '')::numeric(20,2),
CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
jsonb_strip_nulls(jsonb_build_object(
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
))
FROM firms.staging_ong
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
caen = EXCLUDED.caen,
caeno = EXCLUDED.caeno,
capitaluri_proprii = EXCLUDED.capitaluri_proprii,
venituri_total = EXCLUDED.venituri_total,
cheltuieli_total = EXCLUDED.cheltuieli_total,
excedent = EXCLUDED.excedent,
personal_neeconomic = EXCLUDED.personal_neeconomic,
personal_economic = EXCLUDED.personal_economic,
indicators = EXCLUDED.indicators,
fetched_at = now();
SQL
}
# ─── WEB_INST_DE_CREDIT (banks) — pre-IFRS schemas vary by year ─────────
# 2015: not published. 2016/2017/2019: 23 cols (I1..I21). 2018: not published.
# 2020/2021/2022: 23 cols (I21). 2023: 24 cols (I22). 2024: 25 cols (I23).
import_bank() {
local year="$1"
local file="$DATA_DIR/web_inst_de_credit_${year}.txt"
local slug="situatii_financiare_${year}"
case "$year" in
2020) slug="situatii_financiare_2021" ;;
2023) slug="situatii_financiare2023" ;;
esac
local url
if [ ! -s "$file" ]; then
url=$(discover "$slug" "^web_(inst|instit)_de_credit.*${year}\\.txt$")
if [ -z "$url" ]; then log "[$year/BANK] no file in dataset, skip"; return 0; fi
fetch "$file" "$url" || return 1
fi
# Detect column count from header line.
local header_cols
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
log "[$year/BANK] $file ($(stat -c%s "$file") bytes, $header_cols cols)"
# Build a TEMP table sized to the file, then map to firms.financials_banks.
# The "cifra_afaceri" mapping: in IFRS 2024 schema (25 cols) it's i23. In
# older 23-col schema it's i21. In 24-col schema (2023) it's i22.
local ind_n cifra_col profit_inainte_col profit_exerc_col capital_col activ_col cols_def cols_list ind_pairs
ind_n=$(( header_cols - 2 )) # i1..iN
case "$ind_n" in
21) cifra_col=i21; profit_inainte_col=i17; profit_exerc_col=i20; capital_col=i14; activ_col=i6 ;;
22) cifra_col=i22; profit_inainte_col=i18; profit_exerc_col=i21; capital_col=i14; activ_col=i6 ;;
23) cifra_col=i23; profit_inainte_col=i19; profit_exerc_col=i22; capital_col=i14; activ_col=i6 ;;
*) log "[$year/BANK] unexpected indicator count $ind_n, skipping"; return 0 ;;
esac
# Build dynamic column list for TEMP table and \\copy.
cols_def="cui text, caen text"
cols_list="cui, caen"
ind_pairs=""
for i in $(seq 1 "$ind_n"); do
cols_def="$cols_def, i${i} text"
cols_list="$cols_list, i${i}"
ind_pairs="$ind_pairs 'i${i}', NULLIF(i${i}, ''),"
done
ind_pairs="${ind_pairs%,}"
psql -v ON_ERROR_STOP=1 <<COPYEOF
CREATE TEMP TABLE tmp_bank (
$cols_def
); -- session-scoped; dropped when psql exits
\\copy tmp_bank ($cols_list) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
INSERT INTO firms.financials_banks (
cui, year, caen,
active_financiare_amortiz, capital_social, profit_exercitiu,
profit_inainte_impozit, cifra_afaceri, indicators, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
NULLIF($activ_col, '')::numeric(20,2),
NULLIF($capital_col, '')::numeric(20,2),
NULLIF($profit_exerc_col, '')::numeric(20,2),
NULLIF($profit_inainte_col, '')::numeric(20,2),
NULLIF($cifra_col, '')::numeric(20,2),
jsonb_strip_nulls(jsonb_build_object($ind_pairs)),
'mfinante:WEB_Inst_de_credit'
FROM tmp_bank
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
caen = EXCLUDED.caen,
active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
capital_social = EXCLUDED.capital_social,
profit_exercitiu = EXCLUDED.profit_exercitiu,
profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
cifra_afaceri = EXCLUDED.cifra_afaceri,
indicators = EXCLUDED.indicators,
source = EXCLUDED.source,
fetched_at = now();
COPYEOF
}
# CATEGORIES env var filters which sub-imports run. Default = all.
# Useful: CATEGORIES="bank" to skip companies and only redo banks.
CATEGORIES="${CATEGORIES:-uu bl ong bank}"
for YEAR in $YEARS; do
log "── Year $YEAR ──────────────────────────────"
for CAT in $CATEGORIES; do
case "$CAT" in
uu) import_uu "$YEAR" || log "[$YEAR/WEB_UU] failed" ;;
bl) import_bl "$YEAR" || log "[$YEAR/WEB_BL_BS_SL] failed" ;;
ong) import_ong "$YEAR" || log "[$YEAR/WEB_ONG] failed" ;;
bank) import_bank "$YEAR" || log "[$YEAR/BANK] failed" ;;
esac
done
done
log "=== Refreshing latest-year MV ==="
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;" || true
log "=== Final coverage ==="
psql -c "
SELECT 'fin' AS tbl, year, COUNT(*) AS n FROM firms.financials GROUP BY year
UNION ALL
SELECT 'ong' AS tbl, year, COUNT(*) AS n FROM firms.financials_ong GROUP BY year
UNION ALL
SELECT 'bank' AS tbl, year, COUNT(*) AS n FROM firms.financials_banks GROUP BY year
ORDER BY tbl, year;
" 2>&1 | tee -a "$LOG"
log "=== Historical import done ==="
+194
View File
@@ -0,0 +1,194 @@
#!/bin/bash
# Imports MFP non-WEB_UU/BL_BS_SL financial categories into separate tables.
# Currently handles WEB_ONG (46 indicators, NGO-specific) and WEB_Inst_de_credit
# (23 IFRS indicators for banks). Other small categories (IFN, ASIG, BROK, SIF,
# PENSII, VS, VM, IP_IEME, IR, FOND_GARANTARE) can follow the same pattern with
# their own tables; for now we treat them as future work since each is <1MB
# and < a few hundred records.
#
# Discovers download URLs via data.gov.ro CKAN API per data year.
#
# Idempotent. ON CONFLICT (cui, year) DO UPDATE so re-runs refresh latest values.
set -uo pipefail
DATA_DIR=/opt/vreaudigital/data/mfinante
LOG=/var/log/vreaudigital-fin-import.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
mkdir -p "$DATA_DIR"
# ── DB env (unchanged from import-financials.sh pattern) ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
--path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
log "=== ONG + Banks import started ==="
# Apply schema if not present.
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/016_firms_financials_categories.sql >/dev/null
# Helper: discover CSV URL via CKAN. Slug per data year, file pattern per category.
discover_url() {
local year="$1"
local pattern="$2" # e.g. "web_ong_an" or "web_instit_de_credit_an" or "web_inst_de_credit_"
local slug
case "$year" in
2015) slug="situatii_financiare_2015" ;;
2016) slug="situatii_financiare_2016" ;;
2017) slug="situatii_financiare_2017" ;;
2018) slug="situatii_financiare_2018" ;;
2019) slug="situatii_financiare_2019" ;;
2020) slug="situatii_financiare_2021" ;; # 2020 data lives in 2021 megadump
2021) slug="situatii_financiare_2021" ;;
2022) slug="situatii_financiare_2022" ;;
2023) slug="situatii_financiare2023" ;;
2024) slug="situatii_financiare_2024" ;;
*) echo ""; return 1 ;;
esac
curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
| python3 -c "
import json, sys, re
d = json.load(sys.stdin)
year = '$year'
pat = re.compile(r'$pattern' + year + r'\\.txt\$', re.I)
for r in d.get('result', {}).get('resources', []):
if pat.search(r.get('name', '')):
print(r.get('url', '')); break
"
}
# ─── ONG ──────────────────────────────────────────────────────────────────
for YEAR in ${YEARS:-2020 2021 2022 2023 2024}; do
FILE="$DATA_DIR/web_ong_${YEAR}.txt"
if [ ! -s "$FILE" ]; then
URL=$(discover_url "$YEAR" "web_ong_an")
if [ -z "$URL" ]; then log "[$YEAR/ONG] URL not found, skipping"; continue; fi
log "[$YEAR/ONG] Downloading from $URL ..."
curl -fsL --max-time 120 -o "$FILE" "$URL"
fi
log "[$YEAR/ONG] COPY $FILE ($(stat -c%s "$FILE") bytes)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$FILE' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$YEAR/ONG] UPSERT into firms.financials_ong..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials_ong (
cui, year, caen, caeno,
capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
personal_neeconomic, personal_economic, indicators
)
SELECT DISTINCT ON (cui)
cui, $YEAR, caen, caeno,
NULLIF(i12, '')::numeric(20,2),
NULLIF(i38, '')::numeric(20,2),
NULLIF(i40, '')::numeric(20,2),
NULLIF(i42, '')::numeric(20,2),
CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
jsonb_strip_nulls(jsonb_build_object(
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
))
FROM firms.staging_ong
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
caen = EXCLUDED.caen,
caeno = EXCLUDED.caeno,
capitaluri_proprii = EXCLUDED.capitaluri_proprii,
venituri_total = EXCLUDED.venituri_total,
cheltuieli_total = EXCLUDED.cheltuieli_total,
excedent = EXCLUDED.excedent,
personal_neeconomic = EXCLUDED.personal_neeconomic,
personal_economic = EXCLUDED.personal_economic,
indicators = EXCLUDED.indicators,
fetched_at = now();
SQL
done
# ─── BĂNCI / Instituții de Credit ─────────────────────────────────────────
for YEAR in ${YEARS:-2020 2021 2022 2023 2024}; do
FILE="$DATA_DIR/web_inst_de_credit_${YEAR}.txt"
if [ ! -s "$FILE" ]; then
# Filename differs per year — sometimes web_instit_de_credit_an, sometimes web_inst_de_credit_
URL=$(discover_url "$YEAR" "web_(inst|instit)_de_credit_(an)?")
if [ -z "$URL" ]; then log "[$YEAR/BANK] URL not found, skipping"; continue; fi
log "[$YEAR/BANK] Downloading from $URL ..."
curl -fsL --max-time 60 -o "$FILE" "$URL"
fi
log "[$YEAR/BANK] COPY $FILE ($(stat -c%s "$FILE") bytes)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_banks;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_banks (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) FROM '$FILE' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$YEAR/BANK] UPSERT into firms.financials_banks..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials_banks (
cui, year, caen,
active_financiare_amortiz, capital_social, profit_exercitiu,
profit_inainte_impozit, cifra_afaceri, indicators
)
SELECT DISTINCT ON (cui)
cui, $YEAR, caen,
NULLIF(i6, '')::numeric(20,2),
NULLIF(i14, '')::numeric(20,2),
NULLIF(i22, '')::numeric(20,2),
NULLIF(i19, '')::numeric(20,2),
NULLIF(i23, '')::numeric(20,2),
jsonb_strip_nulls(jsonb_build_object(
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, '')
))
FROM firms.staging_banks
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
caen = EXCLUDED.caen,
active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
capital_social = EXCLUDED.capital_social,
profit_exercitiu = EXCLUDED.profit_exercitiu,
profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
cifra_afaceri = EXCLUDED.cifra_afaceri,
indicators = EXCLUDED.indicators,
fetched_at = now();
SQL
done
log "=== ONG + Banks final stats ==="
psql -At -F"|" -c "
SELECT 'ong:' || year, COUNT(*) FROM firms.financials_ong GROUP BY year ORDER BY year;" 2>&1 | tee -a "$LOG"
psql -At -F"|" -c "
SELECT 'bank:' || year, COUNT(*) FROM firms.financials_banks GROUP BY year ORDER BY year;" 2>&1 | tee -a "$LOG"
log "=== ONG + Banks import done ==="
+108
View File
@@ -0,0 +1,108 @@
#!/bin/bash
# Import financial indicators (Situații financiare) from data.gov.ro per year.
# Runs COPY from web_uu_YYYY.txt → staging_financials → firms.financials (PK cui+year).
set -euo pipefail
DATA_DIR=/opt/vreaudigital/data/mfinante
LOG=/var/log/vreaudigital-fin-import.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" --client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" -- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DATABASE_URL TOKEN DB
log "=== Financial import started ==="
# WEB_UU and WEB_BL_BS_SL share the same 22-column schema (CUI, CAEN, I1..I20)
# so we can use the same staging table + INSERT for both. The `source` column
# tracks which raw category the row came from. WEB_BL_BS_SL covers special-
# regime entities (bilanț scurt, lichidare) that aren't in WEB_UU — e.g.
# Alliance Healthcare, in-liquidation companies. Together they fill most of
# the financial-data gap.
import_year_category() {
local YEAR="$1"
local CATEGORY="$2" # WEB_UU | WEB_BL_BS_SL
local FILE="$3"
local SRC_LABEL="mfinante:${CATEGORY}"
if [ ! -s "$FILE" ]; then
log "[$YEAR/$CATEGORY] [SKIP] $FILE missing"
return 0
fi
log "[$YEAR/$CATEGORY] Truncating staging..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
log "[$YEAR/$CATEGORY] COPY $FILE..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$FILE' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$YEAR/$CATEGORY] UPSERT into financials (source=$SRC_LABEL)..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $YEAR, caen,
i1, i2, i3, i4, i5,
i6, i7, i8, i9,
i10, i11, i12,
i13, i14, i15,
i16, i17, i18, i19,
-- Sanitize salariati: drop absurd values (data anomalies up to 7.7e14 observed)
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
'$SRC_LABEL'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
-- For (cui, year) duplicates across categories, prefer WEB_UU (more complete
-- schema for normal companies). Don't overwrite a WEB_UU row with a BL_BS_SL row.
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
}
# YEARS env var overrides the default daily-run list. Used by the historical
# backfill wrapper (import-financials-historical.sh). Default behaviour is
# unchanged for the cron job.
YEARS="${YEARS:-2020 2021 2022 2023 2024}"
for YEAR in $YEARS; do
import_year_category "$YEAR" "WEB_UU" "$DATA_DIR/web_uu_${YEAR}.txt"
import_year_category "$YEAR" "WEB_BL_BS_SL" "$DATA_DIR/web_bl_bs_sl_${YEAR}.txt"
done
log "=== Refreshing latest-year MV ==="
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;"
log "=== Final stats ==="
psql -c "
SELECT year, COUNT(*) AS firms_with_data,
ROUND(AVG(NULLIF(cifra_afaceri, 0))::numeric, 0) AS avg_ca,
COUNT(*) FILTER (WHERE cifra_afaceri > 0) AS cu_ca,
COUNT(*) FILTER (WHERE numar_salariati > 0) AS cu_salariati
FROM firms.financials
GROUP BY year ORDER BY year;
" 2>&1 | tee -a "$LOG"
log "=== Import done ==="
+85
View File
@@ -0,0 +1,85 @@
#!/bin/bash
# Discovers the latest ONRC bulk dataset on data.gov.ro, downloads any newer
# CSVs, and runs import-onrc.sh — but only if the dataset is fresher than
# what's already on disk. Idempotent: re-running on the same day is a no-op.
#
# Dataset on data.gov.ro is published ~monthly with slug pattern
# `firme-DD-MM-YYYY`. Resource UUIDs change each release, so we can't
# hardcode URLs — query CKAN to discover the current ones.
set -euo pipefail
DATA_DIR=/opt/vreaudigital/data/onrc
LOG=/var/log/vreaudigital-onrc-import.log
STAMP_FILE="$DATA_DIR/.dataset-name"
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
mkdir -p "$DATA_DIR"
log "=== ONRC fresh-check started ==="
# Query CKAN for the most recently modified `firme-...` dataset.
LATEST_NAME=$(curl -fsS --max-time 30 \
"https://data.gov.ro/api/3/action/package_search?q=firme&sort=metadata_modified+desc&rows=10" \
| jq -r '[.result.results[] | select(.name | test("^firme-[0-9]{2}-[0-9]{2}-[0-9]{4}$"))][0].name // empty')
if [ -z "$LATEST_NAME" ]; then
log "ERROR: could not find a firme-DD-MM-YYYY dataset on data.gov.ro"
exit 1
fi
log "Latest dataset on data.gov.ro: $LATEST_NAME"
# Skip if we've already imported this snapshot.
if [ -f "$STAMP_FILE" ] && [ "$(cat "$STAMP_FILE")" = "$LATEST_NAME" ]; then
log "Already imported $LATEST_NAME — nothing to do."
exit 0
fi
# Fetch resource URLs for the dataset. We need 4 of them (the rest are unused).
log "Fetching resource URLs for $LATEST_NAME..."
RESOURCES_JSON=$(curl -fsS --max-time 30 \
"https://data.gov.ro/api/3/action/package_show?id=$LATEST_NAME")
declare -A NEEDED=(
[od_firme.csv]=""
[od_caen_autorizat.csv]=""
[od_stare_firma.csv]=""
[od_reprezentanti_legali.csv]=""
)
while IFS=$'\t' read -r url; do
fname=$(basename "$url" | tr 'A-Z' 'a-z')
if [ -n "${NEEDED[$fname]+x}" ]; then
NEEDED[$fname]="$url"
fi
done < <(echo "$RESOURCES_JSON" | jq -r '.result.resources[] | "\(.url)"')
for f in "${!NEEDED[@]}"; do
if [ -z "${NEEDED[$f]}" ]; then
log "ERROR: resource $f not found in dataset $LATEST_NAME"
exit 1
fi
done
# Download each CSV (curl -z compares against existing file's mtime).
for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do
url="${NEEDED[$f]}"
log "Downloading $f..."
curl -fL --max-time 600 -o "$DATA_DIR/$f.tmp" "$url" 2>&1 | tail -3 | tee -a "$LOG"
mv -f "$DATA_DIR/$f.tmp" "$DATA_DIR/$f"
done
log "Running import-onrc.sh..."
"$SCRIPT_DIR/import-onrc.sh"
# ONRC import inserts new firms without lat/lng. Run the full geocoding
# fallback chain (geonames_postal → uat_centroid → photon → judet_centroid)
# so /harta + UI map clustering have coordinates for every fresh-import row.
log "Running geocode-firms.sh fallback chain..."
"$SCRIPT_DIR/geocode-firms.sh" || log "WARN: geocode-firms.sh exited non-zero; continuing"
# Record the snapshot we just successfully imported.
echo "$LATEST_NAME" > "$STAMP_FILE"
log "=== ONRC fresh-import done (snapshot=$LATEST_NAME) ==="
+272
View File
@@ -0,0 +1,272 @@
#!/bin/bash
# Import ONRC bulk CSV files into firms.entities.
# Source: data.gov.ro (CC-BY 4.0), updated weekly.
#
# Pipeline:
# 1. TRUNCATE staging tables
# 2. COPY each CSV (~/data/onrc/*.csv) into corresponding staging table
# 3. UPSERT into firms.entities, joining on cod_inmatriculare
# 4. Resolve siruta UAT for each firm via county+localitate fuzzy match
#
# Idempotent. Run nightly via cron.
set -euo pipefail
DATA_DIR=/opt/vreaudigital/data/onrc
LOG=/var/log/vreaudigital-onrc-import.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ONRC import started ==="
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
# Pass URL to psql via stdin to avoid leaking via `ps aux`.
# psql doesn't natively read URL from stdin; use libpq env vars instead.
# Parse URL: postgresql://USER:PASS@HOST:PORT/DBNAME
DB_USER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
DB_PASS=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
DB_HOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
DB_PORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
DB_NAME=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
export PGUSER="$DB_USER" PGPASSWORD="$DB_PASS" PGHOST="$DB_HOST" PGPORT="$DB_PORT" PGDATABASE="$DB_NAME"
unset DATABASE_URL TOKEN DB DB_USER DB_PASS DB_HOST DB_PORT DB_NAME
# ── Sanity check files ──
for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do
if [ ! -s "$DATA_DIR/$f" ]; then
log "FATAL: $DATA_DIR/$f missing or empty"; exit 1
fi
done
DATASET_NAME=$(basename "$(dirname "$(readlink -f "$DATA_DIR/od_firme.csv")")" | head -c 40)
log "Dataset name (best guess): $DATASET_NAME"
# ── Stage CSVs ──
log "Truncating staging tables..."
psql -v ON_ERROR_STOP=1 -c "
TRUNCATE TABLE firms.staging_onrc_firme, firms.staging_onrc_caen,
firms.staging_onrc_stare, firms.staging_onrc_reprezentanti;
"
log "COPY od_firme.csv (683MB)..."
time psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_firme (denumire, cui, cod_inmatriculare, data_inmatriculare, euid, forma_juridica, adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar, adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal, adr_sector, adr_completare, web, tara_firma_mama) FROM '$DATA_DIR/od_firme.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
log "COPY od_caen_autorizat.csv..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_caen (cod_inmatriculare, cod_caen, ver_caen) FROM '$DATA_DIR/od_caen_autorizat.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
log "COPY od_stare_firma.csv..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_stare (cod_inmatriculare, cod_stare) FROM '$DATA_DIR/od_stare_firma.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
log "COPY od_reprezentanti_legali.csv..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_reprezentanti (cod_inmatriculare, persoana, calitate, data_nastere, localitate_nastere, judet_nastere, tara_nastere, localitate, judet, tara) FROM '$DATA_DIR/od_reprezentanti_legali.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
# Optional: extras from same dataset (entreprises individuelle + EU branches).
# Idempotent — TRUNCATE-and-reload each run.
if [ -s "$DATA_DIR/od_reprezentanti_if.csv" ]; then
log "COPY od_reprezentanti_if.csv (~13MB)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.reprezentanti_if;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.reprezentanti_if (cod_inmatriculare, nume, data_nastere, localitate_nastere, judet_nastere, tara_nastere, calitate) FROM '$DATA_DIR/od_reprezentanti_if.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
else
log "[SKIP] od_reprezentanti_if.csv missing"
fi
if [ -s "$DATA_DIR/od_sucursale_alte_state_membre.csv" ]; then
log "COPY od_sucursale_alte_state_membre.csv (small)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.sucursale_ue;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.sucursale_ue (cod_inmatriculare, tip_unitate, denumire_sucursala, euid, cod_fiscal_strain, tara) FROM '$DATA_DIR/od_sucursale_alte_state_membre.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
else
log "[SKIP] od_sucursale_alte_state_membre.csv missing"
fi
# ── Aggregate into firms.entities ──
log "Building firms.entities from staging..."
time psql -v ON_ERROR_STOP=1 <<SQL
-- Pre-aggregate stare per cod_inmatriculare (multiple historical states possible — pick latest)
DROP TABLE IF EXISTS tmp_stare_agg;
CREATE TEMP TABLE tmp_stare_agg AS
SELECT DISTINCT ON (cod_inmatriculare) cod_inmatriculare, cod_stare
FROM firms.staging_onrc_stare
WHERE cod_inmatriculare IS NOT NULL
ORDER BY cod_inmatriculare, cod_stare DESC;
-- Aggregate CAEN per cod_inmatriculare
DROP TABLE IF EXISTS tmp_caen_agg;
CREATE TEMP TABLE tmp_caen_agg AS
SELECT
cod_inmatriculare,
array_agg(DISTINCT cod_caen ORDER BY cod_caen) FILTER (WHERE cod_caen IS NOT NULL) AS caens
FROM firms.staging_onrc_caen
WHERE cod_inmatriculare IS NOT NULL
GROUP BY cod_inmatriculare;
-- Aggregate reprezentanti per cod_inmatriculare
DROP TABLE IF EXISTS tmp_rep_agg;
CREATE TEMP TABLE tmp_rep_agg AS
SELECT
cod_inmatriculare,
jsonb_agg(jsonb_build_object(
'persoana', persoana,
'calitate', calitate,
'localitate', localitate,
'judet', judet,
'tara', tara
)) AS rep_legali
FROM firms.staging_onrc_reprezentanti
WHERE cod_inmatriculare IS NOT NULL AND persoana IS NOT NULL
GROUP BY cod_inmatriculare;
-- UPSERT firms.entities. CUI as PK.
-- Skip rows where CUI is empty/0. DISTINCT ON (cui) — if multiple ONRC rows share the
-- same CUI (rare but happens with reorganization), pick the most recently registered.
INSERT INTO firms.entities (
cui, cod_inmatriculare, euid, name, forma_juridica,
adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar,
adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal,
adr_sector, adr_completare,
adr_full,
data_inmatriculare,
registration_year,
web,
tara_firma_mama,
caen_autorizate,
rep_legali,
status_text,
is_radiated_onrc,
source_onrc_dataset,
onrc_fetched_at,
updated_at
)
SELECT DISTINCT ON (f.cui)
f.cui,
f.cod_inmatriculare,
f.euid,
f.denumire,
f.forma_juridica,
f.adr_tara, f.adr_judet, f.adr_localitate, f.adr_strada, f.adr_numar,
f.adr_bloc, f.adr_scara, f.adr_etaj, f.adr_apartament, f.adr_cod_postal,
f.adr_sector, f.adr_completare,
-- Build adr_full for geocoding
COALESCE(
NULLIF(trim(concat_ws(', ',
NULLIF(trim(concat_ws(' ', f.adr_strada,
CASE WHEN f.adr_numar IS NOT NULL THEN 'nr.' || f.adr_numar END
)), ''),
f.adr_localitate,
f.adr_judet,
'Romania'
)), ''),
NULL
) AS adr_full,
-- ONRC format: DD.MM.YYYY
CASE WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
THEN to_date(f.data_inmatriculare, 'DD.MM.YYYY')
ELSE NULL END AS data_inmatriculare,
CASE WHEN f.data_inmatriculare ~ '\d{4}\$'
THEN right(f.data_inmatriculare, 4)::int
WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
THEN right(f.data_inmatriculare, 4)::int
ELSE NULL END AS registration_year,
f.web,
f.tara_firma_mama,
ca.caens,
ra.rep_legali,
-- Status: store raw stare code (decoding via ONRC nomenclator e TODO)
-- For now: best effort detection of "radiat" pattern
COALESCE(ss.cod_stare, 'unknown') AS status_text,
false AS is_radiated_onrc, -- TODO: import ONRC stare nomenclator and detect
'$DATASET_NAME' AS source_onrc_dataset,
now() AS onrc_fetched_at,
now() AS updated_at
FROM firms.staging_onrc_firme f
LEFT JOIN tmp_caen_agg ca ON ca.cod_inmatriculare = f.cod_inmatriculare
LEFT JOIN tmp_rep_agg ra ON ra.cod_inmatriculare = f.cod_inmatriculare
LEFT JOIN tmp_stare_agg ss ON ss.cod_inmatriculare = f.cod_inmatriculare
LEFT JOIN firms.stare_codelist scl ON scl.cod = ss.cod_stare
WHERE f.cui IS NOT NULL
AND f.cui != ''
AND f.cui != '0'
AND f.denumire IS NOT NULL
ORDER BY f.cui, f.data_inmatriculare DESC NULLS LAST
ON CONFLICT (cui) DO UPDATE SET
cod_inmatriculare = EXCLUDED.cod_inmatriculare,
euid = EXCLUDED.euid,
name = EXCLUDED.name,
forma_juridica = EXCLUDED.forma_juridica,
adr_tara = EXCLUDED.adr_tara,
adr_judet = EXCLUDED.adr_judet,
adr_localitate = EXCLUDED.adr_localitate,
adr_strada = EXCLUDED.adr_strada,
adr_numar = EXCLUDED.adr_numar,
adr_bloc = EXCLUDED.adr_bloc,
adr_scara = EXCLUDED.adr_scara,
adr_etaj = EXCLUDED.adr_etaj,
adr_apartament = EXCLUDED.adr_apartament,
adr_cod_postal = EXCLUDED.adr_cod_postal,
adr_sector = EXCLUDED.adr_sector,
adr_completare = EXCLUDED.adr_completare,
adr_full = EXCLUDED.adr_full,
data_inmatriculare = EXCLUDED.data_inmatriculare,
registration_year = EXCLUDED.registration_year,
web = EXCLUDED.web,
tara_firma_mama = EXCLUDED.tara_firma_mama,
caen_autorizate = EXCLUDED.caen_autorizate,
rep_legali = EXCLUDED.rep_legali,
status_text = EXCLUDED.status_text,
is_radiated_onrc = EXCLUDED.is_radiated_onrc,
source_onrc_dataset = EXCLUDED.source_onrc_dataset,
onrc_fetched_at = EXCLUDED.onrc_fetched_at,
updated_at = now();
-- Match siruta UAT for each firm via norm_uat_name
UPDATE firms.entities f
SET siruta = sub.siruta
FROM (
SELECT DISTINCT ON (e.cui) e.cui, gu.siruta
FROM firms.entities e
JOIN public."GisUat" gu
ON seap.norm_uat_name(gu.county) = seap.norm_uat_name(e.adr_judet)
AND seap.norm_uat_name(gu.name) = seap.norm_uat_name(e.adr_localitate)
WHERE e.siruta IS NULL
AND e.adr_judet IS NOT NULL
AND e.adr_localitate IS NOT NULL
ORDER BY e.cui, gu.siruta
) sub
WHERE f.cui = sub.cui;
SQL
# ── Stats ──
log "Final stats:"
psql -c "
SELECT
COUNT(*) AS total_firms,
COUNT(*) FILTER (WHERE siruta IS NOT NULL) AS cu_siruta,
COUNT(*) FILTER (WHERE rep_legali IS NOT NULL) AS cu_admins,
COUNT(*) FILTER (WHERE caen_autorizate IS NOT NULL) AS cu_caen,
COUNT(*) FILTER (WHERE is_radiated_onrc = true) AS radiate
FROM firms.entities;
" 2>&1 | tee -a "$LOG"
log "=== ONRC import complete ==="
+199
View File
@@ -0,0 +1,199 @@
#!/bin/bash
# Download GeoNames RO postal codes and rebuild firms.postal_codes.
# Then geocode firms.entities by postal_code lookup, falling back to UAT
# centroid for firms without a valid postal code but with a siruta UAT.
#
# Coverage estimates (snapshot 2026-05-08):
# - postal-precision: ~2.07M / 3.97M firms (52%) — accuracy ~100m-2km
# - UAT-centroid fallback: +1.7M firms (44%) — accuracy 5-30km
# - combined: ~96% of all firms get lat/lng
#
# Run before geocode-photon.ts (which targets the remaining ~4% / refines the
# postal-level pins to housenumber level when available).
#
# Idempotent: safe to re-run weekly. Only rewrites firms.entities rows where
# the existing pin is null OR was set by an older/lower-precision source.
set -euo pipefail
DATA_DIR=/opt/vreaudigital/data/postal
LOG=/var/log/vreaudigital-postal-import.log
GEONAMES_URL=https://download.geonames.org/export/zip/RO.zip
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
mkdir -p "$DATA_DIR"
log "=== Postal-codes import started ==="
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DATABASE_URL TOKEN DB
# ── Download + unzip ──
log "Downloading $GEONAMES_URL..."
curl -fsSL --max-time 120 -o "$DATA_DIR/RO.zip" "$GEONAMES_URL"
log "Unzipping..."
cd "$DATA_DIR" && unzip -o RO.zip -d "$DATA_DIR" >/dev/null
[ -s "$DATA_DIR/RO.txt" ] || { log "FATAL: RO.txt missing or empty"; exit 1; }
# ── Apply schema (idempotent) ──
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/014_firms_postal_codes.sql >/dev/null
# ── Stage + UPSERT into firms.postal_codes ──
log "TRUNCATE staging + COPY..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_postal_codes;"
# GeoNames RO.txt is tab-separated, no header, US-ASCII safe (no quote escapes).
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_postal_codes (country_code, postal_code, place_name, admin1_name, admin1_code, admin2_name, admin2_code, admin3_name, admin3_code, lat, lng, accuracy) FROM '$DATA_DIR/RO.txt' WITH (FORMAT csv, DELIMITER E'\t', NULL '', QUOTE E'\b', HEADER false);
COPYEOF
log "Rebuilding firms.postal_codes from staging..."
psql -v ON_ERROR_STOP=1 <<'SQL'
TRUNCATE TABLE firms.postal_codes;
INSERT INTO firms.postal_codes (postal_code, place_name, county, county_code, admin2_code, admin3_code, admin3_name, lat, lng, accuracy)
SELECT
s.postal_code,
s.place_name,
NULLIF(s.admin1_name, ''),
NULLIF(s.admin1_code, ''),
NULLIF(s.admin2_code, ''),
NULLIF(s.admin3_code, ''),
NULLIF(s.admin3_name, ''),
s.lat::numeric(9,6),
s.lng::numeric(9,6),
NULLIF(s.accuracy, '')::int
FROM firms.staging_postal_codes s
WHERE s.postal_code ~ '^[0-9]{6}$'
AND s.lat ~ '^-?[0-9.]+$'
AND s.lng ~ '^-?[0-9.]+$'
ON CONFLICT (postal_code, place_name) DO UPDATE
SET lat = EXCLUDED.lat, lng = EXCLUDED.lng, accuracy = EXCLUDED.accuracy;
SQL
log "Stats:"
psql -At -F"|" -c "
SELECT 'postal_codes_loaded', COUNT(*) FROM firms.postal_codes UNION ALL
SELECT 'distinct_postal_codes', COUNT(DISTINCT postal_code) FROM firms.postal_codes;
" 2>&1 | tee -a "$LOG"
# ── Geocode firms.entities (chunked, deadlock-retry) ──
# Two-pass: postal first (more precise), then UAT centroid as fallback.
# Each chunk is its own psql transaction so a deadlock against the
# concurrent ANAF enrichment script aborts only the current chunk
# (caught + retried), not the entire batch's progress.
run_chunked_update() {
local label="$1"
local sql="$2"
local chunk_total=0 chunk_n=0 retries=0
while :; do
# -X disables psqlrc, -e echoes the statement so we get "UPDATE N" tag
OUT=$(psql -v ON_ERROR_STOP=1 -X 2>&1 <<SQL
$sql
SQL
)
if echo "$OUT" | grep -q "deadlock detected"; then
retries=$((retries + 1))
if [ "$retries" -gt 8 ]; then
log "[$label] giving up after 8 deadlock retries"
echo "$OUT" | tail -5 | tee -a "$LOG"
return 1
fi
log "[$label] deadlock — retry #$retries in 2s"
sleep 2
continue
fi
if echo "$OUT" | grep -qE "^ERROR:"; then
echo "$OUT" | tail -10 | tee -a "$LOG"
return 1
fi
ROWS=$(echo "$OUT" | grep -oE '^UPDATE [0-9]+' | tail -1 | awk '{print $2}')
ROWS=${ROWS:-0}
chunk_n=$((chunk_n + 1))
chunk_total=$((chunk_total + ROWS))
if [ "$ROWS" = "0" ]; then
log "[$label] done — $chunk_n chunks, $chunk_total rows"
return 0
fi
log "[$label] chunk #$chunk_n: $ROWS rows (running total $chunk_total)"
done
}
log "Geocoding firms.entities by postal_code..."
run_chunked_update "postal" "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.adr_cod_postal ~ '^[0-9]{6}\$'
AND (e.geocode_source IS NULL OR e.geocode_source = 'uat_centroid')
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
ORDER BY e.cui
LIMIT 50000
)
UPDATE firms.entities e
SET
lat = pc.lat::double precision,
lng = pc.lng::double precision,
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
geocode_source = 'geonames_postal',
geocode_score = 0.6,
geocoded_at = now(),
updated_at = now()
FROM firms.postal_codes_best pc, cand
WHERE e.cui = cand.cui
AND e.adr_cod_postal = pc.postal_code;
"
log "Geocoding firms.entities fallback to UAT centroid..."
# public.\"GisUat\".geom is in SRID 3844 (RO STEREO70 projected). Geography
# requires WGS84 lon/lat (4326), so ST_Transform before ::geography.
run_chunked_update "uat" "
WITH cand AS (
SELECT e.cui FROM firms.entities e
WHERE e.siruta IS NOT NULL
AND e.geocode_source IS NULL
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
ORDER BY e.cui
LIMIT 50000
)
UPDATE firms.entities e
SET
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
geocode_source = 'uat_centroid',
geocode_score = 0.3,
geocoded_at = now(),
updated_at = now()
FROM public.\"GisUat\" gu, cand
WHERE e.cui = cand.cui
AND e.siruta = gu.siruta;
"
log "Final stats:"
psql -At -F"|" -c "
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE lat IS NOT NULL) AS cu_lat_lng,
COUNT(*) FILTER (WHERE geocode_source = 'geonames_postal') AS via_postal,
COUNT(*) FILTER (WHERE geocode_source = 'uat_centroid') AS via_uat,
COUNT(*) FILTER (WHERE geocode_source = 'photon') AS via_photon
FROM firms.entities;
" 2>&1 | tee -a "$LOG"
log "=== Postal-codes import done ==="
+51
View File
@@ -0,0 +1,51 @@
#!/bin/bash
# One-shot install of Photon 0.5.0 (last Elasticsearch-backed release) on satra.
# Photon 0.6+ uses OpenSearch and is incompatible with the country-level extracts
# graphhopper still publishes (which are ES format). Verified working 2026-05-08.
#
# After install, start as a service: see vreaudigital-photon.service in this dir.
#
# Prerequisite: the RO ES extract is already at /opt/photon/photon_data
# (downloaded by setup-photon.sh from photon-db-ro-DDMMYY.tar.bz2).
set -euo pipefail
PHOTON_DIR=/opt/photon
PHOTON_VERSION=0.5.0
JAR_URL=https://github.com/komoot/photon/releases/download/${PHOTON_VERSION}/photon-${PHOTON_VERSION}.jar
log() { echo "[$(date '+%H:%M:%S')] $1"; }
log "=== Photon ${PHOTON_VERSION} install ==="
# 1. JDK 21 (works with Photon 0.5.0; 0.5 requires JDK 11+).
if ! command -v java >/dev/null 2>&1; then
log "Installing openjdk-21-jre-headless..."
sudo apt-get install -y openjdk-21-jre-headless
fi
java --version
# 2. Photon JAR
if [ ! -s "$PHOTON_DIR/photon-${PHOTON_VERSION}.jar" ]; then
log "Downloading photon-${PHOTON_VERSION}.jar (~38MB)..."
sudo curl -fL -o "$PHOTON_DIR/photon-${PHOTON_VERSION}.jar" "$JAR_URL"
sudo chown bulibasa:bulibasa "$PHOTON_DIR/photon-${PHOTON_VERSION}.jar"
else
log "JAR already on disk."
fi
# 3. Sanity-check the extract directory
if [ ! -d "$PHOTON_DIR/photon_data/elasticsearch" ]; then
log "FATAL: $PHOTON_DIR/photon_data/elasticsearch missing — run setup-photon.sh first."
exit 1
fi
sudo chown -R bulibasa:bulibasa "$PHOTON_DIR/photon_data"
# 4. Pre-create log + service file expectations
sudo touch /var/log/vreaudigital-photon.log
sudo chown bulibasa:bulibasa /var/log/vreaudigital-photon.log
log "=== Install done. Start with: ==="
log " cd $PHOTON_DIR && nohup java -Xmx8G -jar photon-${PHOTON_VERSION}.jar -data-dir $PHOTON_DIR -listen-port 2322 </dev/null >>/var/log/vreaudigital-photon.log 2>&1 &"
log "Or install systemd unit: sudo ln -sf $PHOTON_DIR/../vreaudigital/services/seap-scraper/cron/vreaudigital-photon.service /etc/systemd/system/ && sudo systemctl enable --now vreaudigital-photon"
log "Smoke test: curl 'http://localhost:2322/api?q=Bucuresti&limit=1'"
+204
View File
@@ -0,0 +1,204 @@
#!/bin/bash
# Fuzzy-match ancom.operatori.titular_name → firms.entities.cui via the
# same Stage A (exact normalized) + Stage B (pg_trgm unique-pick) + Stage C
# (judet disambiguation) pipeline as cron/match-cui-anre.sh.
#
# Most ANCOM rows have CUI directly from the detail page (cui_match_method='direct'),
# so this is a fallback for whatever subset has titular_cui IS NULL.
#
# Idempotent — only touches rows where titular_cui IS NULL.
set -uo pipefail
LOG=/var/log/vreaudigital-cui-match-ancom.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
# Resolve DATABASE_URL via Infisical Machine Identity
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
log "=== ANCOM CUI matcher started ==="
BEFORE=$(psql -At -c "SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) FROM ancom.operatori;")
log "before: $BEFORE"
# Pre-step: populate titular_name_norm for all rows where it's NULL.
log "pre-step: populating titular_name_norm..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
UPDATE ancom.operatori
SET titular_name_norm = firms.normalize_company_name(titular_name)
WHERE titular_name_norm IS NULL
AND titular_name IS NOT NULL;
SQL
# Stage A: exact normalized match (unique only).
log "Stage A: exact normalized match..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
WITH cand AS (
SELECT t.ancom_id AS row_id, t.titular_name_norm AS norm
FROM ancom.operatori t
WHERE t.titular_cui IS NULL
AND t.titular_name_norm IS NOT NULL
),
matched AS (
SELECT c.row_id, MIN(e.cui) AS cui, COUNT(*) AS n
FROM cand c
JOIN firms.entities e ON e.name_normalized = c.norm
GROUP BY c.row_id
)
UPDATE ancom.operatori t
SET titular_cui = m.cui,
cui_match_score = 1.0,
cui_match_method = 'exact_norm',
matched_at = now()
FROM matched m
WHERE t.ancom_id = m.row_id
AND t.titular_cui IS NULL
AND m.n = 1;
SQL
log "Stage A done"
# Stage B: pg_trgm fuzzy. Same SET threshold 0.7 + 0.85/0.10 accept rule
# as match-cui-external.sh.
log "Stage B: pg_trgm fuzzy (score >= 0.85, gap >= 0.10)..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sb_rows AS
SELECT t.ancom_id AS rowid, t.titular_name_norm AS norm
FROM ancom.operatori t
WHERE t.titular_cui IS NULL
AND t.titular_name_norm IS NOT NULL
AND length(t.titular_name_norm) >= 5;
CREATE INDEX ON _sb_rows (norm);
ANALYZE _sb_rows;
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
ANALYZE _sb_norms;
CREATE TEMP TABLE _sb_resolved AS
WITH ranked AS (
SELECT c.norm,
e.cui,
similarity(e.name_normalized, c.norm) AS sim,
ROW_NUMBER() OVER (
PARTITION BY c.norm
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
) AS rn
FROM _sb_norms c
JOIN firms.entities e ON e.name_normalized % c.norm
),
top2 AS (
SELECT norm,
MAX(sim) FILTER (WHERE rn = 1) AS s1,
MAX(sim) FILTER (WHERE rn = 2) AS s2,
MAX(cui) FILTER (WHERE rn = 1) AS cui1
FROM ranked WHERE rn <= 2
GROUP BY norm
)
SELECT norm, cui1, s1
FROM top2
WHERE s1 >= 0.85
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
CREATE INDEX ON _sb_resolved (norm);
ANALYZE _sb_resolved;
UPDATE ancom.operatori t
SET titular_cui = r.cui1,
cui_match_score = r.s1,
cui_match_method = 'trgm_unique',
matched_at = now()
FROM _sb_rows rw
JOIN _sb_resolved r ON rw.norm = r.norm
WHERE t.ancom_id = rw.rowid
AND t.titular_cui IS NULL;
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
SQL
log "Stage B done"
# Stage C: judet disambiguation when there are multiple trgm candidates.
log "Stage C: judet disambiguation..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sc_rows AS
SELECT t.ancom_id AS rowid,
t.titular_name_norm AS norm,
firms.normalize_judet(t.judet) AS judet_norm
FROM ancom.operatori t
WHERE t.titular_cui IS NULL
AND t.titular_name_norm IS NOT NULL
AND t.judet IS NOT NULL
AND length(t.titular_name_norm) >= 5;
CREATE INDEX ON _sc_rows (norm, judet_norm);
ANALYZE _sc_rows;
CREATE TEMP TABLE _sc_keys AS
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
ANALYZE _sc_keys;
CREATE TEMP TABLE _sc_resolved AS
WITH ranked AS (
SELECT c.norm, c.judet_norm, e.cui,
similarity(e.name_normalized, c.norm) AS sim,
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
FROM _sc_keys c
JOIN firms.entities e ON e.name_normalized % c.norm
),
pick AS (
SELECT DISTINCT ON (norm, judet_norm)
norm, judet_norm, cui, sim
FROM ranked
WHERE judet_match
ORDER BY norm, judet_norm, sim DESC, cui
)
SELECT * FROM pick WHERE sim >= 0.7;
CREATE INDEX ON _sc_resolved (norm, judet_norm);
ANALYZE _sc_resolved;
UPDATE ancom.operatori t
SET titular_cui = r.cui,
cui_match_score = r.sim,
cui_match_method = 'trgm_judet',
matched_at = now()
FROM _sc_rows rw
JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
WHERE t.ancom_id = rw.rowid
AND t.titular_cui IS NULL;
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
SQL
log "Stage C done"
AFTER=$(psql -At -c "
SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' ||
COUNT(*) || ' (matched ' ||
ROUND(100.0*COUNT(*) FILTER (WHERE titular_cui IS NOT NULL) / COUNT(*), 1) || '%)'
FROM ancom.operatori;")
log "after: $AFTER"
log "by method:"
psql -At -F'|' -c "
SELECT cui_match_method, COUNT(*)
FROM ancom.operatori
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
# Refresh the per-CUI MV now that titular_cui is populated.
log "refreshing ancom.mv_operatori_per_cui..."
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW CONCURRENTLY ancom.mv_operatori_per_cui;" \
2>>"$LOG" \
|| psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW ancom.mv_operatori_per_cui;" 2>&1 | tee -a "$LOG"
log "=== ANCOM CUI matcher done ==="
+204
View File
@@ -0,0 +1,204 @@
#!/bin/bash
# Fuzzy-match anre.licente.titular_name → firms.entities.cui via the
# same Stage A (exact normalized) + Stage B (pg_trgm unique-pick) + Stage C
# (judet disambiguation) pipeline as cron/match-cui-external.sh.
#
# Idempotent — only touches rows where titular_cui IS NULL.
#
# anre.licente has its own column names (titular_cui not cui), so we have
# a dedicated wrapper here. Same SQL approach, different column names.
set -uo pipefail
LOG=/var/log/vreaudigital-cui-match-anre.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
# Resolve DATABASE_URL via Infisical Machine Identity
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
log "=== ANRE CUI matcher started ==="
BEFORE=$(psql -At -c "SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) FROM anre.licente;")
log "before: $BEFORE"
# Pre-step: populate titular_name_norm for all rows where it's NULL.
log "pre-step: populating titular_name_norm..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
UPDATE anre.licente
SET titular_name_norm = firms.normalize_company_name(titular_name)
WHERE titular_name_norm IS NULL
AND titular_name IS NOT NULL;
SQL
# Stage A: exact normalized match (unique only).
log "Stage A: exact normalized match..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
WITH cand AS (
SELECT t.id AS row_id, t.titular_name_norm AS norm
FROM anre.licente t
WHERE t.titular_cui IS NULL
AND t.titular_name_norm IS NOT NULL
),
matched AS (
SELECT c.row_id, MIN(e.cui) AS cui, COUNT(*) AS n
FROM cand c
JOIN firms.entities e ON e.name_normalized = c.norm
GROUP BY c.row_id
)
UPDATE anre.licente t
SET titular_cui = m.cui,
cui_match_score = 1.0,
cui_match_method = 'exact_norm',
matched_at = now()
FROM matched m
WHERE t.id = m.row_id
AND t.titular_cui IS NULL
AND m.n = 1;
SQL
log "Stage A done"
# Stage B: pg_trgm fuzzy. Same SET threshold 0.7 + 0.85/0.10 accept rule
# as match-cui-external.sh.
log "Stage B: pg_trgm fuzzy (score >= 0.85, gap >= 0.10)..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sb_rows AS
SELECT t.id AS rowid, t.titular_name_norm AS norm
FROM anre.licente t
WHERE t.titular_cui IS NULL
AND t.titular_name_norm IS NOT NULL
AND length(t.titular_name_norm) >= 5;
CREATE INDEX ON _sb_rows (norm);
ANALYZE _sb_rows;
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
ANALYZE _sb_norms;
CREATE TEMP TABLE _sb_resolved AS
WITH ranked AS (
SELECT c.norm,
e.cui,
similarity(e.name_normalized, c.norm) AS sim,
ROW_NUMBER() OVER (
PARTITION BY c.norm
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
) AS rn
FROM _sb_norms c
JOIN firms.entities e ON e.name_normalized % c.norm
),
top2 AS (
SELECT norm,
MAX(sim) FILTER (WHERE rn = 1) AS s1,
MAX(sim) FILTER (WHERE rn = 2) AS s2,
MAX(cui) FILTER (WHERE rn = 1) AS cui1
FROM ranked WHERE rn <= 2
GROUP BY norm
)
SELECT norm, cui1, s1
FROM top2
WHERE s1 >= 0.85
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
CREATE INDEX ON _sb_resolved (norm);
ANALYZE _sb_resolved;
UPDATE anre.licente t
SET titular_cui = r.cui1,
cui_match_score = r.s1,
cui_match_method = 'trgm_unique',
matched_at = now()
FROM _sb_rows rw
JOIN _sb_resolved r ON rw.norm = r.norm
WHERE t.id = rw.rowid
AND t.titular_cui IS NULL;
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
SQL
log "Stage B done"
# Stage C: judet disambiguation when there are multiple trgm candidates.
log "Stage C: judet disambiguation..."
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sc_rows AS
SELECT t.id AS rowid,
t.titular_name_norm AS norm,
firms.normalize_judet(t.judet) AS judet_norm
FROM anre.licente t
WHERE t.titular_cui IS NULL
AND t.titular_name_norm IS NOT NULL
AND t.judet IS NOT NULL
AND length(t.titular_name_norm) >= 5;
CREATE INDEX ON _sc_rows (norm, judet_norm);
ANALYZE _sc_rows;
CREATE TEMP TABLE _sc_keys AS
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
ANALYZE _sc_keys;
CREATE TEMP TABLE _sc_resolved AS
WITH ranked AS (
SELECT c.norm, c.judet_norm, e.cui,
similarity(e.name_normalized, c.norm) AS sim,
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
FROM _sc_keys c
JOIN firms.entities e ON e.name_normalized % c.norm
),
pick AS (
SELECT DISTINCT ON (norm, judet_norm)
norm, judet_norm, cui, sim
FROM ranked
WHERE judet_match
ORDER BY norm, judet_norm, sim DESC, cui
)
SELECT * FROM pick WHERE sim >= 0.7;
CREATE INDEX ON _sc_resolved (norm, judet_norm);
ANALYZE _sc_resolved;
UPDATE anre.licente t
SET titular_cui = r.cui,
cui_match_score = r.sim,
cui_match_method = 'trgm_judet',
matched_at = now()
FROM _sc_rows rw
JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
WHERE t.id = rw.rowid
AND t.titular_cui IS NULL;
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
SQL
log "Stage C done"
AFTER=$(psql -At -c "
SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' ||
COUNT(*) || ' (matched ' ||
ROUND(100.0*COUNT(*) FILTER (WHERE titular_cui IS NOT NULL) / COUNT(*), 1) || '%)'
FROM anre.licente;")
log "after: $AFTER"
log "by method:"
psql -At -F'|' -c "
SELECT cui_match_method, COUNT(*)
FROM anre.licente
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
# Refresh the per-CUI MV now that titular_cui is populated.
log "refreshing anre.mv_licente_per_cui..."
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW CONCURRENTLY anre.mv_licente_per_cui;" \
2>>"$LOG" \
|| psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW anre.mv_licente_per_cui;" 2>&1 | tee -a "$LOG"
log "=== ANRE CUI matcher done ==="
+237
View File
@@ -0,0 +1,237 @@
#!/bin/bash
# Run CUI-matching pass over external tables that have company names
# but no CUI yet. Idempotent — only touches rows where cui IS NULL.
#
# Currently matches:
# - fonduri.beneficiar_anunt (~41K names)
# - fonduri.afir_plati (~316K distinct names)
#
# Future: ANI shareholdings, license registries, etc. — all use the same
# firms.normalize_company_name() helper from sql/019_cui_matcher.sql.
set -uo pipefail
LOG=/var/log/vreaudigital-cui-match.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
# Resolve DATABASE_URL via Infisical Machine Identity
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
log "=== CUI matcher started ==="
# Apply schema (idempotent — generates name_normalized column + indexes)
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/019_cui_matcher.sql >/dev/null
run_matcher() {
local TABLE="$1"
local NAME_COL="$2"
local JUDET_COL="$3" # may be empty string if source has no judet
local PRINTABLE="$4"
local RUN_TRGM="${5:-true}" # set to "false" to skip Stages B+C
# (e.g. AFIR direct payments where unmatched
# rows are individual farmers, not companies)
log "[$PRINTABLE] before: $(psql -At -c "SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*) FROM $TABLE;" | tr '|' '/')"
# Stage A: exact normalized match (unique). When multiple firms share the
# same normalized name (homonyms), we skip — Stage B + judet handles them.
log "[$PRINTABLE] Stage A: exact normalized match..."
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
WITH cand AS (
SELECT t.ctid AS row_ctid,
firms.normalize_company_name(t.$NAME_COL) AS norm
FROM $TABLE t
WHERE t.cui IS NULL
AND t.$NAME_COL IS NOT NULL
),
matched AS (
SELECT c.row_ctid,
MIN(e.cui) AS cui,
COUNT(*) AS n
FROM cand c
JOIN firms.entities e ON e.name_normalized = c.norm
GROUP BY c.row_ctid
)
UPDATE $TABLE t
SET cui = m.cui,
cui_match_score = 1.0,
cui_match_method = 'exact_norm',
matched_at = now()
FROM matched m
WHERE t.ctid = m.row_ctid
AND t.cui IS NULL
AND m.n = 1;
SQL
log "[$PRINTABLE] Stage A done"
# Stage B: pg_trgm similarity. Picks top candidate if score ≥ 0.85 AND
# gap to second-best ≥ 0.10 (so we know it's unambiguously the best match).
#
# Performance: previously O(unmatched_rows × candidate_pool) at default
# threshold 0.3 — 30+ min on AFIR (493K rows). Three-step pipeline now:
# 1. Materialize unmatched rows (rowid + norm) into a temp table
# 2. DISTINCT norms → much smaller trgm input set (BEN 13K→2K, AFIR 493K→274K)
# 3. SET pg_trgm.similarity_threshold = 0.7 so the gin `%` operator returns
# only candidates above the post-filter floor (drops fan-out by ~10×)
# The 0.85/0.10 accept rule is unchanged and produces identical matches.
if [ "$RUN_TRGM" != "true" ]; then
log "[$PRINTABLE] Stage B/C skipped (RUN_TRGM=false) — unmatched rows in this source are individuals, not registered companies"
log "[$PRINTABLE] after: $(psql -At -c "
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
COUNT(*),
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
FROM $TABLE;" | tr '|' '/')"
return 0
fi
log "[$PRINTABLE] Stage B: pg_trgm fuzzy (score ≥ 0.85, gap ≥ 0.10)..."
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sb_rows AS
SELECT t.ctid AS rowid,
firms.normalize_company_name(t.$NAME_COL) AS norm
FROM $TABLE t
WHERE t.cui IS NULL
AND t.$NAME_COL IS NOT NULL
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
CREATE INDEX ON _sb_rows (norm);
ANALYZE _sb_rows;
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
ANALYZE _sb_norms;
CREATE TEMP TABLE _sb_resolved AS
WITH ranked AS (
SELECT c.norm,
e.cui,
similarity(e.name_normalized, c.norm) AS sim,
ROW_NUMBER() OVER (
PARTITION BY c.norm
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
) AS rn
FROM _sb_norms c
JOIN firms.entities e ON e.name_normalized % c.norm
),
top2 AS (
SELECT norm,
MAX(sim) FILTER (WHERE rn = 1) AS s1,
MAX(sim) FILTER (WHERE rn = 2) AS s2,
MAX(cui) FILTER (WHERE rn = 1) AS cui1
FROM ranked WHERE rn <= 2
GROUP BY norm
)
SELECT norm, cui1, s1
FROM top2
WHERE s1 >= 0.85
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
CREATE INDEX ON _sb_resolved (norm);
ANALYZE _sb_resolved;
UPDATE $TABLE t
SET cui = r.cui1,
cui_match_score = r.s1,
cui_match_method = 'trgm_unique',
matched_at = now()
FROM _sb_rows rw
JOIN _sb_resolved r ON rw.norm = r.norm
WHERE t.ctid = rw.rowid
AND t.cui IS NULL;
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
SQL
log "[$PRINTABLE] Stage B done"
# Stage C: judet disambiguation when source has a judet column.
# Multiple candidates above 0.7 → prefer the one whose adr_judet matches.
# Same dedup-by-(norm,judet) + SET threshold pipeline as Stage B.
if [ -n "$JUDET_COL" ]; then
log "[$PRINTABLE] Stage C: judet disambiguation..."
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
SET pg_trgm.similarity_threshold = 0.7;
CREATE TEMP TABLE _sc_rows AS
SELECT t.ctid AS rowid,
firms.normalize_company_name(t.$NAME_COL) AS norm,
firms.normalize_judet(t.$JUDET_COL) AS judet_norm
FROM $TABLE t
WHERE t.cui IS NULL
AND t.$NAME_COL IS NOT NULL
AND t.$JUDET_COL IS NOT NULL
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
CREATE INDEX ON _sc_rows (norm, judet_norm);
ANALYZE _sc_rows;
CREATE TEMP TABLE _sc_keys AS
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
ANALYZE _sc_keys;
CREATE TEMP TABLE _sc_resolved AS
WITH ranked AS (
SELECT c.norm,
c.judet_norm,
e.cui,
similarity(e.name_normalized, c.norm) AS sim,
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
FROM _sc_keys c
JOIN firms.entities e ON e.name_normalized % c.norm
),
pick AS (
SELECT DISTINCT ON (norm, judet_norm)
norm, judet_norm, cui, sim
FROM ranked
WHERE judet_match
ORDER BY norm, judet_norm, sim DESC, cui
)
SELECT * FROM pick WHERE sim >= 0.7;
CREATE INDEX ON _sc_resolved (norm, judet_norm);
ANALYZE _sc_resolved;
UPDATE $TABLE t
SET cui = r.cui,
cui_match_score = r.sim,
cui_match_method = 'trgm_judet',
matched_at = now()
FROM _sc_rows rw
JOIN _sc_resolved r
ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
WHERE t.ctid = rw.rowid
AND t.cui IS NULL;
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
SQL
log "[$PRINTABLE] Stage C done"
fi
log "[$PRINTABLE] after: $(psql -At -c "
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
COUNT(*),
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
FROM $TABLE;" | tr '|' '/')"
log "[$PRINTABLE] by method:"
psql -At -F'|' -c "
SELECT cui_match_method, COUNT(*)
FROM $TABLE
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
}
run_matcher "fonduri.beneficiar_anunt" "beneficiar_name" "beneficiar_judet" "BEN_PRIVAT" true
# AFIR: skip trgm — unmatched rows are individual farmers (popa gheorghe,
# radu vasile, …) receiving FEADR direct payments. They have no CUI and
# never appear in firms.entities (private company registry). Running trgm
# on 274K distinct names against 4M entities would take 30+ hours for ~0 gain.
run_matcher "fonduri.afir_plati" "beneficiar_name" "localitate" "AFIR" false
log "=== CUI matcher done ==="
+79
View File
@@ -0,0 +1,79 @@
#!/bin/bash
# Nightly refresh of seap materialized views.
# Run from satra cron at 04:00 — peak DB idle window.
#
# Sources DATABASE_URL via Infisical Machine Identity (same as the
# vreaudigital container). Never echoes the value.
set -euo pipefail
LOG=/var/log/vreaudigital-mvs.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== Materialized view refresh started ==="
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
exit 1
fi
# shellcheck disable=SC1091
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login \
--method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" \
--path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
# Parse URL into PG* env vars and discard URL — psql with the URL on the command
# line leaks the password to anyone running `ps aux` (incident 2026-05-07).
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DATABASE_URL TOKEN DB
START=$(date +%s)
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
\timing on
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.uat_procurement_stats;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.uat_kpi;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_authority_concentration;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_cpv_median_value;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_top_cpv_divisions;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_top_suppliers;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_top_authorities;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_recurrent_pairs;
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_supplier_cpv_share;
-- Cross-source MVs (added 2026-05-11 after backfills)
REFRESH MATERIALIZED VIEW CONCURRENTLY cnsc.mv_per_authority_cui;
REFRESH MATERIALIZED VIEW CONCURRENTLY cnsc.mv_per_contestator_cui;
REFRESH MATERIALIZED VIEW CONCURRENTLY anre.mv_licente_per_cui;
REFRESH MATERIALIZED VIEW CONCURRENTLY ancom.mv_operatori_per_cui;
REFRESH MATERIALIZED VIEW CONCURRENTLY asf.mv_entitati_per_cui;
REFRESH MATERIALIZED VIEW CONCURRENTLY aaas.mv_per_cui;
-- Red-flags KPI snapshot (043_red_flags_kpi_snapshot.sql)
SELECT public_kpi.refresh_red_flags_counts();
-- Red-flags previews snapshot (044_red_flags_previews_snapshot.sql) — top-5
-- rows per recipe; landing reads as a single SELECT instead of awaiting 14
-- live cross-source queries (~17s → ~5ms).
SELECT public_kpi.refresh_red_flags_previews();
-- Cauta default-browse facets+totals snapshot (046) — short-circuits the 6
-- parallel facet aggregates when no filter is set (~1.9s → ~50ms).
SELECT public_kpi.refresh_cauta_defaults();
SQL
END=$(date +%s)
log "=== Done in $((END-START))s ==="
+87
View File
@@ -0,0 +1,87 @@
#!/bin/bash
# AAAS — Autoritatea pentru Administrarea Activelor Statului.
# Scrapes the AAAS portfolio of state-owned companies from
# https://www.aaas.gov.ro/.../1-9-3-companii-sub-autoritatea-aaas/.
#
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern: Infisical Machine
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
# post-launch.
#
# Idempotent (UPSERT on cui PK). Safe to run from cron.
#
# AAAS publishes ~12 active-portfolio companies as of 2026-05-10. The
# "vânzări acțiuni" + "valorificare creanțe" sections are under construction;
# the scraper logs their state but produces no rows from them yet.
#
# Env knobs:
# LIMIT=0 (default: 0 = full = all 12)
#
# Run:
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-aaas.sh
# sudo LIMIT=3 /opt/vreaudigital/services/seap-scraper/cron/scrape-aaas.sh # smoke
set -euo pipefail
LIMIT="${LIMIT:-0}"
LOG=/var/log/vreaudigital-aaas.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== AAAS scrape started (limit=$LIMIT) ==="
if docker ps --filter name=vreaudigital-aaas --format '{{.Names}}' | grep -q '^vreaudigital-aaas$'; then
log "WARN: vreaudigital-aaas already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-aaas 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-aaas-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=""
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="--limit=$LIMIT"
CID=$(docker run -d \
--name vreaudigital-aaas \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-aaas.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-aaas >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-aaas 2>/dev/null || echo "?")
docker logs vreaudigital-aaas 2>&1 | tail -25 | tee -a "$LOG"
log "=== AAAS scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+82
View File
@@ -0,0 +1,82 @@
#!/bin/bash
# AEP donatii scraper — runs scrape-aep-donatii.ts in a node:22-alpine container.
# Mirrors enrich-anaf.sh / scrape-regas.sh: Infisical Machine Identity → env-file
# → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (uses ON CONFLICT (source_hash) DO UPDATE). Safe to run from cron.
#
# Args via env:
# TABLE=pj|pf|rvc|all (default: all — fetches all 3 datasets sequentially)
# LIMIT=<int> (default: 0 = no limit)
set -euo pipefail
TABLE="${TABLE:-all}"
LIMIT="${LIMIT:-0}"
LOG=/var/log/vreaudigital-aep.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== AEP donatii scrape started (table=$TABLE limit=$LIMIT) ==="
if docker ps --filter name=vreaudigital-aep --format '{{.Names}}' | grep -q '^vreaudigital-aep$'; then
log "WARN: vreaudigital-aep already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-aep 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
# ── Launch detached docker container ──
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=()
[ "$LIMIT" != "0" ] && EXTRA_ARGS+=("--limit=$LIMIT")
CID=$(docker run -d \
--name vreaudigital-aep \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-aep-donatii.ts \
--table="$TABLE" \
"${EXTRA_ARGS[@]}")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-aep >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-aep 2>/dev/null || echo "?")
docker logs vreaudigital-aep 2>&1 | tail -20 | tee -a "$LOG"
docker rm -f vreaudigital-aep 2>/dev/null || true
log "=== AEP donatii scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+125
View File
@@ -0,0 +1,125 @@
#!/bin/bash
# ANAF datornici — LIVE scraper wrapper (Cloudflare Turnstile via 2captcha).
#
# Mirrors scrape-cnsc.sh / scrape-anaf-datornici.sh pattern but runs a Python
# script (not TSX) because the live scraper uses requests + psycopg2 and shares
# nothing with the data.gov.ro one-shot TS importer.
#
# Infisical Machine Identity → env-file (DATABASE_URL + TWOCAPTCHA_KEY) →
# docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (UPSERT on cui+publication_date). Designed to be triggered
# quarterly by vreaudigital-anaf-datornici.timer.
#
# ⚠️ COST: each run spends real money via 2captcha (~$0.50-3 per quarterly
# tick, ~$60-100 one-time for 10-year backfill). Do NOT enable the systemd
# timer until TWOCAPTCHA_KEY is funded — see HANDOFF-anaf-datornici-2captcha.md.
#
# Env knobs:
# DRY_RUN=1 — parse-only, zero spend, zero DB writes.
# BACKFILL_FROM=2016-Q1 — iterate from quarter X through current.
# CATEGORIES=mari,mijlocii — subset of {mari,mijlocii,mici,institutii_publice,persoane_fizice}.
# INCLUDE_LISTA_ALBA=1 — also scrape anaf.lista_alba (separate endpoint).
#
# Run:
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-datornici-live.sh
# sudo DRY_RUN=1 /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-datornici-live.sh
# sudo BACKFILL_FROM=2016-Q1 INCLUDE_LISTA_ALBA=1 /opt/.../scrape-anaf-datornici-live.sh
set -euo pipefail
DRY_RUN="${DRY_RUN:-0}"
BACKFILL_FROM="${BACKFILL_FROM:-}"
CATEGORIES="${CATEGORIES:-}"
INCLUDE_LISTA_ALBA="${INCLUDE_LISTA_ALBA:-0}"
LOG=/var/log/vreaudigital-anaf-datornici.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ANAF datornici LIVE scrape started (dry_run=$DRY_RUN backfill=$BACKFILL_FROM lista_alba=$INCLUDE_LISTA_ALBA) ==="
if docker ps --filter name=vreaudigital-anaf-datornici-live --format '{{.Names}}' \
| grep -q '^vreaudigital-anaf-datornici-live$'; then
log "WARN: vreaudigital-anaf-datornici-live already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-anaf-datornici-live 2>/dev/null || true
# ── Fetch DATABASE_URL + TWOCAPTCHA_KEY via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-anaf-datornici-live-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL
# TWOCAPTCHA_KEY: required unless DRY_RUN=1. If missing, abort with a clear
# pointer to the handoff doc — DO NOT silently run (would still hit ANAF page).
if [ "$DRY_RUN" != "1" ]; then
# Try primary path first ($INFISICAL_PATH = /vreaudigital), fall back to root.
# Some users add TWOCAPTCHA_KEY at root path / (less project-namespaced).
for try_path in "$INFISICAL_PATH" "/"; do
TWOCAPTCHA_KEY=$(infisical secrets get TWOCAPTCHA_KEY \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$try_path" \
--token="$TOKEN" --plain --silent 2>/dev/null || true)
[ -n "${TWOCAPTCHA_KEY:-}" ] && break
done
if [ -z "${TWOCAPTCHA_KEY:-}" ]; then
log "ERROR: TWOCAPTCHA_KEY missing in Infisical (checked $INFISICAL_PATH + /) — see HANDOFF-anaf-datornici-2captcha.md"
log " Add via: NEW SECRET PROTOCOL (Infisical, either path /vreaudigital or /)"
rm -f "$ENVF"
exit 3
fi
echo "TWOCAPTCHA_KEY=$TWOCAPTCHA_KEY" >> "$ENVF"
unset TWOCAPTCHA_KEY
fi
unset TOKEN
# Pass-through env knobs
echo "DRY_RUN=$DRY_RUN" >> "$ENVF"
[ -n "$BACKFILL_FROM" ] && echo "BACKFILL_FROM=$BACKFILL_FROM" >> "$ENVF"
[ -n "$CATEGORIES" ] && echo "CATEGORIES=$CATEGORIES" >> "$ENVF"
[ "$INCLUDE_LISTA_ALBA" = "1" ] && echo "INCLUDE_LISTA_ALBA=1" >> "$ENVF"
echo "ANAF_DATORNICI_LOG=/work/.log/anaf-datornici.log" >> "$ENVF"
cd /opt/vreaudigital/services/seap-scraper
# Ensure /work/.log is writable inside container (host bind-mount); the
# Python process also tees to stdout → docker logs → journald.
mkdir -p .log
CID=$(docker run -d \
--name vreaudigital-anaf-datornici-live \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
python:3.12-slim \
bash -c "pip install --quiet --no-cache-dir psycopg2-binary requests && python3 scrapers/anaf_datornici/scraper.py")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-anaf-datornici-live >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-datornici-live 2>/dev/null || echo "?")
docker logs vreaudigital-anaf-datornici-live 2>&1 | tail -30 | tee -a "$LOG"
log "=== ANAF datornici LIVE scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+84
View File
@@ -0,0 +1,84 @@
#!/bin/bash
# ANAF datornici scraper — runs scrape-anaf-datornici.ts in node:22-alpine.
# Mirrors enrich-anaf.sh / scrape-regas.sh pattern: Infisical Machine Identity
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Default source: data.gov.ro Q1-2016 snapshot (only public bulk source available;
# anaf.ro/restante/ live is CAPTCHA-blocked — see ANAF-DATORNICI-RECIPES.md).
#
# Idempotent (uses ON CONFLICT (cui, publication_date) DO UPDATE). Safe to run
# from cron, but in practice this is a one-shot until live scraping unlocks.
set -euo pipefail
SOURCE="${SOURCE:-datagov2016}"
DRY_RUN="${DRY_RUN:-0}"
LOG=/var/log/vreaudigital-anaf-datornici.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ANAF datornici scrape started (source=$SOURCE dry-run=$DRY_RUN) ==="
if docker ps --filter name=vreaudigital-anaf-datornici --format '{{.Names}}' \
| grep -q '^vreaudigital-anaf-datornici$'; then
log "WARN: vreaudigital-anaf-datornici already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-anaf-datornici 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
# ── Launch detached docker container ──
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
DRY_FLAG=""
if [ "$DRY_RUN" = "1" ]; then
DRY_FLAG="--dry-run"
fi
CID=$(docker run -d \
--name vreaudigital-anaf-datornici \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-anaf-datornici.ts \
--source="$SOURCE" \
$DRY_FLAG)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-anaf-datornici >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-datornici 2>/dev/null || echo "?")
docker logs vreaudigital-anaf-datornici 2>&1 | tail -15 | tee -a "$LOG"
log "=== ANAF datornici scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+102
View File
@@ -0,0 +1,102 @@
#!/bin/bash
# ANAF lista albă — LIVE scraper wrapper (JCaptcha via 2captcha).
#
# Mirrors scrape-anaf-datornici-live.sh exactly. Difference is endpoint
# (/restante/listaalba.xhtml) and target table (anaf.lista_alba — 3 cols/row).
#
# Infisical Machine Identity → env-file (DATABASE_URL + TWOCAPTCHA_KEY) →
# docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (UPSERT on cui+publication_date). Designed to be triggered
# quarterly by vreaudigital-anaf-lista-alba.timer (offset +1h vs datornici).
#
# Env knobs:
# DRY_RUN=1 — parse-only, zero spend, zero DB writes.
#
# Run:
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-lista-alba.sh
# sudo DRY_RUN=1 /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-lista-alba.sh
set -euo pipefail
DRY_RUN="${DRY_RUN:-0}"
LOG=/var/log/vreaudigital-anaf-lista-alba.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ANAF lista_alba LIVE scrape started (dry_run=$DRY_RUN) ==="
if docker ps --filter name=vreaudigital-anaf-lista-alba-live --format '{{.Names}}' \
| grep -q '^vreaudigital-anaf-lista-alba-live$'; then
log "WARN: vreaudigital-anaf-lista-alba-live already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-anaf-lista-alba-live 2>/dev/null || true
# ── Fetch DATABASE_URL + TWOCAPTCHA_KEY via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-anaf-lista-alba-live-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL
if [ "$DRY_RUN" != "1" ]; then
for try_path in "$INFISICAL_PATH" "/"; do
TWOCAPTCHA_KEY=$(infisical secrets get TWOCAPTCHA_KEY \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$try_path" \
--token="$TOKEN" --plain --silent 2>/dev/null || true)
[ -n "${TWOCAPTCHA_KEY:-}" ] && break
done
if [ -z "${TWOCAPTCHA_KEY:-}" ]; then
log "ERROR: TWOCAPTCHA_KEY missing in Infisical (checked $INFISICAL_PATH + /)"
rm -f "$ENVF"
exit 3
fi
echo "TWOCAPTCHA_KEY=$TWOCAPTCHA_KEY" >> "$ENVF"
unset TWOCAPTCHA_KEY
fi
unset TOKEN
echo "DRY_RUN=$DRY_RUN" >> "$ENVF"
echo "ANAF_LISTA_ALBA_LOG=/work/.log/anaf-lista-alba.log" >> "$ENVF"
cd /opt/vreaudigital/services/seap-scraper
mkdir -p .log
CID=$(docker run -d \
--name vreaudigital-anaf-lista-alba-live \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
python:3.12-slim \
bash -c "pip install --quiet --no-cache-dir psycopg2-binary requests && python3 scrapers/anaf_lista_alba/scraper.py")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-anaf-lista-alba-live >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-lista-alba-live 2>/dev/null || echo "?")
docker logs vreaudigital-anaf-lista-alba-live 2>&1 | tail -30 | tee -a "$LOG"
log "=== ANAF lista_alba LIVE scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+86
View File
@@ -0,0 +1,86 @@
#!/bin/bash
# ANCOM — Autoritatea Națională pentru Administrare și Reglementare în
# Comunicații. Scrapes the public registry of authorized communications
# providers from ancom.ro.
#
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern: Infisical Machine
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
# post-launch.
#
# Idempotent (UPSERT on ancom_id). Safe to run from cron.
#
# Env knobs:
# LIMIT=0 (default: 0 = full ~570 operators)
# MAX_PAGES=0 (default: 0 = all list pages)
#
# Run:
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-ancom.sh # smoke test (2 pages = 20 ids)
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-ancom.sh # full
set -euo pipefail
LIMIT="${LIMIT:-0}"
MAX_PAGES="${MAX_PAGES:-0}"
LOG=/var/log/vreaudigital-ancom.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ANCOM scrape started (limit=$LIMIT max_pages=$MAX_PAGES) ==="
if docker ps --filter name=vreaudigital-ancom --format '{{.Names}}' | grep -q '^vreaudigital-ancom$'; then
log "WARN: vreaudigital-ancom already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-ancom 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-ancom-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=""
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
CID=$(docker run -d \
--name vreaudigital-ancom \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-ancom.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-ancom >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-ancom 2>/dev/null || echo "?")
docker logs vreaudigital-ancom 2>&1 | tail -30 | tee -a "$LOG"
log "=== ANCOM scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+89
View File
@@ -0,0 +1,89 @@
#!/bin/bash
# ANRE — Autoritatea Națională de Reglementare în domeniul Energiei.
# Scrapes 4 public registries from portal.anre.ro/PublicLists:
# electricitate (~5K), gaze (~350), atestat (~10K), electricieni (~100K).
#
# Mirrors scrape-regas.sh / scrape-bugetar.sh pattern: Infisical Machine
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
# post-launch.
#
# Idempotent (UPSERT on sha1 PK / UNIQUE(nr_autorizare,nume_prenume)).
# Safe to run from cron.
#
# Env knobs:
# SOURCE=all|electricitate|gaze|atestat|electricieni (default: all)
# LIMIT=0 (default: 0 = full)
#
# Run:
# sudo SOURCE=electricitate LIMIT=100 /opt/vreaudigital/services/seap-scraper/cron/scrape-anre.sh
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anre.sh # full all sources
set -euo pipefail
SOURCE="${SOURCE:-all}"
LIMIT="${LIMIT:-0}"
LOG=/var/log/vreaudigital-anre.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ANRE scrape started (source=$SOURCE limit=$LIMIT) ==="
if docker ps --filter name=vreaudigital-anre --format '{{.Names}}' | grep -q '^vreaudigital-anre$'; then
log "WARN: vreaudigital-anre already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-anre 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-anre-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
# ANRE portal uses an intermediate CA cert chain that node's bundle doesn't trust.
# Cert is valid (verified OOB via Microsoft-IIS handshake), bypass for this scraper.
echo "NODE_TLS_REJECT_UNAUTHORIZED=0" >> "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS="--source=$SOURCE"
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
CID=$(docker run -d \
--name vreaudigital-anre \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-anre.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-anre >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anre 2>/dev/null || echo "?")
docker logs vreaudigital-anre 2>&1 | tail -25 | tee -a "$LOG"
log "=== ANRE scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+86
View File
@@ -0,0 +1,86 @@
#!/bin/bash
# ASF — Autoritatea de Supraveghere Financiară.
# Scrapes the public registry of authorized financial entities (insurers,
# brokers, etc.) from data.asfromania.ro/scr/ra. ~860 entities.
#
# Mirrors scrape-anre.sh pattern: Infisical Machine Identity → env-file →
# docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (UPSERT on UNIQUE(register_type, register_no)).
# Safe to run from cron.
#
# Env knobs:
# LIMIT=0 (default: 0 = full)
# NO_GAPFILL=0 (default: 0 = run gapfill; set 1 to skip)
#
# Run:
# sudo LIMIT=20 /opt/vreaudigital/services/seap-scraper/cron/scrape-asf.sh # smoke
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-asf.sh # full
set -euo pipefail
LIMIT="${LIMIT:-0}"
NO_GAPFILL="${NO_GAPFILL:-0}"
LOG=/var/log/vreaudigital-asf.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ASF scrape started (limit=$LIMIT no_gapfill=$NO_GAPFILL) ==="
if docker ps --filter name=vreaudigital-asf --format '{{.Names}}' | grep -q '^vreaudigital-asf$'; then
log "WARN: vreaudigital-asf already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-asf 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-asf-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=""
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
[ "$NO_GAPFILL" = "1" ] && EXTRA_ARGS="$EXTRA_ARGS --no-gapfill"
CID=$(docker run -d \
--name vreaudigital-asf \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-asf.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-asf >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-asf 2>/dev/null || echo "?")
docker logs vreaudigital-asf 2>&1 | tail -40 | tee -a "$LOG"
log "=== ASF scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+115
View File
@@ -0,0 +1,115 @@
#!/bin/bash
# Scraper Transparență Bugetară MFP — Faza 1: enumerare universul entităților
# publice raportoare + fuzzy match nume → CUI.
#
# Faza 2 (descărcare rapoarte XML) nu e implementată: aplicația MFP cere
# CAPTCHA pe fiecare căutare, ceea ce necesită captcha solver extern (2captcha
# / anti-captcha) și un buget pentru ~1.6M cereri (4-8K USD pentru ingest
# istoric complet 2020-2025). Vezi BUGETAR-PLAN.md pentru detalii.
#
# Modes:
# MODE=enumerate (default) → enumeră (sector × județ) → bugetar.entitate
# MODE=match-cui → fuzzy match denumire → firms.entities.cui_normalized
# MODE=full → enumerate + match-cui într-o singură rulare
#
# Idempotent. Sigur de rulat repetat (UPSERT).
set -euo pipefail
MODE="${MODE:-enumerate}"
JUDET="${JUDET:-}"
SECTOR="${SECTOR:-}"
DELAY_MS="${DELAY_MS:-500}"
LOG=/var/log/vreaudigital-bugetar.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== bugetar scraper started (mode=$MODE judet=${JUDET:-ALL} sector=${SECTOR:-ALL}) ==="
# Guard: previous run still going?
if docker ps --filter name=vreaudigital-bugetar --format '{{.Names}}' | grep -q '^vreaudigital-bugetar$'; then
log "WARN: vreaudigital-bugetar already running, skipping"
exit 0
fi
docker rm -f vreaudigital-bugetar 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-bugetar-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
# Make sure node_modules exists.
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
run_scraper_mode() {
local mode="$1"
local extra_args=""
[ -n "$JUDET" ] && extra_args="$extra_args --judet=$JUDET"
[ -n "$SECTOR" ] && extra_args="$extra_args --sector=$SECTOR"
[ "$mode" = "enumerate" ] && extra_args="$extra_args --delay-ms=$DELAY_MS"
log "running mode=$mode args=$extra_args"
CID=$(docker run -d \
--name "vreaudigital-bugetar-$mode" \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-bugetar.ts --mode="$mode" $extra_args)
log " container: $CID"
sleep 3 # daemon a citit envfile
docker wait "vreaudigital-bugetar-$mode" >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' "vreaudigital-bugetar-$mode" 2>/dev/null || echo "?")
docker logs "vreaudigital-bugetar-$mode" 2>&1 | tail -10 | tee -a "$LOG"
docker rm -f "vreaudigital-bugetar-$mode" >/dev/null 2>&1 || true
return "$EXIT_CODE"
}
EXIT_CODE=0
case "$MODE" in
enumerate)
run_scraper_mode enumerate || EXIT_CODE=$?
;;
match-cui)
run_scraper_mode match-cui || EXIT_CODE=$?
;;
full)
run_scraper_mode enumerate || EXIT_CODE=$?
if [ "$EXIT_CODE" -eq 0 ]; then
run_scraper_mode match-cui || EXIT_CODE=$?
fi
;;
*)
log "ERROR: unknown MODE=$MODE (use enumerate|match-cui|full)"
EXIT_CODE=2
;;
esac
rm -f "$ENVF"
log "envfile cleaned"
log "=== bugetar scraper done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+96
View File
@@ -0,0 +1,96 @@
#!/bin/bash
# CNAS — Casa Națională de Asigurări de Sănătate.
# Scrapes the central WP media library at cnas.ro/wp-content/uploads/ for
# furnizori-de-servicii-medicale PDFs (~70-90 active docs as of 2026-05).
# Per-county Angular SPA at cas.cnas.ro/casXX is currently empty (handoff
# documented in CNAS-PLAN.md).
#
# Mirrors scrape-anre.sh / scrape-regas.sh pattern: Infisical Machine Identity
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
# Container has poppler-utils installed for pdftotext.
#
# Idempotent. Safe to run from cron weekly (CNAS uploads ~5-15 files/month).
#
# Env knobs:
# LIMIT=0 (default: 0 = all matched files)
# MODE=full (full | metadata-only | parse-only)
#
# Run:
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # full
# sudo LIMIT=5 /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # smoke test
# sudo MODE=metadata-only /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # list-only
set -euo pipefail
LIMIT="${LIMIT:-0}"
MODE="${MODE:-full}"
LOG=/var/log/vreaudigital-cnas.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== CNAS scrape started (limit=$LIMIT mode=$MODE) ==="
if docker ps --filter name=vreaudigital-cnas --format '{{.Names}}' | grep -q '^vreaudigital-cnas$'; then
log "WARN: vreaudigital-cnas already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-cnas 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-cnas-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=""
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
case "$MODE" in
metadata-only) EXTRA_ARGS="$EXTRA_ARGS --metadata-only" ;;
parse-only) EXTRA_ARGS="$EXTRA_ARGS --parse-only" ;;
full) ;;
*) log "ERROR: unknown MODE=$MODE (full|metadata-only|parse-only)"; exit 1 ;;
esac
# Note: poppler-utils is installed at container start for pdftotext + pdfinfo.
# Using sh -c so we can chain apk add + npx tsx in a single command.
CID=$(docker run -d \
--name vreaudigital-cnas \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user 0:0 \
--restart no \
node:22-alpine \
sh -c "apk add --no-cache poppler-utils >/dev/null && npx tsx src/scrape-cnas.ts $EXTRA_ARGS")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-cnas >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-cnas 2>/dev/null || echo "?")
docker logs vreaudigital-cnas 2>&1 | tail -50 | tee -a "$LOG"
log "=== CNAS scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+85
View File
@@ -0,0 +1,85 @@
#!/bin/bash
# CNSC — Consiliul Național de Soluționare a Contestațiilor.
# Walks portal.cnsc.ro/decizii.html (~30K decisions across ~617 pages of 50).
#
# Mirrors scrape-anre.sh / scrape-aaas.sh pattern: Infisical Machine Identity
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent: ON CONFLICT (decision_no, decision_year) DO UPDATE.
# Safe to run from cron daily — only newly-published decisions are inserted,
# the rest are no-op updates of fetched_at.
#
# Env knobs:
# START_PAGE=1 (default 1; set higher to resume after partial run)
# MAX_PAGES=0 (default 0 = until totalPages; smaller for smoke test)
#
# Run:
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-cnsc.sh
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-cnsc.sh
set -euo pipefail
START_PAGE="${START_PAGE:-1}"
MAX_PAGES="${MAX_PAGES:-0}"
LOG=/var/log/vreaudigital-cnsc.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== CNSC scrape started (start_page=$START_PAGE max_pages=$MAX_PAGES) ==="
if docker ps --filter name=vreaudigital-cnsc --format '{{.Names}}' | grep -q '^vreaudigital-cnsc$'; then
log "WARN: vreaudigital-cnsc already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-cnsc 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-cnsc-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS="--start-page=$START_PAGE"
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
CID=$(docker run -d \
--name vreaudigital-cnsc \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-cnsc.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-cnsc >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-cnsc 2>/dev/null || echo "?")
docker logs vreaudigital-cnsc 2>&1 | tail -25 | tee -a "$LOG"
log "=== CNSC scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+93
View File
@@ -0,0 +1,93 @@
#!/bin/bash
# Curtea de Conturi — Stage 1: listing-page metadata harvest.
#
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern: Infisical Machine
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
# post-launch.
#
# Idempotent (UPSERT on slug_id PK = sha1(category|slug)).
# Safe to run from cron — recommend weekly (new audits drip in slowly).
#
# Stage 2 (PDF parse + CUI fuzzy match) is a separate scraper, see
# services/seap-scraper/CURTEACONT-PLAN.md.
#
# Env knobs:
# SOURCE=all|financiar|conformitate|performanta (default: all)
# LIMIT=0 (default: 0 = full)
# START_PAGE=1 (default: 1)
#
# Run:
# sudo SOURCE=financiar LIMIT=500 /opt/vreaudigital/services/seap-scraper/cron/scrape-curteacont.sh
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-curteacont.sh # full all sources
set -euo pipefail
SOURCE="${SOURCE:-all}"
LIMIT="${LIMIT:-0}"
START_PAGE="${START_PAGE:-1}"
LOG=/var/log/vreaudigital-curteacont.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== curteacont scrape started (source=$SOURCE limit=$LIMIT start=$START_PAGE) ==="
if docker ps --filter name=vreaudigital-curteacont --format '{{.Names}}' | grep -q '^vreaudigital-curteacont$'; then
log "WARN: vreaudigital-curteacont already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-curteacont 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-curteacont-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
# curteadeconturi.ro serves an intermediate CA chain that node's bundle doesn't
# trust by default. Cert is valid OOB; bypass for this scraper. (Same workaround
# we use for ANRE.)
echo "NODE_TLS_REJECT_UNAUTHORIZED=0" >> "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS="--source=$SOURCE --start-page=$START_PAGE"
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
CID=$(docker run -d \
--name vreaudigital-curteacont \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-curteacont.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-curteacont >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-curteacont 2>/dev/null || echo "?")
docker logs vreaudigital-curteacont 2>&1 | tail -50 | tee -a "$LOG"
log "=== curteacont scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+81
View File
@@ -0,0 +1,81 @@
#!/bin/bash
# SEAP Achiziții Directe (DA) — daily/weekly backfill of e-licitatie.ro DA notices.
#
# The DA endpoint is rate-limited and large (~500K rows already + ~8M historical
# 2017-2024 pending). The scraper itself is idempotent and resumable via
# `seap.sync_state[source='da']`:
# - reads last_date, requests notices > last_date
# - upserts on natural key, updates sync_state to latest fetched
#
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern. Reads DATABASE_URL via
# Infisical MI, writes envfile, docker-run with --env-file, deletes file.
#
# Env knobs:
# MODE=da | backfill (default: da; backfill = last 6 months ignoring sync_state)
#
# Run:
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-da.sh
# sudo MODE=backfill /opt/vreaudigital/services/seap-scraper/cron/scrape-da.sh
set -euo pipefail
MODE="${MODE:-da}"
LOG=/var/log/vreaudigital-da.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== SEAP DA scrape started (mode=$MODE) ==="
if docker ps --filter name=vreaudigital-da --format '{{.Names}}' | grep -q '^vreaudigital-da$'; then
log "WARN: vreaudigital-da already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-da 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-da-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
CID=$(docker run -d \
--name vreaudigital-da \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/index.ts --mode=$MODE)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-da >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-da 2>/dev/null || echo "?")
docker logs vreaudigital-da 2>&1 | tail -40 | tee -a "$LOG"
log "=== SEAP DA scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+88
View File
@@ -0,0 +1,88 @@
#!/bin/bash
# GNM — Garda Națională de Mediu.
# Scrapes the gnm.ro WordPress RSS feed (~36 pages × 10 items) for environmental
# enforcement press releases. Persists every release to gnm.comunicate, flags
# is_enforcement, and runs a regex pass to surface (firm, fine_lei) tuples into
# gnm.amenzi_extrase.
#
# Mirrors scrape-ancom.sh / scrape-anre.sh pattern: Infisical Machine Identity
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (UPSERT on guid; skip on raw_hash unchanged). Safe to run from cron.
#
# Env knobs:
# MAX_PAGES=0 (default: 0 = walk until empty, max 50)
# SINCE_DAYS=0 (default: 0 = no cutoff; >0 = stop at first item older than N days)
#
# Run:
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # smoke (20 articles)
# sudo SINCE_DAYS=30 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # incremental
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # full (~360 articles)
set -euo pipefail
MAX_PAGES="${MAX_PAGES:-0}"
SINCE_DAYS="${SINCE_DAYS:-0}"
LOG=/var/log/vreaudigital-gnm.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== GNM scrape started (max_pages=$MAX_PAGES since_days=$SINCE_DAYS) ==="
if docker ps --filter name=vreaudigital-gnm --format '{{.Names}}' | grep -q '^vreaudigital-gnm$'; then
log "WARN: vreaudigital-gnm already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-gnm 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-gnm-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
unset DBURL TOKEN
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
EXTRA_ARGS=""
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
[ "$SINCE_DAYS" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --since-days=$SINCE_DAYS"
CID=$(docker run -d \
--name vreaudigital-gnm \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-gnm.ts $EXTRA_ARGS)
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-gnm >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-gnm 2>/dev/null || echo "?")
docker logs vreaudigital-gnm 2>&1 | tail -30 | tee -a "$LOG"
log "=== GNM scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+79
View File
@@ -0,0 +1,79 @@
#!/bin/bash
# RegAS scraper — runs scrape-regas.ts in a node:22-alpine container.
# Mirrors the enrich-anaf.sh pattern: Infisical Machine Identity → env-file
# → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
#
# Idempotent (uses ON CONFLICT (id) DO UPDATE). Safe to run from cron.
set -euo pipefail
PAGE_SIZE="${PAGE_SIZE:-5000}"
START_PAGE="${START_PAGE:-0}"
MAX_PAGES="${MAX_PAGES:-0}"
LOG=/var/log/vreaudigital-regas.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== RegAS scrape started (page-size=$PAGE_SIZE start-page=$START_PAGE max-pages=$MAX_PAGES) ==="
if docker ps --filter name=vreaudigital-regas --format '{{.Names}}' | grep -q '^vreaudigital-regas$'; then
log "WARN: vreaudigital-regas already running, skipping this tick"
exit 0
fi
docker rm -f vreaudigital-regas 2>/dev/null || true
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
umask 077
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
DBURL=$(infisical secrets get DATABASE_URL \
--domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--token="$TOKEN" --plain --silent)
echo "DATABASE_URL=$DBURL" > "$ENVF"
# RegAS uses an intermediate CA cert chain that node's bundle doesn't trust.
# Cert is valid (verified OOB), bypass for this scraper only.
echo "NODE_TLS_REJECT_UNAUTHORIZED=0" >> "$ENVF"
unset DBURL TOKEN
# ── Launch detached docker container ──
cd /opt/vreaudigital/services/seap-scraper
if [ ! -d node_modules/tsx ]; then
log "Installing seap-scraper deps..."
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
fi
CID=$(docker run -d \
--name vreaudigital-regas \
--network host \
--env-file "$ENVF" \
-v "$(pwd):/work" \
-w /work \
--user "$(id -u):$(id -g)" \
--restart no \
node:22-alpine \
npx tsx src/scrape-regas.ts \
--page-size="$PAGE_SIZE" \
--start-page="$START_PAGE" \
--max-pages="$MAX_PAGES")
log "container started: $CID"
sleep 3
rm -f "$ENVF"
log "envfile cleaned"
docker wait vreaudigital-regas >/dev/null
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-regas 2>/dev/null || echo "?")
docker logs vreaudigital-regas 2>&1 | tail -10 | tee -a "$LOG"
log "=== RegAS scrape done (exit=$EXIT_CODE) ==="
exit "$EXIT_CODE"
+70
View File
@@ -0,0 +1,70 @@
#!/bin/bash
# Setup Photon (Komoot) geocoder docker container with pre-built RO extract.
# Photon = Java service with embedded OpenSearch index over OSM admin polygons + addresses.
#
# Source: https://download1.graphhopper.com/public/extracts/by-country-code/ro/
# Size: ~332MB tar.bz2 → ~3GB extracted
# API: HTTP on :2322, ?q=Strada+X+Bucuresti returns GeoJSON with coords + admin matches.
set -euo pipefail
PHOTON_DIR=/opt/photon
EXTRACT_BASE=https://download1.graphhopper.com/public/extracts/by-country-code/ro
log() { echo "[$(date '+%H:%M:%S')] $1"; }
log "=== Photon setup ==="
# 1. Download extract — graphhopper publishes dated snapshots (photon-db-ro-YYMMDD.tar.bz2);
# the "-latest" alias is unreliable, so we auto-pick the newest dated file from the index.
sudo mkdir -p "$PHOTON_DIR"
cd "$PHOTON_DIR"
if [ ! -d "$PHOTON_DIR/photon_data" ]; then
LATEST=$(curl -fsSL "$EXTRACT_BASE/" \
| grep -oE 'photon-db-ro-[0-9]{6}\.tar\.bz2' \
| sort -u | tail -1)
if [ -z "$LATEST" ]; then
log "FATAL: could not discover latest Photon RO extract from $EXTRACT_BASE/"
exit 1
fi
log "Downloading $LATEST (~332MB)..."
sudo curl -fL "$EXTRACT_BASE/$LATEST" -o photon-ro.tar.bz2
log "Extracting (creates ~3GB photon_data/)..."
sudo tar -xjf photon-ro.tar.bz2
sudo rm photon-ro.tar.bz2
sudo chown -R 1000:1000 "$PHOTON_DIR"
else
log "photon_data/ already exists; skipping download"
fi
# 2. Run docker container
if docker ps --filter name=photon-ro --format '{{.Names}}' | grep -q photon-ro; then
log "photon-ro already running"
else
log "Starting photon-ro container..."
docker rm -f photon-ro 2>/dev/null || true
docker run -d --name photon-ro --restart unless-stopped \
-p 127.0.0.1:2322:2322 \
-v "$PHOTON_DIR/photon_data:/photon/photon_data" \
rtuszik/photon-docker:latest
fi
# 3. Wait for startup, smoke test
log "Waiting for Photon to initialize..."
for i in $(seq 1 30); do
if curl -fs "http://localhost:2322/api?q=Bucuresti" >/dev/null 2>&1; then
log "Photon ready."
break
fi
sleep 2
done
# 4. Smoke tests
log "Smoke test 1 — Bucuresti:"
curl -fs "http://localhost:2322/api?q=Bucuresti&limit=2" | head -c 400
echo
log "Smoke test 2 — Cluj-Napoca Strada Memorandumului:"
curl -fs "http://localhost:2322/api?q=Strada+Memorandumului+Cluj-Napoca&limit=1" | head -c 400
echo
log "=== Photon setup complete (HTTP API on 127.0.0.1:2322) ==="
@@ -0,0 +1,14 @@
[Unit]
Description=vreaudigital — daily ANAF delta enrichment (tier=daily, concurrency=2)
Wants=network.target docker.service
After=network.target docker.service
[Service]
Type=oneshot
User=bulibasa
Environment=TIER=daily
Environment=ANAF_CONCURRENCY=2
ExecStart=/opt/vreaudigital/services/seap-scraper/cron/enrich-anaf.sh
StandardOutput=journal
StandardError=journal
TimeoutStartSec=2h
@@ -0,0 +1,11 @@
[Unit]
Description=vreaudigital — ANAF delta enrichment daily at 02:00
Requires=vreaudigital-anaf-daily.service
[Timer]
OnCalendar=*-*-* 02:00:00
Persistent=true
RandomizedDelaySec=300
[Install]
WantedBy=timers.target
@@ -0,0 +1,11 @@
[Unit]
Description=vreaudigital — refresh seap materialized views
Wants=network.target
After=network.target
[Service]
Type=oneshot
User=bulibasa
ExecStart=/opt/vreaudigital/services/seap-scraper/cron/refresh-mvs.sh
StandardOutput=journal
StandardError=journal
@@ -0,0 +1,11 @@
[Unit]
Description=vreaudigital — refresh materialized views nightly at 04:00
Requires=vreaudigital-mvs.service
[Timer]
OnCalendar=*-*-* 04:00:00
Persistent=true
RandomizedDelaySec=600
[Install]
WantedBy=timers.target
@@ -0,0 +1,12 @@
[Unit]
Description=vreaudigital — fetch latest ONRC bulk and import (weekly check, monthly real change)
Wants=network.target
After=network.target
[Service]
Type=oneshot
User=bulibasa
ExecStart=/opt/vreaudigital/services/seap-scraper/cron/import-onrc-fresh.sh
StandardOutput=journal
StandardError=journal
TimeoutStartSec=2h
@@ -0,0 +1,11 @@
[Unit]
Description=vreaudigital — weekly ONRC fresh-check Tuesday 03:00
Requires=vreaudigital-onrc-weekly.service
[Timer]
OnCalendar=Tue *-*-* 03:00:00
Persistent=true
RandomizedDelaySec=900
[Install]
WantedBy=timers.target
@@ -0,0 +1,18 @@
[Unit]
Description=vreaudigital — Photon 0.5.0 geocoder (Elasticsearch backend) for RO firms
After=network.target
[Service]
Type=simple
User=bulibasa
WorkingDirectory=/opt/photon
ExecStart=/usr/bin/java -Xmx8G -jar /opt/photon/photon-0.5.0.jar -data-dir /opt/photon -listen-port 2322
Restart=on-failure
RestartSec=15
StandardOutput=append:/var/log/vreaudigital-photon.log
StandardError=append:/var/log/vreaudigital-photon.log
LimitNOFILE=65536
LimitMEMLOCK=infinity
[Install]
WantedBy=multi-user.target