initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Executable
+82
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# Daily delta enrichment from ANAF webservicesp v9.
|
||||
# Runs the tsx script inside a node:22-alpine container so satra doesn't
|
||||
# need node installed at host level. DATABASE_URL is fetched fresh from
|
||||
# Infisical and passed via --env-file (mode 600, deleted right after the
|
||||
# container starts) — never on the docker run command line.
|
||||
#
|
||||
# Tier selection: pass TIER=daily|full|bulk as env (default: daily).
|
||||
# Concurrency: pass ANAF_CONCURRENCY=N (default: 2).
|
||||
#
|
||||
# Idempotent. Safe to run from cron.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TIER="${TIER:-daily}"
|
||||
ANAF_CONCURRENCY="${ANAF_CONCURRENCY:-2}"
|
||||
LOG=/var/log/vreaudigital-anaf.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ANAF enrichment started (tier=$TIER, concurrency=$ANAF_CONCURRENCY) ==="
|
||||
|
||||
# Bail if a previous run is still going — daily/full tier should always
|
||||
# finish well under 24h, so a still-running container means trouble.
|
||||
if docker ps --filter name=vreaudigital-anaf --format '{{.Names}}' | grep -q '^vreaudigital-anaf$'; then
|
||||
log "WARN: vreaudigital-anaf already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-anaf 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
# ── Launch detached docker container ──
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
# Make sure node_modules exists (first run on a fresh host).
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-anaf \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/enrich-anaf.ts --concurrency="$ANAF_CONCURRENCY" --tier="$TIER")
|
||||
log "container started: $CID"
|
||||
|
||||
# Daemon has read --env-file by the time `docker run -d` returns.
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
# Wait synchronously so systemd Type=oneshot accurately captures runtime.
|
||||
docker wait vreaudigital-anaf >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-anaf 2>&1 | tail -5 | tee -a "$LOG"
|
||||
log "=== ANAF enrichment done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+343
@@ -0,0 +1,343 @@
|
||||
#!/bin/bash
|
||||
# Full geocoding fallback chain for firms.entities (WHERE lat IS NULL).
|
||||
#
|
||||
# Re-runnable / idempotent. Filters every stage on `lat IS NULL` so re-runs
|
||||
# are no-ops once coverage is full. Safe to call after any ONRC fresh import
|
||||
# (import-onrc-fresh.sh) which by itself does NOT geocode new rows.
|
||||
#
|
||||
# Stage chain (highest accuracy first):
|
||||
# 1. geonames_postal — exact 6-digit RO postal match against firms.postal_codes_best
|
||||
# 2. uat_centroid — by siruta → public."GisUat" polygon centroid
|
||||
# 3. photon — Komoot Photon OSM geocoder (local 127.0.0.1:2322), street-level
|
||||
# 3b/3c/3d. uat_centroid by postal_codes (locality+county median) — for rows w/o
|
||||
# adr_strada (Photon's filter requires it). Tries locality token,
|
||||
# then Comuna parent, then â/î normalization.
|
||||
# 4. judet_centroid — last resort, county median from firms.postal_codes
|
||||
#
|
||||
# Two rows in the entire dataset have literally zero address fields and stay NULL.
|
||||
#
|
||||
# Usage:
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
|
||||
# sudo SKIP_PHOTON=1 /opt/vreaudigital/services/seap-scraper/cron/geocode-firms.sh
|
||||
#
|
||||
# Env:
|
||||
# SKIP_PHOTON=1 — skip stage 3 (photon docker) — useful when Photon down
|
||||
# PHOTON_CONCURRENCY=40
|
||||
# PHOTON_BATCH=200
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-geocode-firms.log
|
||||
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
||||
SEAP_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
SKIP_PHOTON="${SKIP_PHOTON:-0}"
|
||||
PHOTON_CONCURRENCY="${PHOTON_CONCURRENCY:-40}"
|
||||
PHOTON_BATCH="${PHOTON_BATCH:-200}"
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== Geocode-firms fallback chain started ==="
|
||||
|
||||
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
|
||||
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
|
||||
initial_null=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
|
||||
log "Initial WHERE lat IS NULL count: $initial_null"
|
||||
|
||||
if [ "$initial_null" = "0" ]; then
|
||||
log "Nothing to do — no firms with NULL lat."
|
||||
unset DATABASE_URL TOKEN DB PGPASSWORD
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# ── Stage 1: geonames_postal ────────────────────────────────────────────────
|
||||
log "[stage 1] geonames_postal (exact 6-digit postal match)..."
|
||||
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
||||
WITH cand AS (
|
||||
SELECT e.cui FROM firms.entities e
|
||||
WHERE e.lat IS NULL
|
||||
AND e.adr_cod_postal ~ '^[0-9]{6}\$'
|
||||
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = pc.lat::double precision,
|
||||
lng = pc.lng::double precision,
|
||||
geom = ST_SetSRID(ST_MakePoint(pc.lng::double precision, pc.lat::double precision), 4326)::geography,
|
||||
geocode_source = 'geonames_postal',
|
||||
geocode_score = 0.6,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM firms.postal_codes_best pc, cand
|
||||
WHERE e.cui = cand.cui
|
||||
AND e.adr_cod_postal = pc.postal_code
|
||||
AND e.lat IS NULL
|
||||
RETURNING 1
|
||||
" | wc -l)
|
||||
log "[stage 1] updated $n rows"
|
||||
|
||||
# ── Stage 2: uat_centroid by siruta ─────────────────────────────────────────
|
||||
log "[stage 2] uat_centroid (via siruta → GisUat polygon centroid)..."
|
||||
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
||||
WITH cand AS (
|
||||
SELECT e.cui FROM firms.entities e
|
||||
WHERE e.lat IS NULL
|
||||
AND e.siruta IS NOT NULL
|
||||
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
||||
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
||||
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
|
||||
geocode_source = 'uat_centroid',
|
||||
geocode_score = 0.3,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM public.\"GisUat\" gu, cand
|
||||
WHERE e.cui = cand.cui
|
||||
AND e.siruta = gu.siruta
|
||||
AND e.lat IS NULL
|
||||
RETURNING 1
|
||||
" | wc -l)
|
||||
log "[stage 2] updated $n rows"
|
||||
|
||||
# ── Stage 3: photon (docker) ────────────────────────────────────────────────
|
||||
if [ "$SKIP_PHOTON" = "1" ]; then
|
||||
log "[stage 3] SKIP_PHOTON=1 — skipping photon stage"
|
||||
else
|
||||
remaining_photon=$(psql -At -c "
|
||||
SELECT count(*) FROM firms.entities
|
||||
WHERE geocode_source IS NULL
|
||||
AND adr_strada IS NOT NULL
|
||||
AND adr_judet IS NOT NULL
|
||||
")
|
||||
if [ "$remaining_photon" = "0" ]; then
|
||||
log "[stage 3] no photon-eligible rows — skipping"
|
||||
else
|
||||
log "[stage 3] photon — $remaining_photon candidates..."
|
||||
if docker ps --filter name=vreaudigital-geocode --format '{{.Names}}' | grep -q '^vreaudigital-geocode$'; then
|
||||
log "WARN: vreaudigital-geocode already running — skipping stage 3"
|
||||
else
|
||||
docker rm -f vreaudigital-geocode 2>/dev/null || true
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-geocode-env.XXXXXX)
|
||||
printf 'DATABASE_URL=%s\nPHOTON_URL=http://127.0.0.1:2322\n' \
|
||||
"$DATABASE_URL" > "$ENVF"
|
||||
cd "$SEAP_DIR"
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-geocode \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" -w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
sh -c "npx tsx src/geocode-photon.ts --concurrency=$PHOTON_CONCURRENCY --batch=$PHOTON_BATCH")
|
||||
log "container started: $CID"
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
docker wait vreaudigital-geocode >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-geocode 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-geocode 2>&1 | tail -10 | tee -a "$LOG"
|
||||
log "[stage 3] photon container exit=$EXIT_CODE"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
unset DATABASE_URL TOKEN DB
|
||||
|
||||
# ── Stage 3b/3c/3d: uat_centroid by name (no siruta, no postal) ─────────────
|
||||
# For rows w/o adr_strada (skipped by photon) match postal_codes locality+county
|
||||
# median. Three normalization variants try locality token, comuna parent, and
|
||||
# Romanian â/î diacritic normalization.
|
||||
log "[stage 3b] uat_centroid by postal_codes locality+county median (locality token)..."
|
||||
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
||||
WITH cand AS (
|
||||
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
|
||||
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
|
||||
),
|
||||
loc_clean AS (
|
||||
SELECT
|
||||
cui,
|
||||
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
|
||||
upper(unaccent(trim(regexp_replace(
|
||||
regexp_replace(adr_localitate, ',.*\$', ''),
|
||||
'^(Sat|Or[şs]\\.?|Mun\\.?|Loc\\.?|Cartier|Comuna)\\s+', '', 'i'
|
||||
)))) AS loc_key
|
||||
FROM cand
|
||||
),
|
||||
pc_agg AS (
|
||||
SELECT
|
||||
upper(unaccent(coalesce(county,''))) AS judet_key,
|
||||
upper(unaccent(place_name)) AS loc_key,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
||||
FROM firms.postal_codes
|
||||
WHERE place_name IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = pc.lat,
|
||||
lng = pc.lng,
|
||||
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
||||
geocode_source = 'uat_centroid',
|
||||
geocode_score = 0.3,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM loc_clean lc
|
||||
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
|
||||
WHERE e.cui = lc.cui AND e.lat IS NULL
|
||||
RETURNING 1
|
||||
" | wc -l)
|
||||
log "[stage 3b] updated $n rows"
|
||||
|
||||
log "[stage 3c] uat_centroid by Comuna parent..."
|
||||
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
||||
WITH cand AS (
|
||||
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
|
||||
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
|
||||
),
|
||||
loc_clean AS (
|
||||
SELECT
|
||||
cui,
|
||||
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
|
||||
upper(unaccent(trim((regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1]))) AS loc_key
|
||||
FROM cand
|
||||
),
|
||||
pc_agg AS (
|
||||
SELECT
|
||||
upper(unaccent(coalesce(county,''))) AS judet_key,
|
||||
upper(unaccent(place_name)) AS loc_key,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
||||
FROM firms.postal_codes
|
||||
WHERE place_name IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = pc.lat,
|
||||
lng = pc.lng,
|
||||
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
||||
geocode_source = 'uat_centroid',
|
||||
geocode_score = 0.3,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM loc_clean lc
|
||||
JOIN pc_agg pc ON pc.judet_key = lc.judet_key AND pc.loc_key = lc.loc_key
|
||||
WHERE e.cui = lc.cui AND e.lat IS NULL AND lc.loc_key IS NOT NULL
|
||||
RETURNING 1
|
||||
" | wc -l)
|
||||
log "[stage 3c] updated $n rows"
|
||||
|
||||
log "[stage 3d] uat_centroid with â/î normalization (Oraş/Comuna/locality)..."
|
||||
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
||||
WITH cand AS (
|
||||
SELECT e.cui, e.adr_judet, e.adr_localitate FROM firms.entities e
|
||||
WHERE e.lat IS NULL AND e.adr_judet IS NOT NULL AND e.adr_localitate IS NOT NULL
|
||||
),
|
||||
loc_norm AS (
|
||||
SELECT
|
||||
cui,
|
||||
upper(unaccent(regexp_replace(adr_judet,'^MUNICIPIUL ',''))) AS judet_key,
|
||||
upper(unaccent(translate(trim(coalesce(
|
||||
(regexp_match(adr_localitate, 'Or[şs]\\.?\\s+([^,]+)', 'i'))[1],
|
||||
(regexp_match(adr_localitate, 'Comuna\\s+([^,]+)', 'i'))[1],
|
||||
regexp_replace(regexp_replace(adr_localitate, ',.*\$',''), '^(Sat|Loc\\.?)\\s+','','i')
|
||||
)), 'îÎ', 'âÂ'))) AS loc_key
|
||||
FROM cand
|
||||
),
|
||||
pc_agg AS (
|
||||
SELECT
|
||||
upper(unaccent(coalesce(county,''))) AS judet_key,
|
||||
upper(unaccent(translate(place_name, 'îÎ','âÂ'))) AS loc_key,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
||||
FROM firms.postal_codes
|
||||
WHERE place_name IS NOT NULL
|
||||
GROUP BY 1, 2
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = pc.lat,
|
||||
lng = pc.lng,
|
||||
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
||||
geocode_source = 'uat_centroid',
|
||||
geocode_score = 0.3,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM loc_norm ln
|
||||
JOIN pc_agg pc ON pc.judet_key = ln.judet_key AND pc.loc_key = ln.loc_key
|
||||
WHERE e.cui = ln.cui AND e.lat IS NULL AND ln.loc_key IS NOT NULL
|
||||
RETURNING 1
|
||||
" | wc -l)
|
||||
log "[stage 3d] updated $n rows"
|
||||
|
||||
# ── Stage 4: judet_centroid fallback ────────────────────────────────────────
|
||||
log "[stage 4] judet_centroid (county median, last resort)..."
|
||||
n=$(psql -v ON_ERROR_STOP=1 -At -c "
|
||||
WITH judet_agg AS (
|
||||
SELECT
|
||||
upper(unaccent(coalesce(county,''))) AS judet_key,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lat::double precision) AS lat,
|
||||
percentile_cont(0.5) WITHIN GROUP (ORDER BY lng::double precision) AS lng
|
||||
FROM firms.postal_codes
|
||||
WHERE county IS NOT NULL
|
||||
GROUP BY 1
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = ja.lat,
|
||||
lng = ja.lng,
|
||||
geom = ST_SetSRID(ST_MakePoint(ja.lng, ja.lat), 4326)::geography,
|
||||
geocode_source = 'judet_centroid',
|
||||
geocode_score = 0.1,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM judet_agg ja
|
||||
WHERE upper(unaccent(regexp_replace(e.adr_judet,'^MUNICIPIUL ',''))) = ja.judet_key
|
||||
AND e.lat IS NULL
|
||||
RETURNING 1
|
||||
" | wc -l)
|
||||
log "[stage 4] updated $n rows"
|
||||
|
||||
# ── Final stats ─────────────────────────────────────────────────────────────
|
||||
log "Final stats:"
|
||||
psql -A -F"|" -c "
|
||||
SELECT
|
||||
geocode_source,
|
||||
count(*) AS rows
|
||||
FROM firms.entities
|
||||
GROUP BY geocode_source
|
||||
ORDER BY rows DESC;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
residual=$(psql -At -c "SELECT count(*) FROM firms.entities WHERE lat IS NULL;")
|
||||
log "Residual WHERE lat IS NULL: $residual (out of reach — no address fields)"
|
||||
log "=== Geocode-firms fallback chain done ==="
|
||||
|
||||
unset PGPASSWORD
|
||||
Executable
+144
@@ -0,0 +1,144 @@
|
||||
#!/bin/bash
|
||||
# Daily data-freshness heartbeat for vreaudigital.ro
|
||||
# - Queries max(fetched_at) per primary table across 17 schemas
|
||||
# - Compares against per-source expected cadence (days)
|
||||
# - Posts a webhook payload if any source is stale beyond threshold
|
||||
# - Always exits 0 (alerts are signal, not error — cron noise budget = 1 alert/day)
|
||||
#
|
||||
# Run from satra cron at 07:00 daily.
|
||||
# Designed to be paranoid-safe: never echoes the DB password, never fails
|
||||
# loud on transient DB blips (only fails when the heartbeat itself can't run).
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-heartbeat.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG"; }
|
||||
|
||||
WEBHOOK_URL="https://n8n.beletage.ro/webhook/satra-backup-alert"
|
||||
HOSTNAME_TAG="vreaudigital"
|
||||
|
||||
log "=== Heartbeat started ==="
|
||||
|
||||
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
|
||||
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
|
||||
TOKEN=$(infisical login \
|
||||
--method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
DATABASE_URL=$(infisical run \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" \
|
||||
--path="$INFISICAL_PATH" \
|
||||
--silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DATABASE_URL TOKEN DB
|
||||
|
||||
# Per-source cadence query. Each row: source_label, expected_max_days, actual_gap_days,
|
||||
# last_seen_date. Sources stuck at known long staleness (anaf datornici Q1 2016) are
|
||||
# excluded — heartbeat noise budget is for fixable freshness, not known constants.
|
||||
QUERY=$(cat <<'SQL'
|
||||
WITH probes AS (
|
||||
SELECT 'seap.announcements' AS label, 2 AS expected_days, max(publication_date)::date AS last_seen FROM seap.announcements
|
||||
UNION ALL
|
||||
SELECT 'seap.wsp_sync_state', 1, max(last_run_at)::date FROM seap.wsp_sync_state
|
||||
UNION ALL
|
||||
SELECT 'seap.sync_state(da)', 30, max(updated_at)::date FROM seap.sync_state WHERE source='da'
|
||||
UNION ALL
|
||||
SELECT 'firms.entities', 100, max(updated_at)::date FROM firms.entities
|
||||
UNION ALL
|
||||
SELECT 'firms.financials', 400, max(fetched_at)::date FROM firms.financials
|
||||
UNION ALL
|
||||
SELECT 'fonduri.beneficiar_anunt', 7, max(data_publicare)::date FROM fonduri.beneficiar_anunt
|
||||
UNION ALL
|
||||
SELECT 'fonduri.afir_plati', 365, max(fetched_at)::date FROM fonduri.afir_plati
|
||||
UNION ALL
|
||||
SELECT 'regas.ajutoare', 45, max(fetched_at)::date FROM regas.ajutoare
|
||||
UNION ALL
|
||||
SELECT 'aep.donatii_pj', 60, max(fetched_at)::date FROM aep.donatii_pj
|
||||
UNION ALL
|
||||
SELECT 'ani.declaratii', 400, max(fetched_at)::date FROM ani.declaratii
|
||||
UNION ALL
|
||||
SELECT 'bugetar.entitate', 60, max(updated_at)::date FROM bugetar.entitate
|
||||
UNION ALL
|
||||
SELECT 'anre.licente', 14, max(fetched_at)::date FROM anre.licente
|
||||
UNION ALL
|
||||
SELECT 'ancom.operatori', 14, max(fetched_at)::date FROM ancom.operatori
|
||||
UNION ALL
|
||||
SELECT 'cnsc.decizii', 14, max(fetched_at)::date FROM cnsc.decizii
|
||||
UNION ALL
|
||||
SELECT 'cnas.furnizori', 60, max(fetched_at)::date FROM cnas.furnizori
|
||||
UNION ALL
|
||||
SELECT 'asf.entitati', 14, max(fetched_at)::date FROM asf.entitati
|
||||
UNION ALL
|
||||
SELECT 'aaas.firme', 30, max(fetched_at)::date FROM aaas.firme
|
||||
UNION ALL
|
||||
SELECT 'curteacont.rapoarte', 14, max(fetched_at)::date FROM curteacont.rapoarte
|
||||
UNION ALL
|
||||
SELECT 'apia.fermieri', 60, max(fetched_at)::date FROM apia.fermieri
|
||||
UNION ALL
|
||||
SELECT 'gnm.comunicate', 14, max(fetched_at)::date FROM gnm.comunicate
|
||||
)
|
||||
SELECT label, expected_days,
|
||||
-- clamp future dates (TED publication-date can be in the future) and
|
||||
-- treat NULL last_seen as ancient (empty table → alert).
|
||||
-- NB: LEAST(NULL, x) = x in PG (returns NULL only if all args NULL),
|
||||
-- so explicit CASE for NULL handling.
|
||||
CASE WHEN last_seen IS NULL THEN 9999
|
||||
ELSE (now()::date - LEAST(last_seen, now()::date)) END AS gap_days,
|
||||
COALESCE(last_seen::text, 'NEVER') AS last_seen,
|
||||
CASE WHEN last_seen IS NULL THEN 'STALE'
|
||||
WHEN (now()::date - LEAST(last_seen, now()::date)) > expected_days THEN 'STALE'
|
||||
ELSE 'OK' END AS status
|
||||
FROM probes
|
||||
ORDER BY CASE WHEN last_seen IS NULL THEN 9999
|
||||
ELSE (now()::date - LEAST(last_seen, now()::date)) END DESC;
|
||||
SQL
|
||||
)
|
||||
|
||||
OUT=$(psql -v ON_ERROR_STOP=1 -A -F$'\t' -t -c "$QUERY" 2>&1) || {
|
||||
log "ERROR: psql failed — heartbeat skipped this run"
|
||||
log "$OUT"
|
||||
exit 0
|
||||
}
|
||||
|
||||
unset PGPASSWORD
|
||||
|
||||
STALE_LIST=$(echo "$OUT" | awk -F'\t' '$5=="STALE" { printf "%s (gap=%sd, expected≤%sd, last=%s)\n", $1, $3, $2, $4 }')
|
||||
STALE_COUNT=$(echo -n "$STALE_LIST" | grep -c . || true)
|
||||
TOTAL=$(echo -n "$OUT" | grep -c . || true)
|
||||
|
||||
log "Probed $TOTAL sources, $STALE_COUNT stale"
|
||||
echo "$OUT" | awk -F'\t' '{ printf " %-30s %s gap=%sd last=%s\n", $1, $5, $3, $4 }' | tee -a "$LOG"
|
||||
|
||||
if [ "$STALE_COUNT" -gt 0 ]; then
|
||||
log "ALERT — posting to webhook"
|
||||
PAYLOAD=$(jq -nc \
|
||||
--arg s "STALE" \
|
||||
--arg h "$HOSTNAME_TAG" \
|
||||
--argjson c "$STALE_COUNT" \
|
||||
--argjson t "$TOTAL" \
|
||||
--arg d "$STALE_LIST" \
|
||||
'{status:$s, host:$h, service:"data-heartbeat", stale_count:$c, total:$t, details:$d}')
|
||||
curl -sS -X POST -H "Content-Type: application/json" --max-time 30 \
|
||||
-d "$PAYLOAD" "$WEBHOOK_URL" >/dev/null 2>&1 || log "webhook POST failed (non-fatal)"
|
||||
fi
|
||||
|
||||
log "=== Done ==="
|
||||
exit 0
|
||||
+132
@@ -0,0 +1,132 @@
|
||||
#!/bin/bash
|
||||
# AFIR historical XLSX importer wrapper.
|
||||
#
|
||||
# Downloads a yearly AFIR FEADR/FEGA XLSX, normalizes to pipe-TSV, ships to
|
||||
# satra, COPYs into fonduri.staging_afir, then INSERTs into fonduri.afir_plati
|
||||
# with source_year tagging.
|
||||
#
|
||||
# Idempotent: rows with the matching source_year are deleted before insert
|
||||
# (XLSX dumps are stateless reflections of AFIR DB at publication time).
|
||||
#
|
||||
# Usage:
|
||||
# ./import-afir-historical.sh URL YEAR FUND [LIMIT]
|
||||
# URL: AFIR XLSX direct download URL
|
||||
# YEAR: 4-digit source year, e.g. 2023
|
||||
# FUND: 'feadr' or 'fega' (informational; schema is identical)
|
||||
# LIMIT: optional integer — only insert first N rows (smoke test)
|
||||
#
|
||||
# Example:
|
||||
# ./import-afir-historical.sh \
|
||||
# 'https://www.afir.ro/media/35cm3jdr/listaplati_2023_feadr_actualizata.xlsx' \
|
||||
# 2023 feadr
|
||||
#
|
||||
# Smoke test (1000 rows):
|
||||
# ./import-afir-historical.sh '<url>' 2023 feadr 1000
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
URL="${1:?URL required}"
|
||||
YEAR="${2:?YEAR required}"
|
||||
FUND="${3:?FUND required (feadr|fega)}"
|
||||
LIMIT="${4:-}"
|
||||
|
||||
if ! [[ "$YEAR" =~ ^20[0-9]{2}$ ]]; then
|
||||
echo "[afir-historical] ERROR: YEAR must be 4-digit (got: $YEAR)" >&2
|
||||
exit 2
|
||||
fi
|
||||
if [[ "$FUND" != "feadr" && "$FUND" != "fega" ]]; then
|
||||
echo "[afir-historical] ERROR: FUND must be 'feadr' or 'fega' (got: $FUND)" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
WORK_LOCAL="/tmp/afir-historical-$$"
|
||||
WORK_REMOTE="/tmp/afir-historical-$YEAR-$FUND"
|
||||
trap "rm -rf $WORK_LOCAL" EXIT
|
||||
mkdir -p "$WORK_LOCAL"
|
||||
|
||||
XLSX_LOCAL="$WORK_LOCAL/listaplati_${YEAR}_${FUND}.xlsx"
|
||||
TSV_LOCAL="$WORK_LOCAL/listaplati_${YEAR}_${FUND}.tsv"
|
||||
|
||||
echo "[afir-historical] === ${YEAR} ${FUND} ==="
|
||||
|
||||
# 1. Download (resume-friendly, large file safe). Run on satra to skip the
|
||||
# upload-back-to-server hop — the XLSX is 30 MB.
|
||||
echo "[afir-historical] downloading on satra..."
|
||||
ssh satra "mkdir -p $WORK_REMOTE && curl -sLkf --max-time 600 -o $WORK_REMOTE/listaplati.xlsx '$URL' && ls -lh $WORK_REMOTE/listaplati.xlsx"
|
||||
|
||||
# 2. Normalize to pipe-delimited TSV using existing python3-openpyxl on satra.
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)/scripts"
|
||||
echo "[afir-historical] uploading normalizer..."
|
||||
scp -q "$SCRIPT_DIR/import-afir-historical.py" satra:$WORK_REMOTE/normalize.py
|
||||
|
||||
echo "[afir-historical] normalizing XLSX → TSV (this takes ~2-5 min for 500K rows)..."
|
||||
ssh satra "python3 $WORK_REMOTE/normalize.py $WORK_REMOTE/listaplati.xlsx $WORK_REMOTE/data.tsv 2>&1 | tail -20"
|
||||
|
||||
# 3. Optional smoke-test truncation
|
||||
TSV_REMOTE="$WORK_REMOTE/data.tsv"
|
||||
if [ -n "$LIMIT" ]; then
|
||||
echo "[afir-historical] LIMIT=$LIMIT — truncating TSV for smoke test..."
|
||||
ssh satra "head -n $LIMIT $WORK_REMOTE/data.tsv > $WORK_REMOTE/data.smoke.tsv && wc -l $WORK_REMOTE/data.smoke.tsv"
|
||||
TSV_REMOTE="$WORK_REMOTE/data.smoke.tsv"
|
||||
fi
|
||||
|
||||
# 4. Stage + INSERT on Postgres via /tmp/baseline.sh (Infisical-aware psql wrapper).
|
||||
echo "[afir-historical] staging + insert..."
|
||||
ssh satra "/tmp/baseline.sh <<SQL
|
||||
\\set ON_ERROR_STOP on
|
||||
|
||||
TRUNCATE TABLE fonduri.staging_afir;
|
||||
|
||||
\\copy fonduri.staging_afir (beneficiar_name, last_name, mama_cui, localitate, cod_masura, obiectiv, data_start, data_end, fega_op, fega_total, feadr_op, feadr_total, op_amount, cofinantare, ue_total) FROM '$TSV_REMOTE' WITH (FORMAT text, DELIMITER '|', NULL '')
|
||||
|
||||
SELECT 'staging_loaded' AS step, COUNT(*) AS rows FROM fonduri.staging_afir;
|
||||
|
||||
-- Idempotent: drop existing rows for (year, fund) before reinsert.
|
||||
-- We use cod_masura prefix as a fund discriminator: FEGA codes start with
|
||||
-- a single letter or specific scheme (DPB, ANTPDD, etc); FEADR is 'M ' prefix
|
||||
-- or numeric. For safety in the LIMIT smoke test we DON'T delete; only
|
||||
-- delete on a full run (LIMIT empty).
|
||||
SQL"
|
||||
|
||||
if [ -z "$LIMIT" ]; then
|
||||
echo "[afir-historical] full run — deleting prior rows for source_year=$YEAR..."
|
||||
ssh satra "/tmp/baseline.sh -c \"DELETE FROM fonduri.afir_plati WHERE source_year = $YEAR;\""
|
||||
fi
|
||||
|
||||
ssh satra "/tmp/baseline.sh <<SQL
|
||||
\\set ON_ERROR_STOP on
|
||||
|
||||
INSERT INTO fonduri.afir_plati (
|
||||
source_year, beneficiar_name, last_name, mama_cui, localitate,
|
||||
cod_masura, obiectiv, data_start, data_end,
|
||||
fega_op, fega_total, feadr_op, feadr_total,
|
||||
op_amount, cofinantare, ue_total
|
||||
)
|
||||
SELECT
|
||||
$YEAR,
|
||||
beneficiar_name, NULLIF(last_name, ''), NULLIF(mama_cui, ''), NULLIF(localitate, ''),
|
||||
NULLIF(cod_masura, ''), NULLIF(obiectiv, ''), NULLIF(data_start, ''), NULLIF(data_end, ''),
|
||||
NULLIF(fega_op, '')::numeric,
|
||||
NULLIF(fega_total, '')::numeric,
|
||||
NULLIF(feadr_op, '')::numeric,
|
||||
NULLIF(feadr_total, '')::numeric,
|
||||
NULLIF(op_amount, '')::numeric,
|
||||
NULLIF(cofinantare, '')::numeric,
|
||||
NULLIF(ue_total, '')::numeric
|
||||
FROM fonduri.staging_afir;
|
||||
|
||||
SELECT '$YEAR-$FUND' AS run,
|
||||
COUNT(*) AS rows_inserted,
|
||||
COUNT(DISTINCT beneficiar_name) AS distinct_beneficiars,
|
||||
SUM(CASE WHEN feadr_total > 0 THEN 1 END) AS with_feadr,
|
||||
SUM(CASE WHEN fega_total > 0 THEN 1 END) AS with_fega,
|
||||
SUM(ue_total)::bigint AS sum_ue_eur
|
||||
FROM fonduri.afir_plati WHERE source_year = $YEAR;
|
||||
SQL"
|
||||
|
||||
if [ -z "$LIMIT" ]; then
|
||||
echo "[afir-historical] cleaning up remote workdir..."
|
||||
ssh satra "rm -rf $WORK_REMOTE"
|
||||
fi
|
||||
|
||||
echo "[afir-historical] === done ($YEAR $FUND) ==="
|
||||
+210
@@ -0,0 +1,210 @@
|
||||
#!/bin/bash
|
||||
# APIA "Lista fermieri" importer wrapper.
|
||||
#
|
||||
# Discovers CKAN package "lista-fermierilor-campania-apia-{YEAR}" on
|
||||
# data.gov.ro and ingests each XLSX resource into apia.fermieri. The
|
||||
# package can grow over time as more UATs publish their lists; the importer
|
||||
# is resource-id keyed so re-runs are idempotent (DELETE WHERE
|
||||
# source_resource_id = X before re-INSERT).
|
||||
#
|
||||
# Pattern follows cron/import-afir-historical.sh but simpler — APIA XLSX is
|
||||
# tiny (KB-MB, not 30 MB), so we don't need streaming COPY tricks; we
|
||||
# stage on satra and load directly.
|
||||
#
|
||||
# Usage:
|
||||
# ./import-apia-fermieri.sh # all years (currently 2024)
|
||||
# ./import-apia-fermieri.sh 2024 # only the given year
|
||||
# ./import-apia-fermieri.sh 2024 1 # smoke test: only first resource
|
||||
#
|
||||
# Requires `jq` and `python3-openpyxl` on satra (already installed).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
YEAR_FILTER="${1:-}" # empty = all years discoverable
|
||||
RESOURCE_LIMIT="${2:-0}" # 0 = all resources within selected year(s)
|
||||
|
||||
WORK_LOCAL="/tmp/apia-import-$$"
|
||||
trap "rm -rf $WORK_LOCAL" EXIT
|
||||
mkdir -p "$WORK_LOCAL"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)/scripts"
|
||||
NORMALIZER="$SCRIPT_DIR/import-apia-fermieri.py"
|
||||
|
||||
# 1. Discover candidate datasets via CKAN search.
|
||||
echo "[apia-import] discovering CKAN datasets..."
|
||||
curl -sSL --max-time 60 \
|
||||
"https://data.gov.ro/api/3/action/package_search?q=lista+fermieri+APIA&rows=50" \
|
||||
> "$WORK_LOCAL/search.json"
|
||||
|
||||
# Extract: dataset_name | resource_id | resource_url | resource_format | resource_name
|
||||
# Filter to xlsx resources whose dataset name matches lista-fermier*-apia-*.
|
||||
python3 - "$WORK_LOCAL/search.json" "$YEAR_FILTER" > "$WORK_LOCAL/resources.tsv" <<'PY'
|
||||
import json, sys, re
|
||||
|
||||
path, year_filter = sys.argv[1], sys.argv[2]
|
||||
with open(path) as f:
|
||||
d = json.load(f)
|
||||
|
||||
results = d.get("result", {}).get("results", [])
|
||||
out_lines = []
|
||||
for pkg in results:
|
||||
name = pkg.get("name", "")
|
||||
if not re.search(r"lista[-_]ferm", name, re.I):
|
||||
continue
|
||||
# Year extraction from package name (e.g. "lista-fermierilor-campania-apia-2024")
|
||||
m = re.search(r"(20\d{2})", name)
|
||||
pkg_year = m.group(1) if m else ""
|
||||
if year_filter and pkg_year != year_filter:
|
||||
continue
|
||||
for rs in pkg.get("resources", []):
|
||||
fmt = (rs.get("format") or "").upper()
|
||||
if fmt not in ("XLSX", "XLS"):
|
||||
continue
|
||||
rid = rs.get("id") or ""
|
||||
rurl = rs.get("url") or ""
|
||||
rname = (rs.get("name") or "").replace("\t", " ")
|
||||
if not (rid and rurl and pkg_year):
|
||||
continue
|
||||
out_lines.append(f"{name}\t{pkg_year}\t{rid}\t{rurl}\t{rname}")
|
||||
|
||||
if not out_lines:
|
||||
print("[apia-import] no matching xlsx resources found", file=sys.stderr)
|
||||
|
||||
print("\n".join(out_lines))
|
||||
PY
|
||||
|
||||
N_RESOURCES=$(wc -l < "$WORK_LOCAL/resources.tsv" || echo 0)
|
||||
echo "[apia-import] found $N_RESOURCES candidate XLSX resource(s)"
|
||||
|
||||
if [ "$N_RESOURCES" -eq 0 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Optional smoke truncation (head N).
|
||||
if [ "$RESOURCE_LIMIT" -gt 0 ] 2>/dev/null; then
|
||||
head -n "$RESOURCE_LIMIT" "$WORK_LOCAL/resources.tsv" > "$WORK_LOCAL/resources.smoke.tsv"
|
||||
mv "$WORK_LOCAL/resources.smoke.tsv" "$WORK_LOCAL/resources.tsv"
|
||||
echo "[apia-import] smoke mode — truncated to first $RESOURCE_LIMIT resource(s)"
|
||||
fi
|
||||
|
||||
# 2. Upload normalizer to satra (once).
|
||||
echo "[apia-import] uploading normalizer..."
|
||||
ssh satra "mkdir -p /tmp/apia-import"
|
||||
scp -q "$NORMALIZER" satra:/tmp/apia-import/normalize.py
|
||||
|
||||
# 3. For each resource: download → normalize → stage → INSERT.
|
||||
TOTAL_ROWS=0
|
||||
TOTAL_INSERTED=0
|
||||
TOTAL_RESOURCES=0
|
||||
|
||||
while IFS=$'\t' read -r DATASET_ID YEAR RESOURCE_ID SOURCE_URL RESOURCE_NAME; do
|
||||
TOTAL_RESOURCES=$((TOTAL_RESOURCES + 1))
|
||||
WORK_REMOTE="/tmp/apia-import/$RESOURCE_ID"
|
||||
echo "[apia-import] === $DATASET_ID / $RESOURCE_ID ($RESOURCE_NAME) ==="
|
||||
|
||||
STARTED_AT=$(date -u +%Y-%m-%dT%H:%M:%S.%3NZ)
|
||||
T0=$(date +%s%3N)
|
||||
|
||||
ssh satra "mkdir -p $WORK_REMOTE && curl -sLkf --max-time 120 -o $WORK_REMOTE/listaferm.xlsx '$SOURCE_URL' && ls -lh $WORK_REMOTE/listaferm.xlsx"
|
||||
|
||||
ssh satra "python3 /tmp/apia-import/normalize.py \
|
||||
$WORK_REMOTE/listaferm.xlsx $WORK_REMOTE/data.tsv \
|
||||
'$YEAR' '$DATASET_ID' '$RESOURCE_ID' '$SOURCE_URL' 2>&1 | tail -5"
|
||||
|
||||
N_TSV=$(ssh satra "wc -l < $WORK_REMOTE/data.tsv")
|
||||
echo "[apia-import] normalized rows: $N_TSV"
|
||||
|
||||
# Idempotent: drop existing rows for this resource_id, then re-INSERT.
|
||||
ssh satra "/tmp/baseline.sh <<SQL
|
||||
\\set ON_ERROR_STOP on
|
||||
|
||||
TRUNCATE TABLE apia.staging_fermieri;
|
||||
|
||||
\\copy apia.staging_fermieri FROM '$WORK_REMOTE/data.tsv' WITH (FORMAT text, DELIMITER '|', NULL '')
|
||||
|
||||
SELECT 'staged' AS step, COUNT(*) AS rows FROM apia.staging_fermieri;
|
||||
|
||||
DELETE FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
|
||||
|
||||
-- Dedupe within the staging set on the natural key (UAT XLSXes occasionally
|
||||
-- list the same farmer twice for separate parcel categories). Pick the row
|
||||
-- with max suprafata_ha so we don't lose the larger declaration.
|
||||
INSERT INTO apia.fermieri (
|
||||
campaign_year, name, comuna_oras, sat, centru_apia,
|
||||
responsabil_uat, suprafata_ha,
|
||||
source_dataset_id, source_resource_id, source_url
|
||||
)
|
||||
SELECT DISTINCT ON (campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''))
|
||||
campaign_year::smallint,
|
||||
name,
|
||||
NULLIF(comuna_oras, ''),
|
||||
NULLIF(sat, ''),
|
||||
NULLIF(centru_apia, ''),
|
||||
NULLIF(responsabil_uat, ''),
|
||||
NULLIF(suprafata_ha, '')::numeric,
|
||||
source_dataset_id,
|
||||
source_resource_id,
|
||||
source_url
|
||||
FROM apia.staging_fermieri
|
||||
ORDER BY campaign_year::smallint, name, NULLIF(comuna_oras,''), NULLIF(sat,''),
|
||||
NULLIF(suprafata_ha,'')::numeric DESC NULLS LAST
|
||||
ON CONFLICT (campaign_year, name, comuna_oras, sat) DO UPDATE
|
||||
SET centru_apia = EXCLUDED.centru_apia,
|
||||
responsabil_uat = EXCLUDED.responsabil_uat,
|
||||
suprafata_ha = EXCLUDED.suprafata_ha,
|
||||
source_dataset_id = EXCLUDED.source_dataset_id,
|
||||
source_resource_id = EXCLUDED.source_resource_id,
|
||||
source_url = EXCLUDED.source_url,
|
||||
fetched_at = now();
|
||||
|
||||
SELECT 'inserted' AS step,
|
||||
COUNT(*) AS rows_now
|
||||
FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';
|
||||
SQL"
|
||||
|
||||
N_NOW=$(ssh satra "/tmp/baseline.sh -t -A -c \"SELECT COUNT(*) FROM apia.fermieri WHERE source_resource_id = '$RESOURCE_ID';\" 2>/dev/null | tail -1")
|
||||
echo "[apia-import] inserted rows for $RESOURCE_ID: $N_NOW"
|
||||
|
||||
T1=$(date +%s%3N)
|
||||
DURATION=$((T1 - T0))
|
||||
|
||||
# Log the run
|
||||
ssh satra "/tmp/baseline.sh -c \"
|
||||
INSERT INTO apia.scrape_log (
|
||||
source_dataset_id, source_resource_id, source_url, campaign_year,
|
||||
rows_seen, rows_inserted, duration_ms, started_at
|
||||
) VALUES (
|
||||
'$DATASET_ID', '$RESOURCE_ID', '$SOURCE_URL', $YEAR,
|
||||
$N_TSV, $N_NOW, $DURATION, '$STARTED_AT'
|
||||
);\" 2>&1 | tail -2"
|
||||
|
||||
TOTAL_ROWS=$((TOTAL_ROWS + N_TSV))
|
||||
TOTAL_INSERTED=$((TOTAL_INSERTED + N_NOW))
|
||||
|
||||
ssh satra "rm -rf $WORK_REMOTE"
|
||||
done < "$WORK_LOCAL/resources.tsv"
|
||||
|
||||
# 4. CUI matcher
|
||||
echo "[apia-import] matching CUI..."
|
||||
ssh satra "/tmp/baseline.sh -c 'SELECT * FROM apia.match_cui();' 2>&1 | tail -10"
|
||||
|
||||
# 5. Refresh MV
|
||||
echo "[apia-import] refreshing materialized view..."
|
||||
ssh satra "/tmp/baseline.sh -c 'REFRESH MATERIALIZED VIEW apia.mv_per_cui;' 2>&1 | tail -5"
|
||||
|
||||
# 6. Final summary
|
||||
echo "[apia-import] === SUMMARY ==="
|
||||
ssh satra "/tmp/baseline.sh <<'SQL'
|
||||
SELECT
|
||||
'totals' AS metric,
|
||||
COUNT(*) AS rows_total,
|
||||
COUNT(DISTINCT source_resource_id) AS resources,
|
||||
COUNT(DISTINCT comuna_oras) AS comune,
|
||||
COUNT(DISTINCT centru_apia) AS centre_apia,
|
||||
ROUND(SUM(suprafata_ha)::numeric, 2) AS total_ha,
|
||||
COUNT(*) FILTER (WHERE cui IS NOT NULL) AS rows_with_cui,
|
||||
COUNT(*) FILTER (WHERE is_legal_person) AS rows_pj
|
||||
FROM apia.fermieri;
|
||||
SQL"
|
||||
|
||||
echo "[apia-import] === done ($TOTAL_RESOURCES resource(s), $TOTAL_INSERTED rows) ==="
|
||||
@@ -0,0 +1,526 @@
|
||||
#!/bin/bash
|
||||
# Historical financial backfill 2015-2019 from data.gov.ro / MFP.
|
||||
#
|
||||
# Why a separate script: 2015 and pre-2020 files have slightly different
|
||||
# schemas (WEB_UU 2015 has 21 cols vs 22 for 2016+; WEB_BL_BS_SL 2015 has 23
|
||||
# cols vs 22 for 2016+; WEB_INST_DE_CREDIT 2016/2017/2019 has 23 cols vs 25
|
||||
# for 2024). The daily importer (import-financials.sh +
|
||||
# import-financials-ong-banks.sh) assumes the 2020+ schema and silently fails
|
||||
# or rejects older years. This wrapper:
|
||||
# 1) Downloads the right files from data.gov.ro for the requested years.
|
||||
# 2) Loads them via a session-local TEMP TABLE matched to that year's column
|
||||
# count, then INSERTs into the canonical firms.financials* tables.
|
||||
#
|
||||
# Usage on satra:
|
||||
# /opt/vreaudigital/services/seap-scraper/cron/import-financials-historical.sh
|
||||
# YEARS="2017 2018" /opt/...../import-financials-historical.sh # subset
|
||||
#
|
||||
# Idempotent — PK (cui, year) + ON CONFLICT DO UPDATE.
|
||||
#
|
||||
# Banks: 2015 and 2018 have no Inst_de_credit file at data.gov.ro. Banks for
|
||||
# 2016/2017/2019 use the pre-IFRS schema (21 indicators), so this script also
|
||||
# loads pre-2020 bank files into firms.financials_banks with the JSONB
|
||||
# `indicators` column carrying everything; the typed columns are mapped
|
||||
# best-effort (i21 instead of i23 → cifra_afaceri).
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/mfinante
|
||||
LOG=/var/log/vreaudigital-fin-historical.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
mkdir -p "$DATA_DIR"
|
||||
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
|
||||
--path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DBURL TOKEN DB
|
||||
|
||||
YEARS="${YEARS:-2015 2016 2017 2018 2019}"
|
||||
|
||||
log "=== Historical financial import started (YEARS=$YEARS) ==="
|
||||
|
||||
# Discover a download URL from a data.gov.ro slug by filename regex.
|
||||
# Args: slug pattern (pattern is a Python regex matched on resource name)
|
||||
discover() {
|
||||
local slug="$1"
|
||||
local pattern="$2"
|
||||
curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, re
|
||||
d = json.load(sys.stdin)
|
||||
pat = re.compile(r'''$pattern''', re.I)
|
||||
for r in d.get('result', {}).get('resources', []):
|
||||
if pat.search(r.get('name', '')):
|
||||
print(r.get('url', '')); break
|
||||
"
|
||||
}
|
||||
|
||||
# Download a file from data.gov.ro if not already present.
|
||||
# Args: local_path url
|
||||
fetch() {
|
||||
local file="$1"
|
||||
local url="$2"
|
||||
if [ -s "$file" ]; then
|
||||
log " [SKIP] $file already exists ($(stat -c%s "$file") bytes)"
|
||||
return 0
|
||||
fi
|
||||
if [ -z "$url" ]; then
|
||||
log " [ERR] No URL for $file"
|
||||
return 1
|
||||
fi
|
||||
log " Downloading $url → $file"
|
||||
curl -fsL --max-time 300 -o "$file" "$url" || { log " [ERR] download failed"; rm -f "$file"; return 1; }
|
||||
log " OK $(stat -c%s "$file") bytes"
|
||||
}
|
||||
|
||||
# ─── WEB_UU (companies, prescurtat) ──────────────────────────────────────
|
||||
import_uu() {
|
||||
local year="$1"
|
||||
local file="$DATA_DIR/web_uu_${year}.txt"
|
||||
local slug="situatii_financiare_${year}"
|
||||
local pattern url ncols
|
||||
case "$year" in
|
||||
2015) pattern="^web_uu.*${year}\\.txt$"; ncols=21 ;;
|
||||
*) pattern="^web_uu.*${year}\\.txt$"; ncols=22 ;;
|
||||
esac
|
||||
if [ ! -s "$file" ]; then
|
||||
url=$(discover "$slug" "$pattern")
|
||||
fetch "$file" "$url" || return 1
|
||||
fi
|
||||
log "[$year/WEB_UU] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
|
||||
if [ "$ncols" -eq 22 ]; then
|
||||
# Standard schema (2016+): CUI,CAEN,I1..I20. I20 = salariati.
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
log "[$year/WEB_UU] UPSERT..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials (
|
||||
cui, year, caen,
|
||||
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
||||
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
||||
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
||||
cifra_afaceri, venituri_total, cheltuieli_total,
|
||||
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
||||
numar_salariati, source
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $year, caen,
|
||||
i1, i2, i3, i4, i5, i6, i7, i8, i9,
|
||||
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
|
||||
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
|
||||
'mfinante:WEB_UU'
|
||||
FROM firms.staging_financials
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
source = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
||||
ELSE EXCLUDED.source
|
||||
END,
|
||||
caen = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
||||
ELSE EXCLUDED.caen
|
||||
END;
|
||||
SQL
|
||||
else
|
||||
# 2015 schema (21 cols, CUI,CAEN,I1..I19). The pre-2016 reporting
|
||||
# ordering omits the modern I12 (patrimoniul_regiei) column entirely
|
||||
# and shifts everything from cifra_afaceri onward one position left:
|
||||
# 2015 I12 ↔ modern I13 (cifra_afaceri)
|
||||
# 2015 I13 ↔ modern I14 (venituri_total)
|
||||
# ...
|
||||
# 2015 I18 ↔ modern I19 (pierdere_neta)
|
||||
# 2015 I19 ↔ modern I20 (numar_salariati)
|
||||
# Verified by matching cifra_afaceri / salariati to a stable CUI's
|
||||
# 2016-2024 series. Without this remap, salariati was being ingested
|
||||
# as pierdere_neta and cifra_afaceri was off by one column.
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
log "[$year/WEB_UU] UPSERT (2015 left-shift remap)..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials (
|
||||
cui, year, caen,
|
||||
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
||||
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
||||
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
||||
cifra_afaceri, venituri_total, cheltuieli_total,
|
||||
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
||||
numar_salariati, source
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $year, caen,
|
||||
i1, i2, i3, i4, i5, i6, i7, i8, i9,
|
||||
i10, i11,
|
||||
NULL::numeric(20,2), -- patrimoniul_regiei not in 2015 schema
|
||||
i12, i13, i14, i15, i16, i17, i18, -- cifra_afaceri..pierdere_neta
|
||||
CASE WHEN i19 BETWEEN 0 AND 100000000 THEN i19::bigint ELSE NULL END,
|
||||
'mfinante:WEB_UU'
|
||||
FROM firms.staging_financials
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
source = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
||||
ELSE EXCLUDED.source
|
||||
END,
|
||||
caen = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
||||
ELSE EXCLUDED.caen
|
||||
END;
|
||||
SQL
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── WEB_BL_BS_SL ────────────────────────────────────────────────────────
|
||||
import_bl() {
|
||||
local year="$1"
|
||||
local file="$DATA_DIR/web_bl_bs_sl_${year}.txt"
|
||||
local slug="situatii_financiare_${year}"
|
||||
local pattern url ncols
|
||||
pattern="^web_bl_bs_sl.*${year}\\.txt$"
|
||||
case "$year" in
|
||||
2015) ncols=23 ;; # has extra I21
|
||||
*) ncols=22 ;;
|
||||
esac
|
||||
if [ ! -s "$file" ]; then
|
||||
url=$(discover "$slug" "$pattern")
|
||||
fetch "$file" "$url" || return 1
|
||||
fi
|
||||
log "[$year/WEB_BL_BS_SL] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
|
||||
if [ "$ncols" -eq 22 ]; then
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
log "[$year/WEB_BL_BS_SL] UPSERT..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials (
|
||||
cui, year, caen,
|
||||
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
||||
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
||||
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
||||
cifra_afaceri, venituri_total, cheltuieli_total,
|
||||
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
||||
numar_salariati, source
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $year, caen,
|
||||
i1, i2, i3, i4, i5, i6, i7, i8, i9,
|
||||
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
|
||||
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
|
||||
'mfinante:WEB_BL_BS_SL'
|
||||
FROM firms.staging_financials
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
source = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
||||
ELSE EXCLUDED.source
|
||||
END,
|
||||
caen = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
||||
ELSE EXCLUDED.caen
|
||||
END;
|
||||
SQL
|
||||
else
|
||||
# 2015 BL_BS_SL schema (23 cols, CUI,CAEN,I1..I21). The pre-2016 BL
|
||||
# reporting has an extra (unknown) field somewhere between
|
||||
# capital_subscris (I11) and cifra_afaceri. Empirically (cross-checked
|
||||
# CUI 538310 against 2016-2024 series): cifra_afaceri lives at I14
|
||||
# (not I13), salariati at I21. Treat I12,I13 as patrimoniul_regiei +
|
||||
# an unmapped field (likely related to regii autonome / provizioane
|
||||
# detail); both empty for typical SRLs. Map:
|
||||
# 2015 BL I1..I11 = modern I1..I11
|
||||
# 2015 BL I12 → patrimoniul_regiei (modern I12)
|
||||
# 2015 BL I13 → dropped (unknown)
|
||||
# 2015 BL I14 → cifra_afaceri (modern I13)
|
||||
# 2015 BL I15..I20 → modern I14..I19
|
||||
# 2015 BL I21 → numar_salariati (modern I20)
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
CREATE TEMP TABLE tmp_bl23 (
|
||||
cui text, caen text,
|
||||
i1 numeric(20,2), i2 numeric(20,2), i3 numeric(20,2), i4 numeric(20,2),
|
||||
i5 numeric(20,2), i6 numeric(20,2), i7 numeric(20,2), i8 numeric(20,2),
|
||||
i9 numeric(20,2), i10 numeric(20,2), i11 numeric(20,2), i12 numeric(20,2),
|
||||
i13 numeric(20,2), i14 numeric(20,2), i15 numeric(20,2), i16 numeric(20,2),
|
||||
i17 numeric(20,2), i18 numeric(20,2), i19 numeric(20,2), i20 numeric(20,2),
|
||||
i21 numeric(20,2)
|
||||
); -- session-scoped; dropped when psql exits
|
||||
\\copy tmp_bl23 FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
INSERT INTO firms.financials (
|
||||
cui, year, caen,
|
||||
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
||||
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
||||
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
||||
cifra_afaceri, venituri_total, cheltuieli_total,
|
||||
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
||||
numar_salariati, source
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $year, caen,
|
||||
i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
|
||||
i12, -- patrimoniul_regiei
|
||||
i14, i15, i16, i17, i18, i19, i20, -- cifra_afaceri..pierdere_neta
|
||||
CASE WHEN i21 BETWEEN 0 AND 100000000 THEN i21::bigint ELSE NULL END,
|
||||
'mfinante:WEB_BL_BS_SL'
|
||||
FROM tmp_bl23
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
source = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
||||
ELSE EXCLUDED.source
|
||||
END,
|
||||
caen = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
||||
ELSE EXCLUDED.caen
|
||||
END;
|
||||
COPYEOF
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── WEB_ONG (49 cols, schema consistent across 2015-2024) ───────────────
|
||||
import_ong() {
|
||||
local year="$1"
|
||||
local file="$DATA_DIR/web_ong_${year}.txt"
|
||||
local slug="situatii_financiare_${year}"
|
||||
local url
|
||||
if [ ! -s "$file" ]; then
|
||||
url=$(discover "$slug" "^web_ong.*${year}\\.txt$")
|
||||
fetch "$file" "$url" || return 1
|
||||
fi
|
||||
local header_cols
|
||||
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
|
||||
log "[$year/WEB_ONG] COPY $file ($(stat -c%s "$file") bytes, $header_cols cols)..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
|
||||
if [ "$header_cols" -eq 49 ]; then
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
elif [ "$header_cols" -eq 51 ]; then
|
||||
# 2018 schema: ...,I44,DEN_CAENO,I45,DEN_CAEN,I46 (extra UNQUOTED text
|
||||
# columns whose contents contain commas — breaks naive CSV parsing).
|
||||
# Preprocess into a 49-col file by walking backwards from end to identify
|
||||
# the two text columns (variable comma count).
|
||||
local cleaned="${file}.cleaned49"
|
||||
log "[$year/WEB_ONG] Preprocessing 51→49 cols (stripping DEN_CAEN/DEN_CAENO)..."
|
||||
python3 - "$file" "$cleaned" <<'PYEOF'
|
||||
import sys
|
||||
src, dst = sys.argv[1], sys.argv[2]
|
||||
NUM_RE = __import__('re').compile(r'^-?\d+(\.\d+)?$|^$')
|
||||
out = open(dst, 'w')
|
||||
with open(src) as fh:
|
||||
header = fh.readline().rstrip('\n').split(',')
|
||||
# write reduced header (drop DEN_CAEN, DEN_CAENO positions 47 and 49, zero-indexed)
|
||||
keep = [i for i, h in enumerate(header) if h.upper() not in ('DEN_CAEN', 'DEN_CAENO')]
|
||||
out.write(','.join(header[i] for i in keep) + '\n')
|
||||
for line in fh:
|
||||
line = line.rstrip('\n')
|
||||
parts = line.split(',')
|
||||
# Walk from end: parts[-1] = i46 (numeric), then DEN_CAEN spans
|
||||
# multiple parts (text). parts[-X] = i45 (numeric/empty), then
|
||||
# DEN_CAENO spans, then parts[-Y] = i44 (numeric/empty).
|
||||
n = len(parts)
|
||||
# Find last 3 numeric-or-empty trailing fields by scanning back.
|
||||
# i46 = parts[n-1]; find i45 = first numeric/empty going back from n-2.
|
||||
i46_idx = n - 1
|
||||
# walk backwards skipping non-numeric until we hit numeric -> that's i45
|
||||
j = n - 2
|
||||
while j >= 0 and not NUM_RE.match(parts[j]):
|
||||
j -= 1
|
||||
i45_idx = j
|
||||
# den_caen spans (i45_idx+1 .. i46_idx-1) → join those
|
||||
# continue back to find i44
|
||||
j -= 1
|
||||
while j >= 0 and not NUM_RE.match(parts[j]):
|
||||
j -= 1
|
||||
i44_idx = j
|
||||
if i44_idx < 0 or i45_idx < 0:
|
||||
# malformed row — skip
|
||||
continue
|
||||
# Reassemble: parts[0..i44_idx] + parts[i45_idx] + parts[i46_idx]
|
||||
new_parts = parts[:i44_idx+1] + [parts[i45_idx]] + [parts[i46_idx]]
|
||||
if len(new_parts) != 49:
|
||||
# row doesn't fit expected 49-col output → skip
|
||||
continue
|
||||
out.write(','.join(new_parts) + '\n')
|
||||
out.close()
|
||||
PYEOF
|
||||
log "[$year/WEB_ONG] Cleaned $(wc -l < "$cleaned") lines (incl. header)"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$cleaned' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
rm -f "$cleaned"
|
||||
else
|
||||
log "[$year/WEB_ONG] unexpected col count $header_cols, skipping"
|
||||
return 0
|
||||
fi
|
||||
log "[$year/WEB_ONG] UPSERT..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials_ong (
|
||||
cui, year, caen, caeno,
|
||||
capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
|
||||
personal_neeconomic, personal_economic, indicators
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $year, caen, caeno,
|
||||
NULLIF(i12, '')::numeric(20,2),
|
||||
NULLIF(i38, '')::numeric(20,2),
|
||||
NULLIF(i40, '')::numeric(20,2),
|
||||
NULLIF(i42, '')::numeric(20,2),
|
||||
CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
|
||||
CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
|
||||
jsonb_strip_nulls(jsonb_build_object(
|
||||
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
|
||||
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
|
||||
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
|
||||
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
|
||||
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
|
||||
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
|
||||
'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
|
||||
'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
|
||||
'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
|
||||
'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
|
||||
'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
|
||||
'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
|
||||
))
|
||||
FROM firms.staging_ong
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
caen = EXCLUDED.caen,
|
||||
caeno = EXCLUDED.caeno,
|
||||
capitaluri_proprii = EXCLUDED.capitaluri_proprii,
|
||||
venituri_total = EXCLUDED.venituri_total,
|
||||
cheltuieli_total = EXCLUDED.cheltuieli_total,
|
||||
excedent = EXCLUDED.excedent,
|
||||
personal_neeconomic = EXCLUDED.personal_neeconomic,
|
||||
personal_economic = EXCLUDED.personal_economic,
|
||||
indicators = EXCLUDED.indicators,
|
||||
fetched_at = now();
|
||||
SQL
|
||||
}
|
||||
|
||||
# ─── WEB_INST_DE_CREDIT (banks) — pre-IFRS schemas vary by year ─────────
|
||||
# 2015: not published. 2016/2017/2019: 23 cols (I1..I21). 2018: not published.
|
||||
# 2020/2021/2022: 23 cols (I21). 2023: 24 cols (I22). 2024: 25 cols (I23).
|
||||
import_bank() {
|
||||
local year="$1"
|
||||
local file="$DATA_DIR/web_inst_de_credit_${year}.txt"
|
||||
local slug="situatii_financiare_${year}"
|
||||
case "$year" in
|
||||
2020) slug="situatii_financiare_2021" ;;
|
||||
2023) slug="situatii_financiare2023" ;;
|
||||
esac
|
||||
local url
|
||||
if [ ! -s "$file" ]; then
|
||||
url=$(discover "$slug" "^web_(inst|instit)_de_credit.*${year}\\.txt$")
|
||||
if [ -z "$url" ]; then log "[$year/BANK] no file in dataset, skip"; return 0; fi
|
||||
fetch "$file" "$url" || return 1
|
||||
fi
|
||||
# Detect column count from header line.
|
||||
local header_cols
|
||||
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
|
||||
log "[$year/BANK] $file ($(stat -c%s "$file") bytes, $header_cols cols)"
|
||||
# Build a TEMP table sized to the file, then map to firms.financials_banks.
|
||||
# The "cifra_afaceri" mapping: in IFRS 2024 schema (25 cols) it's i23. In
|
||||
# older 23-col schema it's i21. In 24-col schema (2023) it's i22.
|
||||
local ind_n cifra_col profit_inainte_col profit_exerc_col capital_col activ_col cols_def cols_list ind_pairs
|
||||
ind_n=$(( header_cols - 2 )) # i1..iN
|
||||
case "$ind_n" in
|
||||
21) cifra_col=i21; profit_inainte_col=i17; profit_exerc_col=i20; capital_col=i14; activ_col=i6 ;;
|
||||
22) cifra_col=i22; profit_inainte_col=i18; profit_exerc_col=i21; capital_col=i14; activ_col=i6 ;;
|
||||
23) cifra_col=i23; profit_inainte_col=i19; profit_exerc_col=i22; capital_col=i14; activ_col=i6 ;;
|
||||
*) log "[$year/BANK] unexpected indicator count $ind_n, skipping"; return 0 ;;
|
||||
esac
|
||||
# Build dynamic column list for TEMP table and \\copy.
|
||||
cols_def="cui text, caen text"
|
||||
cols_list="cui, caen"
|
||||
ind_pairs=""
|
||||
for i in $(seq 1 "$ind_n"); do
|
||||
cols_def="$cols_def, i${i} text"
|
||||
cols_list="$cols_list, i${i}"
|
||||
ind_pairs="$ind_pairs 'i${i}', NULLIF(i${i}, ''),"
|
||||
done
|
||||
ind_pairs="${ind_pairs%,}"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
CREATE TEMP TABLE tmp_bank (
|
||||
$cols_def
|
||||
); -- session-scoped; dropped when psql exits
|
||||
\\copy tmp_bank ($cols_list) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
INSERT INTO firms.financials_banks (
|
||||
cui, year, caen,
|
||||
active_financiare_amortiz, capital_social, profit_exercitiu,
|
||||
profit_inainte_impozit, cifra_afaceri, indicators, source
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $year, caen,
|
||||
NULLIF($activ_col, '')::numeric(20,2),
|
||||
NULLIF($capital_col, '')::numeric(20,2),
|
||||
NULLIF($profit_exerc_col, '')::numeric(20,2),
|
||||
NULLIF($profit_inainte_col, '')::numeric(20,2),
|
||||
NULLIF($cifra_col, '')::numeric(20,2),
|
||||
jsonb_strip_nulls(jsonb_build_object($ind_pairs)),
|
||||
'mfinante:WEB_Inst_de_credit'
|
||||
FROM tmp_bank
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
caen = EXCLUDED.caen,
|
||||
active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
|
||||
capital_social = EXCLUDED.capital_social,
|
||||
profit_exercitiu = EXCLUDED.profit_exercitiu,
|
||||
profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
|
||||
cifra_afaceri = EXCLUDED.cifra_afaceri,
|
||||
indicators = EXCLUDED.indicators,
|
||||
source = EXCLUDED.source,
|
||||
fetched_at = now();
|
||||
COPYEOF
|
||||
}
|
||||
|
||||
# CATEGORIES env var filters which sub-imports run. Default = all.
|
||||
# Useful: CATEGORIES="bank" to skip companies and only redo banks.
|
||||
CATEGORIES="${CATEGORIES:-uu bl ong bank}"
|
||||
|
||||
for YEAR in $YEARS; do
|
||||
log "── Year $YEAR ──────────────────────────────"
|
||||
for CAT in $CATEGORIES; do
|
||||
case "$CAT" in
|
||||
uu) import_uu "$YEAR" || log "[$YEAR/WEB_UU] failed" ;;
|
||||
bl) import_bl "$YEAR" || log "[$YEAR/WEB_BL_BS_SL] failed" ;;
|
||||
ong) import_ong "$YEAR" || log "[$YEAR/WEB_ONG] failed" ;;
|
||||
bank) import_bank "$YEAR" || log "[$YEAR/BANK] failed" ;;
|
||||
esac
|
||||
done
|
||||
done
|
||||
|
||||
log "=== Refreshing latest-year MV ==="
|
||||
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;" || true
|
||||
|
||||
log "=== Final coverage ==="
|
||||
psql -c "
|
||||
SELECT 'fin' AS tbl, year, COUNT(*) AS n FROM firms.financials GROUP BY year
|
||||
UNION ALL
|
||||
SELECT 'ong' AS tbl, year, COUNT(*) AS n FROM firms.financials_ong GROUP BY year
|
||||
UNION ALL
|
||||
SELECT 'bank' AS tbl, year, COUNT(*) AS n FROM firms.financials_banks GROUP BY year
|
||||
ORDER BY tbl, year;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== Historical import done ==="
|
||||
+194
@@ -0,0 +1,194 @@
|
||||
#!/bin/bash
|
||||
# Imports MFP non-WEB_UU/BL_BS_SL financial categories into separate tables.
|
||||
# Currently handles WEB_ONG (46 indicators, NGO-specific) and WEB_Inst_de_credit
|
||||
# (23 IFRS indicators for banks). Other small categories (IFN, ASIG, BROK, SIF,
|
||||
# PENSII, VS, VM, IP_IEME, IR, FOND_GARANTARE) can follow the same pattern with
|
||||
# their own tables; for now we treat them as future work since each is <1MB
|
||||
# and < a few hundred records.
|
||||
#
|
||||
# Discovers download URLs via data.gov.ro CKAN API per data year.
|
||||
#
|
||||
# Idempotent. ON CONFLICT (cui, year) DO UPDATE so re-runs refresh latest values.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/mfinante
|
||||
LOG=/var/log/vreaudigital-fin-import.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
mkdir -p "$DATA_DIR"
|
||||
|
||||
# ── DB env (unchanged from import-financials.sh pattern) ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
|
||||
--path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DBURL TOKEN DB
|
||||
|
||||
log "=== ONG + Banks import started ==="
|
||||
|
||||
# Apply schema if not present.
|
||||
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/016_firms_financials_categories.sql >/dev/null
|
||||
|
||||
# Helper: discover CSV URL via CKAN. Slug per data year, file pattern per category.
|
||||
discover_url() {
|
||||
local year="$1"
|
||||
local pattern="$2" # e.g. "web_ong_an" or "web_instit_de_credit_an" or "web_inst_de_credit_"
|
||||
local slug
|
||||
case "$year" in
|
||||
2015) slug="situatii_financiare_2015" ;;
|
||||
2016) slug="situatii_financiare_2016" ;;
|
||||
2017) slug="situatii_financiare_2017" ;;
|
||||
2018) slug="situatii_financiare_2018" ;;
|
||||
2019) slug="situatii_financiare_2019" ;;
|
||||
2020) slug="situatii_financiare_2021" ;; # 2020 data lives in 2021 megadump
|
||||
2021) slug="situatii_financiare_2021" ;;
|
||||
2022) slug="situatii_financiare_2022" ;;
|
||||
2023) slug="situatii_financiare2023" ;;
|
||||
2024) slug="situatii_financiare_2024" ;;
|
||||
*) echo ""; return 1 ;;
|
||||
esac
|
||||
curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json, sys, re
|
||||
d = json.load(sys.stdin)
|
||||
year = '$year'
|
||||
pat = re.compile(r'$pattern' + year + r'\\.txt\$', re.I)
|
||||
for r in d.get('result', {}).get('resources', []):
|
||||
if pat.search(r.get('name', '')):
|
||||
print(r.get('url', '')); break
|
||||
"
|
||||
}
|
||||
|
||||
# ─── ONG ──────────────────────────────────────────────────────────────────
|
||||
for YEAR in ${YEARS:-2020 2021 2022 2023 2024}; do
|
||||
FILE="$DATA_DIR/web_ong_${YEAR}.txt"
|
||||
if [ ! -s "$FILE" ]; then
|
||||
URL=$(discover_url "$YEAR" "web_ong_an")
|
||||
if [ -z "$URL" ]; then log "[$YEAR/ONG] URL not found, skipping"; continue; fi
|
||||
log "[$YEAR/ONG] Downloading from $URL ..."
|
||||
curl -fsL --max-time 120 -o "$FILE" "$URL"
|
||||
fi
|
||||
log "[$YEAR/ONG] COPY $FILE ($(stat -c%s "$FILE") bytes)..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$FILE' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
|
||||
log "[$YEAR/ONG] UPSERT into firms.financials_ong..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials_ong (
|
||||
cui, year, caen, caeno,
|
||||
capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
|
||||
personal_neeconomic, personal_economic, indicators
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $YEAR, caen, caeno,
|
||||
NULLIF(i12, '')::numeric(20,2),
|
||||
NULLIF(i38, '')::numeric(20,2),
|
||||
NULLIF(i40, '')::numeric(20,2),
|
||||
NULLIF(i42, '')::numeric(20,2),
|
||||
CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
|
||||
CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
|
||||
jsonb_strip_nulls(jsonb_build_object(
|
||||
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
|
||||
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
|
||||
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
|
||||
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
|
||||
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
|
||||
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
|
||||
'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
|
||||
'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
|
||||
'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
|
||||
'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
|
||||
'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
|
||||
'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
|
||||
))
|
||||
FROM firms.staging_ong
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
caen = EXCLUDED.caen,
|
||||
caeno = EXCLUDED.caeno,
|
||||
capitaluri_proprii = EXCLUDED.capitaluri_proprii,
|
||||
venituri_total = EXCLUDED.venituri_total,
|
||||
cheltuieli_total = EXCLUDED.cheltuieli_total,
|
||||
excedent = EXCLUDED.excedent,
|
||||
personal_neeconomic = EXCLUDED.personal_neeconomic,
|
||||
personal_economic = EXCLUDED.personal_economic,
|
||||
indicators = EXCLUDED.indicators,
|
||||
fetched_at = now();
|
||||
SQL
|
||||
done
|
||||
|
||||
# ─── BĂNCI / Instituții de Credit ─────────────────────────────────────────
|
||||
for YEAR in ${YEARS:-2020 2021 2022 2023 2024}; do
|
||||
FILE="$DATA_DIR/web_inst_de_credit_${YEAR}.txt"
|
||||
if [ ! -s "$FILE" ]; then
|
||||
# Filename differs per year — sometimes web_instit_de_credit_an, sometimes web_inst_de_credit_
|
||||
URL=$(discover_url "$YEAR" "web_(inst|instit)_de_credit_(an)?")
|
||||
if [ -z "$URL" ]; then log "[$YEAR/BANK] URL not found, skipping"; continue; fi
|
||||
log "[$YEAR/BANK] Downloading from $URL ..."
|
||||
curl -fsL --max-time 60 -o "$FILE" "$URL"
|
||||
fi
|
||||
log "[$YEAR/BANK] COPY $FILE ($(stat -c%s "$FILE") bytes)..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_banks;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_banks (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) FROM '$FILE' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
|
||||
log "[$YEAR/BANK] UPSERT into firms.financials_banks..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials_banks (
|
||||
cui, year, caen,
|
||||
active_financiare_amortiz, capital_social, profit_exercitiu,
|
||||
profit_inainte_impozit, cifra_afaceri, indicators
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $YEAR, caen,
|
||||
NULLIF(i6, '')::numeric(20,2),
|
||||
NULLIF(i14, '')::numeric(20,2),
|
||||
NULLIF(i22, '')::numeric(20,2),
|
||||
NULLIF(i19, '')::numeric(20,2),
|
||||
NULLIF(i23, '')::numeric(20,2),
|
||||
jsonb_strip_nulls(jsonb_build_object(
|
||||
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
|
||||
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
|
||||
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
|
||||
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
|
||||
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
|
||||
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, '')
|
||||
))
|
||||
FROM firms.staging_banks
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
caen = EXCLUDED.caen,
|
||||
active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
|
||||
capital_social = EXCLUDED.capital_social,
|
||||
profit_exercitiu = EXCLUDED.profit_exercitiu,
|
||||
profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
|
||||
cifra_afaceri = EXCLUDED.cifra_afaceri,
|
||||
indicators = EXCLUDED.indicators,
|
||||
fetched_at = now();
|
||||
SQL
|
||||
done
|
||||
|
||||
log "=== ONG + Banks final stats ==="
|
||||
psql -At -F"|" -c "
|
||||
SELECT 'ong:' || year, COUNT(*) FROM firms.financials_ong GROUP BY year ORDER BY year;" 2>&1 | tee -a "$LOG"
|
||||
psql -At -F"|" -c "
|
||||
SELECT 'bank:' || year, COUNT(*) FROM firms.financials_banks GROUP BY year ORDER BY year;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== ONG + Banks import done ==="
|
||||
+108
@@ -0,0 +1,108 @@
|
||||
#!/bin/bash
|
||||
# Import financial indicators (Situații financiare) from data.gov.ro per year.
|
||||
# Runs COPY from web_uu_YYYY.txt → staging_financials → firms.financials (PK cui+year).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/mfinante
|
||||
LOG=/var/log/vreaudigital-fin-import.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" --client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
|
||||
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" -- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DATABASE_URL TOKEN DB
|
||||
|
||||
log "=== Financial import started ==="
|
||||
|
||||
# WEB_UU and WEB_BL_BS_SL share the same 22-column schema (CUI, CAEN, I1..I20)
|
||||
# so we can use the same staging table + INSERT for both. The `source` column
|
||||
# tracks which raw category the row came from. WEB_BL_BS_SL covers special-
|
||||
# regime entities (bilanț scurt, lichidare) that aren't in WEB_UU — e.g.
|
||||
# Alliance Healthcare, in-liquidation companies. Together they fill most of
|
||||
# the financial-data gap.
|
||||
|
||||
import_year_category() {
|
||||
local YEAR="$1"
|
||||
local CATEGORY="$2" # WEB_UU | WEB_BL_BS_SL
|
||||
local FILE="$3"
|
||||
local SRC_LABEL="mfinante:${CATEGORY}"
|
||||
|
||||
if [ ! -s "$FILE" ]; then
|
||||
log "[$YEAR/$CATEGORY] [SKIP] $FILE missing"
|
||||
return 0
|
||||
fi
|
||||
log "[$YEAR/$CATEGORY] Truncating staging..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
||||
|
||||
log "[$YEAR/$CATEGORY] COPY $FILE..."
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$FILE' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
||||
COPYEOF
|
||||
|
||||
log "[$YEAR/$CATEGORY] UPSERT into financials (source=$SRC_LABEL)..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL
|
||||
INSERT INTO firms.financials (
|
||||
cui, year, caen,
|
||||
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
||||
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
||||
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
||||
cifra_afaceri, venituri_total, cheltuieli_total,
|
||||
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
||||
numar_salariati, source
|
||||
)
|
||||
SELECT DISTINCT ON (cui)
|
||||
cui, $YEAR, caen,
|
||||
i1, i2, i3, i4, i5,
|
||||
i6, i7, i8, i9,
|
||||
i10, i11, i12,
|
||||
i13, i14, i15,
|
||||
i16, i17, i18, i19,
|
||||
-- Sanitize salariati: drop absurd values (data anomalies up to 7.7e14 observed)
|
||||
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
|
||||
'$SRC_LABEL'
|
||||
FROM firms.staging_financials
|
||||
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
||||
ORDER BY cui
|
||||
ON CONFLICT (cui, year) DO UPDATE SET
|
||||
-- For (cui, year) duplicates across categories, prefer WEB_UU (more complete
|
||||
-- schema for normal companies). Don't overwrite a WEB_UU row with a BL_BS_SL row.
|
||||
source = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
||||
ELSE EXCLUDED.source
|
||||
END,
|
||||
caen = CASE
|
||||
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
||||
ELSE EXCLUDED.caen
|
||||
END;
|
||||
SQL
|
||||
}
|
||||
|
||||
# YEARS env var overrides the default daily-run list. Used by the historical
|
||||
# backfill wrapper (import-financials-historical.sh). Default behaviour is
|
||||
# unchanged for the cron job.
|
||||
YEARS="${YEARS:-2020 2021 2022 2023 2024}"
|
||||
for YEAR in $YEARS; do
|
||||
import_year_category "$YEAR" "WEB_UU" "$DATA_DIR/web_uu_${YEAR}.txt"
|
||||
import_year_category "$YEAR" "WEB_BL_BS_SL" "$DATA_DIR/web_bl_bs_sl_${YEAR}.txt"
|
||||
done
|
||||
|
||||
log "=== Refreshing latest-year MV ==="
|
||||
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;"
|
||||
|
||||
log "=== Final stats ==="
|
||||
psql -c "
|
||||
SELECT year, COUNT(*) AS firms_with_data,
|
||||
ROUND(AVG(NULLIF(cifra_afaceri, 0))::numeric, 0) AS avg_ca,
|
||||
COUNT(*) FILTER (WHERE cifra_afaceri > 0) AS cu_ca,
|
||||
COUNT(*) FILTER (WHERE numar_salariati > 0) AS cu_salariati
|
||||
FROM firms.financials
|
||||
GROUP BY year ORDER BY year;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== Import done ==="
|
||||
+85
@@ -0,0 +1,85 @@
|
||||
#!/bin/bash
|
||||
# Discovers the latest ONRC bulk dataset on data.gov.ro, downloads any newer
|
||||
# CSVs, and runs import-onrc.sh — but only if the dataset is fresher than
|
||||
# what's already on disk. Idempotent: re-running on the same day is a no-op.
|
||||
#
|
||||
# Dataset on data.gov.ro is published ~monthly with slug pattern
|
||||
# `firme-DD-MM-YYYY`. Resource UUIDs change each release, so we can't
|
||||
# hardcode URLs — query CKAN to discover the current ones.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/onrc
|
||||
LOG=/var/log/vreaudigital-onrc-import.log
|
||||
STAMP_FILE="$DATA_DIR/.dataset-name"
|
||||
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
mkdir -p "$DATA_DIR"
|
||||
|
||||
log "=== ONRC fresh-check started ==="
|
||||
|
||||
# Query CKAN for the most recently modified `firme-...` dataset.
|
||||
LATEST_NAME=$(curl -fsS --max-time 30 \
|
||||
"https://data.gov.ro/api/3/action/package_search?q=firme&sort=metadata_modified+desc&rows=10" \
|
||||
| jq -r '[.result.results[] | select(.name | test("^firme-[0-9]{2}-[0-9]{2}-[0-9]{4}$"))][0].name // empty')
|
||||
|
||||
if [ -z "$LATEST_NAME" ]; then
|
||||
log "ERROR: could not find a firme-DD-MM-YYYY dataset on data.gov.ro"
|
||||
exit 1
|
||||
fi
|
||||
log "Latest dataset on data.gov.ro: $LATEST_NAME"
|
||||
|
||||
# Skip if we've already imported this snapshot.
|
||||
if [ -f "$STAMP_FILE" ] && [ "$(cat "$STAMP_FILE")" = "$LATEST_NAME" ]; then
|
||||
log "Already imported $LATEST_NAME — nothing to do."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Fetch resource URLs for the dataset. We need 4 of them (the rest are unused).
|
||||
log "Fetching resource URLs for $LATEST_NAME..."
|
||||
RESOURCES_JSON=$(curl -fsS --max-time 30 \
|
||||
"https://data.gov.ro/api/3/action/package_show?id=$LATEST_NAME")
|
||||
|
||||
declare -A NEEDED=(
|
||||
[od_firme.csv]=""
|
||||
[od_caen_autorizat.csv]=""
|
||||
[od_stare_firma.csv]=""
|
||||
[od_reprezentanti_legali.csv]=""
|
||||
)
|
||||
|
||||
while IFS=$'\t' read -r url; do
|
||||
fname=$(basename "$url" | tr 'A-Z' 'a-z')
|
||||
if [ -n "${NEEDED[$fname]+x}" ]; then
|
||||
NEEDED[$fname]="$url"
|
||||
fi
|
||||
done < <(echo "$RESOURCES_JSON" | jq -r '.result.resources[] | "\(.url)"')
|
||||
|
||||
for f in "${!NEEDED[@]}"; do
|
||||
if [ -z "${NEEDED[$f]}" ]; then
|
||||
log "ERROR: resource $f not found in dataset $LATEST_NAME"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Download each CSV (curl -z compares against existing file's mtime).
|
||||
for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do
|
||||
url="${NEEDED[$f]}"
|
||||
log "Downloading $f..."
|
||||
curl -fL --max-time 600 -o "$DATA_DIR/$f.tmp" "$url" 2>&1 | tail -3 | tee -a "$LOG"
|
||||
mv -f "$DATA_DIR/$f.tmp" "$DATA_DIR/$f"
|
||||
done
|
||||
|
||||
log "Running import-onrc.sh..."
|
||||
"$SCRIPT_DIR/import-onrc.sh"
|
||||
|
||||
# ONRC import inserts new firms without lat/lng. Run the full geocoding
|
||||
# fallback chain (geonames_postal → uat_centroid → photon → judet_centroid)
|
||||
# so /harta + UI map clustering have coordinates for every fresh-import row.
|
||||
log "Running geocode-firms.sh fallback chain..."
|
||||
"$SCRIPT_DIR/geocode-firms.sh" || log "WARN: geocode-firms.sh exited non-zero; continuing"
|
||||
|
||||
# Record the snapshot we just successfully imported.
|
||||
echo "$LATEST_NAME" > "$STAMP_FILE"
|
||||
log "=== ONRC fresh-import done (snapshot=$LATEST_NAME) ==="
|
||||
Executable
+272
@@ -0,0 +1,272 @@
|
||||
#!/bin/bash
|
||||
# Import ONRC bulk CSV files into firms.entities.
|
||||
# Source: data.gov.ro (CC-BY 4.0), updated weekly.
|
||||
#
|
||||
# Pipeline:
|
||||
# 1. TRUNCATE staging tables
|
||||
# 2. COPY each CSV (~/data/onrc/*.csv) into corresponding staging table
|
||||
# 3. UPSERT into firms.entities, joining on cod_inmatriculare
|
||||
# 4. Resolve siruta UAT for each firm via county+localitate fuzzy match
|
||||
#
|
||||
# Idempotent. Run nightly via cron.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/onrc
|
||||
LOG=/var/log/vreaudigital-onrc-import.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ONRC import started ==="
|
||||
|
||||
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
# Pass URL to psql via stdin to avoid leaking via `ps aux`.
|
||||
# psql doesn't natively read URL from stdin; use libpq env vars instead.
|
||||
# Parse URL: postgresql://USER:PASS@HOST:PORT/DBNAME
|
||||
DB_USER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
DB_PASS=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
DB_HOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
DB_PORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
DB_NAME=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
export PGUSER="$DB_USER" PGPASSWORD="$DB_PASS" PGHOST="$DB_HOST" PGPORT="$DB_PORT" PGDATABASE="$DB_NAME"
|
||||
unset DATABASE_URL TOKEN DB DB_USER DB_PASS DB_HOST DB_PORT DB_NAME
|
||||
|
||||
# ── Sanity check files ──
|
||||
for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do
|
||||
if [ ! -s "$DATA_DIR/$f" ]; then
|
||||
log "FATAL: $DATA_DIR/$f missing or empty"; exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
DATASET_NAME=$(basename "$(dirname "$(readlink -f "$DATA_DIR/od_firme.csv")")" | head -c 40)
|
||||
log "Dataset name (best guess): $DATASET_NAME"
|
||||
|
||||
# ── Stage CSVs ──
|
||||
log "Truncating staging tables..."
|
||||
psql -v ON_ERROR_STOP=1 -c "
|
||||
TRUNCATE TABLE firms.staging_onrc_firme, firms.staging_onrc_caen,
|
||||
firms.staging_onrc_stare, firms.staging_onrc_reprezentanti;
|
||||
"
|
||||
|
||||
log "COPY od_firme.csv (683MB)..."
|
||||
time psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_onrc_firme (denumire, cui, cod_inmatriculare, data_inmatriculare, euid, forma_juridica, adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar, adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal, adr_sector, adr_completare, web, tara_firma_mama) FROM '$DATA_DIR/od_firme.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
||||
COPYEOF
|
||||
|
||||
log "COPY od_caen_autorizat.csv..."
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_onrc_caen (cod_inmatriculare, cod_caen, ver_caen) FROM '$DATA_DIR/od_caen_autorizat.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
||||
COPYEOF
|
||||
|
||||
log "COPY od_stare_firma.csv..."
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_onrc_stare (cod_inmatriculare, cod_stare) FROM '$DATA_DIR/od_stare_firma.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
||||
COPYEOF
|
||||
|
||||
log "COPY od_reprezentanti_legali.csv..."
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_onrc_reprezentanti (cod_inmatriculare, persoana, calitate, data_nastere, localitate_nastere, judet_nastere, tara_nastere, localitate, judet, tara) FROM '$DATA_DIR/od_reprezentanti_legali.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
||||
COPYEOF
|
||||
|
||||
# Optional: extras from same dataset (entreprises individuelle + EU branches).
|
||||
# Idempotent — TRUNCATE-and-reload each run.
|
||||
if [ -s "$DATA_DIR/od_reprezentanti_if.csv" ]; then
|
||||
log "COPY od_reprezentanti_if.csv (~13MB)..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.reprezentanti_if;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.reprezentanti_if (cod_inmatriculare, nume, data_nastere, localitate_nastere, judet_nastere, tara_nastere, calitate) FROM '$DATA_DIR/od_reprezentanti_if.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
||||
COPYEOF
|
||||
else
|
||||
log "[SKIP] od_reprezentanti_if.csv missing"
|
||||
fi
|
||||
|
||||
if [ -s "$DATA_DIR/od_sucursale_alte_state_membre.csv" ]; then
|
||||
log "COPY od_sucursale_alte_state_membre.csv (small)..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.sucursale_ue;"
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.sucursale_ue (cod_inmatriculare, tip_unitate, denumire_sucursala, euid, cod_fiscal_strain, tara) FROM '$DATA_DIR/od_sucursale_alte_state_membre.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
||||
COPYEOF
|
||||
else
|
||||
log "[SKIP] od_sucursale_alte_state_membre.csv missing"
|
||||
fi
|
||||
|
||||
# ── Aggregate into firms.entities ──
|
||||
log "Building firms.entities from staging..."
|
||||
time psql -v ON_ERROR_STOP=1 <<SQL
|
||||
-- Pre-aggregate stare per cod_inmatriculare (multiple historical states possible — pick latest)
|
||||
DROP TABLE IF EXISTS tmp_stare_agg;
|
||||
CREATE TEMP TABLE tmp_stare_agg AS
|
||||
SELECT DISTINCT ON (cod_inmatriculare) cod_inmatriculare, cod_stare
|
||||
FROM firms.staging_onrc_stare
|
||||
WHERE cod_inmatriculare IS NOT NULL
|
||||
ORDER BY cod_inmatriculare, cod_stare DESC;
|
||||
|
||||
-- Aggregate CAEN per cod_inmatriculare
|
||||
DROP TABLE IF EXISTS tmp_caen_agg;
|
||||
CREATE TEMP TABLE tmp_caen_agg AS
|
||||
SELECT
|
||||
cod_inmatriculare,
|
||||
array_agg(DISTINCT cod_caen ORDER BY cod_caen) FILTER (WHERE cod_caen IS NOT NULL) AS caens
|
||||
FROM firms.staging_onrc_caen
|
||||
WHERE cod_inmatriculare IS NOT NULL
|
||||
GROUP BY cod_inmatriculare;
|
||||
|
||||
-- Aggregate reprezentanti per cod_inmatriculare
|
||||
DROP TABLE IF EXISTS tmp_rep_agg;
|
||||
CREATE TEMP TABLE tmp_rep_agg AS
|
||||
SELECT
|
||||
cod_inmatriculare,
|
||||
jsonb_agg(jsonb_build_object(
|
||||
'persoana', persoana,
|
||||
'calitate', calitate,
|
||||
'localitate', localitate,
|
||||
'judet', judet,
|
||||
'tara', tara
|
||||
)) AS rep_legali
|
||||
FROM firms.staging_onrc_reprezentanti
|
||||
WHERE cod_inmatriculare IS NOT NULL AND persoana IS NOT NULL
|
||||
GROUP BY cod_inmatriculare;
|
||||
|
||||
-- UPSERT firms.entities. CUI as PK.
|
||||
-- Skip rows where CUI is empty/0. DISTINCT ON (cui) — if multiple ONRC rows share the
|
||||
-- same CUI (rare but happens with reorganization), pick the most recently registered.
|
||||
INSERT INTO firms.entities (
|
||||
cui, cod_inmatriculare, euid, name, forma_juridica,
|
||||
adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar,
|
||||
adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal,
|
||||
adr_sector, adr_completare,
|
||||
adr_full,
|
||||
data_inmatriculare,
|
||||
registration_year,
|
||||
web,
|
||||
tara_firma_mama,
|
||||
caen_autorizate,
|
||||
rep_legali,
|
||||
status_text,
|
||||
is_radiated_onrc,
|
||||
source_onrc_dataset,
|
||||
onrc_fetched_at,
|
||||
updated_at
|
||||
)
|
||||
SELECT DISTINCT ON (f.cui)
|
||||
f.cui,
|
||||
f.cod_inmatriculare,
|
||||
f.euid,
|
||||
f.denumire,
|
||||
f.forma_juridica,
|
||||
f.adr_tara, f.adr_judet, f.adr_localitate, f.adr_strada, f.adr_numar,
|
||||
f.adr_bloc, f.adr_scara, f.adr_etaj, f.adr_apartament, f.adr_cod_postal,
|
||||
f.adr_sector, f.adr_completare,
|
||||
-- Build adr_full for geocoding
|
||||
COALESCE(
|
||||
NULLIF(trim(concat_ws(', ',
|
||||
NULLIF(trim(concat_ws(' ', f.adr_strada,
|
||||
CASE WHEN f.adr_numar IS NOT NULL THEN 'nr.' || f.adr_numar END
|
||||
)), ''),
|
||||
f.adr_localitate,
|
||||
f.adr_judet,
|
||||
'Romania'
|
||||
)), ''),
|
||||
NULL
|
||||
) AS adr_full,
|
||||
-- ONRC format: DD.MM.YYYY
|
||||
CASE WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
|
||||
THEN to_date(f.data_inmatriculare, 'DD.MM.YYYY')
|
||||
ELSE NULL END AS data_inmatriculare,
|
||||
CASE WHEN f.data_inmatriculare ~ '\d{4}\$'
|
||||
THEN right(f.data_inmatriculare, 4)::int
|
||||
WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
|
||||
THEN right(f.data_inmatriculare, 4)::int
|
||||
ELSE NULL END AS registration_year,
|
||||
f.web,
|
||||
f.tara_firma_mama,
|
||||
ca.caens,
|
||||
ra.rep_legali,
|
||||
-- Status: store raw stare code (decoding via ONRC nomenclator e TODO)
|
||||
-- For now: best effort detection of "radiat" pattern
|
||||
COALESCE(ss.cod_stare, 'unknown') AS status_text,
|
||||
false AS is_radiated_onrc, -- TODO: import ONRC stare nomenclator and detect
|
||||
'$DATASET_NAME' AS source_onrc_dataset,
|
||||
now() AS onrc_fetched_at,
|
||||
now() AS updated_at
|
||||
FROM firms.staging_onrc_firme f
|
||||
LEFT JOIN tmp_caen_agg ca ON ca.cod_inmatriculare = f.cod_inmatriculare
|
||||
LEFT JOIN tmp_rep_agg ra ON ra.cod_inmatriculare = f.cod_inmatriculare
|
||||
LEFT JOIN tmp_stare_agg ss ON ss.cod_inmatriculare = f.cod_inmatriculare
|
||||
LEFT JOIN firms.stare_codelist scl ON scl.cod = ss.cod_stare
|
||||
WHERE f.cui IS NOT NULL
|
||||
AND f.cui != ''
|
||||
AND f.cui != '0'
|
||||
AND f.denumire IS NOT NULL
|
||||
ORDER BY f.cui, f.data_inmatriculare DESC NULLS LAST
|
||||
ON CONFLICT (cui) DO UPDATE SET
|
||||
cod_inmatriculare = EXCLUDED.cod_inmatriculare,
|
||||
euid = EXCLUDED.euid,
|
||||
name = EXCLUDED.name,
|
||||
forma_juridica = EXCLUDED.forma_juridica,
|
||||
adr_tara = EXCLUDED.adr_tara,
|
||||
adr_judet = EXCLUDED.adr_judet,
|
||||
adr_localitate = EXCLUDED.adr_localitate,
|
||||
adr_strada = EXCLUDED.adr_strada,
|
||||
adr_numar = EXCLUDED.adr_numar,
|
||||
adr_bloc = EXCLUDED.adr_bloc,
|
||||
adr_scara = EXCLUDED.adr_scara,
|
||||
adr_etaj = EXCLUDED.adr_etaj,
|
||||
adr_apartament = EXCLUDED.adr_apartament,
|
||||
adr_cod_postal = EXCLUDED.adr_cod_postal,
|
||||
adr_sector = EXCLUDED.adr_sector,
|
||||
adr_completare = EXCLUDED.adr_completare,
|
||||
adr_full = EXCLUDED.adr_full,
|
||||
data_inmatriculare = EXCLUDED.data_inmatriculare,
|
||||
registration_year = EXCLUDED.registration_year,
|
||||
web = EXCLUDED.web,
|
||||
tara_firma_mama = EXCLUDED.tara_firma_mama,
|
||||
caen_autorizate = EXCLUDED.caen_autorizate,
|
||||
rep_legali = EXCLUDED.rep_legali,
|
||||
status_text = EXCLUDED.status_text,
|
||||
is_radiated_onrc = EXCLUDED.is_radiated_onrc,
|
||||
source_onrc_dataset = EXCLUDED.source_onrc_dataset,
|
||||
onrc_fetched_at = EXCLUDED.onrc_fetched_at,
|
||||
updated_at = now();
|
||||
|
||||
-- Match siruta UAT for each firm via norm_uat_name
|
||||
UPDATE firms.entities f
|
||||
SET siruta = sub.siruta
|
||||
FROM (
|
||||
SELECT DISTINCT ON (e.cui) e.cui, gu.siruta
|
||||
FROM firms.entities e
|
||||
JOIN public."GisUat" gu
|
||||
ON seap.norm_uat_name(gu.county) = seap.norm_uat_name(e.adr_judet)
|
||||
AND seap.norm_uat_name(gu.name) = seap.norm_uat_name(e.adr_localitate)
|
||||
WHERE e.siruta IS NULL
|
||||
AND e.adr_judet IS NOT NULL
|
||||
AND e.adr_localitate IS NOT NULL
|
||||
ORDER BY e.cui, gu.siruta
|
||||
) sub
|
||||
WHERE f.cui = sub.cui;
|
||||
SQL
|
||||
|
||||
# ── Stats ──
|
||||
log "Final stats:"
|
||||
psql -c "
|
||||
SELECT
|
||||
COUNT(*) AS total_firms,
|
||||
COUNT(*) FILTER (WHERE siruta IS NOT NULL) AS cu_siruta,
|
||||
COUNT(*) FILTER (WHERE rep_legali IS NOT NULL) AS cu_admins,
|
||||
COUNT(*) FILTER (WHERE caen_autorizate IS NOT NULL) AS cu_caen,
|
||||
COUNT(*) FILTER (WHERE is_radiated_onrc = true) AS radiate
|
||||
FROM firms.entities;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== ONRC import complete ==="
|
||||
+199
@@ -0,0 +1,199 @@
|
||||
#!/bin/bash
|
||||
# Download GeoNames RO postal codes and rebuild firms.postal_codes.
|
||||
# Then geocode firms.entities by postal_code lookup, falling back to UAT
|
||||
# centroid for firms without a valid postal code but with a siruta UAT.
|
||||
#
|
||||
# Coverage estimates (snapshot 2026-05-08):
|
||||
# - postal-precision: ~2.07M / 3.97M firms (52%) — accuracy ~100m-2km
|
||||
# - UAT-centroid fallback: +1.7M firms (44%) — accuracy 5-30km
|
||||
# - combined: ~96% of all firms get lat/lng
|
||||
#
|
||||
# Run before geocode-photon.ts (which targets the remaining ~4% / refines the
|
||||
# postal-level pins to housenumber level when available).
|
||||
#
|
||||
# Idempotent: safe to re-run weekly. Only rewrites firms.entities rows where
|
||||
# the existing pin is null OR was set by an older/lower-precision source.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DATA_DIR=/opt/vreaudigital/data/postal
|
||||
LOG=/var/log/vreaudigital-postal-import.log
|
||||
GEONAMES_URL=https://download.geonames.org/export/zip/RO.zip
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
mkdir -p "$DATA_DIR"
|
||||
|
||||
log "=== Postal-codes import started ==="
|
||||
|
||||
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DATABASE_URL TOKEN DB
|
||||
|
||||
# ── Download + unzip ──
|
||||
log "Downloading $GEONAMES_URL..."
|
||||
curl -fsSL --max-time 120 -o "$DATA_DIR/RO.zip" "$GEONAMES_URL"
|
||||
log "Unzipping..."
|
||||
cd "$DATA_DIR" && unzip -o RO.zip -d "$DATA_DIR" >/dev/null
|
||||
[ -s "$DATA_DIR/RO.txt" ] || { log "FATAL: RO.txt missing or empty"; exit 1; }
|
||||
|
||||
# ── Apply schema (idempotent) ──
|
||||
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/014_firms_postal_codes.sql >/dev/null
|
||||
|
||||
# ── Stage + UPSERT into firms.postal_codes ──
|
||||
log "TRUNCATE staging + COPY..."
|
||||
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_postal_codes;"
|
||||
|
||||
# GeoNames RO.txt is tab-separated, no header, US-ASCII safe (no quote escapes).
|
||||
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
||||
\\copy firms.staging_postal_codes (country_code, postal_code, place_name, admin1_name, admin1_code, admin2_name, admin2_code, admin3_name, admin3_code, lat, lng, accuracy) FROM '$DATA_DIR/RO.txt' WITH (FORMAT csv, DELIMITER E'\t', NULL '', QUOTE E'\b', HEADER false);
|
||||
COPYEOF
|
||||
|
||||
log "Rebuilding firms.postal_codes from staging..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL'
|
||||
TRUNCATE TABLE firms.postal_codes;
|
||||
INSERT INTO firms.postal_codes (postal_code, place_name, county, county_code, admin2_code, admin3_code, admin3_name, lat, lng, accuracy)
|
||||
SELECT
|
||||
s.postal_code,
|
||||
s.place_name,
|
||||
NULLIF(s.admin1_name, ''),
|
||||
NULLIF(s.admin1_code, ''),
|
||||
NULLIF(s.admin2_code, ''),
|
||||
NULLIF(s.admin3_code, ''),
|
||||
NULLIF(s.admin3_name, ''),
|
||||
s.lat::numeric(9,6),
|
||||
s.lng::numeric(9,6),
|
||||
NULLIF(s.accuracy, '')::int
|
||||
FROM firms.staging_postal_codes s
|
||||
WHERE s.postal_code ~ '^[0-9]{6}$'
|
||||
AND s.lat ~ '^-?[0-9.]+$'
|
||||
AND s.lng ~ '^-?[0-9.]+$'
|
||||
ON CONFLICT (postal_code, place_name) DO UPDATE
|
||||
SET lat = EXCLUDED.lat, lng = EXCLUDED.lng, accuracy = EXCLUDED.accuracy;
|
||||
SQL
|
||||
|
||||
log "Stats:"
|
||||
psql -At -F"|" -c "
|
||||
SELECT 'postal_codes_loaded', COUNT(*) FROM firms.postal_codes UNION ALL
|
||||
SELECT 'distinct_postal_codes', COUNT(DISTINCT postal_code) FROM firms.postal_codes;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
# ── Geocode firms.entities (chunked, deadlock-retry) ──
|
||||
# Two-pass: postal first (more precise), then UAT centroid as fallback.
|
||||
# Each chunk is its own psql transaction so a deadlock against the
|
||||
# concurrent ANAF enrichment script aborts only the current chunk
|
||||
# (caught + retried), not the entire batch's progress.
|
||||
run_chunked_update() {
|
||||
local label="$1"
|
||||
local sql="$2"
|
||||
local chunk_total=0 chunk_n=0 retries=0
|
||||
while :; do
|
||||
# -X disables psqlrc, -e echoes the statement so we get "UPDATE N" tag
|
||||
OUT=$(psql -v ON_ERROR_STOP=1 -X 2>&1 <<SQL
|
||||
$sql
|
||||
SQL
|
||||
)
|
||||
if echo "$OUT" | grep -q "deadlock detected"; then
|
||||
retries=$((retries + 1))
|
||||
if [ "$retries" -gt 8 ]; then
|
||||
log "[$label] giving up after 8 deadlock retries"
|
||||
echo "$OUT" | tail -5 | tee -a "$LOG"
|
||||
return 1
|
||||
fi
|
||||
log "[$label] deadlock — retry #$retries in 2s"
|
||||
sleep 2
|
||||
continue
|
||||
fi
|
||||
if echo "$OUT" | grep -qE "^ERROR:"; then
|
||||
echo "$OUT" | tail -10 | tee -a "$LOG"
|
||||
return 1
|
||||
fi
|
||||
ROWS=$(echo "$OUT" | grep -oE '^UPDATE [0-9]+' | tail -1 | awk '{print $2}')
|
||||
ROWS=${ROWS:-0}
|
||||
chunk_n=$((chunk_n + 1))
|
||||
chunk_total=$((chunk_total + ROWS))
|
||||
if [ "$ROWS" = "0" ]; then
|
||||
log "[$label] done — $chunk_n chunks, $chunk_total rows"
|
||||
return 0
|
||||
fi
|
||||
log "[$label] chunk #$chunk_n: $ROWS rows (running total $chunk_total)"
|
||||
done
|
||||
}
|
||||
|
||||
log "Geocoding firms.entities by postal_code..."
|
||||
run_chunked_update "postal" "
|
||||
WITH cand AS (
|
||||
SELECT e.cui FROM firms.entities e
|
||||
WHERE e.adr_cod_postal ~ '^[0-9]{6}\$'
|
||||
AND (e.geocode_source IS NULL OR e.geocode_source = 'uat_centroid')
|
||||
AND EXISTS (SELECT 1 FROM firms.postal_codes_best pc WHERE pc.postal_code = e.adr_cod_postal)
|
||||
ORDER BY e.cui
|
||||
LIMIT 50000
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = pc.lat::double precision,
|
||||
lng = pc.lng::double precision,
|
||||
geom = ST_SetSRID(ST_MakePoint(pc.lng, pc.lat), 4326)::geography,
|
||||
geocode_source = 'geonames_postal',
|
||||
geocode_score = 0.6,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM firms.postal_codes_best pc, cand
|
||||
WHERE e.cui = cand.cui
|
||||
AND e.adr_cod_postal = pc.postal_code;
|
||||
"
|
||||
|
||||
log "Geocoding firms.entities fallback to UAT centroid..."
|
||||
# public.\"GisUat\".geom is in SRID 3844 (RO STEREO70 projected). Geography
|
||||
# requires WGS84 lon/lat (4326), so ST_Transform before ::geography.
|
||||
run_chunked_update "uat" "
|
||||
WITH cand AS (
|
||||
SELECT e.cui FROM firms.entities e
|
||||
WHERE e.siruta IS NOT NULL
|
||||
AND e.geocode_source IS NULL
|
||||
AND EXISTS (SELECT 1 FROM public.\"GisUat\" gu WHERE gu.siruta = e.siruta)
|
||||
ORDER BY e.cui
|
||||
LIMIT 50000
|
||||
)
|
||||
UPDATE firms.entities e
|
||||
SET
|
||||
lat = ST_Y(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
||||
lng = ST_X(ST_Transform(ST_Centroid(gu.geom), 4326))::double precision,
|
||||
geom = ST_Transform(ST_Centroid(gu.geom), 4326)::geography,
|
||||
geocode_source = 'uat_centroid',
|
||||
geocode_score = 0.3,
|
||||
geocoded_at = now(),
|
||||
updated_at = now()
|
||||
FROM public.\"GisUat\" gu, cand
|
||||
WHERE e.cui = cand.cui
|
||||
AND e.siruta = gu.siruta;
|
||||
"
|
||||
|
||||
log "Final stats:"
|
||||
psql -At -F"|" -c "
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE lat IS NOT NULL) AS cu_lat_lng,
|
||||
COUNT(*) FILTER (WHERE geocode_source = 'geonames_postal') AS via_postal,
|
||||
COUNT(*) FILTER (WHERE geocode_source = 'uat_centroid') AS via_uat,
|
||||
COUNT(*) FILTER (WHERE geocode_source = 'photon') AS via_photon
|
||||
FROM firms.entities;
|
||||
" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== Postal-codes import done ==="
|
||||
Executable
+51
@@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
# One-shot install of Photon 0.5.0 (last Elasticsearch-backed release) on satra.
|
||||
# Photon 0.6+ uses OpenSearch and is incompatible with the country-level extracts
|
||||
# graphhopper still publishes (which are ES format). Verified working 2026-05-08.
|
||||
#
|
||||
# After install, start as a service: see vreaudigital-photon.service in this dir.
|
||||
#
|
||||
# Prerequisite: the RO ES extract is already at /opt/photon/photon_data
|
||||
# (downloaded by setup-photon.sh from photon-db-ro-DDMMYY.tar.bz2).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PHOTON_DIR=/opt/photon
|
||||
PHOTON_VERSION=0.5.0
|
||||
JAR_URL=https://github.com/komoot/photon/releases/download/${PHOTON_VERSION}/photon-${PHOTON_VERSION}.jar
|
||||
|
||||
log() { echo "[$(date '+%H:%M:%S')] $1"; }
|
||||
|
||||
log "=== Photon ${PHOTON_VERSION} install ==="
|
||||
|
||||
# 1. JDK 21 (works with Photon 0.5.0; 0.5 requires JDK 11+).
|
||||
if ! command -v java >/dev/null 2>&1; then
|
||||
log "Installing openjdk-21-jre-headless..."
|
||||
sudo apt-get install -y openjdk-21-jre-headless
|
||||
fi
|
||||
java --version
|
||||
|
||||
# 2. Photon JAR
|
||||
if [ ! -s "$PHOTON_DIR/photon-${PHOTON_VERSION}.jar" ]; then
|
||||
log "Downloading photon-${PHOTON_VERSION}.jar (~38MB)..."
|
||||
sudo curl -fL -o "$PHOTON_DIR/photon-${PHOTON_VERSION}.jar" "$JAR_URL"
|
||||
sudo chown bulibasa:bulibasa "$PHOTON_DIR/photon-${PHOTON_VERSION}.jar"
|
||||
else
|
||||
log "JAR already on disk."
|
||||
fi
|
||||
|
||||
# 3. Sanity-check the extract directory
|
||||
if [ ! -d "$PHOTON_DIR/photon_data/elasticsearch" ]; then
|
||||
log "FATAL: $PHOTON_DIR/photon_data/elasticsearch missing — run setup-photon.sh first."
|
||||
exit 1
|
||||
fi
|
||||
sudo chown -R bulibasa:bulibasa "$PHOTON_DIR/photon_data"
|
||||
|
||||
# 4. Pre-create log + service file expectations
|
||||
sudo touch /var/log/vreaudigital-photon.log
|
||||
sudo chown bulibasa:bulibasa /var/log/vreaudigital-photon.log
|
||||
|
||||
log "=== Install done. Start with: ==="
|
||||
log " cd $PHOTON_DIR && nohup java -Xmx8G -jar photon-${PHOTON_VERSION}.jar -data-dir $PHOTON_DIR -listen-port 2322 </dev/null >>/var/log/vreaudigital-photon.log 2>&1 &"
|
||||
log "Or install systemd unit: sudo ln -sf $PHOTON_DIR/../vreaudigital/services/seap-scraper/cron/vreaudigital-photon.service /etc/systemd/system/ && sudo systemctl enable --now vreaudigital-photon"
|
||||
log "Smoke test: curl 'http://localhost:2322/api?q=Bucuresti&limit=1'"
|
||||
+204
@@ -0,0 +1,204 @@
|
||||
#!/bin/bash
|
||||
# Fuzzy-match ancom.operatori.titular_name → firms.entities.cui via the
|
||||
# same Stage A (exact normalized) + Stage B (pg_trgm unique-pick) + Stage C
|
||||
# (judet disambiguation) pipeline as cron/match-cui-anre.sh.
|
||||
#
|
||||
# Most ANCOM rows have CUI directly from the detail page (cui_match_method='direct'),
|
||||
# so this is a fallback for whatever subset has titular_cui IS NULL.
|
||||
#
|
||||
# Idempotent — only touches rows where titular_cui IS NULL.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-cui-match-ancom.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
# Resolve DATABASE_URL via Infisical Machine Identity
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
|
||||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DBURL TOKEN DB
|
||||
|
||||
log "=== ANCOM CUI matcher started ==="
|
||||
|
||||
BEFORE=$(psql -At -c "SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) FROM ancom.operatori;")
|
||||
log "before: $BEFORE"
|
||||
|
||||
# Pre-step: populate titular_name_norm for all rows where it's NULL.
|
||||
log "pre-step: populating titular_name_norm..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
UPDATE ancom.operatori
|
||||
SET titular_name_norm = firms.normalize_company_name(titular_name)
|
||||
WHERE titular_name_norm IS NULL
|
||||
AND titular_name IS NOT NULL;
|
||||
SQL
|
||||
|
||||
# Stage A: exact normalized match (unique only).
|
||||
log "Stage A: exact normalized match..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
WITH cand AS (
|
||||
SELECT t.ancom_id AS row_id, t.titular_name_norm AS norm
|
||||
FROM ancom.operatori t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
),
|
||||
matched AS (
|
||||
SELECT c.row_id, MIN(e.cui) AS cui, COUNT(*) AS n
|
||||
FROM cand c
|
||||
JOIN firms.entities e ON e.name_normalized = c.norm
|
||||
GROUP BY c.row_id
|
||||
)
|
||||
UPDATE ancom.operatori t
|
||||
SET titular_cui = m.cui,
|
||||
cui_match_score = 1.0,
|
||||
cui_match_method = 'exact_norm',
|
||||
matched_at = now()
|
||||
FROM matched m
|
||||
WHERE t.ancom_id = m.row_id
|
||||
AND t.titular_cui IS NULL
|
||||
AND m.n = 1;
|
||||
SQL
|
||||
log "Stage A done"
|
||||
|
||||
# Stage B: pg_trgm fuzzy. Same SET threshold 0.7 + 0.85/0.10 accept rule
|
||||
# as match-cui-external.sh.
|
||||
log "Stage B: pg_trgm fuzzy (score >= 0.85, gap >= 0.10)..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sb_rows AS
|
||||
SELECT t.ancom_id AS rowid, t.titular_name_norm AS norm
|
||||
FROM ancom.operatori t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
AND length(t.titular_name_norm) >= 5;
|
||||
CREATE INDEX ON _sb_rows (norm);
|
||||
ANALYZE _sb_rows;
|
||||
|
||||
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
|
||||
ANALYZE _sb_norms;
|
||||
|
||||
CREATE TEMP TABLE _sb_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm,
|
||||
e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY c.norm
|
||||
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
|
||||
) AS rn
|
||||
FROM _sb_norms c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
top2 AS (
|
||||
SELECT norm,
|
||||
MAX(sim) FILTER (WHERE rn = 1) AS s1,
|
||||
MAX(sim) FILTER (WHERE rn = 2) AS s2,
|
||||
MAX(cui) FILTER (WHERE rn = 1) AS cui1
|
||||
FROM ranked WHERE rn <= 2
|
||||
GROUP BY norm
|
||||
)
|
||||
SELECT norm, cui1, s1
|
||||
FROM top2
|
||||
WHERE s1 >= 0.85
|
||||
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
|
||||
CREATE INDEX ON _sb_resolved (norm);
|
||||
ANALYZE _sb_resolved;
|
||||
|
||||
UPDATE ancom.operatori t
|
||||
SET titular_cui = r.cui1,
|
||||
cui_match_score = r.s1,
|
||||
cui_match_method = 'trgm_unique',
|
||||
matched_at = now()
|
||||
FROM _sb_rows rw
|
||||
JOIN _sb_resolved r ON rw.norm = r.norm
|
||||
WHERE t.ancom_id = rw.rowid
|
||||
AND t.titular_cui IS NULL;
|
||||
|
||||
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
|
||||
SQL
|
||||
log "Stage B done"
|
||||
|
||||
# Stage C: judet disambiguation when there are multiple trgm candidates.
|
||||
log "Stage C: judet disambiguation..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sc_rows AS
|
||||
SELECT t.ancom_id AS rowid,
|
||||
t.titular_name_norm AS norm,
|
||||
firms.normalize_judet(t.judet) AS judet_norm
|
||||
FROM ancom.operatori t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
AND t.judet IS NOT NULL
|
||||
AND length(t.titular_name_norm) >= 5;
|
||||
CREATE INDEX ON _sc_rows (norm, judet_norm);
|
||||
ANALYZE _sc_rows;
|
||||
|
||||
CREATE TEMP TABLE _sc_keys AS
|
||||
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
|
||||
ANALYZE _sc_keys;
|
||||
|
||||
CREATE TEMP TABLE _sc_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm, c.judet_norm, e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
|
||||
FROM _sc_keys c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
pick AS (
|
||||
SELECT DISTINCT ON (norm, judet_norm)
|
||||
norm, judet_norm, cui, sim
|
||||
FROM ranked
|
||||
WHERE judet_match
|
||||
ORDER BY norm, judet_norm, sim DESC, cui
|
||||
)
|
||||
SELECT * FROM pick WHERE sim >= 0.7;
|
||||
CREATE INDEX ON _sc_resolved (norm, judet_norm);
|
||||
ANALYZE _sc_resolved;
|
||||
|
||||
UPDATE ancom.operatori t
|
||||
SET titular_cui = r.cui,
|
||||
cui_match_score = r.sim,
|
||||
cui_match_method = 'trgm_judet',
|
||||
matched_at = now()
|
||||
FROM _sc_rows rw
|
||||
JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
|
||||
WHERE t.ancom_id = rw.rowid
|
||||
AND t.titular_cui IS NULL;
|
||||
|
||||
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
|
||||
SQL
|
||||
log "Stage C done"
|
||||
|
||||
AFTER=$(psql -At -c "
|
||||
SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' ||
|
||||
COUNT(*) || ' (matched ' ||
|
||||
ROUND(100.0*COUNT(*) FILTER (WHERE titular_cui IS NOT NULL) / COUNT(*), 1) || '%)'
|
||||
FROM ancom.operatori;")
|
||||
log "after: $AFTER"
|
||||
|
||||
log "by method:"
|
||||
psql -At -F'|' -c "
|
||||
SELECT cui_match_method, COUNT(*)
|
||||
FROM ancom.operatori
|
||||
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Refresh the per-CUI MV now that titular_cui is populated.
|
||||
log "refreshing ancom.mv_operatori_per_cui..."
|
||||
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW CONCURRENTLY ancom.mv_operatori_per_cui;" \
|
||||
2>>"$LOG" \
|
||||
|| psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW ancom.mv_operatori_per_cui;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== ANCOM CUI matcher done ==="
|
||||
Executable
+204
@@ -0,0 +1,204 @@
|
||||
#!/bin/bash
|
||||
# Fuzzy-match anre.licente.titular_name → firms.entities.cui via the
|
||||
# same Stage A (exact normalized) + Stage B (pg_trgm unique-pick) + Stage C
|
||||
# (judet disambiguation) pipeline as cron/match-cui-external.sh.
|
||||
#
|
||||
# Idempotent — only touches rows where titular_cui IS NULL.
|
||||
#
|
||||
# anre.licente has its own column names (titular_cui not cui), so we have
|
||||
# a dedicated wrapper here. Same SQL approach, different column names.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-cui-match-anre.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
# Resolve DATABASE_URL via Infisical Machine Identity
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
|
||||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DBURL TOKEN DB
|
||||
|
||||
log "=== ANRE CUI matcher started ==="
|
||||
|
||||
BEFORE=$(psql -At -c "SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' || COUNT(*) FROM anre.licente;")
|
||||
log "before: $BEFORE"
|
||||
|
||||
# Pre-step: populate titular_name_norm for all rows where it's NULL.
|
||||
log "pre-step: populating titular_name_norm..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
UPDATE anre.licente
|
||||
SET titular_name_norm = firms.normalize_company_name(titular_name)
|
||||
WHERE titular_name_norm IS NULL
|
||||
AND titular_name IS NOT NULL;
|
||||
SQL
|
||||
|
||||
# Stage A: exact normalized match (unique only).
|
||||
log "Stage A: exact normalized match..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
WITH cand AS (
|
||||
SELECT t.id AS row_id, t.titular_name_norm AS norm
|
||||
FROM anre.licente t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
),
|
||||
matched AS (
|
||||
SELECT c.row_id, MIN(e.cui) AS cui, COUNT(*) AS n
|
||||
FROM cand c
|
||||
JOIN firms.entities e ON e.name_normalized = c.norm
|
||||
GROUP BY c.row_id
|
||||
)
|
||||
UPDATE anre.licente t
|
||||
SET titular_cui = m.cui,
|
||||
cui_match_score = 1.0,
|
||||
cui_match_method = 'exact_norm',
|
||||
matched_at = now()
|
||||
FROM matched m
|
||||
WHERE t.id = m.row_id
|
||||
AND t.titular_cui IS NULL
|
||||
AND m.n = 1;
|
||||
SQL
|
||||
log "Stage A done"
|
||||
|
||||
# Stage B: pg_trgm fuzzy. Same SET threshold 0.7 + 0.85/0.10 accept rule
|
||||
# as match-cui-external.sh.
|
||||
log "Stage B: pg_trgm fuzzy (score >= 0.85, gap >= 0.10)..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sb_rows AS
|
||||
SELECT t.id AS rowid, t.titular_name_norm AS norm
|
||||
FROM anre.licente t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
AND length(t.titular_name_norm) >= 5;
|
||||
CREATE INDEX ON _sb_rows (norm);
|
||||
ANALYZE _sb_rows;
|
||||
|
||||
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
|
||||
ANALYZE _sb_norms;
|
||||
|
||||
CREATE TEMP TABLE _sb_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm,
|
||||
e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY c.norm
|
||||
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
|
||||
) AS rn
|
||||
FROM _sb_norms c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
top2 AS (
|
||||
SELECT norm,
|
||||
MAX(sim) FILTER (WHERE rn = 1) AS s1,
|
||||
MAX(sim) FILTER (WHERE rn = 2) AS s2,
|
||||
MAX(cui) FILTER (WHERE rn = 1) AS cui1
|
||||
FROM ranked WHERE rn <= 2
|
||||
GROUP BY norm
|
||||
)
|
||||
SELECT norm, cui1, s1
|
||||
FROM top2
|
||||
WHERE s1 >= 0.85
|
||||
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
|
||||
CREATE INDEX ON _sb_resolved (norm);
|
||||
ANALYZE _sb_resolved;
|
||||
|
||||
UPDATE anre.licente t
|
||||
SET titular_cui = r.cui1,
|
||||
cui_match_score = r.s1,
|
||||
cui_match_method = 'trgm_unique',
|
||||
matched_at = now()
|
||||
FROM _sb_rows rw
|
||||
JOIN _sb_resolved r ON rw.norm = r.norm
|
||||
WHERE t.id = rw.rowid
|
||||
AND t.titular_cui IS NULL;
|
||||
|
||||
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
|
||||
SQL
|
||||
log "Stage B done"
|
||||
|
||||
# Stage C: judet disambiguation when there are multiple trgm candidates.
|
||||
log "Stage C: judet disambiguation..."
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sc_rows AS
|
||||
SELECT t.id AS rowid,
|
||||
t.titular_name_norm AS norm,
|
||||
firms.normalize_judet(t.judet) AS judet_norm
|
||||
FROM anre.licente t
|
||||
WHERE t.titular_cui IS NULL
|
||||
AND t.titular_name_norm IS NOT NULL
|
||||
AND t.judet IS NOT NULL
|
||||
AND length(t.titular_name_norm) >= 5;
|
||||
CREATE INDEX ON _sc_rows (norm, judet_norm);
|
||||
ANALYZE _sc_rows;
|
||||
|
||||
CREATE TEMP TABLE _sc_keys AS
|
||||
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
|
||||
ANALYZE _sc_keys;
|
||||
|
||||
CREATE TEMP TABLE _sc_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm, c.judet_norm, e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
|
||||
FROM _sc_keys c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
pick AS (
|
||||
SELECT DISTINCT ON (norm, judet_norm)
|
||||
norm, judet_norm, cui, sim
|
||||
FROM ranked
|
||||
WHERE judet_match
|
||||
ORDER BY norm, judet_norm, sim DESC, cui
|
||||
)
|
||||
SELECT * FROM pick WHERE sim >= 0.7;
|
||||
CREATE INDEX ON _sc_resolved (norm, judet_norm);
|
||||
ANALYZE _sc_resolved;
|
||||
|
||||
UPDATE anre.licente t
|
||||
SET titular_cui = r.cui,
|
||||
cui_match_score = r.sim,
|
||||
cui_match_method = 'trgm_judet',
|
||||
matched_at = now()
|
||||
FROM _sc_rows rw
|
||||
JOIN _sc_resolved r ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
|
||||
WHERE t.id = rw.rowid
|
||||
AND t.titular_cui IS NULL;
|
||||
|
||||
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
|
||||
SQL
|
||||
log "Stage C done"
|
||||
|
||||
AFTER=$(psql -At -c "
|
||||
SELECT COUNT(*) FILTER (WHERE titular_cui IS NULL) || '/' ||
|
||||
COUNT(*) || ' (matched ' ||
|
||||
ROUND(100.0*COUNT(*) FILTER (WHERE titular_cui IS NOT NULL) / COUNT(*), 1) || '%)'
|
||||
FROM anre.licente;")
|
||||
log "after: $AFTER"
|
||||
|
||||
log "by method:"
|
||||
psql -At -F'|' -c "
|
||||
SELECT cui_match_method, COUNT(*)
|
||||
FROM anre.licente
|
||||
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
# Refresh the per-CUI MV now that titular_cui is populated.
|
||||
log "refreshing anre.mv_licente_per_cui..."
|
||||
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW CONCURRENTLY anre.mv_licente_per_cui;" \
|
||||
2>>"$LOG" \
|
||||
|| psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW anre.mv_licente_per_cui;" 2>&1 | tee -a "$LOG"
|
||||
|
||||
log "=== ANRE CUI matcher done ==="
|
||||
+237
@@ -0,0 +1,237 @@
|
||||
#!/bin/bash
|
||||
# Run CUI-matching pass over external tables that have company names
|
||||
# but no CUI yet. Idempotent — only touches rows where cui IS NULL.
|
||||
#
|
||||
# Currently matches:
|
||||
# - fonduri.beneficiar_anunt (~41K names)
|
||||
# - fonduri.afir_plati (~316K distinct names)
|
||||
#
|
||||
# Future: ANI shareholdings, license registries, etc. — all use the same
|
||||
# firms.normalize_company_name() helper from sql/019_cui_matcher.sql.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-cui-match.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
# Resolve DATABASE_URL via Infisical Machine Identity
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" --silent --plain)
|
||||
DBURL=$(infisical run --domain="$INFISICAL_API_URL" --projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DBURL TOKEN DB
|
||||
|
||||
log "=== CUI matcher started ==="
|
||||
|
||||
# Apply schema (idempotent — generates name_normalized column + indexes)
|
||||
psql -v ON_ERROR_STOP=1 -f /opt/vreaudigital/services/seap-scraper/sql/019_cui_matcher.sql >/dev/null
|
||||
|
||||
run_matcher() {
|
||||
local TABLE="$1"
|
||||
local NAME_COL="$2"
|
||||
local JUDET_COL="$3" # may be empty string if source has no judet
|
||||
local PRINTABLE="$4"
|
||||
local RUN_TRGM="${5:-true}" # set to "false" to skip Stages B+C
|
||||
# (e.g. AFIR direct payments where unmatched
|
||||
# rows are individual farmers, not companies)
|
||||
|
||||
log "[$PRINTABLE] before: $(psql -At -c "SELECT COUNT(*) FILTER (WHERE cui IS NULL), COUNT(*) FROM $TABLE;" | tr '|' '/')"
|
||||
|
||||
# Stage A: exact normalized match (unique). When multiple firms share the
|
||||
# same normalized name (homonyms), we skip — Stage B + judet handles them.
|
||||
log "[$PRINTABLE] Stage A: exact normalized match..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
|
||||
WITH cand AS (
|
||||
SELECT t.ctid AS row_ctid,
|
||||
firms.normalize_company_name(t.$NAME_COL) AS norm
|
||||
FROM $TABLE t
|
||||
WHERE t.cui IS NULL
|
||||
AND t.$NAME_COL IS NOT NULL
|
||||
),
|
||||
matched AS (
|
||||
SELECT c.row_ctid,
|
||||
MIN(e.cui) AS cui,
|
||||
COUNT(*) AS n
|
||||
FROM cand c
|
||||
JOIN firms.entities e ON e.name_normalized = c.norm
|
||||
GROUP BY c.row_ctid
|
||||
)
|
||||
UPDATE $TABLE t
|
||||
SET cui = m.cui,
|
||||
cui_match_score = 1.0,
|
||||
cui_match_method = 'exact_norm',
|
||||
matched_at = now()
|
||||
FROM matched m
|
||||
WHERE t.ctid = m.row_ctid
|
||||
AND t.cui IS NULL
|
||||
AND m.n = 1;
|
||||
SQL
|
||||
log "[$PRINTABLE] Stage A done"
|
||||
|
||||
# Stage B: pg_trgm similarity. Picks top candidate if score ≥ 0.85 AND
|
||||
# gap to second-best ≥ 0.10 (so we know it's unambiguously the best match).
|
||||
#
|
||||
# Performance: previously O(unmatched_rows × candidate_pool) at default
|
||||
# threshold 0.3 — 30+ min on AFIR (493K rows). Three-step pipeline now:
|
||||
# 1. Materialize unmatched rows (rowid + norm) into a temp table
|
||||
# 2. DISTINCT norms → much smaller trgm input set (BEN 13K→2K, AFIR 493K→274K)
|
||||
# 3. SET pg_trgm.similarity_threshold = 0.7 so the gin `%` operator returns
|
||||
# only candidates above the post-filter floor (drops fan-out by ~10×)
|
||||
# The 0.85/0.10 accept rule is unchanged and produces identical matches.
|
||||
if [ "$RUN_TRGM" != "true" ]; then
|
||||
log "[$PRINTABLE] Stage B/C skipped (RUN_TRGM=false) — unmatched rows in this source are individuals, not registered companies"
|
||||
log "[$PRINTABLE] after: $(psql -At -c "
|
||||
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
|
||||
COUNT(*),
|
||||
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
|
||||
FROM $TABLE;" | tr '|' '/')"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "[$PRINTABLE] Stage B: pg_trgm fuzzy (score ≥ 0.85, gap ≥ 0.10)..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sb_rows AS
|
||||
SELECT t.ctid AS rowid,
|
||||
firms.normalize_company_name(t.$NAME_COL) AS norm
|
||||
FROM $TABLE t
|
||||
WHERE t.cui IS NULL
|
||||
AND t.$NAME_COL IS NOT NULL
|
||||
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
|
||||
CREATE INDEX ON _sb_rows (norm);
|
||||
ANALYZE _sb_rows;
|
||||
|
||||
CREATE TEMP TABLE _sb_norms AS SELECT DISTINCT norm FROM _sb_rows;
|
||||
ANALYZE _sb_norms;
|
||||
|
||||
CREATE TEMP TABLE _sb_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm,
|
||||
e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY c.norm
|
||||
ORDER BY similarity(e.name_normalized, c.norm) DESC, e.cui
|
||||
) AS rn
|
||||
FROM _sb_norms c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
top2 AS (
|
||||
SELECT norm,
|
||||
MAX(sim) FILTER (WHERE rn = 1) AS s1,
|
||||
MAX(sim) FILTER (WHERE rn = 2) AS s2,
|
||||
MAX(cui) FILTER (WHERE rn = 1) AS cui1
|
||||
FROM ranked WHERE rn <= 2
|
||||
GROUP BY norm
|
||||
)
|
||||
SELECT norm, cui1, s1
|
||||
FROM top2
|
||||
WHERE s1 >= 0.85
|
||||
AND (s2 IS NULL OR (s1 - s2) >= 0.10);
|
||||
CREATE INDEX ON _sb_resolved (norm);
|
||||
ANALYZE _sb_resolved;
|
||||
|
||||
UPDATE $TABLE t
|
||||
SET cui = r.cui1,
|
||||
cui_match_score = r.s1,
|
||||
cui_match_method = 'trgm_unique',
|
||||
matched_at = now()
|
||||
FROM _sb_rows rw
|
||||
JOIN _sb_resolved r ON rw.norm = r.norm
|
||||
WHERE t.ctid = rw.rowid
|
||||
AND t.cui IS NULL;
|
||||
|
||||
DROP TABLE _sb_rows, _sb_norms, _sb_resolved;
|
||||
SQL
|
||||
log "[$PRINTABLE] Stage B done"
|
||||
|
||||
# Stage C: judet disambiguation when source has a judet column.
|
||||
# Multiple candidates above 0.7 → prefer the one whose adr_judet matches.
|
||||
# Same dedup-by-(norm,judet) + SET threshold pipeline as Stage B.
|
||||
if [ -n "$JUDET_COL" ]; then
|
||||
log "[$PRINTABLE] Stage C: judet disambiguation..."
|
||||
psql -v ON_ERROR_STOP=1 <<SQL 2>&1 | tee -a "$LOG"
|
||||
SET pg_trgm.similarity_threshold = 0.7;
|
||||
|
||||
CREATE TEMP TABLE _sc_rows AS
|
||||
SELECT t.ctid AS rowid,
|
||||
firms.normalize_company_name(t.$NAME_COL) AS norm,
|
||||
firms.normalize_judet(t.$JUDET_COL) AS judet_norm
|
||||
FROM $TABLE t
|
||||
WHERE t.cui IS NULL
|
||||
AND t.$NAME_COL IS NOT NULL
|
||||
AND t.$JUDET_COL IS NOT NULL
|
||||
AND length(firms.normalize_company_name(t.$NAME_COL)) >= 5;
|
||||
CREATE INDEX ON _sc_rows (norm, judet_norm);
|
||||
ANALYZE _sc_rows;
|
||||
|
||||
CREATE TEMP TABLE _sc_keys AS
|
||||
SELECT DISTINCT norm, judet_norm FROM _sc_rows;
|
||||
ANALYZE _sc_keys;
|
||||
|
||||
CREATE TEMP TABLE _sc_resolved AS
|
||||
WITH ranked AS (
|
||||
SELECT c.norm,
|
||||
c.judet_norm,
|
||||
e.cui,
|
||||
similarity(e.name_normalized, c.norm) AS sim,
|
||||
(firms.normalize_judet(e.adr_judet) = c.judet_norm) AS judet_match
|
||||
FROM _sc_keys c
|
||||
JOIN firms.entities e ON e.name_normalized % c.norm
|
||||
),
|
||||
pick AS (
|
||||
SELECT DISTINCT ON (norm, judet_norm)
|
||||
norm, judet_norm, cui, sim
|
||||
FROM ranked
|
||||
WHERE judet_match
|
||||
ORDER BY norm, judet_norm, sim DESC, cui
|
||||
)
|
||||
SELECT * FROM pick WHERE sim >= 0.7;
|
||||
CREATE INDEX ON _sc_resolved (norm, judet_norm);
|
||||
ANALYZE _sc_resolved;
|
||||
|
||||
UPDATE $TABLE t
|
||||
SET cui = r.cui,
|
||||
cui_match_score = r.sim,
|
||||
cui_match_method = 'trgm_judet',
|
||||
matched_at = now()
|
||||
FROM _sc_rows rw
|
||||
JOIN _sc_resolved r
|
||||
ON rw.norm = r.norm AND rw.judet_norm = r.judet_norm
|
||||
WHERE t.ctid = rw.rowid
|
||||
AND t.cui IS NULL;
|
||||
|
||||
DROP TABLE _sc_rows, _sc_keys, _sc_resolved;
|
||||
SQL
|
||||
log "[$PRINTABLE] Stage C done"
|
||||
fi
|
||||
|
||||
log "[$PRINTABLE] after: $(psql -At -c "
|
||||
SELECT COUNT(*) FILTER (WHERE cui IS NULL),
|
||||
COUNT(*),
|
||||
ROUND(100.0*COUNT(*) FILTER (WHERE cui IS NOT NULL) / COUNT(*), 1) || '%'
|
||||
FROM $TABLE;" | tr '|' '/')"
|
||||
log "[$PRINTABLE] by method:"
|
||||
psql -At -F'|' -c "
|
||||
SELECT cui_match_method, COUNT(*)
|
||||
FROM $TABLE
|
||||
GROUP BY 1 ORDER BY 2 DESC NULLS LAST;" 2>&1 | tee -a "$LOG"
|
||||
}
|
||||
|
||||
run_matcher "fonduri.beneficiar_anunt" "beneficiar_name" "beneficiar_judet" "BEN_PRIVAT" true
|
||||
# AFIR: skip trgm — unmatched rows are individual farmers (popa gheorghe,
|
||||
# radu vasile, …) receiving FEADR direct payments. They have no CUI and
|
||||
# never appear in firms.entities (private company registry). Running trgm
|
||||
# on 274K distinct names against 4M entities would take 30+ hours for ~0 gain.
|
||||
run_matcher "fonduri.afir_plati" "beneficiar_name" "localitate" "AFIR" false
|
||||
|
||||
log "=== CUI matcher done ==="
|
||||
Executable
+79
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
# Nightly refresh of seap materialized views.
|
||||
# Run from satra cron at 04:00 — peak DB idle window.
|
||||
#
|
||||
# Sources DATABASE_URL via Infisical Machine Identity (same as the
|
||||
# vreaudigital container). Never echoes the value.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
LOG=/var/log/vreaudigital-mvs.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== Materialized view refresh started ==="
|
||||
|
||||
if [ ! -f /opt/vreaudigital/.infisical-mi ]; then
|
||||
log "FATAL: /opt/vreaudigital/.infisical-mi missing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
|
||||
TOKEN=$(infisical login \
|
||||
--method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
DATABASE_URL=$(infisical run \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" \
|
||||
--path="$INFISICAL_PATH" \
|
||||
--silent --token="$TOKEN" \
|
||||
-- sh -c 'echo "$DATABASE_URL"')
|
||||
|
||||
# Parse URL into PG* env vars and discard URL — psql with the URL on the command
|
||||
# line leaks the password to anyone running `ps aux` (incident 2026-05-07).
|
||||
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
||||
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
||||
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
||||
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
||||
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
||||
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
||||
unset DATABASE_URL TOKEN DB
|
||||
|
||||
START=$(date +%s)
|
||||
psql -v ON_ERROR_STOP=1 <<'SQL' 2>&1 | tee -a "$LOG"
|
||||
\timing on
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.uat_procurement_stats;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.uat_kpi;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_authority_concentration;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_cpv_median_value;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_top_cpv_divisions;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_top_suppliers;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_top_authorities;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_recurrent_pairs;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY seap.mv_supplier_cpv_share;
|
||||
-- Cross-source MVs (added 2026-05-11 after backfills)
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY cnsc.mv_per_authority_cui;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY cnsc.mv_per_contestator_cui;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY anre.mv_licente_per_cui;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY ancom.mv_operatori_per_cui;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY asf.mv_entitati_per_cui;
|
||||
REFRESH MATERIALIZED VIEW CONCURRENTLY aaas.mv_per_cui;
|
||||
-- Red-flags KPI snapshot (043_red_flags_kpi_snapshot.sql)
|
||||
SELECT public_kpi.refresh_red_flags_counts();
|
||||
-- Red-flags previews snapshot (044_red_flags_previews_snapshot.sql) — top-5
|
||||
-- rows per recipe; landing reads as a single SELECT instead of awaiting 14
|
||||
-- live cross-source queries (~17s → ~5ms).
|
||||
SELECT public_kpi.refresh_red_flags_previews();
|
||||
-- Cauta default-browse facets+totals snapshot (046) — short-circuits the 6
|
||||
-- parallel facet aggregates when no filter is set (~1.9s → ~50ms).
|
||||
SELECT public_kpi.refresh_cauta_defaults();
|
||||
SQL
|
||||
END=$(date +%s)
|
||||
|
||||
log "=== Done in $((END-START))s ==="
|
||||
Executable
+87
@@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
# AAAS — Autoritatea pentru Administrarea Activelor Statului.
|
||||
# Scrapes the AAAS portfolio of state-owned companies from
|
||||
# https://www.aaas.gov.ro/.../1-9-3-companii-sub-autoritatea-aaas/.
|
||||
#
|
||||
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern: Infisical Machine
|
||||
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
|
||||
# post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on cui PK). Safe to run from cron.
|
||||
#
|
||||
# AAAS publishes ~12 active-portfolio companies as of 2026-05-10. The
|
||||
# "vânzări acțiuni" + "valorificare creanțe" sections are under construction;
|
||||
# the scraper logs their state but produces no rows from them yet.
|
||||
#
|
||||
# Env knobs:
|
||||
# LIMIT=0 (default: 0 = full = all 12)
|
||||
#
|
||||
# Run:
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-aaas.sh
|
||||
# sudo LIMIT=3 /opt/vreaudigital/services/seap-scraper/cron/scrape-aaas.sh # smoke
|
||||
set -euo pipefail
|
||||
|
||||
LIMIT="${LIMIT:-0}"
|
||||
LOG=/var/log/vreaudigital-aaas.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== AAAS scrape started (limit=$LIMIT) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-aaas --format '{{.Names}}' | grep -q '^vreaudigital-aaas$'; then
|
||||
log "WARN: vreaudigital-aaas already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-aaas 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-aaas-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS=""
|
||||
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="--limit=$LIMIT"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-aaas \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-aaas.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-aaas >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-aaas 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-aaas 2>&1 | tail -25 | tee -a "$LOG"
|
||||
log "=== AAAS scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
+82
@@ -0,0 +1,82 @@
|
||||
#!/bin/bash
|
||||
# AEP donatii scraper — runs scrape-aep-donatii.ts in a node:22-alpine container.
|
||||
# Mirrors enrich-anaf.sh / scrape-regas.sh: Infisical Machine Identity → env-file
|
||||
# → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent (uses ON CONFLICT (source_hash) DO UPDATE). Safe to run from cron.
|
||||
#
|
||||
# Args via env:
|
||||
# TABLE=pj|pf|rvc|all (default: all — fetches all 3 datasets sequentially)
|
||||
# LIMIT=<int> (default: 0 = no limit)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TABLE="${TABLE:-all}"
|
||||
LIMIT="${LIMIT:-0}"
|
||||
LOG=/var/log/vreaudigital-aep.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== AEP donatii scrape started (table=$TABLE limit=$LIMIT) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-aep --format '{{.Names}}' | grep -q '^vreaudigital-aep$'; then
|
||||
log "WARN: vreaudigital-aep already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-aep 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
# ── Launch detached docker container ──
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS=()
|
||||
[ "$LIMIT" != "0" ] && EXTRA_ARGS+=("--limit=$LIMIT")
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-aep \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-aep-donatii.ts \
|
||||
--table="$TABLE" \
|
||||
"${EXTRA_ARGS[@]}")
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-aep >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-aep 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-aep 2>&1 | tail -20 | tee -a "$LOG"
|
||||
docker rm -f vreaudigital-aep 2>/dev/null || true
|
||||
log "=== AEP donatii scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
+125
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
# ANAF datornici — LIVE scraper wrapper (Cloudflare Turnstile via 2captcha).
|
||||
#
|
||||
# Mirrors scrape-cnsc.sh / scrape-anaf-datornici.sh pattern but runs a Python
|
||||
# script (not TSX) because the live scraper uses requests + psycopg2 and shares
|
||||
# nothing with the data.gov.ro one-shot TS importer.
|
||||
#
|
||||
# Infisical Machine Identity → env-file (DATABASE_URL + TWOCAPTCHA_KEY) →
|
||||
# docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on cui+publication_date). Designed to be triggered
|
||||
# quarterly by vreaudigital-anaf-datornici.timer.
|
||||
#
|
||||
# ⚠️ COST: each run spends real money via 2captcha (~$0.50-3 per quarterly
|
||||
# tick, ~$60-100 one-time for 10-year backfill). Do NOT enable the systemd
|
||||
# timer until TWOCAPTCHA_KEY is funded — see HANDOFF-anaf-datornici-2captcha.md.
|
||||
#
|
||||
# Env knobs:
|
||||
# DRY_RUN=1 — parse-only, zero spend, zero DB writes.
|
||||
# BACKFILL_FROM=2016-Q1 — iterate from quarter X through current.
|
||||
# CATEGORIES=mari,mijlocii — subset of {mari,mijlocii,mici,institutii_publice,persoane_fizice}.
|
||||
# INCLUDE_LISTA_ALBA=1 — also scrape anaf.lista_alba (separate endpoint).
|
||||
#
|
||||
# Run:
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-datornici-live.sh
|
||||
# sudo DRY_RUN=1 /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-datornici-live.sh
|
||||
# sudo BACKFILL_FROM=2016-Q1 INCLUDE_LISTA_ALBA=1 /opt/.../scrape-anaf-datornici-live.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
BACKFILL_FROM="${BACKFILL_FROM:-}"
|
||||
CATEGORIES="${CATEGORIES:-}"
|
||||
INCLUDE_LISTA_ALBA="${INCLUDE_LISTA_ALBA:-0}"
|
||||
LOG=/var/log/vreaudigital-anaf-datornici.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ANAF datornici LIVE scrape started (dry_run=$DRY_RUN backfill=$BACKFILL_FROM lista_alba=$INCLUDE_LISTA_ALBA) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-anaf-datornici-live --format '{{.Names}}' \
|
||||
| grep -q '^vreaudigital-anaf-datornici-live$'; then
|
||||
log "WARN: vreaudigital-anaf-datornici-live already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-anaf-datornici-live 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL + TWOCAPTCHA_KEY via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-anaf-datornici-live-env.XXXXXX)
|
||||
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL
|
||||
|
||||
# TWOCAPTCHA_KEY: required unless DRY_RUN=1. If missing, abort with a clear
|
||||
# pointer to the handoff doc — DO NOT silently run (would still hit ANAF page).
|
||||
if [ "$DRY_RUN" != "1" ]; then
|
||||
# Try primary path first ($INFISICAL_PATH = /vreaudigital), fall back to root.
|
||||
# Some users add TWOCAPTCHA_KEY at root path / (less project-namespaced).
|
||||
for try_path in "$INFISICAL_PATH" "/"; do
|
||||
TWOCAPTCHA_KEY=$(infisical secrets get TWOCAPTCHA_KEY \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$try_path" \
|
||||
--token="$TOKEN" --plain --silent 2>/dev/null || true)
|
||||
[ -n "${TWOCAPTCHA_KEY:-}" ] && break
|
||||
done
|
||||
if [ -z "${TWOCAPTCHA_KEY:-}" ]; then
|
||||
log "ERROR: TWOCAPTCHA_KEY missing in Infisical (checked $INFISICAL_PATH + /) — see HANDOFF-anaf-datornici-2captcha.md"
|
||||
log " Add via: NEW SECRET PROTOCOL (Infisical, either path /vreaudigital or /)"
|
||||
rm -f "$ENVF"
|
||||
exit 3
|
||||
fi
|
||||
echo "TWOCAPTCHA_KEY=$TWOCAPTCHA_KEY" >> "$ENVF"
|
||||
unset TWOCAPTCHA_KEY
|
||||
fi
|
||||
unset TOKEN
|
||||
|
||||
# Pass-through env knobs
|
||||
echo "DRY_RUN=$DRY_RUN" >> "$ENVF"
|
||||
[ -n "$BACKFILL_FROM" ] && echo "BACKFILL_FROM=$BACKFILL_FROM" >> "$ENVF"
|
||||
[ -n "$CATEGORIES" ] && echo "CATEGORIES=$CATEGORIES" >> "$ENVF"
|
||||
[ "$INCLUDE_LISTA_ALBA" = "1" ] && echo "INCLUDE_LISTA_ALBA=1" >> "$ENVF"
|
||||
echo "ANAF_DATORNICI_LOG=/work/.log/anaf-datornici.log" >> "$ENVF"
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
# Ensure /work/.log is writable inside container (host bind-mount); the
|
||||
# Python process also tees to stdout → docker logs → journald.
|
||||
mkdir -p .log
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-anaf-datornici-live \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
python:3.12-slim \
|
||||
bash -c "pip install --quiet --no-cache-dir psycopg2-binary requests && python3 scrapers/anaf_datornici/scraper.py")
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-anaf-datornici-live >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-datornici-live 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-anaf-datornici-live 2>&1 | tail -30 | tee -a "$LOG"
|
||||
log "=== ANAF datornici LIVE scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
+84
@@ -0,0 +1,84 @@
|
||||
#!/bin/bash
|
||||
# ANAF datornici scraper — runs scrape-anaf-datornici.ts in node:22-alpine.
|
||||
# Mirrors enrich-anaf.sh / scrape-regas.sh pattern: Infisical Machine Identity
|
||||
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Default source: data.gov.ro Q1-2016 snapshot (only public bulk source available;
|
||||
# anaf.ro/restante/ live is CAPTCHA-blocked — see ANAF-DATORNICI-RECIPES.md).
|
||||
#
|
||||
# Idempotent (uses ON CONFLICT (cui, publication_date) DO UPDATE). Safe to run
|
||||
# from cron, but in practice this is a one-shot until live scraping unlocks.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SOURCE="${SOURCE:-datagov2016}"
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
LOG=/var/log/vreaudigital-anaf-datornici.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ANAF datornici scrape started (source=$SOURCE dry-run=$DRY_RUN) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-anaf-datornici --format '{{.Names}}' \
|
||||
| grep -q '^vreaudigital-anaf-datornici$'; then
|
||||
log "WARN: vreaudigital-anaf-datornici already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-anaf-datornici 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
# ── Launch detached docker container ──
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
DRY_FLAG=""
|
||||
if [ "$DRY_RUN" = "1" ]; then
|
||||
DRY_FLAG="--dry-run"
|
||||
fi
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-anaf-datornici \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-anaf-datornici.ts \
|
||||
--source="$SOURCE" \
|
||||
$DRY_FLAG)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-anaf-datornici >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-datornici 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-anaf-datornici 2>&1 | tail -15 | tee -a "$LOG"
|
||||
log "=== ANAF datornici scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
+102
@@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
# ANAF lista albă — LIVE scraper wrapper (JCaptcha via 2captcha).
|
||||
#
|
||||
# Mirrors scrape-anaf-datornici-live.sh exactly. Difference is endpoint
|
||||
# (/restante/listaalba.xhtml) and target table (anaf.lista_alba — 3 cols/row).
|
||||
#
|
||||
# Infisical Machine Identity → env-file (DATABASE_URL + TWOCAPTCHA_KEY) →
|
||||
# docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on cui+publication_date). Designed to be triggered
|
||||
# quarterly by vreaudigital-anaf-lista-alba.timer (offset +1h vs datornici).
|
||||
#
|
||||
# Env knobs:
|
||||
# DRY_RUN=1 — parse-only, zero spend, zero DB writes.
|
||||
#
|
||||
# Run:
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-lista-alba.sh
|
||||
# sudo DRY_RUN=1 /opt/vreaudigital/services/seap-scraper/cron/scrape-anaf-lista-alba.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
LOG=/var/log/vreaudigital-anaf-lista-alba.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ANAF lista_alba LIVE scrape started (dry_run=$DRY_RUN) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-anaf-lista-alba-live --format '{{.Names}}' \
|
||||
| grep -q '^vreaudigital-anaf-lista-alba-live$'; then
|
||||
log "WARN: vreaudigital-anaf-lista-alba-live already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-anaf-lista-alba-live 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL + TWOCAPTCHA_KEY via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-anaf-lista-alba-live-env.XXXXXX)
|
||||
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL
|
||||
|
||||
if [ "$DRY_RUN" != "1" ]; then
|
||||
for try_path in "$INFISICAL_PATH" "/"; do
|
||||
TWOCAPTCHA_KEY=$(infisical secrets get TWOCAPTCHA_KEY \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$try_path" \
|
||||
--token="$TOKEN" --plain --silent 2>/dev/null || true)
|
||||
[ -n "${TWOCAPTCHA_KEY:-}" ] && break
|
||||
done
|
||||
if [ -z "${TWOCAPTCHA_KEY:-}" ]; then
|
||||
log "ERROR: TWOCAPTCHA_KEY missing in Infisical (checked $INFISICAL_PATH + /)"
|
||||
rm -f "$ENVF"
|
||||
exit 3
|
||||
fi
|
||||
echo "TWOCAPTCHA_KEY=$TWOCAPTCHA_KEY" >> "$ENVF"
|
||||
unset TWOCAPTCHA_KEY
|
||||
fi
|
||||
unset TOKEN
|
||||
|
||||
echo "DRY_RUN=$DRY_RUN" >> "$ENVF"
|
||||
echo "ANAF_LISTA_ALBA_LOG=/work/.log/anaf-lista-alba.log" >> "$ENVF"
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
mkdir -p .log
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-anaf-lista-alba-live \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
python:3.12-slim \
|
||||
bash -c "pip install --quiet --no-cache-dir psycopg2-binary requests && python3 scrapers/anaf_lista_alba/scraper.py")
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-anaf-lista-alba-live >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anaf-lista-alba-live 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-anaf-lista-alba-live 2>&1 | tail -30 | tee -a "$LOG"
|
||||
log "=== ANAF lista_alba LIVE scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+86
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
# ANCOM — Autoritatea Națională pentru Administrare și Reglementare în
|
||||
# Comunicații. Scrapes the public registry of authorized communications
|
||||
# providers from ancom.ro.
|
||||
#
|
||||
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern: Infisical Machine
|
||||
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
|
||||
# post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on ancom_id). Safe to run from cron.
|
||||
#
|
||||
# Env knobs:
|
||||
# LIMIT=0 (default: 0 = full ~570 operators)
|
||||
# MAX_PAGES=0 (default: 0 = all list pages)
|
||||
#
|
||||
# Run:
|
||||
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-ancom.sh # smoke test (2 pages = 20 ids)
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-ancom.sh # full
|
||||
set -euo pipefail
|
||||
|
||||
LIMIT="${LIMIT:-0}"
|
||||
MAX_PAGES="${MAX_PAGES:-0}"
|
||||
LOG=/var/log/vreaudigital-ancom.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ANCOM scrape started (limit=$LIMIT max_pages=$MAX_PAGES) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-ancom --format '{{.Names}}' | grep -q '^vreaudigital-ancom$'; then
|
||||
log "WARN: vreaudigital-ancom already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-ancom 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-ancom-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS=""
|
||||
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
|
||||
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-ancom \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-ancom.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-ancom >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-ancom 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-ancom 2>&1 | tail -30 | tee -a "$LOG"
|
||||
log "=== ANCOM scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+89
@@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
# ANRE — Autoritatea Națională de Reglementare în domeniul Energiei.
|
||||
# Scrapes 4 public registries from portal.anre.ro/PublicLists:
|
||||
# electricitate (~5K), gaze (~350), atestat (~10K), electricieni (~100K).
|
||||
#
|
||||
# Mirrors scrape-regas.sh / scrape-bugetar.sh pattern: Infisical Machine
|
||||
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
|
||||
# post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on sha1 PK / UNIQUE(nr_autorizare,nume_prenume)).
|
||||
# Safe to run from cron.
|
||||
#
|
||||
# Env knobs:
|
||||
# SOURCE=all|electricitate|gaze|atestat|electricieni (default: all)
|
||||
# LIMIT=0 (default: 0 = full)
|
||||
#
|
||||
# Run:
|
||||
# sudo SOURCE=electricitate LIMIT=100 /opt/vreaudigital/services/seap-scraper/cron/scrape-anre.sh
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-anre.sh # full all sources
|
||||
set -euo pipefail
|
||||
|
||||
SOURCE="${SOURCE:-all}"
|
||||
LIMIT="${LIMIT:-0}"
|
||||
LOG=/var/log/vreaudigital-anre.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ANRE scrape started (source=$SOURCE limit=$LIMIT) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-anre --format '{{.Names}}' | grep -q '^vreaudigital-anre$'; then
|
||||
log "WARN: vreaudigital-anre already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-anre 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-anre-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
# ANRE portal uses an intermediate CA cert chain that node's bundle doesn't trust.
|
||||
# Cert is valid (verified OOB via Microsoft-IIS handshake), bypass for this scraper.
|
||||
echo "NODE_TLS_REJECT_UNAUTHORIZED=0" >> "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS="--source=$SOURCE"
|
||||
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-anre \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-anre.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-anre >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-anre 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-anre 2>&1 | tail -25 | tee -a "$LOG"
|
||||
log "=== ANRE scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+86
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
# ASF — Autoritatea de Supraveghere Financiară.
|
||||
# Scrapes the public registry of authorized financial entities (insurers,
|
||||
# brokers, etc.) from data.asfromania.ro/scr/ra. ~860 entities.
|
||||
#
|
||||
# Mirrors scrape-anre.sh pattern: Infisical Machine Identity → env-file →
|
||||
# docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on UNIQUE(register_type, register_no)).
|
||||
# Safe to run from cron.
|
||||
#
|
||||
# Env knobs:
|
||||
# LIMIT=0 (default: 0 = full)
|
||||
# NO_GAPFILL=0 (default: 0 = run gapfill; set 1 to skip)
|
||||
#
|
||||
# Run:
|
||||
# sudo LIMIT=20 /opt/vreaudigital/services/seap-scraper/cron/scrape-asf.sh # smoke
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-asf.sh # full
|
||||
set -euo pipefail
|
||||
|
||||
LIMIT="${LIMIT:-0}"
|
||||
NO_GAPFILL="${NO_GAPFILL:-0}"
|
||||
LOG=/var/log/vreaudigital-asf.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== ASF scrape started (limit=$LIMIT no_gapfill=$NO_GAPFILL) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-asf --format '{{.Names}}' | grep -q '^vreaudigital-asf$'; then
|
||||
log "WARN: vreaudigital-asf already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-asf 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-asf-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS=""
|
||||
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
|
||||
[ "$NO_GAPFILL" = "1" ] && EXTRA_ARGS="$EXTRA_ARGS --no-gapfill"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-asf \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-asf.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-asf >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-asf 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-asf 2>&1 | tail -40 | tee -a "$LOG"
|
||||
log "=== ASF scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+115
@@ -0,0 +1,115 @@
|
||||
#!/bin/bash
|
||||
# Scraper Transparență Bugetară MFP — Faza 1: enumerare universul entităților
|
||||
# publice raportoare + fuzzy match nume → CUI.
|
||||
#
|
||||
# Faza 2 (descărcare rapoarte XML) nu e implementată: aplicația MFP cere
|
||||
# CAPTCHA pe fiecare căutare, ceea ce necesită captcha solver extern (2captcha
|
||||
# / anti-captcha) și un buget pentru ~1.6M cereri (4-8K USD pentru ingest
|
||||
# istoric complet 2020-2025). Vezi BUGETAR-PLAN.md pentru detalii.
|
||||
#
|
||||
# Modes:
|
||||
# MODE=enumerate (default) → enumeră (sector × județ) → bugetar.entitate
|
||||
# MODE=match-cui → fuzzy match denumire → firms.entities.cui_normalized
|
||||
# MODE=full → enumerate + match-cui într-o singură rulare
|
||||
#
|
||||
# Idempotent. Sigur de rulat repetat (UPSERT).
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MODE="${MODE:-enumerate}"
|
||||
JUDET="${JUDET:-}"
|
||||
SECTOR="${SECTOR:-}"
|
||||
DELAY_MS="${DELAY_MS:-500}"
|
||||
LOG=/var/log/vreaudigital-bugetar.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== bugetar scraper started (mode=$MODE judet=${JUDET:-ALL} sector=${SECTOR:-ALL}) ==="
|
||||
|
||||
# Guard: previous run still going?
|
||||
if docker ps --filter name=vreaudigital-bugetar --format '{{.Names}}' | grep -q '^vreaudigital-bugetar$'; then
|
||||
log "WARN: vreaudigital-bugetar already running, skipping"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-bugetar 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-bugetar-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
# Make sure node_modules exists.
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
run_scraper_mode() {
|
||||
local mode="$1"
|
||||
local extra_args=""
|
||||
[ -n "$JUDET" ] && extra_args="$extra_args --judet=$JUDET"
|
||||
[ -n "$SECTOR" ] && extra_args="$extra_args --sector=$SECTOR"
|
||||
[ "$mode" = "enumerate" ] && extra_args="$extra_args --delay-ms=$DELAY_MS"
|
||||
|
||||
log "running mode=$mode args=$extra_args"
|
||||
CID=$(docker run -d \
|
||||
--name "vreaudigital-bugetar-$mode" \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-bugetar.ts --mode="$mode" $extra_args)
|
||||
log " container: $CID"
|
||||
|
||||
sleep 3 # daemon a citit envfile
|
||||
docker wait "vreaudigital-bugetar-$mode" >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' "vreaudigital-bugetar-$mode" 2>/dev/null || echo "?")
|
||||
docker logs "vreaudigital-bugetar-$mode" 2>&1 | tail -10 | tee -a "$LOG"
|
||||
docker rm -f "vreaudigital-bugetar-$mode" >/dev/null 2>&1 || true
|
||||
return "$EXIT_CODE"
|
||||
}
|
||||
|
||||
EXIT_CODE=0
|
||||
case "$MODE" in
|
||||
enumerate)
|
||||
run_scraper_mode enumerate || EXIT_CODE=$?
|
||||
;;
|
||||
match-cui)
|
||||
run_scraper_mode match-cui || EXIT_CODE=$?
|
||||
;;
|
||||
full)
|
||||
run_scraper_mode enumerate || EXIT_CODE=$?
|
||||
if [ "$EXIT_CODE" -eq 0 ]; then
|
||||
run_scraper_mode match-cui || EXIT_CODE=$?
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
log "ERROR: unknown MODE=$MODE (use enumerate|match-cui|full)"
|
||||
EXIT_CODE=2
|
||||
;;
|
||||
esac
|
||||
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
log "=== bugetar scraper done (exit=$EXIT_CODE) ==="
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+96
@@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
# CNAS — Casa Națională de Asigurări de Sănătate.
|
||||
# Scrapes the central WP media library at cnas.ro/wp-content/uploads/ for
|
||||
# furnizori-de-servicii-medicale PDFs (~70-90 active docs as of 2026-05).
|
||||
# Per-county Angular SPA at cas.cnas.ro/casXX is currently empty (handoff
|
||||
# documented in CNAS-PLAN.md).
|
||||
#
|
||||
# Mirrors scrape-anre.sh / scrape-regas.sh pattern: Infisical Machine Identity
|
||||
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
# Container has poppler-utils installed for pdftotext.
|
||||
#
|
||||
# Idempotent. Safe to run from cron weekly (CNAS uploads ~5-15 files/month).
|
||||
#
|
||||
# Env knobs:
|
||||
# LIMIT=0 (default: 0 = all matched files)
|
||||
# MODE=full (full | metadata-only | parse-only)
|
||||
#
|
||||
# Run:
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # full
|
||||
# sudo LIMIT=5 /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # smoke test
|
||||
# sudo MODE=metadata-only /opt/vreaudigital/services/seap-scraper/cron/scrape-cnas.sh # list-only
|
||||
set -euo pipefail
|
||||
|
||||
LIMIT="${LIMIT:-0}"
|
||||
MODE="${MODE:-full}"
|
||||
LOG=/var/log/vreaudigital-cnas.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== CNAS scrape started (limit=$LIMIT mode=$MODE) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-cnas --format '{{.Names}}' | grep -q '^vreaudigital-cnas$'; then
|
||||
log "WARN: vreaudigital-cnas already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-cnas 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-cnas-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS=""
|
||||
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
|
||||
case "$MODE" in
|
||||
metadata-only) EXTRA_ARGS="$EXTRA_ARGS --metadata-only" ;;
|
||||
parse-only) EXTRA_ARGS="$EXTRA_ARGS --parse-only" ;;
|
||||
full) ;;
|
||||
*) log "ERROR: unknown MODE=$MODE (full|metadata-only|parse-only)"; exit 1 ;;
|
||||
esac
|
||||
|
||||
# Note: poppler-utils is installed at container start for pdftotext + pdfinfo.
|
||||
# Using sh -c so we can chain apk add + npx tsx in a single command.
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-cnas \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user 0:0 \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
sh -c "apk add --no-cache poppler-utils >/dev/null && npx tsx src/scrape-cnas.ts $EXTRA_ARGS")
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-cnas >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-cnas 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-cnas 2>&1 | tail -50 | tee -a "$LOG"
|
||||
log "=== CNAS scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+85
@@ -0,0 +1,85 @@
|
||||
#!/bin/bash
|
||||
# CNSC — Consiliul Național de Soluționare a Contestațiilor.
|
||||
# Walks portal.cnsc.ro/decizii.html (~30K decisions across ~617 pages of 50).
|
||||
#
|
||||
# Mirrors scrape-anre.sh / scrape-aaas.sh pattern: Infisical Machine Identity
|
||||
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent: ON CONFLICT (decision_no, decision_year) DO UPDATE.
|
||||
# Safe to run from cron daily — only newly-published decisions are inserted,
|
||||
# the rest are no-op updates of fetched_at.
|
||||
#
|
||||
# Env knobs:
|
||||
# START_PAGE=1 (default 1; set higher to resume after partial run)
|
||||
# MAX_PAGES=0 (default 0 = until totalPages; smaller for smoke test)
|
||||
#
|
||||
# Run:
|
||||
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-cnsc.sh
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-cnsc.sh
|
||||
set -euo pipefail
|
||||
|
||||
START_PAGE="${START_PAGE:-1}"
|
||||
MAX_PAGES="${MAX_PAGES:-0}"
|
||||
LOG=/var/log/vreaudigital-cnsc.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== CNSC scrape started (start_page=$START_PAGE max_pages=$MAX_PAGES) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-cnsc --format '{{.Names}}' | grep -q '^vreaudigital-cnsc$'; then
|
||||
log "WARN: vreaudigital-cnsc already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-cnsc 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-cnsc-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS="--start-page=$START_PAGE"
|
||||
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-cnsc \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-cnsc.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-cnsc >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-cnsc 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-cnsc 2>&1 | tail -25 | tee -a "$LOG"
|
||||
log "=== CNSC scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
+93
@@ -0,0 +1,93 @@
|
||||
#!/bin/bash
|
||||
# Curtea de Conturi — Stage 1: listing-page metadata harvest.
|
||||
#
|
||||
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern: Infisical Machine
|
||||
# Identity → env-file → docker run --env-file (NEVER -e $VAR), file deleted
|
||||
# post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on slug_id PK = sha1(category|slug)).
|
||||
# Safe to run from cron — recommend weekly (new audits drip in slowly).
|
||||
#
|
||||
# Stage 2 (PDF parse + CUI fuzzy match) is a separate scraper, see
|
||||
# services/seap-scraper/CURTEACONT-PLAN.md.
|
||||
#
|
||||
# Env knobs:
|
||||
# SOURCE=all|financiar|conformitate|performanta (default: all)
|
||||
# LIMIT=0 (default: 0 = full)
|
||||
# START_PAGE=1 (default: 1)
|
||||
#
|
||||
# Run:
|
||||
# sudo SOURCE=financiar LIMIT=500 /opt/vreaudigital/services/seap-scraper/cron/scrape-curteacont.sh
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-curteacont.sh # full all sources
|
||||
set -euo pipefail
|
||||
|
||||
SOURCE="${SOURCE:-all}"
|
||||
LIMIT="${LIMIT:-0}"
|
||||
START_PAGE="${START_PAGE:-1}"
|
||||
LOG=/var/log/vreaudigital-curteacont.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== curteacont scrape started (source=$SOURCE limit=$LIMIT start=$START_PAGE) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-curteacont --format '{{.Names}}' | grep -q '^vreaudigital-curteacont$'; then
|
||||
log "WARN: vreaudigital-curteacont already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-curteacont 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-curteacont-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
# curteadeconturi.ro serves an intermediate CA chain that node's bundle doesn't
|
||||
# trust by default. Cert is valid OOB; bypass for this scraper. (Same workaround
|
||||
# we use for ANRE.)
|
||||
echo "NODE_TLS_REJECT_UNAUTHORIZED=0" >> "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS="--source=$SOURCE --start-page=$START_PAGE"
|
||||
[ "$LIMIT" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --limit=$LIMIT"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-curteacont \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-curteacont.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-curteacont >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-curteacont 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-curteacont 2>&1 | tail -50 | tee -a "$LOG"
|
||||
log "=== curteacont scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+81
@@ -0,0 +1,81 @@
|
||||
#!/bin/bash
|
||||
# SEAP Achiziții Directe (DA) — daily/weekly backfill of e-licitatie.ro DA notices.
|
||||
#
|
||||
# The DA endpoint is rate-limited and large (~500K rows already + ~8M historical
|
||||
# 2017-2024 pending). The scraper itself is idempotent and resumable via
|
||||
# `seap.sync_state[source='da']`:
|
||||
# - reads last_date, requests notices > last_date
|
||||
# - upserts on natural key, updates sync_state to latest fetched
|
||||
#
|
||||
# Mirrors scrape-anre.sh / scrape-bugetar.sh pattern. Reads DATABASE_URL via
|
||||
# Infisical MI, writes envfile, docker-run with --env-file, deletes file.
|
||||
#
|
||||
# Env knobs:
|
||||
# MODE=da | backfill (default: da; backfill = last 6 months ignoring sync_state)
|
||||
#
|
||||
# Run:
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-da.sh
|
||||
# sudo MODE=backfill /opt/vreaudigital/services/seap-scraper/cron/scrape-da.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MODE="${MODE:-da}"
|
||||
LOG=/var/log/vreaudigital-da.log
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== SEAP DA scrape started (mode=$MODE) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-da --format '{{.Names}}' | grep -q '^vreaudigital-da$'; then
|
||||
log "WARN: vreaudigital-da already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-da 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-da-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-da \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/index.ts --mode=$MODE)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-da >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-da 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-da 2>&1 | tail -40 | tee -a "$LOG"
|
||||
log "=== SEAP DA scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+88
@@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
# GNM — Garda Națională de Mediu.
|
||||
# Scrapes the gnm.ro WordPress RSS feed (~36 pages × 10 items) for environmental
|
||||
# enforcement press releases. Persists every release to gnm.comunicate, flags
|
||||
# is_enforcement, and runs a regex pass to surface (firm, fine_lei) tuples into
|
||||
# gnm.amenzi_extrase.
|
||||
#
|
||||
# Mirrors scrape-ancom.sh / scrape-anre.sh pattern: Infisical Machine Identity
|
||||
# → env-file → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent (UPSERT on guid; skip on raw_hash unchanged). Safe to run from cron.
|
||||
#
|
||||
# Env knobs:
|
||||
# MAX_PAGES=0 (default: 0 = walk until empty, max 50)
|
||||
# SINCE_DAYS=0 (default: 0 = no cutoff; >0 = stop at first item older than N days)
|
||||
#
|
||||
# Run:
|
||||
# sudo MAX_PAGES=2 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # smoke (20 articles)
|
||||
# sudo SINCE_DAYS=30 /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # incremental
|
||||
# sudo /opt/vreaudigital/services/seap-scraper/cron/scrape-gnm.sh # full (~360 articles)
|
||||
set -euo pipefail
|
||||
|
||||
MAX_PAGES="${MAX_PAGES:-0}"
|
||||
SINCE_DAYS="${SINCE_DAYS:-0}"
|
||||
LOG=/var/log/vreaudigital-gnm.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== GNM scrape started (max_pages=$MAX_PAGES since_days=$SINCE_DAYS) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-gnm --format '{{.Names}}' | grep -q '^vreaudigital-gnm$'; then
|
||||
log "WARN: vreaudigital-gnm already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-gnm 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-gnm-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
EXTRA_ARGS=""
|
||||
[ "$MAX_PAGES" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --max-pages=$MAX_PAGES"
|
||||
[ "$SINCE_DAYS" -gt 0 ] 2>/dev/null && EXTRA_ARGS="$EXTRA_ARGS --since-days=$SINCE_DAYS"
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-gnm \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-gnm.ts $EXTRA_ARGS)
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-gnm >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-gnm 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-gnm 2>&1 | tail -30 | tee -a "$LOG"
|
||||
log "=== GNM scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+79
@@ -0,0 +1,79 @@
|
||||
#!/bin/bash
|
||||
# RegAS scraper — runs scrape-regas.ts in a node:22-alpine container.
|
||||
# Mirrors the enrich-anaf.sh pattern: Infisical Machine Identity → env-file
|
||||
# → docker run --env-file (NEVER -e $VAR), file deleted post-launch.
|
||||
#
|
||||
# Idempotent (uses ON CONFLICT (id) DO UPDATE). Safe to run from cron.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PAGE_SIZE="${PAGE_SIZE:-5000}"
|
||||
START_PAGE="${START_PAGE:-0}"
|
||||
MAX_PAGES="${MAX_PAGES:-0}"
|
||||
LOG=/var/log/vreaudigital-regas.log
|
||||
|
||||
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
||||
|
||||
log "=== RegAS scrape started (page-size=$PAGE_SIZE start-page=$START_PAGE max-pages=$MAX_PAGES) ==="
|
||||
|
||||
if docker ps --filter name=vreaudigital-regas --format '{{.Names}}' | grep -q '^vreaudigital-regas$'; then
|
||||
log "WARN: vreaudigital-regas already running, skipping this tick"
|
||||
exit 0
|
||||
fi
|
||||
docker rm -f vreaudigital-regas 2>/dev/null || true
|
||||
|
||||
# ── Fetch DATABASE_URL via Infisical Machine Identity ──
|
||||
source /opt/vreaudigital/.infisical-mi
|
||||
TOKEN=$(infisical login --method=universal-auth \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--client-id="$INFISICAL_CLIENT_ID" \
|
||||
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
||||
--silent --plain)
|
||||
|
||||
umask 077
|
||||
ENVF=$(mktemp /tmp/.vreaudigital-env.XXXXXX)
|
||||
DBURL=$(infisical secrets get DATABASE_URL \
|
||||
--domain="$INFISICAL_API_URL" \
|
||||
--projectId="$INFISICAL_PROJECT_ID" \
|
||||
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
||||
--token="$TOKEN" --plain --silent)
|
||||
echo "DATABASE_URL=$DBURL" > "$ENVF"
|
||||
# RegAS uses an intermediate CA cert chain that node's bundle doesn't trust.
|
||||
# Cert is valid (verified OOB), bypass for this scraper only.
|
||||
echo "NODE_TLS_REJECT_UNAUTHORIZED=0" >> "$ENVF"
|
||||
unset DBURL TOKEN
|
||||
|
||||
# ── Launch detached docker container ──
|
||||
cd /opt/vreaudigital/services/seap-scraper
|
||||
|
||||
if [ ! -d node_modules/tsx ]; then
|
||||
log "Installing seap-scraper deps..."
|
||||
docker run --rm -v "$(pwd):/work" -w /work --user "$(id -u):$(id -g)" \
|
||||
node:22-alpine npm install --omit=optional 2>&1 | tee -a "$LOG" >/dev/null
|
||||
fi
|
||||
|
||||
CID=$(docker run -d \
|
||||
--name vreaudigital-regas \
|
||||
--network host \
|
||||
--env-file "$ENVF" \
|
||||
-v "$(pwd):/work" \
|
||||
-w /work \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--restart no \
|
||||
node:22-alpine \
|
||||
npx tsx src/scrape-regas.ts \
|
||||
--page-size="$PAGE_SIZE" \
|
||||
--start-page="$START_PAGE" \
|
||||
--max-pages="$MAX_PAGES")
|
||||
log "container started: $CID"
|
||||
|
||||
sleep 3
|
||||
rm -f "$ENVF"
|
||||
log "envfile cleaned"
|
||||
|
||||
docker wait vreaudigital-regas >/dev/null
|
||||
EXIT_CODE=$(docker inspect -f '{{.State.ExitCode}}' vreaudigital-regas 2>/dev/null || echo "?")
|
||||
docker logs vreaudigital-regas 2>&1 | tail -10 | tee -a "$LOG"
|
||||
log "=== RegAS scrape done (exit=$EXIT_CODE) ==="
|
||||
|
||||
exit "$EXIT_CODE"
|
||||
Executable
+70
@@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
# Setup Photon (Komoot) geocoder docker container with pre-built RO extract.
|
||||
# Photon = Java service with embedded OpenSearch index over OSM admin polygons + addresses.
|
||||
#
|
||||
# Source: https://download1.graphhopper.com/public/extracts/by-country-code/ro/
|
||||
# Size: ~332MB tar.bz2 → ~3GB extracted
|
||||
# API: HTTP on :2322, ?q=Strada+X+Bucuresti returns GeoJSON with coords + admin matches.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PHOTON_DIR=/opt/photon
|
||||
EXTRACT_BASE=https://download1.graphhopper.com/public/extracts/by-country-code/ro
|
||||
|
||||
log() { echo "[$(date '+%H:%M:%S')] $1"; }
|
||||
|
||||
log "=== Photon setup ==="
|
||||
|
||||
# 1. Download extract — graphhopper publishes dated snapshots (photon-db-ro-YYMMDD.tar.bz2);
|
||||
# the "-latest" alias is unreliable, so we auto-pick the newest dated file from the index.
|
||||
sudo mkdir -p "$PHOTON_DIR"
|
||||
cd "$PHOTON_DIR"
|
||||
|
||||
if [ ! -d "$PHOTON_DIR/photon_data" ]; then
|
||||
LATEST=$(curl -fsSL "$EXTRACT_BASE/" \
|
||||
| grep -oE 'photon-db-ro-[0-9]{6}\.tar\.bz2' \
|
||||
| sort -u | tail -1)
|
||||
if [ -z "$LATEST" ]; then
|
||||
log "FATAL: could not discover latest Photon RO extract from $EXTRACT_BASE/"
|
||||
exit 1
|
||||
fi
|
||||
log "Downloading $LATEST (~332MB)..."
|
||||
sudo curl -fL "$EXTRACT_BASE/$LATEST" -o photon-ro.tar.bz2
|
||||
log "Extracting (creates ~3GB photon_data/)..."
|
||||
sudo tar -xjf photon-ro.tar.bz2
|
||||
sudo rm photon-ro.tar.bz2
|
||||
sudo chown -R 1000:1000 "$PHOTON_DIR"
|
||||
else
|
||||
log "photon_data/ already exists; skipping download"
|
||||
fi
|
||||
|
||||
# 2. Run docker container
|
||||
if docker ps --filter name=photon-ro --format '{{.Names}}' | grep -q photon-ro; then
|
||||
log "photon-ro already running"
|
||||
else
|
||||
log "Starting photon-ro container..."
|
||||
docker rm -f photon-ro 2>/dev/null || true
|
||||
docker run -d --name photon-ro --restart unless-stopped \
|
||||
-p 127.0.0.1:2322:2322 \
|
||||
-v "$PHOTON_DIR/photon_data:/photon/photon_data" \
|
||||
rtuszik/photon-docker:latest
|
||||
fi
|
||||
|
||||
# 3. Wait for startup, smoke test
|
||||
log "Waiting for Photon to initialize..."
|
||||
for i in $(seq 1 30); do
|
||||
if curl -fs "http://localhost:2322/api?q=Bucuresti" >/dev/null 2>&1; then
|
||||
log "Photon ready."
|
||||
break
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
# 4. Smoke tests
|
||||
log "Smoke test 1 — Bucuresti:"
|
||||
curl -fs "http://localhost:2322/api?q=Bucuresti&limit=2" | head -c 400
|
||||
echo
|
||||
log "Smoke test 2 — Cluj-Napoca Strada Memorandumului:"
|
||||
curl -fs "http://localhost:2322/api?q=Strada+Memorandumului+Cluj-Napoca&limit=1" | head -c 400
|
||||
echo
|
||||
log "=== Photon setup complete (HTTP API on 127.0.0.1:2322) ==="
|
||||
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — daily ANAF delta enrichment (tier=daily, concurrency=2)
|
||||
Wants=network.target docker.service
|
||||
After=network.target docker.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=bulibasa
|
||||
Environment=TIER=daily
|
||||
Environment=ANAF_CONCURRENCY=2
|
||||
ExecStart=/opt/vreaudigital/services/seap-scraper/cron/enrich-anaf.sh
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
TimeoutStartSec=2h
|
||||
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — ANAF delta enrichment daily at 02:00
|
||||
Requires=vreaudigital-anaf-daily.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 02:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — refresh seap materialized views
|
||||
Wants=network.target
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=bulibasa
|
||||
ExecStart=/opt/vreaudigital/services/seap-scraper/cron/refresh-mvs.sh
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — refresh materialized views nightly at 04:00
|
||||
Requires=vreaudigital-mvs.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*-* 04:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=600
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
@@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — fetch latest ONRC bulk and import (weekly check, monthly real change)
|
||||
Wants=network.target
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=bulibasa
|
||||
ExecStart=/opt/vreaudigital/services/seap-scraper/cron/import-onrc-fresh.sh
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
TimeoutStartSec=2h
|
||||
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — weekly ONRC fresh-check Tuesday 03:00
|
||||
Requires=vreaudigital-onrc-weekly.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=Tue *-*-* 03:00:00
|
||||
Persistent=true
|
||||
RandomizedDelaySec=900
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=vreaudigital — Photon 0.5.0 geocoder (Elasticsearch backend) for RO firms
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=bulibasa
|
||||
WorkingDirectory=/opt/photon
|
||||
ExecStart=/usr/bin/java -Xmx8G -jar /opt/photon/photon-0.5.0.jar -data-dir /opt/photon -listen-port 2322
|
||||
Restart=on-failure
|
||||
RestartSec=15
|
||||
StandardOutput=append:/var/log/vreaudigital-photon.log
|
||||
StandardError=append:/var/log/vreaudigital-photon.log
|
||||
LimitNOFILE=65536
|
||||
LimitMEMLOCK=infinity
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user