Files
vreau-digital/services/seap-scraper/cron/import-onrc.sh
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

273 lines
11 KiB
Bash
Executable File

#!/bin/bash
# Import ONRC bulk CSV files into firms.entities.
# Source: data.gov.ro (CC-BY 4.0), updated weekly.
#
# Pipeline:
# 1. TRUNCATE staging tables
# 2. COPY each CSV (~/data/onrc/*.csv) into corresponding staging table
# 3. UPSERT into firms.entities, joining on cod_inmatriculare
# 4. Resolve siruta UAT for each firm via county+localitate fuzzy match
#
# Idempotent. Run nightly via cron.
set -euo pipefail
DATA_DIR=/opt/vreaudigital/data/onrc
LOG=/var/log/vreaudigital-onrc-import.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
log "=== ONRC import started ==="
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth \
--domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" \
--client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" \
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
--silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
# Pass URL to psql via stdin to avoid leaking via `ps aux`.
# psql doesn't natively read URL from stdin; use libpq env vars instead.
# Parse URL: postgresql://USER:PASS@HOST:PORT/DBNAME
DB_USER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
DB_PASS=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
DB_HOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
DB_PORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
DB_NAME=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
export PGUSER="$DB_USER" PGPASSWORD="$DB_PASS" PGHOST="$DB_HOST" PGPORT="$DB_PORT" PGDATABASE="$DB_NAME"
unset DATABASE_URL TOKEN DB DB_USER DB_PASS DB_HOST DB_PORT DB_NAME
# ── Sanity check files ──
for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do
if [ ! -s "$DATA_DIR/$f" ]; then
log "FATAL: $DATA_DIR/$f missing or empty"; exit 1
fi
done
DATASET_NAME=$(basename "$(dirname "$(readlink -f "$DATA_DIR/od_firme.csv")")" | head -c 40)
log "Dataset name (best guess): $DATASET_NAME"
# ── Stage CSVs ──
log "Truncating staging tables..."
psql -v ON_ERROR_STOP=1 -c "
TRUNCATE TABLE firms.staging_onrc_firme, firms.staging_onrc_caen,
firms.staging_onrc_stare, firms.staging_onrc_reprezentanti;
"
log "COPY od_firme.csv (683MB)..."
time psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_firme (denumire, cui, cod_inmatriculare, data_inmatriculare, euid, forma_juridica, adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar, adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal, adr_sector, adr_completare, web, tara_firma_mama) FROM '$DATA_DIR/od_firme.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
log "COPY od_caen_autorizat.csv..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_caen (cod_inmatriculare, cod_caen, ver_caen) FROM '$DATA_DIR/od_caen_autorizat.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
log "COPY od_stare_firma.csv..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_stare (cod_inmatriculare, cod_stare) FROM '$DATA_DIR/od_stare_firma.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
log "COPY od_reprezentanti_legali.csv..."
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_onrc_reprezentanti (cod_inmatriculare, persoana, calitate, data_nastere, localitate_nastere, judet_nastere, tara_nastere, localitate, judet, tara) FROM '$DATA_DIR/od_reprezentanti_legali.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
# Optional: extras from same dataset (entreprises individuelle + EU branches).
# Idempotent — TRUNCATE-and-reload each run.
if [ -s "$DATA_DIR/od_reprezentanti_if.csv" ]; then
log "COPY od_reprezentanti_if.csv (~13MB)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.reprezentanti_if;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.reprezentanti_if (cod_inmatriculare, nume, data_nastere, localitate_nastere, judet_nastere, tara_nastere, calitate) FROM '$DATA_DIR/od_reprezentanti_if.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
else
log "[SKIP] od_reprezentanti_if.csv missing"
fi
if [ -s "$DATA_DIR/od_sucursale_alte_state_membre.csv" ]; then
log "COPY od_sucursale_alte_state_membre.csv (small)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.sucursale_ue;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.sucursale_ue (cod_inmatriculare, tip_unitate, denumire_sucursala, euid, cod_fiscal_strain, tara) FROM '$DATA_DIR/od_sucursale_alte_state_membre.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
COPYEOF
else
log "[SKIP] od_sucursale_alte_state_membre.csv missing"
fi
# ── Aggregate into firms.entities ──
log "Building firms.entities from staging..."
time psql -v ON_ERROR_STOP=1 <<SQL
-- Pre-aggregate stare per cod_inmatriculare (multiple historical states possible — pick latest)
DROP TABLE IF EXISTS tmp_stare_agg;
CREATE TEMP TABLE tmp_stare_agg AS
SELECT DISTINCT ON (cod_inmatriculare) cod_inmatriculare, cod_stare
FROM firms.staging_onrc_stare
WHERE cod_inmatriculare IS NOT NULL
ORDER BY cod_inmatriculare, cod_stare DESC;
-- Aggregate CAEN per cod_inmatriculare
DROP TABLE IF EXISTS tmp_caen_agg;
CREATE TEMP TABLE tmp_caen_agg AS
SELECT
cod_inmatriculare,
array_agg(DISTINCT cod_caen ORDER BY cod_caen) FILTER (WHERE cod_caen IS NOT NULL) AS caens
FROM firms.staging_onrc_caen
WHERE cod_inmatriculare IS NOT NULL
GROUP BY cod_inmatriculare;
-- Aggregate reprezentanti per cod_inmatriculare
DROP TABLE IF EXISTS tmp_rep_agg;
CREATE TEMP TABLE tmp_rep_agg AS
SELECT
cod_inmatriculare,
jsonb_agg(jsonb_build_object(
'persoana', persoana,
'calitate', calitate,
'localitate', localitate,
'judet', judet,
'tara', tara
)) AS rep_legali
FROM firms.staging_onrc_reprezentanti
WHERE cod_inmatriculare IS NOT NULL AND persoana IS NOT NULL
GROUP BY cod_inmatriculare;
-- UPSERT firms.entities. CUI as PK.
-- Skip rows where CUI is empty/0. DISTINCT ON (cui) — if multiple ONRC rows share the
-- same CUI (rare but happens with reorganization), pick the most recently registered.
INSERT INTO firms.entities (
cui, cod_inmatriculare, euid, name, forma_juridica,
adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar,
adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal,
adr_sector, adr_completare,
adr_full,
data_inmatriculare,
registration_year,
web,
tara_firma_mama,
caen_autorizate,
rep_legali,
status_text,
is_radiated_onrc,
source_onrc_dataset,
onrc_fetched_at,
updated_at
)
SELECT DISTINCT ON (f.cui)
f.cui,
f.cod_inmatriculare,
f.euid,
f.denumire,
f.forma_juridica,
f.adr_tara, f.adr_judet, f.adr_localitate, f.adr_strada, f.adr_numar,
f.adr_bloc, f.adr_scara, f.adr_etaj, f.adr_apartament, f.adr_cod_postal,
f.adr_sector, f.adr_completare,
-- Build adr_full for geocoding
COALESCE(
NULLIF(trim(concat_ws(', ',
NULLIF(trim(concat_ws(' ', f.adr_strada,
CASE WHEN f.adr_numar IS NOT NULL THEN 'nr.' || f.adr_numar END
)), ''),
f.adr_localitate,
f.adr_judet,
'Romania'
)), ''),
NULL
) AS adr_full,
-- ONRC format: DD.MM.YYYY
CASE WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
THEN to_date(f.data_inmatriculare, 'DD.MM.YYYY')
ELSE NULL END AS data_inmatriculare,
CASE WHEN f.data_inmatriculare ~ '\d{4}\$'
THEN right(f.data_inmatriculare, 4)::int
WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
THEN right(f.data_inmatriculare, 4)::int
ELSE NULL END AS registration_year,
f.web,
f.tara_firma_mama,
ca.caens,
ra.rep_legali,
-- Status: store raw stare code (decoding via ONRC nomenclator e TODO)
-- For now: best effort detection of "radiat" pattern
COALESCE(ss.cod_stare, 'unknown') AS status_text,
false AS is_radiated_onrc, -- TODO: import ONRC stare nomenclator and detect
'$DATASET_NAME' AS source_onrc_dataset,
now() AS onrc_fetched_at,
now() AS updated_at
FROM firms.staging_onrc_firme f
LEFT JOIN tmp_caen_agg ca ON ca.cod_inmatriculare = f.cod_inmatriculare
LEFT JOIN tmp_rep_agg ra ON ra.cod_inmatriculare = f.cod_inmatriculare
LEFT JOIN tmp_stare_agg ss ON ss.cod_inmatriculare = f.cod_inmatriculare
LEFT JOIN firms.stare_codelist scl ON scl.cod = ss.cod_stare
WHERE f.cui IS NOT NULL
AND f.cui != ''
AND f.cui != '0'
AND f.denumire IS NOT NULL
ORDER BY f.cui, f.data_inmatriculare DESC NULLS LAST
ON CONFLICT (cui) DO UPDATE SET
cod_inmatriculare = EXCLUDED.cod_inmatriculare,
euid = EXCLUDED.euid,
name = EXCLUDED.name,
forma_juridica = EXCLUDED.forma_juridica,
adr_tara = EXCLUDED.adr_tara,
adr_judet = EXCLUDED.adr_judet,
adr_localitate = EXCLUDED.adr_localitate,
adr_strada = EXCLUDED.adr_strada,
adr_numar = EXCLUDED.adr_numar,
adr_bloc = EXCLUDED.adr_bloc,
adr_scara = EXCLUDED.adr_scara,
adr_etaj = EXCLUDED.adr_etaj,
adr_apartament = EXCLUDED.adr_apartament,
adr_cod_postal = EXCLUDED.adr_cod_postal,
adr_sector = EXCLUDED.adr_sector,
adr_completare = EXCLUDED.adr_completare,
adr_full = EXCLUDED.adr_full,
data_inmatriculare = EXCLUDED.data_inmatriculare,
registration_year = EXCLUDED.registration_year,
web = EXCLUDED.web,
tara_firma_mama = EXCLUDED.tara_firma_mama,
caen_autorizate = EXCLUDED.caen_autorizate,
rep_legali = EXCLUDED.rep_legali,
status_text = EXCLUDED.status_text,
is_radiated_onrc = EXCLUDED.is_radiated_onrc,
source_onrc_dataset = EXCLUDED.source_onrc_dataset,
onrc_fetched_at = EXCLUDED.onrc_fetched_at,
updated_at = now();
-- Match siruta UAT for each firm via norm_uat_name
UPDATE firms.entities f
SET siruta = sub.siruta
FROM (
SELECT DISTINCT ON (e.cui) e.cui, gu.siruta
FROM firms.entities e
JOIN public."GisUat" gu
ON seap.norm_uat_name(gu.county) = seap.norm_uat_name(e.adr_judet)
AND seap.norm_uat_name(gu.name) = seap.norm_uat_name(e.adr_localitate)
WHERE e.siruta IS NULL
AND e.adr_judet IS NOT NULL
AND e.adr_localitate IS NOT NULL
ORDER BY e.cui, gu.siruta
) sub
WHERE f.cui = sub.cui;
SQL
# ── Stats ──
log "Final stats:"
psql -c "
SELECT
COUNT(*) AS total_firms,
COUNT(*) FILTER (WHERE siruta IS NOT NULL) AS cu_siruta,
COUNT(*) FILTER (WHERE rep_legali IS NOT NULL) AS cu_admins,
COUNT(*) FILTER (WHERE caen_autorizate IS NOT NULL) AS cu_caen,
COUNT(*) FILTER (WHERE is_radiated_onrc = true) AS radiate
FROM firms.entities;
" 2>&1 | tee -a "$LOG"
log "=== ONRC import complete ==="