a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
273 lines
11 KiB
Bash
Executable File
273 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
|
# Import ONRC bulk CSV files into firms.entities.
|
|
# Source: data.gov.ro (CC-BY 4.0), updated weekly.
|
|
#
|
|
# Pipeline:
|
|
# 1. TRUNCATE staging tables
|
|
# 2. COPY each CSV (~/data/onrc/*.csv) into corresponding staging table
|
|
# 3. UPSERT into firms.entities, joining on cod_inmatriculare
|
|
# 4. Resolve siruta UAT for each firm via county+localitate fuzzy match
|
|
#
|
|
# Idempotent. Run nightly via cron.
|
|
|
|
set -euo pipefail
|
|
|
|
DATA_DIR=/opt/vreaudigital/data/onrc
|
|
LOG=/var/log/vreaudigital-onrc-import.log
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
|
|
|
log "=== ONRC import started ==="
|
|
|
|
# ── Resolve DATABASE_URL via Infisical Machine Identity ──
|
|
source /opt/vreaudigital/.infisical-mi
|
|
TOKEN=$(infisical login --method=universal-auth \
|
|
--domain="$INFISICAL_API_URL" \
|
|
--client-id="$INFISICAL_CLIENT_ID" \
|
|
--client-secret="$INFISICAL_CLIENT_SECRET" \
|
|
--silent --plain)
|
|
DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \
|
|
--projectId="$INFISICAL_PROJECT_ID" \
|
|
--env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \
|
|
--silent --token="$TOKEN" \
|
|
-- sh -c 'echo "$DATABASE_URL"')
|
|
DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
|
# Pass URL to psql via stdin to avoid leaking via `ps aux`.
|
|
# psql doesn't natively read URL from stdin; use libpq env vars instead.
|
|
# Parse URL: postgresql://USER:PASS@HOST:PORT/DBNAME
|
|
DB_USER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
|
DB_PASS=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
|
DB_HOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
|
DB_PORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
|
DB_NAME=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
|
export PGUSER="$DB_USER" PGPASSWORD="$DB_PASS" PGHOST="$DB_HOST" PGPORT="$DB_PORT" PGDATABASE="$DB_NAME"
|
|
unset DATABASE_URL TOKEN DB DB_USER DB_PASS DB_HOST DB_PORT DB_NAME
|
|
|
|
# ── Sanity check files ──
|
|
for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do
|
|
if [ ! -s "$DATA_DIR/$f" ]; then
|
|
log "FATAL: $DATA_DIR/$f missing or empty"; exit 1
|
|
fi
|
|
done
|
|
|
|
DATASET_NAME=$(basename "$(dirname "$(readlink -f "$DATA_DIR/od_firme.csv")")" | head -c 40)
|
|
log "Dataset name (best guess): $DATASET_NAME"
|
|
|
|
# ── Stage CSVs ──
|
|
log "Truncating staging tables..."
|
|
psql -v ON_ERROR_STOP=1 -c "
|
|
TRUNCATE TABLE firms.staging_onrc_firme, firms.staging_onrc_caen,
|
|
firms.staging_onrc_stare, firms.staging_onrc_reprezentanti;
|
|
"
|
|
|
|
log "COPY od_firme.csv (683MB)..."
|
|
time psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_onrc_firme (denumire, cui, cod_inmatriculare, data_inmatriculare, euid, forma_juridica, adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar, adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal, adr_sector, adr_completare, web, tara_firma_mama) FROM '$DATA_DIR/od_firme.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
|
COPYEOF
|
|
|
|
log "COPY od_caen_autorizat.csv..."
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_onrc_caen (cod_inmatriculare, cod_caen, ver_caen) FROM '$DATA_DIR/od_caen_autorizat.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
|
COPYEOF
|
|
|
|
log "COPY od_stare_firma.csv..."
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_onrc_stare (cod_inmatriculare, cod_stare) FROM '$DATA_DIR/od_stare_firma.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
|
COPYEOF
|
|
|
|
log "COPY od_reprezentanti_legali.csv..."
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_onrc_reprezentanti (cod_inmatriculare, persoana, calitate, data_nastere, localitate_nastere, judet_nastere, tara_nastere, localitate, judet, tara) FROM '$DATA_DIR/od_reprezentanti_legali.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
|
COPYEOF
|
|
|
|
# Optional: extras from same dataset (entreprises individuelle + EU branches).
|
|
# Idempotent — TRUNCATE-and-reload each run.
|
|
if [ -s "$DATA_DIR/od_reprezentanti_if.csv" ]; then
|
|
log "COPY od_reprezentanti_if.csv (~13MB)..."
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.reprezentanti_if;"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.reprezentanti_if (cod_inmatriculare, nume, data_nastere, localitate_nastere, judet_nastere, tara_nastere, calitate) FROM '$DATA_DIR/od_reprezentanti_if.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
|
COPYEOF
|
|
else
|
|
log "[SKIP] od_reprezentanti_if.csv missing"
|
|
fi
|
|
|
|
if [ -s "$DATA_DIR/od_sucursale_alte_state_membre.csv" ]; then
|
|
log "COPY od_sucursale_alte_state_membre.csv (small)..."
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.sucursale_ue;"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.sucursale_ue (cod_inmatriculare, tip_unitate, denumire_sucursala, euid, cod_fiscal_strain, tara) FROM '$DATA_DIR/od_sucursale_alte_state_membre.csv' WITH (FORMAT csv, DELIMITER '^', HEADER true, NULL '', QUOTE E'\\b');
|
|
COPYEOF
|
|
else
|
|
log "[SKIP] od_sucursale_alte_state_membre.csv missing"
|
|
fi
|
|
|
|
# ── Aggregate into firms.entities ──
|
|
log "Building firms.entities from staging..."
|
|
time psql -v ON_ERROR_STOP=1 <<SQL
|
|
-- Pre-aggregate stare per cod_inmatriculare (multiple historical states possible — pick latest)
|
|
DROP TABLE IF EXISTS tmp_stare_agg;
|
|
CREATE TEMP TABLE tmp_stare_agg AS
|
|
SELECT DISTINCT ON (cod_inmatriculare) cod_inmatriculare, cod_stare
|
|
FROM firms.staging_onrc_stare
|
|
WHERE cod_inmatriculare IS NOT NULL
|
|
ORDER BY cod_inmatriculare, cod_stare DESC;
|
|
|
|
-- Aggregate CAEN per cod_inmatriculare
|
|
DROP TABLE IF EXISTS tmp_caen_agg;
|
|
CREATE TEMP TABLE tmp_caen_agg AS
|
|
SELECT
|
|
cod_inmatriculare,
|
|
array_agg(DISTINCT cod_caen ORDER BY cod_caen) FILTER (WHERE cod_caen IS NOT NULL) AS caens
|
|
FROM firms.staging_onrc_caen
|
|
WHERE cod_inmatriculare IS NOT NULL
|
|
GROUP BY cod_inmatriculare;
|
|
|
|
-- Aggregate reprezentanti per cod_inmatriculare
|
|
DROP TABLE IF EXISTS tmp_rep_agg;
|
|
CREATE TEMP TABLE tmp_rep_agg AS
|
|
SELECT
|
|
cod_inmatriculare,
|
|
jsonb_agg(jsonb_build_object(
|
|
'persoana', persoana,
|
|
'calitate', calitate,
|
|
'localitate', localitate,
|
|
'judet', judet,
|
|
'tara', tara
|
|
)) AS rep_legali
|
|
FROM firms.staging_onrc_reprezentanti
|
|
WHERE cod_inmatriculare IS NOT NULL AND persoana IS NOT NULL
|
|
GROUP BY cod_inmatriculare;
|
|
|
|
-- UPSERT firms.entities. CUI as PK.
|
|
-- Skip rows where CUI is empty/0. DISTINCT ON (cui) — if multiple ONRC rows share the
|
|
-- same CUI (rare but happens with reorganization), pick the most recently registered.
|
|
INSERT INTO firms.entities (
|
|
cui, cod_inmatriculare, euid, name, forma_juridica,
|
|
adr_tara, adr_judet, adr_localitate, adr_strada, adr_numar,
|
|
adr_bloc, adr_scara, adr_etaj, adr_apartament, adr_cod_postal,
|
|
adr_sector, adr_completare,
|
|
adr_full,
|
|
data_inmatriculare,
|
|
registration_year,
|
|
web,
|
|
tara_firma_mama,
|
|
caen_autorizate,
|
|
rep_legali,
|
|
status_text,
|
|
is_radiated_onrc,
|
|
source_onrc_dataset,
|
|
onrc_fetched_at,
|
|
updated_at
|
|
)
|
|
SELECT DISTINCT ON (f.cui)
|
|
f.cui,
|
|
f.cod_inmatriculare,
|
|
f.euid,
|
|
f.denumire,
|
|
f.forma_juridica,
|
|
f.adr_tara, f.adr_judet, f.adr_localitate, f.adr_strada, f.adr_numar,
|
|
f.adr_bloc, f.adr_scara, f.adr_etaj, f.adr_apartament, f.adr_cod_postal,
|
|
f.adr_sector, f.adr_completare,
|
|
-- Build adr_full for geocoding
|
|
COALESCE(
|
|
NULLIF(trim(concat_ws(', ',
|
|
NULLIF(trim(concat_ws(' ', f.adr_strada,
|
|
CASE WHEN f.adr_numar IS NOT NULL THEN 'nr.' || f.adr_numar END
|
|
)), ''),
|
|
f.adr_localitate,
|
|
f.adr_judet,
|
|
'Romania'
|
|
)), ''),
|
|
NULL
|
|
) AS adr_full,
|
|
-- ONRC format: DD.MM.YYYY
|
|
CASE WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
|
|
THEN to_date(f.data_inmatriculare, 'DD.MM.YYYY')
|
|
ELSE NULL END AS data_inmatriculare,
|
|
CASE WHEN f.data_inmatriculare ~ '\d{4}\$'
|
|
THEN right(f.data_inmatriculare, 4)::int
|
|
WHEN f.data_inmatriculare ~ '^\d{2}\.\d{2}\.\d{4}'
|
|
THEN right(f.data_inmatriculare, 4)::int
|
|
ELSE NULL END AS registration_year,
|
|
f.web,
|
|
f.tara_firma_mama,
|
|
ca.caens,
|
|
ra.rep_legali,
|
|
-- Status: store raw stare code (decoding via ONRC nomenclator e TODO)
|
|
-- For now: best effort detection of "radiat" pattern
|
|
COALESCE(ss.cod_stare, 'unknown') AS status_text,
|
|
false AS is_radiated_onrc, -- TODO: import ONRC stare nomenclator and detect
|
|
'$DATASET_NAME' AS source_onrc_dataset,
|
|
now() AS onrc_fetched_at,
|
|
now() AS updated_at
|
|
FROM firms.staging_onrc_firme f
|
|
LEFT JOIN tmp_caen_agg ca ON ca.cod_inmatriculare = f.cod_inmatriculare
|
|
LEFT JOIN tmp_rep_agg ra ON ra.cod_inmatriculare = f.cod_inmatriculare
|
|
LEFT JOIN tmp_stare_agg ss ON ss.cod_inmatriculare = f.cod_inmatriculare
|
|
LEFT JOIN firms.stare_codelist scl ON scl.cod = ss.cod_stare
|
|
WHERE f.cui IS NOT NULL
|
|
AND f.cui != ''
|
|
AND f.cui != '0'
|
|
AND f.denumire IS NOT NULL
|
|
ORDER BY f.cui, f.data_inmatriculare DESC NULLS LAST
|
|
ON CONFLICT (cui) DO UPDATE SET
|
|
cod_inmatriculare = EXCLUDED.cod_inmatriculare,
|
|
euid = EXCLUDED.euid,
|
|
name = EXCLUDED.name,
|
|
forma_juridica = EXCLUDED.forma_juridica,
|
|
adr_tara = EXCLUDED.adr_tara,
|
|
adr_judet = EXCLUDED.adr_judet,
|
|
adr_localitate = EXCLUDED.adr_localitate,
|
|
adr_strada = EXCLUDED.adr_strada,
|
|
adr_numar = EXCLUDED.adr_numar,
|
|
adr_bloc = EXCLUDED.adr_bloc,
|
|
adr_scara = EXCLUDED.adr_scara,
|
|
adr_etaj = EXCLUDED.adr_etaj,
|
|
adr_apartament = EXCLUDED.adr_apartament,
|
|
adr_cod_postal = EXCLUDED.adr_cod_postal,
|
|
adr_sector = EXCLUDED.adr_sector,
|
|
adr_completare = EXCLUDED.adr_completare,
|
|
adr_full = EXCLUDED.adr_full,
|
|
data_inmatriculare = EXCLUDED.data_inmatriculare,
|
|
registration_year = EXCLUDED.registration_year,
|
|
web = EXCLUDED.web,
|
|
tara_firma_mama = EXCLUDED.tara_firma_mama,
|
|
caen_autorizate = EXCLUDED.caen_autorizate,
|
|
rep_legali = EXCLUDED.rep_legali,
|
|
status_text = EXCLUDED.status_text,
|
|
is_radiated_onrc = EXCLUDED.is_radiated_onrc,
|
|
source_onrc_dataset = EXCLUDED.source_onrc_dataset,
|
|
onrc_fetched_at = EXCLUDED.onrc_fetched_at,
|
|
updated_at = now();
|
|
|
|
-- Match siruta UAT for each firm via norm_uat_name
|
|
UPDATE firms.entities f
|
|
SET siruta = sub.siruta
|
|
FROM (
|
|
SELECT DISTINCT ON (e.cui) e.cui, gu.siruta
|
|
FROM firms.entities e
|
|
JOIN public."GisUat" gu
|
|
ON seap.norm_uat_name(gu.county) = seap.norm_uat_name(e.adr_judet)
|
|
AND seap.norm_uat_name(gu.name) = seap.norm_uat_name(e.adr_localitate)
|
|
WHERE e.siruta IS NULL
|
|
AND e.adr_judet IS NOT NULL
|
|
AND e.adr_localitate IS NOT NULL
|
|
ORDER BY e.cui, gu.siruta
|
|
) sub
|
|
WHERE f.cui = sub.cui;
|
|
SQL
|
|
|
|
# ── Stats ──
|
|
log "Final stats:"
|
|
psql -c "
|
|
SELECT
|
|
COUNT(*) AS total_firms,
|
|
COUNT(*) FILTER (WHERE siruta IS NOT NULL) AS cu_siruta,
|
|
COUNT(*) FILTER (WHERE rep_legali IS NOT NULL) AS cu_admins,
|
|
COUNT(*) FILTER (WHERE caen_autorizate IS NOT NULL) AS cu_caen,
|
|
COUNT(*) FILTER (WHERE is_radiated_onrc = true) AS radiate
|
|
FROM firms.entities;
|
|
" 2>&1 | tee -a "$LOG"
|
|
|
|
log "=== ONRC import complete ==="
|