#!/bin/bash # Import ONRC bulk CSV files into firms.entities. # Source: data.gov.ro (CC-BY 4.0), updated weekly. # # Pipeline: # 1. TRUNCATE staging tables # 2. COPY each CSV (~/data/onrc/*.csv) into corresponding staging table # 3. UPSERT into firms.entities, joining on cod_inmatriculare # 4. Resolve siruta UAT for each firm via county+localitate fuzzy match # # Idempotent. Run nightly via cron. set -euo pipefail DATA_DIR=/opt/vreaudigital/data/onrc LOG=/var/log/vreaudigital-onrc-import.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } log "=== ONRC import started ===" # ── Resolve DATABASE_URL via Infisical Machine Identity ── source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth \ --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" \ --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) DATABASE_URL=$(infisical run --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" \ --env="$INFISICAL_ENV" --path="$INFISICAL_PATH" \ --silent --token="$TOKEN" \ -- sh -c 'echo "$DATABASE_URL"') DB=$(echo "$DATABASE_URL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//') # Pass URL to psql via stdin to avoid leaking via `ps aux`. # psql doesn't natively read URL from stdin; use libpq env vars instead. # Parse URL: postgresql://USER:PASS@HOST:PORT/DBNAME DB_USER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|') DB_PASS=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|') DB_HOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|') DB_PORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|') DB_NAME=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|') export PGUSER="$DB_USER" PGPASSWORD="$DB_PASS" PGHOST="$DB_HOST" PGPORT="$DB_PORT" PGDATABASE="$DB_NAME" unset DATABASE_URL TOKEN DB DB_USER DB_PASS DB_HOST DB_PORT DB_NAME # ── Sanity check files ── for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do if [ ! -s "$DATA_DIR/$f" ]; then log "FATAL: $DATA_DIR/$f missing or empty"; exit 1 fi done DATASET_NAME=$(basename "$(dirname "$(readlink -f "$DATA_DIR/od_firme.csv")")" | head -c 40) log "Dataset name (best guess): $DATASET_NAME" # ── Stage CSVs ── log "Truncating staging tables..." psql -v ON_ERROR_STOP=1 -c " TRUNCATE TABLE firms.staging_onrc_firme, firms.staging_onrc_caen, firms.staging_onrc_stare, firms.staging_onrc_reprezentanti; " log "COPY od_firme.csv (683MB)..." time psql -v ON_ERROR_STOP=1 <&1 | tee -a "$LOG" log "=== ONRC import complete ==="