#!/bin/bash # Historical financial backfill 2015-2019 from data.gov.ro / MFP. # # Why a separate script: 2015 and pre-2020 files have slightly different # schemas (WEB_UU 2015 has 21 cols vs 22 for 2016+; WEB_BL_BS_SL 2015 has 23 # cols vs 22 for 2016+; WEB_INST_DE_CREDIT 2016/2017/2019 has 23 cols vs 25 # for 2024). The daily importer (import-financials.sh + # import-financials-ong-banks.sh) assumes the 2020+ schema and silently fails # or rejects older years. This wrapper: # 1) Downloads the right files from data.gov.ro for the requested years. # 2) Loads them via a session-local TEMP TABLE matched to that year's column # count, then INSERTs into the canonical firms.financials* tables. # # Usage on satra: # /opt/vreaudigital/services/seap-scraper/cron/import-financials-historical.sh # YEARS="2017 2018" /opt/...../import-financials-historical.sh # subset # # Idempotent — PK (cui, year) + ON CONFLICT DO UPDATE. # # Banks: 2015 and 2018 have no Inst_de_credit file at data.gov.ro. Banks for # 2016/2017/2019 use the pre-IFRS schema (21 indicators), so this script also # loads pre-2020 bank files into firms.financials_banks with the JSONB # `indicators` column carrying everything; the typed columns are mapped # best-effort (i21 instead of i23 → cifra_afaceri). set -uo pipefail DATA_DIR=/opt/vreaudigital/data/mfinante LOG=/var/log/vreaudigital-fin-historical.log log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } mkdir -p "$DATA_DIR" source /opt/vreaudigital/.infisical-mi TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \ --client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \ --silent --plain) DBURL=$(infisical run --domain="$INFISICAL_API_URL" \ --projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \ --path="$INFISICAL_PATH" --silent --token="$TOKEN" \ -- sh -c 'echo "$DATABASE_URL"') DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//') export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|') export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|') export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|') export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|') export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|') unset DBURL TOKEN DB YEARS="${YEARS:-2015 2016 2017 2018 2019}" log "=== Historical financial import started (YEARS=$YEARS) ===" # Discover a download URL from a data.gov.ro slug by filename regex. # Args: slug pattern (pattern is a Python regex matched on resource name) discover() { local slug="$1" local pattern="$2" curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \ | python3 -c " import json, sys, re d = json.load(sys.stdin) pat = re.compile(r'''$pattern''', re.I) for r in d.get('result', {}).get('resources', []): if pat.search(r.get('name', '')): print(r.get('url', '')); break " } # Download a file from data.gov.ro if not already present. # Args: local_path url fetch() { local file="$1" local url="$2" if [ -s "$file" ]; then log " [SKIP] $file already exists ($(stat -c%s "$file") bytes)" return 0 fi if [ -z "$url" ]; then log " [ERR] No URL for $file" return 1 fi log " Downloading $url → $file" curl -fsL --max-time 300 -o "$file" "$url" || { log " [ERR] download failed"; rm -f "$file"; return 1; } log " OK $(stat -c%s "$file") bytes" } # ─── WEB_UU (companies, prescurtat) ────────────────────────────────────── import_uu() { local year="$1" local file="$DATA_DIR/web_uu_${year}.txt" local slug="situatii_financiare_${year}" local pattern url ncols case "$year" in 2015) pattern="^web_uu.*${year}\\.txt$"; ncols=21 ;; *) pattern="^web_uu.*${year}\\.txt$"; ncols=22 ;; esac if [ ! -s "$file" ]; then url=$(discover "$slug" "$pattern") fetch "$file" "$url" || return 1 fi log "[$year/WEB_UU] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..." if [ "$ncols" -eq 22 ]; then # Standard schema (2016+): CUI,CAEN,I1..I20. I20 = salariati. psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;" psql -v ON_ERROR_STOP=1 < that's i45 j = n - 2 while j >= 0 and not NUM_RE.match(parts[j]): j -= 1 i45_idx = j # den_caen spans (i45_idx+1 .. i46_idx-1) → join those # continue back to find i44 j -= 1 while j >= 0 and not NUM_RE.match(parts[j]): j -= 1 i44_idx = j if i44_idx < 0 or i45_idx < 0: # malformed row — skip continue # Reassemble: parts[0..i44_idx] + parts[i45_idx] + parts[i46_idx] new_parts = parts[:i44_idx+1] + [parts[i45_idx]] + [parts[i46_idx]] if len(new_parts) != 49: # row doesn't fit expected 49-col output → skip continue out.write(','.join(new_parts) + '\n') out.close() PYEOF log "[$year/WEB_ONG] Cleaned $(wc -l < "$cleaned") lines (incl. header)" psql -v ON_ERROR_STOP=1 <&1 | tee -a "$LOG" log "=== Historical import done ==="