vreau-digital/services/seap-scraper/cron/import-financials-historical.sh

#!/bin/bash
# Historical financial backfill 2015-2019 from data.gov.ro / MFP.
#
# Why a separate script: 2015 and pre-2020 files have slightly different
# schemas (WEB_UU 2015 has 21 cols vs 22 for 2016+; WEB_BL_BS_SL 2015 has 23
# cols vs 22 for 2016+; WEB_INST_DE_CREDIT 2016/2017/2019 has 23 cols vs 25
# for 2024). The daily importer (import-financials.sh +
# import-financials-ong-banks.sh) assumes the 2020+ schema and silently fails
# or rejects older years. This wrapper:
#   1) Downloads the right files from data.gov.ro for the requested years.
#   2) Loads them via a session-local TEMP TABLE matched to that year's column
#      count, then INSERTs into the canonical firms.financials* tables.
#
# Usage on satra:
#   /opt/vreaudigital/services/seap-scraper/cron/import-financials-historical.sh
#   YEARS="2017 2018" /opt/...../import-financials-historical.sh   # subset
#
# Idempotent — PK (cui, year) + ON CONFLICT DO UPDATE.
#
# Banks: 2015 and 2018 have no Inst_de_credit file at data.gov.ro. Banks for
# 2016/2017/2019 use the pre-IFRS schema (21 indicators), so this script also
# loads pre-2020 bank files into firms.financials_banks with the JSONB
# `indicators` column carrying everything; the typed columns are mapped
# best-effort (i21 instead of i23 → cifra_afaceri).

set -uo pipefail

DATA_DIR=/opt/vreaudigital/data/mfinante
LOG=/var/log/vreaudigital-fin-historical.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }

mkdir -p "$DATA_DIR"

source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
  --client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
  --silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
  --projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
  --path="$INFISICAL_PATH" --silent --token="$TOKEN" \
  -- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB

YEARS="${YEARS:-2015 2016 2017 2018 2019}"

log "=== Historical financial import started (YEARS=$YEARS) ==="

# Discover a download URL from a data.gov.ro slug by filename regex.
# Args: slug pattern  (pattern is a Python regex matched on resource name)
discover() {
  local slug="$1"
  local pattern="$2"
  curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
    | python3 -c "
import json, sys, re
d = json.load(sys.stdin)
pat = re.compile(r'''$pattern''', re.I)
for r in d.get('result', {}).get('resources', []):
    if pat.search(r.get('name', '')):
        print(r.get('url', '')); break
"
}

# Download a file from data.gov.ro if not already present.
# Args: local_path url
fetch() {
  local file="$1"
  local url="$2"
  if [ -s "$file" ]; then
    log "  [SKIP] $file already exists ($(stat -c%s "$file") bytes)"
    return 0
  fi
  if [ -z "$url" ]; then
    log "  [ERR] No URL for $file"
    return 1
  fi
  log "  Downloading $url → $file"
  curl -fsL --max-time 300 -o "$file" "$url" || { log "  [ERR] download failed"; rm -f "$file"; return 1; }
  log "  OK $(stat -c%s "$file") bytes"
}

# ─── WEB_UU (companies, prescurtat) ──────────────────────────────────────
import_uu() {
  local year="$1"
  local file="$DATA_DIR/web_uu_${year}.txt"
  local slug="situatii_financiare_${year}"
  local pattern url ncols
  case "$year" in
    2015) pattern="^web_uu.*${year}\\.txt$"; ncols=21 ;;
    *)    pattern="^web_uu.*${year}\\.txt$"; ncols=22 ;;
  esac
  if [ ! -s "$file" ]; then
    url=$(discover "$slug" "$pattern")
    fetch "$file" "$url" || return 1
  fi
  log "[$year/WEB_UU] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
  if [ "$ncols" -eq 22 ]; then
    # Standard schema (2016+): CUI,CAEN,I1..I20. I20 = salariati.
    psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
    psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
    log "[$year/WEB_UU] UPSERT..."
    psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
  cui, year, caen,
  active_imobilizate, active_circulante, stocuri, creante, casa_banci,
  cheltuieli_avans, datorii, venituri_avans, provizioane,
  capitaluri_total, capital_subscris, patrimoniul_regiei,
  cifra_afaceri, venituri_total, cheltuieli_total,
  profit_brut, pierdere_bruta, profit_net, pierdere_neta,
  numar_salariati, source
)
SELECT DISTINCT ON (cui)
  cui, $year, caen,
  i1, i2, i3, i4, i5, i6, i7, i8, i9,
  i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
  CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
  'mfinante:WEB_UU'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
  source = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
    ELSE EXCLUDED.source
  END,
  caen = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
    ELSE EXCLUDED.caen
  END;
SQL
  else
    # 2015 schema (21 cols, CUI,CAEN,I1..I19). The pre-2016 reporting
    # ordering omits the modern I12 (patrimoniul_regiei) column entirely
    # and shifts everything from cifra_afaceri onward one position left:
    #   2015 I12 ↔ modern I13 (cifra_afaceri)
    #   2015 I13 ↔ modern I14 (venituri_total)
    #   ...
    #   2015 I18 ↔ modern I19 (pierdere_neta)
    #   2015 I19 ↔ modern I20 (numar_salariati)
    # Verified by matching cifra_afaceri / salariati to a stable CUI's
    # 2016-2024 series. Without this remap, salariati was being ingested
    # as pierdere_neta and cifra_afaceri was off by one column.
    psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
    psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
    log "[$year/WEB_UU] UPSERT (2015 left-shift remap)..."
    psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
  cui, year, caen,
  active_imobilizate, active_circulante, stocuri, creante, casa_banci,
  cheltuieli_avans, datorii, venituri_avans, provizioane,
  capitaluri_total, capital_subscris, patrimoniul_regiei,
  cifra_afaceri, venituri_total, cheltuieli_total,
  profit_brut, pierdere_bruta, profit_net, pierdere_neta,
  numar_salariati, source
)
SELECT DISTINCT ON (cui)
  cui, $year, caen,
  i1, i2, i3, i4, i5, i6, i7, i8, i9,
  i10, i11,
  NULL::numeric(20,2),                 -- patrimoniul_regiei not in 2015 schema
  i12, i13, i14, i15, i16, i17, i18,   -- cifra_afaceri..pierdere_neta
  CASE WHEN i19 BETWEEN 0 AND 100000000 THEN i19::bigint ELSE NULL END,
  'mfinante:WEB_UU'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
  source = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
    ELSE EXCLUDED.source
  END,
  caen = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
    ELSE EXCLUDED.caen
  END;
SQL
  fi
}

# ─── WEB_BL_BS_SL ────────────────────────────────────────────────────────
import_bl() {
  local year="$1"
  local file="$DATA_DIR/web_bl_bs_sl_${year}.txt"
  local slug="situatii_financiare_${year}"
  local pattern url ncols
  pattern="^web_bl_bs_sl.*${year}\\.txt$"
  case "$year" in
    2015) ncols=23 ;;  # has extra I21
    *)    ncols=22 ;;
  esac
  if [ ! -s "$file" ]; then
    url=$(discover "$slug" "$pattern")
    fetch "$file" "$url" || return 1
  fi
  log "[$year/WEB_BL_BS_SL] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
  if [ "$ncols" -eq 22 ]; then
    psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
    psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
    log "[$year/WEB_BL_BS_SL] UPSERT..."
    psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
  cui, year, caen,
  active_imobilizate, active_circulante, stocuri, creante, casa_banci,
  cheltuieli_avans, datorii, venituri_avans, provizioane,
  capitaluri_total, capital_subscris, patrimoniul_regiei,
  cifra_afaceri, venituri_total, cheltuieli_total,
  profit_brut, pierdere_bruta, profit_net, pierdere_neta,
  numar_salariati, source
)
SELECT DISTINCT ON (cui)
  cui, $year, caen,
  i1, i2, i3, i4, i5, i6, i7, i8, i9,
  i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
  CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
  'mfinante:WEB_BL_BS_SL'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
  source = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
    ELSE EXCLUDED.source
  END,
  caen = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
    ELSE EXCLUDED.caen
  END;
SQL
  else
    # 2015 BL_BS_SL schema (23 cols, CUI,CAEN,I1..I21). The pre-2016 BL
    # reporting has an extra (unknown) field somewhere between
    # capital_subscris (I11) and cifra_afaceri. Empirically (cross-checked
    # CUI 538310 against 2016-2024 series): cifra_afaceri lives at I14
    # (not I13), salariati at I21. Treat I12,I13 as patrimoniul_regiei +
    # an unmapped field (likely related to regii autonome / provizioane
    # detail); both empty for typical SRLs. Map:
    #   2015 BL I1..I11 = modern I1..I11
    #   2015 BL I12 → patrimoniul_regiei (modern I12)
    #   2015 BL I13 → dropped (unknown)
    #   2015 BL I14 → cifra_afaceri (modern I13)
    #   2015 BL I15..I20 → modern I14..I19
    #   2015 BL I21 → numar_salariati (modern I20)
    psql -v ON_ERROR_STOP=1 <<COPYEOF
CREATE TEMP TABLE tmp_bl23 (
  cui text, caen text,
  i1 numeric(20,2), i2 numeric(20,2), i3 numeric(20,2), i4 numeric(20,2),
  i5 numeric(20,2), i6 numeric(20,2), i7 numeric(20,2), i8 numeric(20,2),
  i9 numeric(20,2), i10 numeric(20,2), i11 numeric(20,2), i12 numeric(20,2),
  i13 numeric(20,2), i14 numeric(20,2), i15 numeric(20,2), i16 numeric(20,2),
  i17 numeric(20,2), i18 numeric(20,2), i19 numeric(20,2), i20 numeric(20,2),
  i21 numeric(20,2)
);   -- session-scoped; dropped when psql exits
\\copy tmp_bl23 FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
INSERT INTO firms.financials (
  cui, year, caen,
  active_imobilizate, active_circulante, stocuri, creante, casa_banci,
  cheltuieli_avans, datorii, venituri_avans, provizioane,
  capitaluri_total, capital_subscris, patrimoniul_regiei,
  cifra_afaceri, venituri_total, cheltuieli_total,
  profit_brut, pierdere_bruta, profit_net, pierdere_neta,
  numar_salariati, source
)
SELECT DISTINCT ON (cui)
  cui, $year, caen,
  i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
  i12,                                  -- patrimoniul_regiei
  i14, i15, i16, i17, i18, i19, i20,    -- cifra_afaceri..pierdere_neta
  CASE WHEN i21 BETWEEN 0 AND 100000000 THEN i21::bigint ELSE NULL END,
  'mfinante:WEB_BL_BS_SL'
FROM tmp_bl23
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
  source = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
    ELSE EXCLUDED.source
  END,
  caen = CASE
    WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
    ELSE EXCLUDED.caen
  END;
COPYEOF
  fi
}

# ─── WEB_ONG (49 cols, schema consistent across 2015-2024) ───────────────
import_ong() {
  local year="$1"
  local file="$DATA_DIR/web_ong_${year}.txt"
  local slug="situatii_financiare_${year}"
  local url
  if [ ! -s "$file" ]; then
    url=$(discover "$slug" "^web_ong.*${year}\\.txt$")
    fetch "$file" "$url" || return 1
  fi
  local header_cols
  header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
  log "[$year/WEB_ONG] COPY $file ($(stat -c%s "$file") bytes, $header_cols cols)..."
  psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
  if [ "$header_cols" -eq 49 ]; then
    psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
  elif [ "$header_cols" -eq 51 ]; then
    # 2018 schema: ...,I44,DEN_CAENO,I45,DEN_CAEN,I46  (extra UNQUOTED text
    # columns whose contents contain commas — breaks naive CSV parsing).
    # Preprocess into a 49-col file by walking backwards from end to identify
    # the two text columns (variable comma count).
    local cleaned="${file}.cleaned49"
    log "[$year/WEB_ONG] Preprocessing 51→49 cols (stripping DEN_CAEN/DEN_CAENO)..."
    python3 - "$file" "$cleaned" <<'PYEOF'
import sys
src, dst = sys.argv[1], sys.argv[2]
NUM_RE = __import__('re').compile(r'^-?\d+(\.\d+)?$|^$')
out = open(dst, 'w')
with open(src) as fh:
    header = fh.readline().rstrip('\n').split(',')
    # write reduced header (drop DEN_CAEN, DEN_CAENO positions 47 and 49, zero-indexed)
    keep = [i for i, h in enumerate(header) if h.upper() not in ('DEN_CAEN', 'DEN_CAENO')]
    out.write(','.join(header[i] for i in keep) + '\n')
    for line in fh:
        line = line.rstrip('\n')
        parts = line.split(',')
        # Walk from end: parts[-1] = i46 (numeric), then DEN_CAEN spans
        # multiple parts (text). parts[-X] = i45 (numeric/empty), then
        # DEN_CAENO spans, then parts[-Y] = i44 (numeric/empty).
        n = len(parts)
        # Find last 3 numeric-or-empty trailing fields by scanning back.
        # i46 = parts[n-1]; find i45 = first numeric/empty going back from n-2.
        i46_idx = n - 1
        # walk backwards skipping non-numeric until we hit numeric -> that's i45
        j = n - 2
        while j >= 0 and not NUM_RE.match(parts[j]):
            j -= 1
        i45_idx = j
        # den_caen spans (i45_idx+1 .. i46_idx-1) → join those
        # continue back to find i44
        j -= 1
        while j >= 0 and not NUM_RE.match(parts[j]):
            j -= 1
        i44_idx = j
        if i44_idx < 0 or i45_idx < 0:
            # malformed row — skip
            continue
        # Reassemble: parts[0..i44_idx] + parts[i45_idx] + parts[i46_idx]
        new_parts = parts[:i44_idx+1] + [parts[i45_idx]] + [parts[i46_idx]]
        if len(new_parts) != 49:
            # row doesn't fit expected 49-col output → skip
            continue
        out.write(','.join(new_parts) + '\n')
out.close()
PYEOF
    log "[$year/WEB_ONG] Cleaned $(wc -l < "$cleaned") lines (incl. header)"
    psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$cleaned' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
    rm -f "$cleaned"
  else
    log "[$year/WEB_ONG] unexpected col count $header_cols, skipping"
    return 0
  fi
  log "[$year/WEB_ONG] UPSERT..."
  psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials_ong (
  cui, year, caen, caeno,
  capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
  personal_neeconomic, personal_economic, indicators
)
SELECT DISTINCT ON (cui)
  cui, $year, caen, caeno,
  NULLIF(i12, '')::numeric(20,2),
  NULLIF(i38, '')::numeric(20,2),
  NULLIF(i40, '')::numeric(20,2),
  NULLIF(i42, '')::numeric(20,2),
  CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
  CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
  jsonb_strip_nulls(jsonb_build_object(
    'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
    'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
    'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
    'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
    'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
    'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
    'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
    'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
    'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
    'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
    'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
    'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
  ))
FROM firms.staging_ong
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
  caen = EXCLUDED.caen,
  caeno = EXCLUDED.caeno,
  capitaluri_proprii = EXCLUDED.capitaluri_proprii,
  venituri_total = EXCLUDED.venituri_total,
  cheltuieli_total = EXCLUDED.cheltuieli_total,
  excedent = EXCLUDED.excedent,
  personal_neeconomic = EXCLUDED.personal_neeconomic,
  personal_economic = EXCLUDED.personal_economic,
  indicators = EXCLUDED.indicators,
  fetched_at = now();
SQL
}

# ─── WEB_INST_DE_CREDIT (banks) — pre-IFRS schemas vary by year ─────────
# 2015: not published. 2016/2017/2019: 23 cols (I1..I21). 2018: not published.
# 2020/2021/2022: 23 cols (I21). 2023: 24 cols (I22). 2024: 25 cols (I23).
import_bank() {
  local year="$1"
  local file="$DATA_DIR/web_inst_de_credit_${year}.txt"
  local slug="situatii_financiare_${year}"
  case "$year" in
    2020) slug="situatii_financiare_2021" ;;
    2023) slug="situatii_financiare2023" ;;
  esac
  local url
  if [ ! -s "$file" ]; then
    url=$(discover "$slug" "^web_(inst|instit)_de_credit.*${year}\\.txt$")
    if [ -z "$url" ]; then log "[$year/BANK] no file in dataset, skip"; return 0; fi
    fetch "$file" "$url" || return 1
  fi
  # Detect column count from header line.
  local header_cols
  header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
  log "[$year/BANK] $file ($(stat -c%s "$file") bytes, $header_cols cols)"
  # Build a TEMP table sized to the file, then map to firms.financials_banks.
  # The "cifra_afaceri" mapping: in IFRS 2024 schema (25 cols) it's i23. In
  # older 23-col schema it's i21. In 24-col schema (2023) it's i22.
  local ind_n cifra_col profit_inainte_col profit_exerc_col capital_col activ_col cols_def cols_list ind_pairs
  ind_n=$(( header_cols - 2 ))   # i1..iN
  case "$ind_n" in
    21) cifra_col=i21; profit_inainte_col=i17; profit_exerc_col=i20; capital_col=i14; activ_col=i6 ;;
    22) cifra_col=i22; profit_inainte_col=i18; profit_exerc_col=i21; capital_col=i14; activ_col=i6 ;;
    23) cifra_col=i23; profit_inainte_col=i19; profit_exerc_col=i22; capital_col=i14; activ_col=i6 ;;
    *)  log "[$year/BANK] unexpected indicator count $ind_n, skipping"; return 0 ;;
  esac
  # Build dynamic column list for TEMP table and \\copy.
  cols_def="cui text, caen text"
  cols_list="cui, caen"
  ind_pairs=""
  for i in $(seq 1 "$ind_n"); do
    cols_def="$cols_def, i${i} text"
    cols_list="$cols_list, i${i}"
    ind_pairs="$ind_pairs 'i${i}', NULLIF(i${i}, ''),"
  done
  ind_pairs="${ind_pairs%,}"
  psql -v ON_ERROR_STOP=1 <<COPYEOF
CREATE TEMP TABLE tmp_bank (
  $cols_def
);   -- session-scoped; dropped when psql exits
\\copy tmp_bank ($cols_list) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
INSERT INTO firms.financials_banks (
  cui, year, caen,
  active_financiare_amortiz, capital_social, profit_exercitiu,
  profit_inainte_impozit, cifra_afaceri, indicators, source
)
SELECT DISTINCT ON (cui)
  cui, $year, caen,
  NULLIF($activ_col, '')::numeric(20,2),
  NULLIF($capital_col, '')::numeric(20,2),
  NULLIF($profit_exerc_col, '')::numeric(20,2),
  NULLIF($profit_inainte_col, '')::numeric(20,2),
  NULLIF($cifra_col, '')::numeric(20,2),
  jsonb_strip_nulls(jsonb_build_object($ind_pairs)),
  'mfinante:WEB_Inst_de_credit'
FROM tmp_bank
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
  caen = EXCLUDED.caen,
  active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
  capital_social = EXCLUDED.capital_social,
  profit_exercitiu = EXCLUDED.profit_exercitiu,
  profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
  cifra_afaceri = EXCLUDED.cifra_afaceri,
  indicators = EXCLUDED.indicators,
  source = EXCLUDED.source,
  fetched_at = now();
COPYEOF
}

# CATEGORIES env var filters which sub-imports run. Default = all.
# Useful: CATEGORIES="bank" to skip companies and only redo banks.
CATEGORIES="${CATEGORIES:-uu bl ong bank}"

for YEAR in $YEARS; do
  log "── Year $YEAR ──────────────────────────────"
  for CAT in $CATEGORIES; do
    case "$CAT" in
      uu)   import_uu   "$YEAR" || log "[$YEAR/WEB_UU] failed" ;;
      bl)   import_bl   "$YEAR" || log "[$YEAR/WEB_BL_BS_SL] failed" ;;
      ong)  import_ong  "$YEAR" || log "[$YEAR/WEB_ONG] failed" ;;
      bank) import_bank "$YEAR" || log "[$YEAR/BANK] failed" ;;
    esac
  done
done

log "=== Refreshing latest-year MV ==="
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;" || true

log "=== Final coverage ==="
psql -c "
SELECT 'fin'  AS tbl, year, COUNT(*) AS n FROM firms.financials       GROUP BY year
UNION ALL
SELECT 'ong'  AS tbl, year, COUNT(*) AS n FROM firms.financials_ong   GROUP BY year
UNION ALL
SELECT 'bank' AS tbl, year, COUNT(*) AS n FROM firms.financials_banks GROUP BY year
ORDER BY tbl, year;
" 2>&1 | tee -a "$LOG"

log "=== Historical import done ==="