Files
vreau-digital/services/seap-scraper/cron/import-financials-historical.sh
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

527 lines
22 KiB
Bash

#!/bin/bash
# Historical financial backfill 2015-2019 from data.gov.ro / MFP.
#
# Why a separate script: 2015 and pre-2020 files have slightly different
# schemas (WEB_UU 2015 has 21 cols vs 22 for 2016+; WEB_BL_BS_SL 2015 has 23
# cols vs 22 for 2016+; WEB_INST_DE_CREDIT 2016/2017/2019 has 23 cols vs 25
# for 2024). The daily importer (import-financials.sh +
# import-financials-ong-banks.sh) assumes the 2020+ schema and silently fails
# or rejects older years. This wrapper:
# 1) Downloads the right files from data.gov.ro for the requested years.
# 2) Loads them via a session-local TEMP TABLE matched to that year's column
# count, then INSERTs into the canonical firms.financials* tables.
#
# Usage on satra:
# /opt/vreaudigital/services/seap-scraper/cron/import-financials-historical.sh
# YEARS="2017 2018" /opt/...../import-financials-historical.sh # subset
#
# Idempotent — PK (cui, year) + ON CONFLICT DO UPDATE.
#
# Banks: 2015 and 2018 have no Inst_de_credit file at data.gov.ro. Banks for
# 2016/2017/2019 use the pre-IFRS schema (21 indicators), so this script also
# loads pre-2020 bank files into firms.financials_banks with the JSONB
# `indicators` column carrying everything; the typed columns are mapped
# best-effort (i21 instead of i23 → cifra_afaceri).
set -uo pipefail
DATA_DIR=/opt/vreaudigital/data/mfinante
LOG=/var/log/vreaudigital-fin-historical.log
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
mkdir -p "$DATA_DIR"
source /opt/vreaudigital/.infisical-mi
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
--silent --plain)
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
--projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
--path="$INFISICAL_PATH" --silent --token="$TOKEN" \
-- sh -c 'echo "$DATABASE_URL"')
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
unset DBURL TOKEN DB
YEARS="${YEARS:-2015 2016 2017 2018 2019}"
log "=== Historical financial import started (YEARS=$YEARS) ==="
# Discover a download URL from a data.gov.ro slug by filename regex.
# Args: slug pattern (pattern is a Python regex matched on resource name)
discover() {
local slug="$1"
local pattern="$2"
curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
| python3 -c "
import json, sys, re
d = json.load(sys.stdin)
pat = re.compile(r'''$pattern''', re.I)
for r in d.get('result', {}).get('resources', []):
if pat.search(r.get('name', '')):
print(r.get('url', '')); break
"
}
# Download a file from data.gov.ro if not already present.
# Args: local_path url
fetch() {
local file="$1"
local url="$2"
if [ -s "$file" ]; then
log " [SKIP] $file already exists ($(stat -c%s "$file") bytes)"
return 0
fi
if [ -z "$url" ]; then
log " [ERR] No URL for $file"
return 1
fi
log " Downloading $url$file"
curl -fsL --max-time 300 -o "$file" "$url" || { log " [ERR] download failed"; rm -f "$file"; return 1; }
log " OK $(stat -c%s "$file") bytes"
}
# ─── WEB_UU (companies, prescurtat) ──────────────────────────────────────
import_uu() {
local year="$1"
local file="$DATA_DIR/web_uu_${year}.txt"
local slug="situatii_financiare_${year}"
local pattern url ncols
case "$year" in
2015) pattern="^web_uu.*${year}\\.txt$"; ncols=21 ;;
*) pattern="^web_uu.*${year}\\.txt$"; ncols=22 ;;
esac
if [ ! -s "$file" ]; then
url=$(discover "$slug" "$pattern")
fetch "$file" "$url" || return 1
fi
log "[$year/WEB_UU] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
if [ "$ncols" -eq 22 ]; then
# Standard schema (2016+): CUI,CAEN,I1..I20. I20 = salariati.
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$year/WEB_UU] UPSERT..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9,
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
'mfinante:WEB_UU'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
else
# 2015 schema (21 cols, CUI,CAEN,I1..I19). The pre-2016 reporting
# ordering omits the modern I12 (patrimoniul_regiei) column entirely
# and shifts everything from cifra_afaceri onward one position left:
# 2015 I12 ↔ modern I13 (cifra_afaceri)
# 2015 I13 ↔ modern I14 (venituri_total)
# ...
# 2015 I18 ↔ modern I19 (pierdere_neta)
# 2015 I19 ↔ modern I20 (numar_salariati)
# Verified by matching cifra_afaceri / salariati to a stable CUI's
# 2016-2024 series. Without this remap, salariati was being ingested
# as pierdere_neta and cifra_afaceri was off by one column.
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$year/WEB_UU] UPSERT (2015 left-shift remap)..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9,
i10, i11,
NULL::numeric(20,2), -- patrimoniul_regiei not in 2015 schema
i12, i13, i14, i15, i16, i17, i18, -- cifra_afaceri..pierdere_neta
CASE WHEN i19 BETWEEN 0 AND 100000000 THEN i19::bigint ELSE NULL END,
'mfinante:WEB_UU'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
fi
}
# ─── WEB_BL_BS_SL ────────────────────────────────────────────────────────
import_bl() {
local year="$1"
local file="$DATA_DIR/web_bl_bs_sl_${year}.txt"
local slug="situatii_financiare_${year}"
local pattern url ncols
pattern="^web_bl_bs_sl.*${year}\\.txt$"
case "$year" in
2015) ncols=23 ;; # has extra I21
*) ncols=22 ;;
esac
if [ ! -s "$file" ]; then
url=$(discover "$slug" "$pattern")
fetch "$file" "$url" || return 1
fi
log "[$year/WEB_BL_BS_SL] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
if [ "$ncols" -eq 22 ]; then
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
log "[$year/WEB_BL_BS_SL] UPSERT..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9,
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
'mfinante:WEB_BL_BS_SL'
FROM firms.staging_financials
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
SQL
else
# 2015 BL_BS_SL schema (23 cols, CUI,CAEN,I1..I21). The pre-2016 BL
# reporting has an extra (unknown) field somewhere between
# capital_subscris (I11) and cifra_afaceri. Empirically (cross-checked
# CUI 538310 against 2016-2024 series): cifra_afaceri lives at I14
# (not I13), salariati at I21. Treat I12,I13 as patrimoniul_regiei +
# an unmapped field (likely related to regii autonome / provizioane
# detail); both empty for typical SRLs. Map:
# 2015 BL I1..I11 = modern I1..I11
# 2015 BL I12 → patrimoniul_regiei (modern I12)
# 2015 BL I13 → dropped (unknown)
# 2015 BL I14 → cifra_afaceri (modern I13)
# 2015 BL I15..I20 → modern I14..I19
# 2015 BL I21 → numar_salariati (modern I20)
psql -v ON_ERROR_STOP=1 <<COPYEOF
CREATE TEMP TABLE tmp_bl23 (
cui text, caen text,
i1 numeric(20,2), i2 numeric(20,2), i3 numeric(20,2), i4 numeric(20,2),
i5 numeric(20,2), i6 numeric(20,2), i7 numeric(20,2), i8 numeric(20,2),
i9 numeric(20,2), i10 numeric(20,2), i11 numeric(20,2), i12 numeric(20,2),
i13 numeric(20,2), i14 numeric(20,2), i15 numeric(20,2), i16 numeric(20,2),
i17 numeric(20,2), i18 numeric(20,2), i19 numeric(20,2), i20 numeric(20,2),
i21 numeric(20,2)
); -- session-scoped; dropped when psql exits
\\copy tmp_bl23 FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
INSERT INTO firms.financials (
cui, year, caen,
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
cheltuieli_avans, datorii, venituri_avans, provizioane,
capitaluri_total, capital_subscris, patrimoniul_regiei,
cifra_afaceri, venituri_total, cheltuieli_total,
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
numar_salariati, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
i12, -- patrimoniul_regiei
i14, i15, i16, i17, i18, i19, i20, -- cifra_afaceri..pierdere_neta
CASE WHEN i21 BETWEEN 0 AND 100000000 THEN i21::bigint ELSE NULL END,
'mfinante:WEB_BL_BS_SL'
FROM tmp_bl23
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
source = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
ELSE EXCLUDED.source
END,
caen = CASE
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
ELSE EXCLUDED.caen
END;
COPYEOF
fi
}
# ─── WEB_ONG (49 cols, schema consistent across 2015-2024) ───────────────
import_ong() {
local year="$1"
local file="$DATA_DIR/web_ong_${year}.txt"
local slug="situatii_financiare_${year}"
local url
if [ ! -s "$file" ]; then
url=$(discover "$slug" "^web_ong.*${year}\\.txt$")
fetch "$file" "$url" || return 1
fi
local header_cols
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
log "[$year/WEB_ONG] COPY $file ($(stat -c%s "$file") bytes, $header_cols cols)..."
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
if [ "$header_cols" -eq 49 ]; then
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
elif [ "$header_cols" -eq 51 ]; then
# 2018 schema: ...,I44,DEN_CAENO,I45,DEN_CAEN,I46 (extra UNQUOTED text
# columns whose contents contain commas — breaks naive CSV parsing).
# Preprocess into a 49-col file by walking backwards from end to identify
# the two text columns (variable comma count).
local cleaned="${file}.cleaned49"
log "[$year/WEB_ONG] Preprocessing 51→49 cols (stripping DEN_CAEN/DEN_CAENO)..."
python3 - "$file" "$cleaned" <<'PYEOF'
import sys
src, dst = sys.argv[1], sys.argv[2]
NUM_RE = __import__('re').compile(r'^-?\d+(\.\d+)?$|^$')
out = open(dst, 'w')
with open(src) as fh:
header = fh.readline().rstrip('\n').split(',')
# write reduced header (drop DEN_CAEN, DEN_CAENO positions 47 and 49, zero-indexed)
keep = [i for i, h in enumerate(header) if h.upper() not in ('DEN_CAEN', 'DEN_CAENO')]
out.write(','.join(header[i] for i in keep) + '\n')
for line in fh:
line = line.rstrip('\n')
parts = line.split(',')
# Walk from end: parts[-1] = i46 (numeric), then DEN_CAEN spans
# multiple parts (text). parts[-X] = i45 (numeric/empty), then
# DEN_CAENO spans, then parts[-Y] = i44 (numeric/empty).
n = len(parts)
# Find last 3 numeric-or-empty trailing fields by scanning back.
# i46 = parts[n-1]; find i45 = first numeric/empty going back from n-2.
i46_idx = n - 1
# walk backwards skipping non-numeric until we hit numeric -> that's i45
j = n - 2
while j >= 0 and not NUM_RE.match(parts[j]):
j -= 1
i45_idx = j
# den_caen spans (i45_idx+1 .. i46_idx-1) → join those
# continue back to find i44
j -= 1
while j >= 0 and not NUM_RE.match(parts[j]):
j -= 1
i44_idx = j
if i44_idx < 0 or i45_idx < 0:
# malformed row — skip
continue
# Reassemble: parts[0..i44_idx] + parts[i45_idx] + parts[i46_idx]
new_parts = parts[:i44_idx+1] + [parts[i45_idx]] + [parts[i46_idx]]
if len(new_parts) != 49:
# row doesn't fit expected 49-col output → skip
continue
out.write(','.join(new_parts) + '\n')
out.close()
PYEOF
log "[$year/WEB_ONG] Cleaned $(wc -l < "$cleaned") lines (incl. header)"
psql -v ON_ERROR_STOP=1 <<COPYEOF
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$cleaned' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
COPYEOF
rm -f "$cleaned"
else
log "[$year/WEB_ONG] unexpected col count $header_cols, skipping"
return 0
fi
log "[$year/WEB_ONG] UPSERT..."
psql -v ON_ERROR_STOP=1 <<SQL
INSERT INTO firms.financials_ong (
cui, year, caen, caeno,
capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
personal_neeconomic, personal_economic, indicators
)
SELECT DISTINCT ON (cui)
cui, $year, caen, caeno,
NULLIF(i12, '')::numeric(20,2),
NULLIF(i38, '')::numeric(20,2),
NULLIF(i40, '')::numeric(20,2),
NULLIF(i42, '')::numeric(20,2),
CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
jsonb_strip_nulls(jsonb_build_object(
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
))
FROM firms.staging_ong
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
caen = EXCLUDED.caen,
caeno = EXCLUDED.caeno,
capitaluri_proprii = EXCLUDED.capitaluri_proprii,
venituri_total = EXCLUDED.venituri_total,
cheltuieli_total = EXCLUDED.cheltuieli_total,
excedent = EXCLUDED.excedent,
personal_neeconomic = EXCLUDED.personal_neeconomic,
personal_economic = EXCLUDED.personal_economic,
indicators = EXCLUDED.indicators,
fetched_at = now();
SQL
}
# ─── WEB_INST_DE_CREDIT (banks) — pre-IFRS schemas vary by year ─────────
# 2015: not published. 2016/2017/2019: 23 cols (I1..I21). 2018: not published.
# 2020/2021/2022: 23 cols (I21). 2023: 24 cols (I22). 2024: 25 cols (I23).
import_bank() {
local year="$1"
local file="$DATA_DIR/web_inst_de_credit_${year}.txt"
local slug="situatii_financiare_${year}"
case "$year" in
2020) slug="situatii_financiare_2021" ;;
2023) slug="situatii_financiare2023" ;;
esac
local url
if [ ! -s "$file" ]; then
url=$(discover "$slug" "^web_(inst|instit)_de_credit.*${year}\\.txt$")
if [ -z "$url" ]; then log "[$year/BANK] no file in dataset, skip"; return 0; fi
fetch "$file" "$url" || return 1
fi
# Detect column count from header line.
local header_cols
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
log "[$year/BANK] $file ($(stat -c%s "$file") bytes, $header_cols cols)"
# Build a TEMP table sized to the file, then map to firms.financials_banks.
# The "cifra_afaceri" mapping: in IFRS 2024 schema (25 cols) it's i23. In
# older 23-col schema it's i21. In 24-col schema (2023) it's i22.
local ind_n cifra_col profit_inainte_col profit_exerc_col capital_col activ_col cols_def cols_list ind_pairs
ind_n=$(( header_cols - 2 )) # i1..iN
case "$ind_n" in
21) cifra_col=i21; profit_inainte_col=i17; profit_exerc_col=i20; capital_col=i14; activ_col=i6 ;;
22) cifra_col=i22; profit_inainte_col=i18; profit_exerc_col=i21; capital_col=i14; activ_col=i6 ;;
23) cifra_col=i23; profit_inainte_col=i19; profit_exerc_col=i22; capital_col=i14; activ_col=i6 ;;
*) log "[$year/BANK] unexpected indicator count $ind_n, skipping"; return 0 ;;
esac
# Build dynamic column list for TEMP table and \\copy.
cols_def="cui text, caen text"
cols_list="cui, caen"
ind_pairs=""
for i in $(seq 1 "$ind_n"); do
cols_def="$cols_def, i${i} text"
cols_list="$cols_list, i${i}"
ind_pairs="$ind_pairs 'i${i}', NULLIF(i${i}, ''),"
done
ind_pairs="${ind_pairs%,}"
psql -v ON_ERROR_STOP=1 <<COPYEOF
CREATE TEMP TABLE tmp_bank (
$cols_def
); -- session-scoped; dropped when psql exits
\\copy tmp_bank ($cols_list) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
INSERT INTO firms.financials_banks (
cui, year, caen,
active_financiare_amortiz, capital_social, profit_exercitiu,
profit_inainte_impozit, cifra_afaceri, indicators, source
)
SELECT DISTINCT ON (cui)
cui, $year, caen,
NULLIF($activ_col, '')::numeric(20,2),
NULLIF($capital_col, '')::numeric(20,2),
NULLIF($profit_exerc_col, '')::numeric(20,2),
NULLIF($profit_inainte_col, '')::numeric(20,2),
NULLIF($cifra_col, '')::numeric(20,2),
jsonb_strip_nulls(jsonb_build_object($ind_pairs)),
'mfinante:WEB_Inst_de_credit'
FROM tmp_bank
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
ORDER BY cui
ON CONFLICT (cui, year) DO UPDATE SET
caen = EXCLUDED.caen,
active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
capital_social = EXCLUDED.capital_social,
profit_exercitiu = EXCLUDED.profit_exercitiu,
profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
cifra_afaceri = EXCLUDED.cifra_afaceri,
indicators = EXCLUDED.indicators,
source = EXCLUDED.source,
fetched_at = now();
COPYEOF
}
# CATEGORIES env var filters which sub-imports run. Default = all.
# Useful: CATEGORIES="bank" to skip companies and only redo banks.
CATEGORIES="${CATEGORIES:-uu bl ong bank}"
for YEAR in $YEARS; do
log "── Year $YEAR ──────────────────────────────"
for CAT in $CATEGORIES; do
case "$CAT" in
uu) import_uu "$YEAR" || log "[$YEAR/WEB_UU] failed" ;;
bl) import_bl "$YEAR" || log "[$YEAR/WEB_BL_BS_SL] failed" ;;
ong) import_ong "$YEAR" || log "[$YEAR/WEB_ONG] failed" ;;
bank) import_bank "$YEAR" || log "[$YEAR/BANK] failed" ;;
esac
done
done
log "=== Refreshing latest-year MV ==="
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;" || true
log "=== Final coverage ==="
psql -c "
SELECT 'fin' AS tbl, year, COUNT(*) AS n FROM firms.financials GROUP BY year
UNION ALL
SELECT 'ong' AS tbl, year, COUNT(*) AS n FROM firms.financials_ong GROUP BY year
UNION ALL
SELECT 'bank' AS tbl, year, COUNT(*) AS n FROM firms.financials_banks GROUP BY year
ORDER BY tbl, year;
" 2>&1 | tee -a "$LOG"
log "=== Historical import done ==="