a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
527 lines
22 KiB
Bash
527 lines
22 KiB
Bash
#!/bin/bash
|
|
# Historical financial backfill 2015-2019 from data.gov.ro / MFP.
|
|
#
|
|
# Why a separate script: 2015 and pre-2020 files have slightly different
|
|
# schemas (WEB_UU 2015 has 21 cols vs 22 for 2016+; WEB_BL_BS_SL 2015 has 23
|
|
# cols vs 22 for 2016+; WEB_INST_DE_CREDIT 2016/2017/2019 has 23 cols vs 25
|
|
# for 2024). The daily importer (import-financials.sh +
|
|
# import-financials-ong-banks.sh) assumes the 2020+ schema and silently fails
|
|
# or rejects older years. This wrapper:
|
|
# 1) Downloads the right files from data.gov.ro for the requested years.
|
|
# 2) Loads them via a session-local TEMP TABLE matched to that year's column
|
|
# count, then INSERTs into the canonical firms.financials* tables.
|
|
#
|
|
# Usage on satra:
|
|
# /opt/vreaudigital/services/seap-scraper/cron/import-financials-historical.sh
|
|
# YEARS="2017 2018" /opt/...../import-financials-historical.sh # subset
|
|
#
|
|
# Idempotent — PK (cui, year) + ON CONFLICT DO UPDATE.
|
|
#
|
|
# Banks: 2015 and 2018 have no Inst_de_credit file at data.gov.ro. Banks for
|
|
# 2016/2017/2019 use the pre-IFRS schema (21 indicators), so this script also
|
|
# loads pre-2020 bank files into firms.financials_banks with the JSONB
|
|
# `indicators` column carrying everything; the typed columns are mapped
|
|
# best-effort (i21 instead of i23 → cifra_afaceri).
|
|
|
|
set -uo pipefail
|
|
|
|
DATA_DIR=/opt/vreaudigital/data/mfinante
|
|
LOG=/var/log/vreaudigital-fin-historical.log
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; }
|
|
|
|
mkdir -p "$DATA_DIR"
|
|
|
|
source /opt/vreaudigital/.infisical-mi
|
|
TOKEN=$(infisical login --method=universal-auth --domain="$INFISICAL_API_URL" \
|
|
--client-id="$INFISICAL_CLIENT_ID" --client-secret="$INFISICAL_CLIENT_SECRET" \
|
|
--silent --plain)
|
|
DBURL=$(infisical run --domain="$INFISICAL_API_URL" \
|
|
--projectId="$INFISICAL_PROJECT_ID" --env="$INFISICAL_ENV" \
|
|
--path="$INFISICAL_PATH" --silent --token="$TOKEN" \
|
|
-- sh -c 'echo "$DATABASE_URL"')
|
|
DB=$(echo "$DBURL" | sed -E 's/[?&]schema=[^&]*//; s/\?$//')
|
|
export PGUSER=$(echo "$DB" | sed -E 's|^postgresql://([^:]+):.*|\1|')
|
|
export PGPASSWORD=$(echo "$DB" | sed -E 's|^postgresql://[^:]+:([^@]+)@.*|\1|')
|
|
export PGHOST=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@([^:/]+).*|\1|')
|
|
export PGPORT=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^:]+:([0-9]+)/.*|\1|')
|
|
export PGDATABASE=$(echo "$DB" | sed -E 's|^postgresql://[^@]+@[^/]+/([^?]+).*|\1|')
|
|
unset DBURL TOKEN DB
|
|
|
|
YEARS="${YEARS:-2015 2016 2017 2018 2019}"
|
|
|
|
log "=== Historical financial import started (YEARS=$YEARS) ==="
|
|
|
|
# Discover a download URL from a data.gov.ro slug by filename regex.
|
|
# Args: slug pattern (pattern is a Python regex matched on resource name)
|
|
discover() {
|
|
local slug="$1"
|
|
local pattern="$2"
|
|
curl -fsSL --max-time 30 "https://data.gov.ro/api/3/action/package_show?id=$slug" 2>/dev/null \
|
|
| python3 -c "
|
|
import json, sys, re
|
|
d = json.load(sys.stdin)
|
|
pat = re.compile(r'''$pattern''', re.I)
|
|
for r in d.get('result', {}).get('resources', []):
|
|
if pat.search(r.get('name', '')):
|
|
print(r.get('url', '')); break
|
|
"
|
|
}
|
|
|
|
# Download a file from data.gov.ro if not already present.
|
|
# Args: local_path url
|
|
fetch() {
|
|
local file="$1"
|
|
local url="$2"
|
|
if [ -s "$file" ]; then
|
|
log " [SKIP] $file already exists ($(stat -c%s "$file") bytes)"
|
|
return 0
|
|
fi
|
|
if [ -z "$url" ]; then
|
|
log " [ERR] No URL for $file"
|
|
return 1
|
|
fi
|
|
log " Downloading $url → $file"
|
|
curl -fsL --max-time 300 -o "$file" "$url" || { log " [ERR] download failed"; rm -f "$file"; return 1; }
|
|
log " OK $(stat -c%s "$file") bytes"
|
|
}
|
|
|
|
# ─── WEB_UU (companies, prescurtat) ──────────────────────────────────────
|
|
import_uu() {
|
|
local year="$1"
|
|
local file="$DATA_DIR/web_uu_${year}.txt"
|
|
local slug="situatii_financiare_${year}"
|
|
local pattern url ncols
|
|
case "$year" in
|
|
2015) pattern="^web_uu.*${year}\\.txt$"; ncols=21 ;;
|
|
*) pattern="^web_uu.*${year}\\.txt$"; ncols=22 ;;
|
|
esac
|
|
if [ ! -s "$file" ]; then
|
|
url=$(discover "$slug" "$pattern")
|
|
fetch "$file" "$url" || return 1
|
|
fi
|
|
log "[$year/WEB_UU] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
|
|
if [ "$ncols" -eq 22 ]; then
|
|
# Standard schema (2016+): CUI,CAEN,I1..I20. I20 = salariati.
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
COPYEOF
|
|
log "[$year/WEB_UU] UPSERT..."
|
|
psql -v ON_ERROR_STOP=1 <<SQL
|
|
INSERT INTO firms.financials (
|
|
cui, year, caen,
|
|
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
|
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
|
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
|
cifra_afaceri, venituri_total, cheltuieli_total,
|
|
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
|
numar_salariati, source
|
|
)
|
|
SELECT DISTINCT ON (cui)
|
|
cui, $year, caen,
|
|
i1, i2, i3, i4, i5, i6, i7, i8, i9,
|
|
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
|
|
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
|
|
'mfinante:WEB_UU'
|
|
FROM firms.staging_financials
|
|
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
|
ORDER BY cui
|
|
ON CONFLICT (cui, year) DO UPDATE SET
|
|
source = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
|
ELSE EXCLUDED.source
|
|
END,
|
|
caen = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
|
ELSE EXCLUDED.caen
|
|
END;
|
|
SQL
|
|
else
|
|
# 2015 schema (21 cols, CUI,CAEN,I1..I19). The pre-2016 reporting
|
|
# ordering omits the modern I12 (patrimoniul_regiei) column entirely
|
|
# and shifts everything from cifra_afaceri onward one position left:
|
|
# 2015 I12 ↔ modern I13 (cifra_afaceri)
|
|
# 2015 I13 ↔ modern I14 (venituri_total)
|
|
# ...
|
|
# 2015 I18 ↔ modern I19 (pierdere_neta)
|
|
# 2015 I19 ↔ modern I20 (numar_salariati)
|
|
# Verified by matching cifra_afaceri / salariati to a stable CUI's
|
|
# 2016-2024 series. Without this remap, salariati was being ingested
|
|
# as pierdere_neta and cifra_afaceri was off by one column.
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
COPYEOF
|
|
log "[$year/WEB_UU] UPSERT (2015 left-shift remap)..."
|
|
psql -v ON_ERROR_STOP=1 <<SQL
|
|
INSERT INTO firms.financials (
|
|
cui, year, caen,
|
|
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
|
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
|
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
|
cifra_afaceri, venituri_total, cheltuieli_total,
|
|
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
|
numar_salariati, source
|
|
)
|
|
SELECT DISTINCT ON (cui)
|
|
cui, $year, caen,
|
|
i1, i2, i3, i4, i5, i6, i7, i8, i9,
|
|
i10, i11,
|
|
NULL::numeric(20,2), -- patrimoniul_regiei not in 2015 schema
|
|
i12, i13, i14, i15, i16, i17, i18, -- cifra_afaceri..pierdere_neta
|
|
CASE WHEN i19 BETWEEN 0 AND 100000000 THEN i19::bigint ELSE NULL END,
|
|
'mfinante:WEB_UU'
|
|
FROM firms.staging_financials
|
|
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
|
ORDER BY cui
|
|
ON CONFLICT (cui, year) DO UPDATE SET
|
|
source = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
|
ELSE EXCLUDED.source
|
|
END,
|
|
caen = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
|
ELSE EXCLUDED.caen
|
|
END;
|
|
SQL
|
|
fi
|
|
}
|
|
|
|
# ─── WEB_BL_BS_SL ────────────────────────────────────────────────────────
|
|
import_bl() {
|
|
local year="$1"
|
|
local file="$DATA_DIR/web_bl_bs_sl_${year}.txt"
|
|
local slug="situatii_financiare_${year}"
|
|
local pattern url ncols
|
|
pattern="^web_bl_bs_sl.*${year}\\.txt$"
|
|
case "$year" in
|
|
2015) ncols=23 ;; # has extra I21
|
|
*) ncols=22 ;;
|
|
esac
|
|
if [ ! -s "$file" ]; then
|
|
url=$(discover "$slug" "$pattern")
|
|
fetch "$file" "$url" || return 1
|
|
fi
|
|
log "[$year/WEB_BL_BS_SL] COPY $file ($(stat -c%s "$file") bytes, $ncols cols)..."
|
|
if [ "$ncols" -eq 22 ]; then
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_financials;"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_financials (cui, caen, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
COPYEOF
|
|
log "[$year/WEB_BL_BS_SL] UPSERT..."
|
|
psql -v ON_ERROR_STOP=1 <<SQL
|
|
INSERT INTO firms.financials (
|
|
cui, year, caen,
|
|
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
|
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
|
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
|
cifra_afaceri, venituri_total, cheltuieli_total,
|
|
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
|
numar_salariati, source
|
|
)
|
|
SELECT DISTINCT ON (cui)
|
|
cui, $year, caen,
|
|
i1, i2, i3, i4, i5, i6, i7, i8, i9,
|
|
i10, i11, i12, i13, i14, i15, i16, i17, i18, i19,
|
|
CASE WHEN i20 BETWEEN 0 AND 100000000 THEN i20::bigint ELSE NULL END,
|
|
'mfinante:WEB_BL_BS_SL'
|
|
FROM firms.staging_financials
|
|
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
|
ORDER BY cui
|
|
ON CONFLICT (cui, year) DO UPDATE SET
|
|
source = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
|
ELSE EXCLUDED.source
|
|
END,
|
|
caen = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
|
ELSE EXCLUDED.caen
|
|
END;
|
|
SQL
|
|
else
|
|
# 2015 BL_BS_SL schema (23 cols, CUI,CAEN,I1..I21). The pre-2016 BL
|
|
# reporting has an extra (unknown) field somewhere between
|
|
# capital_subscris (I11) and cifra_afaceri. Empirically (cross-checked
|
|
# CUI 538310 against 2016-2024 series): cifra_afaceri lives at I14
|
|
# (not I13), salariati at I21. Treat I12,I13 as patrimoniul_regiei +
|
|
# an unmapped field (likely related to regii autonome / provizioane
|
|
# detail); both empty for typical SRLs. Map:
|
|
# 2015 BL I1..I11 = modern I1..I11
|
|
# 2015 BL I12 → patrimoniul_regiei (modern I12)
|
|
# 2015 BL I13 → dropped (unknown)
|
|
# 2015 BL I14 → cifra_afaceri (modern I13)
|
|
# 2015 BL I15..I20 → modern I14..I19
|
|
# 2015 BL I21 → numar_salariati (modern I20)
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
CREATE TEMP TABLE tmp_bl23 (
|
|
cui text, caen text,
|
|
i1 numeric(20,2), i2 numeric(20,2), i3 numeric(20,2), i4 numeric(20,2),
|
|
i5 numeric(20,2), i6 numeric(20,2), i7 numeric(20,2), i8 numeric(20,2),
|
|
i9 numeric(20,2), i10 numeric(20,2), i11 numeric(20,2), i12 numeric(20,2),
|
|
i13 numeric(20,2), i14 numeric(20,2), i15 numeric(20,2), i16 numeric(20,2),
|
|
i17 numeric(20,2), i18 numeric(20,2), i19 numeric(20,2), i20 numeric(20,2),
|
|
i21 numeric(20,2)
|
|
); -- session-scoped; dropped when psql exits
|
|
\\copy tmp_bl23 FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
INSERT INTO firms.financials (
|
|
cui, year, caen,
|
|
active_imobilizate, active_circulante, stocuri, creante, casa_banci,
|
|
cheltuieli_avans, datorii, venituri_avans, provizioane,
|
|
capitaluri_total, capital_subscris, patrimoniul_regiei,
|
|
cifra_afaceri, venituri_total, cheltuieli_total,
|
|
profit_brut, pierdere_bruta, profit_net, pierdere_neta,
|
|
numar_salariati, source
|
|
)
|
|
SELECT DISTINCT ON (cui)
|
|
cui, $year, caen,
|
|
i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
|
|
i12, -- patrimoniul_regiei
|
|
i14, i15, i16, i17, i18, i19, i20, -- cifra_afaceri..pierdere_neta
|
|
CASE WHEN i21 BETWEEN 0 AND 100000000 THEN i21::bigint ELSE NULL END,
|
|
'mfinante:WEB_BL_BS_SL'
|
|
FROM tmp_bl23
|
|
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
|
ORDER BY cui
|
|
ON CONFLICT (cui, year) DO UPDATE SET
|
|
source = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.source
|
|
ELSE EXCLUDED.source
|
|
END,
|
|
caen = CASE
|
|
WHEN firms.financials.source = 'mfinante:WEB_UU' THEN firms.financials.caen
|
|
ELSE EXCLUDED.caen
|
|
END;
|
|
COPYEOF
|
|
fi
|
|
}
|
|
|
|
# ─── WEB_ONG (49 cols, schema consistent across 2015-2024) ───────────────
|
|
import_ong() {
|
|
local year="$1"
|
|
local file="$DATA_DIR/web_ong_${year}.txt"
|
|
local slug="situatii_financiare_${year}"
|
|
local url
|
|
if [ ! -s "$file" ]; then
|
|
url=$(discover "$slug" "^web_ong.*${year}\\.txt$")
|
|
fetch "$file" "$url" || return 1
|
|
fi
|
|
local header_cols
|
|
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
|
|
log "[$year/WEB_ONG] COPY $file ($(stat -c%s "$file") bytes, $header_cols cols)..."
|
|
psql -v ON_ERROR_STOP=1 -c "TRUNCATE TABLE firms.staging_ong;"
|
|
if [ "$header_cols" -eq 49 ]; then
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
COPYEOF
|
|
elif [ "$header_cols" -eq 51 ]; then
|
|
# 2018 schema: ...,I44,DEN_CAENO,I45,DEN_CAEN,I46 (extra UNQUOTED text
|
|
# columns whose contents contain commas — breaks naive CSV parsing).
|
|
# Preprocess into a 49-col file by walking backwards from end to identify
|
|
# the two text columns (variable comma count).
|
|
local cleaned="${file}.cleaned49"
|
|
log "[$year/WEB_ONG] Preprocessing 51→49 cols (stripping DEN_CAEN/DEN_CAENO)..."
|
|
python3 - "$file" "$cleaned" <<'PYEOF'
|
|
import sys
|
|
src, dst = sys.argv[1], sys.argv[2]
|
|
NUM_RE = __import__('re').compile(r'^-?\d+(\.\d+)?$|^$')
|
|
out = open(dst, 'w')
|
|
with open(src) as fh:
|
|
header = fh.readline().rstrip('\n').split(',')
|
|
# write reduced header (drop DEN_CAEN, DEN_CAENO positions 47 and 49, zero-indexed)
|
|
keep = [i for i, h in enumerate(header) if h.upper() not in ('DEN_CAEN', 'DEN_CAENO')]
|
|
out.write(','.join(header[i] for i in keep) + '\n')
|
|
for line in fh:
|
|
line = line.rstrip('\n')
|
|
parts = line.split(',')
|
|
# Walk from end: parts[-1] = i46 (numeric), then DEN_CAEN spans
|
|
# multiple parts (text). parts[-X] = i45 (numeric/empty), then
|
|
# DEN_CAENO spans, then parts[-Y] = i44 (numeric/empty).
|
|
n = len(parts)
|
|
# Find last 3 numeric-or-empty trailing fields by scanning back.
|
|
# i46 = parts[n-1]; find i45 = first numeric/empty going back from n-2.
|
|
i46_idx = n - 1
|
|
# walk backwards skipping non-numeric until we hit numeric -> that's i45
|
|
j = n - 2
|
|
while j >= 0 and not NUM_RE.match(parts[j]):
|
|
j -= 1
|
|
i45_idx = j
|
|
# den_caen spans (i45_idx+1 .. i46_idx-1) → join those
|
|
# continue back to find i44
|
|
j -= 1
|
|
while j >= 0 and not NUM_RE.match(parts[j]):
|
|
j -= 1
|
|
i44_idx = j
|
|
if i44_idx < 0 or i45_idx < 0:
|
|
# malformed row — skip
|
|
continue
|
|
# Reassemble: parts[0..i44_idx] + parts[i45_idx] + parts[i46_idx]
|
|
new_parts = parts[:i44_idx+1] + [parts[i45_idx]] + [parts[i46_idx]]
|
|
if len(new_parts) != 49:
|
|
# row doesn't fit expected 49-col output → skip
|
|
continue
|
|
out.write(','.join(new_parts) + '\n')
|
|
out.close()
|
|
PYEOF
|
|
log "[$year/WEB_ONG] Cleaned $(wc -l < "$cleaned") lines (incl. header)"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
\\copy firms.staging_ong (cui, caen, caeno, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46) FROM '$cleaned' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
COPYEOF
|
|
rm -f "$cleaned"
|
|
else
|
|
log "[$year/WEB_ONG] unexpected col count $header_cols, skipping"
|
|
return 0
|
|
fi
|
|
log "[$year/WEB_ONG] UPSERT..."
|
|
psql -v ON_ERROR_STOP=1 <<SQL
|
|
INSERT INTO firms.financials_ong (
|
|
cui, year, caen, caeno,
|
|
capitaluri_proprii, venituri_total, cheltuieli_total, excedent,
|
|
personal_neeconomic, personal_economic, indicators
|
|
)
|
|
SELECT DISTINCT ON (cui)
|
|
cui, $year, caen, caeno,
|
|
NULLIF(i12, '')::numeric(20,2),
|
|
NULLIF(i38, '')::numeric(20,2),
|
|
NULLIF(i40, '')::numeric(20,2),
|
|
NULLIF(i42, '')::numeric(20,2),
|
|
CASE WHEN NULLIF(i45, '') ~ '^[0-9]+\$' AND NULLIF(i45, '')::bigint BETWEEN 0 AND 100000000 THEN i45::bigint ELSE NULL END,
|
|
CASE WHEN NULLIF(i46, '') ~ '^[0-9]+\$' AND NULLIF(i46, '')::bigint BETWEEN 0 AND 100000000 THEN i46::bigint ELSE NULL END,
|
|
jsonb_strip_nulls(jsonb_build_object(
|
|
'i1', NULLIF(i1, ''), 'i2', NULLIF(i2, ''), 'i3', NULLIF(i3, ''), 'i4', NULLIF(i4, ''),
|
|
'i5', NULLIF(i5, ''), 'i6', NULLIF(i6, ''), 'i7', NULLIF(i7, ''), 'i8', NULLIF(i8, ''),
|
|
'i9', NULLIF(i9, ''), 'i10', NULLIF(i10, ''), 'i11', NULLIF(i11, ''), 'i12', NULLIF(i12, ''),
|
|
'i13', NULLIF(i13, ''), 'i14', NULLIF(i14, ''), 'i15', NULLIF(i15, ''), 'i16', NULLIF(i16, ''),
|
|
'i17', NULLIF(i17, ''), 'i18', NULLIF(i18, ''), 'i19', NULLIF(i19, ''), 'i20', NULLIF(i20, ''),
|
|
'i21', NULLIF(i21, ''), 'i22', NULLIF(i22, ''), 'i23', NULLIF(i23, ''), 'i24', NULLIF(i24, ''),
|
|
'i25', NULLIF(i25, ''), 'i26', NULLIF(i26, ''), 'i27', NULLIF(i27, ''), 'i28', NULLIF(i28, ''),
|
|
'i29', NULLIF(i29, ''), 'i30', NULLIF(i30, ''), 'i31', NULLIF(i31, ''), 'i32', NULLIF(i32, ''),
|
|
'i33', NULLIF(i33, ''), 'i34', NULLIF(i34, ''), 'i35', NULLIF(i35, ''), 'i36', NULLIF(i36, ''),
|
|
'i37', NULLIF(i37, ''), 'i38', NULLIF(i38, ''), 'i39', NULLIF(i39, ''), 'i40', NULLIF(i40, ''),
|
|
'i41', NULLIF(i41, ''), 'i42', NULLIF(i42, ''), 'i43', NULLIF(i43, ''), 'i44', NULLIF(i44, ''),
|
|
'i45', NULLIF(i45, ''), 'i46', NULLIF(i46, '')
|
|
))
|
|
FROM firms.staging_ong
|
|
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
|
ORDER BY cui
|
|
ON CONFLICT (cui, year) DO UPDATE SET
|
|
caen = EXCLUDED.caen,
|
|
caeno = EXCLUDED.caeno,
|
|
capitaluri_proprii = EXCLUDED.capitaluri_proprii,
|
|
venituri_total = EXCLUDED.venituri_total,
|
|
cheltuieli_total = EXCLUDED.cheltuieli_total,
|
|
excedent = EXCLUDED.excedent,
|
|
personal_neeconomic = EXCLUDED.personal_neeconomic,
|
|
personal_economic = EXCLUDED.personal_economic,
|
|
indicators = EXCLUDED.indicators,
|
|
fetched_at = now();
|
|
SQL
|
|
}
|
|
|
|
# ─── WEB_INST_DE_CREDIT (banks) — pre-IFRS schemas vary by year ─────────
|
|
# 2015: not published. 2016/2017/2019: 23 cols (I1..I21). 2018: not published.
|
|
# 2020/2021/2022: 23 cols (I21). 2023: 24 cols (I22). 2024: 25 cols (I23).
|
|
import_bank() {
|
|
local year="$1"
|
|
local file="$DATA_DIR/web_inst_de_credit_${year}.txt"
|
|
local slug="situatii_financiare_${year}"
|
|
case "$year" in
|
|
2020) slug="situatii_financiare_2021" ;;
|
|
2023) slug="situatii_financiare2023" ;;
|
|
esac
|
|
local url
|
|
if [ ! -s "$file" ]; then
|
|
url=$(discover "$slug" "^web_(inst|instit)_de_credit.*${year}\\.txt$")
|
|
if [ -z "$url" ]; then log "[$year/BANK] no file in dataset, skip"; return 0; fi
|
|
fetch "$file" "$url" || return 1
|
|
fi
|
|
# Detect column count from header line.
|
|
local header_cols
|
|
header_cols=$(head -1 "$file" | tr ',' '\n' | wc -l)
|
|
log "[$year/BANK] $file ($(stat -c%s "$file") bytes, $header_cols cols)"
|
|
# Build a TEMP table sized to the file, then map to firms.financials_banks.
|
|
# The "cifra_afaceri" mapping: in IFRS 2024 schema (25 cols) it's i23. In
|
|
# older 23-col schema it's i21. In 24-col schema (2023) it's i22.
|
|
local ind_n cifra_col profit_inainte_col profit_exerc_col capital_col activ_col cols_def cols_list ind_pairs
|
|
ind_n=$(( header_cols - 2 )) # i1..iN
|
|
case "$ind_n" in
|
|
21) cifra_col=i21; profit_inainte_col=i17; profit_exerc_col=i20; capital_col=i14; activ_col=i6 ;;
|
|
22) cifra_col=i22; profit_inainte_col=i18; profit_exerc_col=i21; capital_col=i14; activ_col=i6 ;;
|
|
23) cifra_col=i23; profit_inainte_col=i19; profit_exerc_col=i22; capital_col=i14; activ_col=i6 ;;
|
|
*) log "[$year/BANK] unexpected indicator count $ind_n, skipping"; return 0 ;;
|
|
esac
|
|
# Build dynamic column list for TEMP table and \\copy.
|
|
cols_def="cui text, caen text"
|
|
cols_list="cui, caen"
|
|
ind_pairs=""
|
|
for i in $(seq 1 "$ind_n"); do
|
|
cols_def="$cols_def, i${i} text"
|
|
cols_list="$cols_list, i${i}"
|
|
ind_pairs="$ind_pairs 'i${i}', NULLIF(i${i}, ''),"
|
|
done
|
|
ind_pairs="${ind_pairs%,}"
|
|
psql -v ON_ERROR_STOP=1 <<COPYEOF
|
|
CREATE TEMP TABLE tmp_bank (
|
|
$cols_def
|
|
); -- session-scoped; dropped when psql exits
|
|
\\copy tmp_bank ($cols_list) FROM '$file' WITH (FORMAT csv, DELIMITER ',', HEADER true, NULL '');
|
|
INSERT INTO firms.financials_banks (
|
|
cui, year, caen,
|
|
active_financiare_amortiz, capital_social, profit_exercitiu,
|
|
profit_inainte_impozit, cifra_afaceri, indicators, source
|
|
)
|
|
SELECT DISTINCT ON (cui)
|
|
cui, $year, caen,
|
|
NULLIF($activ_col, '')::numeric(20,2),
|
|
NULLIF($capital_col, '')::numeric(20,2),
|
|
NULLIF($profit_exerc_col, '')::numeric(20,2),
|
|
NULLIF($profit_inainte_col, '')::numeric(20,2),
|
|
NULLIF($cifra_col, '')::numeric(20,2),
|
|
jsonb_strip_nulls(jsonb_build_object($ind_pairs)),
|
|
'mfinante:WEB_Inst_de_credit'
|
|
FROM tmp_bank
|
|
WHERE cui IS NOT NULL AND cui != '' AND cui != '0'
|
|
ORDER BY cui
|
|
ON CONFLICT (cui, year) DO UPDATE SET
|
|
caen = EXCLUDED.caen,
|
|
active_financiare_amortiz = EXCLUDED.active_financiare_amortiz,
|
|
capital_social = EXCLUDED.capital_social,
|
|
profit_exercitiu = EXCLUDED.profit_exercitiu,
|
|
profit_inainte_impozit = EXCLUDED.profit_inainte_impozit,
|
|
cifra_afaceri = EXCLUDED.cifra_afaceri,
|
|
indicators = EXCLUDED.indicators,
|
|
source = EXCLUDED.source,
|
|
fetched_at = now();
|
|
COPYEOF
|
|
}
|
|
|
|
# CATEGORIES env var filters which sub-imports run. Default = all.
|
|
# Useful: CATEGORIES="bank" to skip companies and only redo banks.
|
|
CATEGORIES="${CATEGORIES:-uu bl ong bank}"
|
|
|
|
for YEAR in $YEARS; do
|
|
log "── Year $YEAR ──────────────────────────────"
|
|
for CAT in $CATEGORIES; do
|
|
case "$CAT" in
|
|
uu) import_uu "$YEAR" || log "[$YEAR/WEB_UU] failed" ;;
|
|
bl) import_bl "$YEAR" || log "[$YEAR/WEB_BL_BS_SL] failed" ;;
|
|
ong) import_ong "$YEAR" || log "[$YEAR/WEB_ONG] failed" ;;
|
|
bank) import_bank "$YEAR" || log "[$YEAR/BANK] failed" ;;
|
|
esac
|
|
done
|
|
done
|
|
|
|
log "=== Refreshing latest-year MV ==="
|
|
psql -v ON_ERROR_STOP=1 -c "REFRESH MATERIALIZED VIEW firms.mv_financials_latest;" || true
|
|
|
|
log "=== Final coverage ==="
|
|
psql -c "
|
|
SELECT 'fin' AS tbl, year, COUNT(*) AS n FROM firms.financials GROUP BY year
|
|
UNION ALL
|
|
SELECT 'ong' AS tbl, year, COUNT(*) AS n FROM firms.financials_ong GROUP BY year
|
|
UNION ALL
|
|
SELECT 'bank' AS tbl, year, COUNT(*) AS n FROM firms.financials_banks GROUP BY year
|
|
ORDER BY tbl, year;
|
|
" 2>&1 | tee -a "$LOG"
|
|
|
|
log "=== Historical import done ==="
|