initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
AFIR FEGA CSV importer — produces pipe-TSV ingestible by the same SQL
loader as the FEADR XLSX path. Schema is identical to FEADR (15 columns):
beneficiar, last_name, mama_cui, localitate, cod_masura, obiectiv,
data_start, data_end, fega_op, fega_total, feadr_op, feadr_total,
op_amount, cofinantare, ue_total.
FEGA CSV from AFIR portal uses:
- comma-separated columns (English decimal, e.g. "4802.43")
- CSV header row: DenumireBeneficiar,NumeFamilie,Cui,Localicate,Masura,
ObiectivSpecific,DataIncepere,DataSfarsit,CuantumOperationeFEGA,
CuantumTotalFega,CuantumOperatiuneFEADR,CuantumtotalFEADR,
CuantumAferentOperatiune,CuantumTotalCofinantareBeneficiar,
CuantumtotalUEBenefeciar
"""
from __future__ import annotations
import csv
import re
import sys
def norm_num(v):
if v is None:
return ""
s = str(v).strip()
if not s:
return ""
# FEGA uses English format already ("4802.43") — comma swap unnecessary
# but tolerate Romanian-style as defensive measure.
if "," in s and "." not in s:
s = s.replace(",", ".")
elif "," in s and "." in s:
s = s.replace(".", "").replace(",", ".")
return s.replace("|", "/")
def norm_text(v):
if v is None:
return ""
s = str(v).strip()
if not s:
return ""
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
s = re.sub(r"\s+", " ", s)
s = s.replace("\\", "\\\\")
return s
def main():
if len(sys.argv) != 3:
print("usage: import-afir-fega-csv.py INPUT.csv OUTPUT.tsv", file=sys.stderr)
sys.exit(2)
in_path, out_path = sys.argv[1], sys.argv[2]
n_data = 0
n_skipped = 0
# FEGA CSV files include 1+ header rows. Auto-skip until we see a
# row that doesn't start with "DenumireBeneficiar".
with open(in_path, "r", encoding="utf-8-sig", errors="replace", newline="") as fin, \
open(out_path, "w", encoding="utf-8") as fout:
reader = csv.reader(fin)
for r in reader:
if not r:
continue
# Skip header row(s)
if r[0].strip().lower().startswith("denumirebeneficiar"):
continue
# Pad to 15 columns
cells = r + [""] * (15 - len(r))
beneficiar = norm_text(cells[0])
if not beneficiar:
n_skipped += 1
continue
out = [
beneficiar,
norm_text(cells[1]), # last_name
norm_text(cells[2]), # mama_cui (FEGA Cui)
norm_text(cells[3]), # localitate (Localicate typo in source)
norm_text(cells[4]), # cod_masura (Masura)
norm_text(cells[5]), # obiectiv (ObiectivSpecific)
norm_text(cells[6]), # data_start (DataIncepere)
norm_text(cells[7]), # data_end (DataSfarsit)
norm_num(cells[8]), # fega_op
norm_num(cells[9]), # fega_total
norm_num(cells[10]), # feadr_op
norm_num(cells[11]), # feadr_total
norm_num(cells[12]), # op_amount (CuantumAferentOperatiune)
norm_num(cells[13]), # cofinantare (CuantumTotalCofinantareBeneficiar)
norm_num(cells[14]), # ue_total (CuantumtotalUEBenefeciar)
]
fout.write("|".join(out) + "\n")
n_data += 1
if n_data % 100000 == 0:
print(f"[afir-fega-import] wrote {n_data} rows", file=sys.stderr)
print(f"[afir-fega-import] done: {n_data} rows · {n_skipped} skipped", file=sys.stderr)
if __name__ == "__main__":
main()
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""AFIR XLSX → pipe-delimited TSV normalizer.
Source: AFIR yearly listaplati XLSX (FEADR or FEGA), as published at
https://www.afir.ro/rapoarte/beneficiari-de-fonduri-europene/date-deschise/
The XLSX has 9 banner rows, then a 15-column header at row 10 (1-indexed),
then ~470K-560K data rows. Schema (since 2023, identical for 2024):
Numele beneficiarului
Numele de familie al beneficiarului
Denumirea societatii-mama si codul de inregistrare fiscala
Localitate
Codul masurii/tipului de interventie
Obiectiv
Data inceperii
Data incheierii
Cuantum Operatiune FEGA
Cuantum Total FEGA
Cuantum Operatiune FEADR
Cuantum Total FEADR
Cuantum aferent operatiunii
Cuantum total cofinantare beneficiari
Cuantum total UE Beneficiar
Output: pipe-delimited TSV (no quoting), in the same column order, suitable
for `\\copy fonduri.staging_afir FROM ... WITH (FORMAT text, DELIMITER '|')`.
Usage:
python3 import-afir-historical.py INPUT.xlsx OUTPUT.tsv
Numeric columns are normalized: Romanian decimal "12.345,67""12345.67".
Empty strings stay empty (NULL in COPY with NULL '').
"""
import sys
import re
import openpyxl
EXPECTED_HEADER = "Numele beneficiarului"
def norm_num(v):
if v is None:
return ""
if isinstance(v, (int, float)):
# Already numeric (rare for AFIR XLSX — values arrive as strings).
return f"{v:.2f}".replace("-0.00", "0.00")
s = str(v).strip()
if not s:
return ""
# Strip thousands "." and convert "," → "."
# AFIR uses Romanian format: 12.345,67 or 12345,67 or 0,00
if "," in s:
s = s.replace(".", "").replace(",", ".")
# Strip leading/trailing whitespace, replace any embedded pipe to be safe
return s.replace("|", "/")
def norm_text(v):
if v is None:
return ""
s = str(v).strip()
if not s:
return ""
# COPY text format: tab and pipe collide with our delimiter; backslash needs escape.
# We chose pipe as delimiter — replace embedded pipes with "/".
# Newlines collapse to space.
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
s = re.sub(r"\s+", " ", s)
# Backslash escape for Postgres COPY text format
s = s.replace("\\", "\\\\")
return s
def main():
if len(sys.argv) != 3:
print("usage: import-afir-historical.py INPUT.xlsx OUTPUT.tsv", file=sys.stderr)
sys.exit(2)
in_path, out_path = sys.argv[1], sys.argv[2]
wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
ws = wb.active
rows = ws.iter_rows(values_only=True)
header_idx = None
for i, r in enumerate(rows):
if r and r[0] and EXPECTED_HEADER in str(r[0]):
header_idx = i
break
if i > 50:
break
if header_idx is None:
print("[afir-import] ERROR: header row not found in first 50 rows", file=sys.stderr)
sys.exit(1)
n_data = 0
n_skipped = 0
with open(out_path, "w", encoding="utf-8") as f:
for r in rows:
# 16 columns observed (last is None padding)
if r is None:
continue
cells = list(r) + [None] * (16 - len(r))
beneficiar = norm_text(cells[0])
if not beneficiar:
# Trailing empty rows
n_skipped += 1
continue
out = [
beneficiar,
norm_text(cells[1]), # last_name
norm_text(cells[2]), # mama_cui
norm_text(cells[3]), # localitate
norm_text(cells[4]), # cod_masura
norm_text(cells[5]), # obiectiv
norm_text(cells[6]), # data_start
norm_text(cells[7]), # data_end
norm_num(cells[8]), # fega_op
norm_num(cells[9]), # fega_total
norm_num(cells[10]), # feadr_op
norm_num(cells[11]), # feadr_total
norm_num(cells[12]), # op_amount
norm_num(cells[13]), # cofinantare
norm_num(cells[14]), # ue_total
]
f.write("|".join(out) + "\n")
n_data += 1
if n_data % 50000 == 0:
print(f"[afir-import] wrote {n_data} rows", file=sys.stderr)
print(f"[afir-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)
if __name__ == "__main__":
main()
+167
View File
@@ -0,0 +1,167 @@
#!/usr/bin/env python3
"""APIA "Lista fermieri" XLSX → pipe-delimited TSV normalizer.
Source: data.gov.ro CKAN package "lista-fermierilor-campania-apia-2024".
Currently a single resource (comuna Găgești, Vaslui, ~192 farmers), but the
package is supposed to grow as more UATs publish their lists. The XLSX
schema is set by APIA and identical across UATs:
Row 0 (header): NR.CRT | NUME PRENUME | RESPONSABIL UAT 2024
| COMUNA/ORAS | SAT | DATE CONTACT | CENTRUL APIA
| SUPRAFATA 2023 | (~17 None columns)
Rows 1..N (data): one row per farmer, NR.CRT 1-indexed.
Output: pipe-delimited TSV (no quoting), columns in this order:
campaign_year | name | comuna_oras | sat | centru_apia
| responsabil_uat | suprafata_ha
| source_dataset_id | source_resource_id | source_url
Empty strings stay empty (NULL in COPY with NULL '').
Usage:
python3 import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv \\
CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL
"""
import re
import sys
import openpyxl
EXPECTED_HEADER_COL0 = "NR.CRT"
EXPECTED_HEADER_COL1 = "NUME" # "NUME PRENUME" or "NUME SI PRENUME"
def norm_text(v):
if v is None:
return ""
s = str(v).strip()
if not s:
return ""
# Pipe is our delimiter — replace embedded pipes; collapse newlines.
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
s = re.sub(r"\s+", " ", s)
s = s.replace("\\", "\\\\")
return s
def norm_num(v):
if v is None:
return ""
if isinstance(v, (int, float)):
# APIA SUPRAFATA arrives as float ("1.04", "12.45") — already English.
# Trim trailing zeros after decimal.
s = f"{v:.4f}"
s = s.rstrip("0").rstrip(".")
return s if s else "0"
s = str(v).strip()
if not s:
return ""
if "," in s:
s = s.replace(".", "").replace(",", ".")
return s.replace("|", "/")
def main():
if len(sys.argv) != 7:
print(
"usage: import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv "
"CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL",
file=sys.stderr,
)
sys.exit(2)
in_path = sys.argv[1]
out_path = sys.argv[2]
campaign_year = sys.argv[3]
dataset_id = sys.argv[4]
resource_id = sys.argv[5]
source_url = sys.argv[6]
wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
ws = wb.active
rows = ws.iter_rows(values_only=True)
header_idx = None
col_map = None
for i, r in enumerate(rows):
if not r:
continue
if r[0] and EXPECTED_HEADER_COL0 in str(r[0]).upper():
# Build column index map from header for resilience.
header = [str(c).strip().upper() if c is not None else "" for c in r]
col_map = {}
for idx, h in enumerate(header):
if "NR.CRT" in h or "NRCRT" in h:
col_map["nr"] = idx
elif "NUME" in h: # "NUME PRENUME" / "NUME SI PRENUME"
col_map.setdefault("name", idx)
elif "RESPONSABIL" in h:
col_map["responsabil"] = idx
elif "COMUNA" in h or "ORAS" in h:
col_map["comuna"] = idx
elif h == "SAT" or h.startswith("SAT "):
col_map["sat"] = idx
elif "CENTRUL" in h or "CENTRU" in h:
col_map["centru"] = idx
elif "SUPRAFATA" in h or "SUPRAFAȚA" in h:
col_map["suprafata"] = idx
header_idx = i
break
if i > 50:
break
if header_idx is None or not col_map or "name" not in col_map:
print(
"[apia-import] ERROR: header row not found in first 50 rows",
file=sys.stderr,
)
sys.exit(1)
print(f"[apia-import] header at row {header_idx}, col_map={col_map}", file=sys.stderr)
n_data = 0
n_skipped = 0
with open(out_path, "w", encoding="utf-8") as f:
for r in rows:
if r is None:
continue
cells = list(r)
# Pad if short
max_idx = max(col_map.values()) if col_map else 0
while len(cells) <= max_idx:
cells.append(None)
name = norm_text(cells[col_map["name"]])
if not name:
n_skipped += 1
continue
comuna = norm_text(cells[col_map["comuna"]]) if "comuna" in col_map else ""
sat = norm_text(cells[col_map["sat"]]) if "sat" in col_map else ""
centru = norm_text(cells[col_map["centru"]]) if "centru" in col_map else ""
responsabil = norm_text(cells[col_map["responsabil"]]) if "responsabil" in col_map else ""
suprafata = norm_num(cells[col_map["suprafata"]]) if "suprafata" in col_map else ""
out = [
campaign_year,
name,
comuna,
sat,
centru,
responsabil,
suprafata,
dataset_id,
resource_id,
source_url,
]
f.write("|".join(out) + "\n")
n_data += 1
print(f"[apia-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)
if __name__ == "__main__":
main()
@@ -0,0 +1,483 @@
#!/usr/bin/env python3
"""
SEAP historical CSV importer for data.gov.ro yearly dumps.
Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that
PostgreSQL COPY can ingest into seap.announcements. Handles:
- BOM stripping
- Romanian decimal commas → dots
- "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants)
- Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs
- CUI normalization (strip "RO " prefix)
Usage:
python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE
TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
SOURCE: e.g. 'datagov_2024_t1_contracte'
The output TSV columns are FIXED (15 columns matching the import SQL):
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type, publication_date, contract_date, awarded_value,
supplier_name, supplier_cui, procedure_type, legislation, source
Column mapping is inferred from CSV headers (case+diacritic-insensitive).
Falls back gracefully when columns are missing (older years had fewer cols).
"""
from __future__ import annotations
import csv
import re
import sys
import unicodedata
from datetime import datetime
from pathlib import Path
def normalize_header(s: str) -> str:
"""Strip BOM, lowercase, strip diacritics, collapse whitespace."""
s = s.replace("", "").strip().lower()
s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
s = re.sub(r"\s+", " ", s)
s = s.replace("?", "")
return s.strip()
def detect_dialect(first_line: str) -> tuple[str, str | None]:
"""Detect delimiter and quote char from first line.
SEAP historical CSVs vary wildly:
- 2017/2018: ^ delim, no quote
- 2022: , delim, | quote (header looks like |FIELD|,|FIELD|)
- 2023 T3: | delim, " quote (header: FIELD|FIELD with row "txt"|"txt")
- 2023 T4: , delim, " quote (standard CSV with title-case headers)
- 2024+: , delim, " quote (standard CSV)
Returns (delim, quotechar_or_None).
"""
# Strip BOM (efbb bf) and lstrip whitespace before sniffing
s = first_line
if s.startswith(""):
s = s[1:]
s_strip = s.lstrip()
# 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|`
# → delim=',' quote='|'
if s_strip.startswith("|") and "|," in s_strip:
return (",", "|")
counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]}
# Pick highest-count delimiter
delim = max(counts, key=counts.get)
if counts[delim] == 0:
delim = ","
if delim == "|":
return ("|", '"')
if delim == "^":
return ("^", None)
if delim == ";":
return (";", '"')
return (",", '"')
# Maps normalized header → output column name.
# Multiple headers may map to the same output (e.g. two "data publicare" cols).
# Schema variants seen across data.gov.ro yearly dumps:
# - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV"
# - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV"
# - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode"
HEADER_MAP = {
# 2024 standard CSV
"autoritate contractanta": "authority_name",
"cui": "authority_cui",
"cui autoritate contractanta": "authority_cui",
"cod cpv": "cpv_code",
"denumire cpv": "cpv_name",
"tip contract": "contract_type",
"tip procedura": "procedure_type",
"tip legislatie": "legislation",
"tip incheiere contract": "award_type",
"tip inchiere contract": "award_type", # typo seen in 2023 T1 XLS
"tip criteriu de atribuire": "criterion",
"numar anunt atribuire": "ref_number",
"numar anunt initiere": "ref_initiere",
"numar anunt": "ref_number",
"numar contract": "contract_number",
"numar lot": "lot_number",
"data contract": "contract_date",
"data publicare": "publication_date",
"data publicare anunt atribuire": "publication_date", # 2023 T4 standard CSV
"data anunt atribuire": "publication_date", # 2023 T1 XLS, 2017 ^-delim
"data anunt initiere": "ref_initiere_date",
"data publicare anunt initiere": "ref_initiere_date",
"data publicare anunt": "publication_date", # 2023 T4 atribuire-fara
"valoare atribuita (ron)": "awarded_value",
"valoare estimata procedura": "estimated_value",
"moneda valoare estimata procedura": "estimated_currency",
"denumire procedura": "procedure_name",
"tip activitate autoritate": "authority_activity",
"criteriu de atribuire": "criterion",
"denumire contract": "contract_title",
"oras ofertant castigator": "supplier_city",
"tara ofertant castigator": "supplier_country",
"data publicare contract": "contract_date",
"tip activitate": "authority_activity",
"tip autoritate": "authority_type",
"tip anunt": "announcement_type",
"criterii de atribuire": "criterion",
"licitatie electronica": "electronic_auction",
"ofertant castigator": "supplier_name",
"cui ofertant castigator": "supplier_cui",
"oras ofertant": "supplier_city",
"tara ofertant": "supplier_country",
"incheiat prin": "award_type",
"valoare contract (ron)": "awarded_value",
"valoare contract": "awarded_value",
"valoare estimata (ron)": "estimated_value",
"valoare estimata": "estimated_value",
"ofertant": "supplier_name",
"cui ofertant": "supplier_cui",
"cui castigator": "supplier_cui",
"castigator": "supplier_name",
"oras": "supplier_city",
"tara": "supplier_country",
"modalitate de desfasurare": "modality",
# 2022/2023 UPPER_SNAKE_CASE pipe-delim schema
"denumire_ac": "authority_name",
"cui_ac": "authority_cui",
"cui_autoritate": "authority_cui",
"autoritate_contractanta": "authority_name",
"numar_anunt_atribuire": "ref_number",
"numar_anunt": "ref_number",
"data_anunt_atribuire": "publication_date",
"data_publicare": "publication_date",
"data_publicare_ai": "ref_initiere_date",
"data_contract": "contract_date",
"numar_contract": "contract_number",
"denumire_contract": "contract_title",
"cod_cpv": "cpv_code",
"cod_cpv_procedura": "cpv_code",
"cpv_code": "cpv_code", # 2023 schema variant
"denumire_cpv": "cpv_name",
"denumire_cpv_procedura": "cpv_name",
"tip_contract": "contract_type",
"tip_procedura": "procedure_type",
"tip_legislatie": "legislation",
"tip_lesiglatie": "legislation", # SEAP typo present in many 2023 files
"tip_anunt": "announcement_type",
"tip_incheiere_contract": "award_type",
"incheiat_prin": "award_type",
"valoare_contract_ron": "awarded_value",
"valoare_atribuita": "awarded_value",
"valoare_estimata_procedura": "estimated_value",
"ofertant": "supplier_name",
"cui_of": "supplier_cui",
"nume_castigator": "supplier_name",
"cui_castigator": "supplier_cui",
"oras_castigator": "supplier_city",
"tara_castigator": "supplier_country",
"modalitate_desfasurare": "modality",
"modalitate_atribuire": "modality",
"tip_criterii_atribuire": "criterion",
"criteriu_de_atribuire": "criterion",
"numar_anunt_ai": "ref_initiere",
"numar_anunt_initiere": "ref_initiere",
"data_anunt_initiere": "ref_initiere_date",
"denumire_procedura": "procedure_name",
# 2017/2018 ^-delim CamelCase legacy schema
"castigator": "supplier_name", # already exists for 2024 but also legacy
"castigatorcui": "supplier_cui",
"castigatortara": "supplier_country",
"castigatorlocalitate": "supplier_city",
"castigatoradresa": "supplier_address",
"tipcontract": "contract_type",
"tipprocedura": "procedure_type",
"autoritatecontractanta": "authority_name",
"autoritatecontractantacui": "authority_cui",
"tipac": "authority_type",
"tipactivitateac": "authority_activity",
"denumireac": "authority_name",
"numaranuntatribuire": "ref_number",
"numaranuntparticipare": "ref_initiere",
"numaranunt": "ref_number",
"dataanuntatribuire": "publication_date",
"dataanuntparticipare": "ref_initiere_date",
"datapublicare": "publication_date",
"tipincheierecontract": "award_type",
"tipcriteriiatribuire": "criterion",
"culicitatieelectronica": "electronic_auction",
"numarofertepre primite": "n_offers",
"numarofertePrimite": "n_offers",
"subcontractat": "subcontracted",
"numarcontract": "contract_number",
"datacontract": "contract_date",
"titlucontract": "contract_title",
"valoare": "awarded_value_orig", # may be in non-RON currency for 2017
"moneda": "currency",
"valoareron": "awarded_value",
"valoareeur": "awarded_value_eur",
"cpvcodeid": "cpv_code_id", # internal SEAP id, not CPV
"cpvcode": "cpv_code", # actual CPV like 85150000-5
"valoareestimataparticipare": "estimated_value",
"monedavaloareestimataparticipare": "estimated_currency",
"fonduricomunitare": "eu_funded",
"tipfinantare": "funding_type",
"tiplegislatieid": "legislation",
"fondeuropean": "eu_fund",
"contractperiodic": "periodic",
"depozitegarantii": "deposits",
"modalitatifinantare": "funding_modes",
"tip": "announcement_subtype", # 2017 contracte has bare "Tip"
# 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores)
"castigator": "supplier_name",
"castigator_cui": "supplier_cui",
"castigator_tara": "supplier_country",
"castigator_localitate": "supplier_city",
"castigaor_localitate": "supplier_city", # SEAP typo seen in 2018 T2 XLS
"castigator_adresa": "supplier_address",
"tip_ac": "authority_type",
"tip_activitate_ac": "authority_activity",
"autoritate_contractanta_cui": "authority_cui",
"numar_anunt_participare": "ref_initiere",
"data_anunt_participare": "ref_initiere_date",
"tip_incheiere_contract": "award_type",
"tip_criterii_atribuire": "criterion",
"cu_licitatie_electronica": "electronic_auction",
"numar_oferte_primite": "n_offers",
"titlu_contract": "contract_title",
"valoare_ron": "awarded_value",
"valoare_eur": "awarded_value_eur",
"valoare_estimata_participare": "estimated_value",
"moneda_valoare_estimata_participare": "estimated_currency",
"fonduri_comunitare": "eu_funded",
"tip_finantare": "funding_type",
"tip_legislatie_id": "legislation",
"fond_european": "eu_fund",
"contract_periodic": "periodic",
"depozite_garantii": "deposits",
"modalitati_finantare": "funding_modes",
"cpv_code_id": "cpv_code_id",
"cpv_code": "cpv_code",
}
def parse_date(s: str | None) -> str | None:
"""Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD."""
if not s:
return None
s = s.strip()
if not s:
return None
# MM/DD/YYYY 01:35:39
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s)
if m:
try:
mm, dd, yy = int(m[1]), int(m[2]), int(m[3])
datetime(yy, mm, dd) # validate
return f"{yy:04d}-{mm:02d}-{dd:02d}"
except ValueError:
return None
# DD.MM.YYYY
m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s)
if m:
try:
dd, mm, yy = int(m[1]), int(m[2]), int(m[3])
datetime(yy, mm, dd)
return f"{yy:04d}-{mm:02d}-{dd:02d}"
except ValueError:
return None
# YYYY-MM-DD passthrough
if re.match(r"^\d{4}-\d{2}-\d{2}", s):
return s[:10]
return None
def parse_number(s: str | None) -> str | None:
"""Parse Romanian number → ISO float string.
SEAP CSV uses MIXED conventions:
- "1.234.567,89" → period=thousand, comma=decimal → 1234567.89
- "123,126" → comma=THOUSAND (3 digits after) → 123126
- "12345,67" → comma=decimal (2 digits after) → 12345.67
- "1,234,567" → all commas=thousand → 1234567
Heuristic: digits-after-final-comma == 3 → thousand separator,
otherwise → decimal. Robust to most real RO data.
"""
if not s:
return None
s = s.strip().strip('"').replace("\xa0", "").replace(" ", "")
if not s or s == "-":
return None
# Mixed period+comma → assume RO format (period thousand, comma decimal)
if "," in s and "." in s:
s = s.replace(".", "").replace(",", ".")
try:
return f"{float(s):.2f}"
except ValueError:
return None
# Multi-comma → all thousand separators
if s.count(",") >= 2:
try:
return f"{int(s.replace(',', '')):d}.00"
except ValueError:
return None
# Single comma → check digits after
if "," in s:
parts = s.split(",")
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
digits_after = len(parts[1])
if digits_after == 3:
# Thousand separator (most common SEAP case)
try:
return f"{int(parts[0] + parts[1])}.00"
except ValueError:
return None
# 1-2 digits after → decimal separator
try:
return f"{float(parts[0] + '.' + parts[1]):.2f}"
except ValueError:
return None
try:
return f"{float(s):.2f}"
except ValueError:
return None
def normalize_cui(s: str | None) -> str | None:
if not s:
return None
s = s.strip().strip('"')
s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE)
s = s.strip()
if not s or not s.isdigit():
return None
return s
def main() -> None:
if len(sys.argv) != 5:
print(__doc__)
sys.exit(2)
csv_path = Path(sys.argv[1])
out_path = Path(sys.argv[2])
record_type = sys.argv[3]
source = sys.argv[4]
if not csv_path.exists():
print(f"ERROR: {csv_path} does not exist", file=sys.stderr)
sys.exit(1)
seen: set[tuple[str, str]] = set()
out_cols = [
"type", "ref_number", "authority_name", "authority_cui",
"cpv_code", "cpv_name", "contract_type", "publication_date",
"contract_date", "awarded_value", "supplier_name", "supplier_cui",
"procedure_type", "legislation", "source",
]
written = 0
skipped_dup = 0
skipped_no_ref = 0
total = 0
# Sniff first line to detect delimiter/quotechar
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f:
first_line = f.readline()
delim, quotechar = detect_dialect(first_line)
print(f"[import] delim={delim!r} quote={quotechar!r}", file=sys.stderr)
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \
out_path.open("w", encoding="utf-8") as out:
if quotechar:
reader = csv.reader(f, delimiter=delim, quotechar=quotechar)
else:
reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE)
# Skip "title" rows — some XLS exports begin with a single-cell
# title (rest empty), then the real header row follows.
header_raw = next(reader)
non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip())
if non_empty <= 1:
print("[import] skipping title row, advancing to next", file=sys.stderr)
header_raw = next(reader)
# Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends
header_raw = [h.strip().strip("|").strip() for h in header_raw]
header = [normalize_header(h) for h in header_raw]
# Build column index map. For dup headers (2× "data publicare"), LAST wins.
col_idx: dict[str, int] = {}
for i, h in enumerate(header):
mapped = HEADER_MAP.get(h)
if mapped:
col_idx[mapped] = i
# Write header line for COPY (\\\\N markers for nulls)
out.write("\t".join(out_cols) + "\n")
for row in reader:
total += 1
if len(row) < len(header):
row = row + [""] * (len(header) - len(row))
def get(col: str) -> str | None:
idx = col_idx.get(col)
if idx is None or idx >= len(row):
return None
v = row[idx].strip().strip("|").strip()
return v if v else None
ref = get("ref_number")
# For initiere imports, files name the ref column "Numar anunt initiere"
# which we map to ref_initiere. Fall through to that field.
if not ref and record_type in ("initiere",):
ref = get("ref_initiere")
if not ref:
skipped_no_ref += 1
continue
key = (record_type, ref)
if key in seen:
skipped_dup += 1
continue
seen.add(key)
fields = {
"type": record_type,
"ref_number": ref,
"authority_name": get("authority_name"),
"authority_cui": normalize_cui(get("authority_cui")),
"cpv_code": get("cpv_code"),
"cpv_name": get("cpv_name"),
"contract_type": get("contract_type"),
"publication_date": parse_date(get("publication_date")),
"contract_date": parse_date(get("contract_date")),
"awarded_value": parse_number(get("awarded_value")),
"supplier_name": get("supplier_name"),
"supplier_cui": normalize_cui(get("supplier_cui")),
"procedure_type": get("procedure_type"),
"legislation": get("legislation"),
"source": source,
}
line_parts = []
for c in out_cols:
v = fields.get(c)
if v is None:
line_parts.append("\\N")
else:
# Escape tabs, newlines, backslashes for COPY format
v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "")
line_parts.append(v)
out.write("\t".join(line_parts) + "\n")
written += 1
print(f"[import] CSV={csv_path.name}")
print(f"[import] total rows: {total}")
print(f"[import] written: {written}")
print(f"[import] dup-skip: {skipped_dup}")
print(f"[import] no-ref: {skipped_no_ref}")
print(f"[import] output: {out_path}")
if __name__ == "__main__":
main()
+81
View File
@@ -0,0 +1,81 @@
#!/bin/bash
# SEAP historical CSV importer wrapper.
# Downloads a yearly+quarterly resource from data.gov.ro CKAN and imports
# it into seap.announcements via the Python normalizer + psql COPY.
#
# Usage:
# ./import-seap-historical.sh URL TYPE SOURCE [DELETE_FIRST]
# URL: full data.gov.ro CKAN download URL
# TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
# SOURCE: tag e.g. 'datagov_2024_t1_contracte'
# DELETE_FIRST: 'yes' to wipe rows tagged with this source before insert
#
# Example:
# bash import-seap-historical.sh \
# 'https://data.gov.ro/dataset/ed.../resource/24a.../download/...t-i-2024.csv' \
# contract datagov_2024_t1_contracte yes
set -euo pipefail
URL="$1"
TYPE="$2"
SOURCE="$3"
DELETE_FIRST="${4:-no}"
WORK=/tmp/seap-historical-$$
mkdir -p "$WORK"
trap "rm -rf $WORK" EXIT
CSV="$WORK/data.csv"
TSV="$WORK/data.tsv"
echo "[import] downloading: $URL"
curl -sk --max-time 600 -L "$URL" -o "$CSV"
echo "[import] downloaded: $(stat -c %s "$CSV") bytes"
echo "[import] normalizing CSV → TSV..."
python3 "$(dirname "$0")/import-seap-historical.py" "$CSV" "$TSV" "$TYPE" "$SOURCE"
# Stage on the DB host
echo "[import] copying TSV to satra..."
scp -q "$TSV" "satra:/tmp/seap-historical.tsv"
DELETE_SQL=""
if [ "$DELETE_FIRST" = "yes" ]; then
DELETE_SQL="DELETE FROM seap.announcements WHERE source = '$SOURCE';"
fi
echo "[import] running insert on satra..."
ssh satra "/tmp/baseline.sh <<SQL
$DELETE_SQL
CREATE TEMP TABLE _stage_seap_hist (
type text, ref_number text, authority_name text, authority_cui text,
cpv_code text, cpv_name text, contract_type text, publication_date text,
contract_date text, awarded_value text, supplier_name text, supplier_cui text,
procedure_type text, legislation text, source text
);
\\COPY _stage_seap_hist FROM '/tmp/seap-historical.tsv' WITH (FORMAT text, DELIMITER E'\\t', HEADER true);
INSERT INTO seap.announcements (
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type, publication_date, contract_date, awarded_value,
supplier_name, supplier_cui, procedure_type, legislation, source
)
SELECT type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type,
NULLIF(publication_date, '')::timestamptz,
NULLIF(contract_date, '')::date,
NULLIF(awarded_value, '')::numeric,
supplier_name, supplier_cui, procedure_type, legislation, source
FROM _stage_seap_hist
ON CONFLICT (type, ref_number) DO NOTHING;
SELECT '$SOURCE' AS source, COUNT(*) AS rows,
MIN(publication_date)::date AS oldest,
MAX(publication_date)::date AS newest,
SUM(awarded_value)::bigint AS total_lei
FROM seap.announcements WHERE source = '$SOURCE';
SQL"
ssh satra "rm -f /tmp/seap-historical.tsv"
echo "[import] done."
+73
View File
@@ -0,0 +1,73 @@
#!/bin/bash
# SEAP historical XLSX importer.
# Downloads an xlsx from data.gov.ro, converts to CSV via openpyxl,
# then hands it to import-seap-historical.py + the same TSV+psql flow.
#
# Usage: ./import-seap-xlsx.sh URL TYPE SOURCE [DELETE_FIRST]
set -euo pipefail
URL="$1"
TYPE="$2"
SOURCE="$3"
DELETE_FIRST="${4:-no}"
WORK=/tmp/seap-xlsx-$$
mkdir -p "$WORK"
trap "rm -rf $WORK" EXIT
XLSX="$WORK/data.xlsx"
CSV="$WORK/data.csv"
TSV="$WORK/data.tsv"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
echo "[xlsx-import] downloading: $URL"
curl -sk --max-time 600 -L "$URL" -o "$XLSX"
echo "[xlsx-import] downloaded: $(stat -c %s "$XLSX") bytes"
echo "[xlsx-import] xlsx → csv..."
python3 "$SCRIPT_DIR/xlsx-to-csv.py" "$XLSX" "$CSV"
echo "[xlsx-import] csv: $(stat -c %s "$CSV") bytes"
echo "[xlsx-import] normalizing CSV → TSV..."
python3 "$SCRIPT_DIR/import-seap-historical.py" "$CSV" "$TSV" "$TYPE" "$SOURCE"
echo "[xlsx-import] copying TSV to satra..."
scp -q "$TSV" "satra:/tmp/seap-historical.tsv"
DELETE_SQL=""
if [ "$DELETE_FIRST" = "yes" ]; then
DELETE_SQL="DELETE FROM seap.announcements WHERE source = '$SOURCE';"
fi
ssh satra "/tmp/baseline.sh <<SQL
$DELETE_SQL
CREATE TEMP TABLE _stage_seap_hist (
type text, ref_number text, authority_name text, authority_cui text,
cpv_code text, cpv_name text, contract_type text, publication_date text,
contract_date text, awarded_value text, supplier_name text, supplier_cui text,
procedure_type text, legislation text, source text
);
\\COPY _stage_seap_hist FROM '/tmp/seap-historical.tsv' WITH (FORMAT text, DELIMITER E'\\t', HEADER true);
INSERT INTO seap.announcements (
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type, publication_date, contract_date, awarded_value,
supplier_name, supplier_cui, procedure_type, legislation, source
)
SELECT type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type,
NULLIF(publication_date, '')::timestamptz,
NULLIF(contract_date, '')::date,
NULLIF(awarded_value, '')::numeric,
supplier_name, supplier_cui, procedure_type, legislation, source
FROM _stage_seap_hist
ON CONFLICT (type, ref_number) DO NOTHING;
SELECT '$SOURCE' AS source, COUNT(*) AS rows,
MIN(publication_date)::date AS oldest,
MAX(publication_date)::date AS newest,
SUM(awarded_value)::bigint AS total_lei
FROM seap.announcements WHERE source = '$SOURCE';
SQL"
ssh satra "rm -f /tmp/seap-historical.tsv"
echo "[xlsx-import] done."
@@ -0,0 +1,56 @@
/**
* Standalone test for CNAS Layout-B parser.
*
* Reads pdftotext -layout output for the 8 known Layout-B PDFs (the 9th is
* an empty form template), parses with parseProviderTextJudetGrouped(), and
* prints results for manual inspection.
*
* Usage:
* npx tsx scripts/test-cnas-layout-b.ts /tmp/cnas-pdfs/Lista-furnizori-testare-genetica-2024-2025_all.pdf
* npx tsx scripts/test-cnas-layout-b.ts /tmp/cnas-pdfs/*.pdf
*/
import { execFile } from 'child_process';
import { promisify } from 'util';
import { basename } from 'path';
import { parseProviderTextJudetGrouped, parseProviderTextRadio, parseProviderTextSingleCAS, parseProviderTextNumberedDot } from '../src/cnas-layout-b.js';
const execFileAsync = promisify(execFile);
async function pdftotextLayout(pdfPath: string): Promise<string> {
const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
maxBuffer: 64 * 1024 * 1024,
});
return stdout;
}
async function main() {
const files = process.argv.slice(2);
if (files.length === 0) {
console.error('Usage: tsx test-cnas-layout-b.ts <pdf>...');
process.exit(1);
}
for (const f of files) {
const fn = basename(f);
console.log(`\n=== ${fn} ===`);
const text = await pdftotextLayout(f);
let rows;
if (/radioterapie/i.test(fn)) {
rows = parseProviderTextRadio(text, { tip: 'radioterapie' });
} else if (/CAS-GORJ.*PNS/i.test(fn) || /Valori-de-contract-furnizori-PNS/i.test(fn)) {
rows = parseProviderTextSingleCAS(text, { tip: 'pns', judet: 'GORJ' });
} else if (/ASISTENTA-MEDICALA-PRIMARA/i.test(fn)) {
rows = parseProviderTextNumberedDot(text, { tip: 'medicina_familie', judet: 'SIBIU' });
} else {
rows = parseProviderTextJudetGrouped(text, { tip: 'oncologie' });
}
const limit = parseInt(process.env.TEST_LIMIT || '20');
console.log(`Parsed ${rows.length} rows`);
for (let i = 0; i < Math.min(rows.length, limit); i++) {
const r = rows[i];
console.log(` [${i + 1}] judet=${r.judet || '-'} name="${r.name}" sediu="${r.sediu || '-'}" tel=${r.telefon || '-'} email=${r.email || '-'} flags=${r.specialitate || '-'}`);
}
if (rows.length > limit) console.log(` ... and ${rows.length - limit} more`);
}
}
main().catch((e) => { console.error(e); process.exit(1); });
@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""XLSX/XLS → CSV converter for SEAP data.gov.ro yearly dumps.
Reads the first sheet, writes a UTF-8 CSV (comma + double-quote) so the
existing SEAP normalizer (import-seap-historical.py) can ingest it.
Auto-detects file format:
- XLSX (zip archive) → openpyxl
- XLS (BIFF8 OLE) → xlrd 1.x
Usage: python3 xlsx-to-csv.py INPUT.{xlsx|xls} OUTPUT.csv
"""
import csv
import sys
from datetime import datetime
from pathlib import Path
def is_xlsx(path: Path) -> bool:
"""XLSX is a ZIP archive (PK header)."""
with path.open("rb") as f:
return f.read(2) == b"PK"
def main() -> None:
if len(sys.argv) != 3:
print(__doc__)
sys.exit(2)
src = Path(sys.argv[1])
dst = Path(sys.argv[2])
written = 0
if is_xlsx(src):
import openpyxl
wb = openpyxl.load_workbook(src, read_only=True, data_only=True)
ws = wb.active
with dst.open("w", encoding="utf-8", newline="") as f:
w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
for row in ws.iter_rows(values_only=True):
out = []
for v in row:
if v is None:
out.append("")
elif isinstance(v, datetime):
out.append(v.strftime("%m/%d/%Y %H:%M:%S"))
elif isinstance(v, float) and v.is_integer():
out.append(str(int(v)))
else:
out.append(str(v))
w.writerow(out)
written += 1
else:
# Legacy XLS via xlrd 1.x — concat ALL sheets (some big SEAP files use
# multiple sheets due to the 65k row limit in old XLS format).
import xlrd
b = xlrd.open_workbook(str(src))
wrote_header = False
with dst.open("w", encoding="utf-8", newline="") as f:
w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
for sidx, sname in enumerate(b.sheet_names()):
sh = b.sheet_by_index(sidx)
if sh.nrows == 0:
continue
start = 0
if wrote_header:
start = 1 # skip repeated header on subsequent sheets
else:
wrote_header = True
for ridx in range(start, sh.nrows):
row = sh.row(ridx)
out = []
for cell in row:
if cell.ctype == xlrd.XL_CELL_EMPTY or cell.ctype == xlrd.XL_CELL_BLANK:
out.append("")
elif cell.ctype == xlrd.XL_CELL_DATE:
try:
tup = xlrd.xldate_as_tuple(cell.value, b.datemode)
out.append(datetime(*tup).strftime("%m/%d/%Y %H:%M:%S"))
except Exception:
out.append(str(cell.value))
elif cell.ctype == xlrd.XL_CELL_NUMBER:
v = cell.value
if v == int(v):
out.append(str(int(v)))
else:
out.append(str(v))
else:
out.append(str(cell.value))
w.writerow(out)
written += 1
print(f"[xlsx2csv] {src.name}{dst.name}: {written} rows", file=sys.stderr)
if __name__ == "__main__":
main()