Files
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

484 lines
18 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
SEAP historical CSV importer for data.gov.ro yearly dumps.
Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that
PostgreSQL COPY can ingest into seap.announcements. Handles:
- BOM stripping
- Romanian decimal commas → dots
- "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants)
- Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs
- CUI normalization (strip "RO " prefix)
Usage:
python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE
TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
SOURCE: e.g. 'datagov_2024_t1_contracte'
The output TSV columns are FIXED (15 columns matching the import SQL):
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type, publication_date, contract_date, awarded_value,
supplier_name, supplier_cui, procedure_type, legislation, source
Column mapping is inferred from CSV headers (case+diacritic-insensitive).
Falls back gracefully when columns are missing (older years had fewer cols).
"""
from __future__ import annotations
import csv
import re
import sys
import unicodedata
from datetime import datetime
from pathlib import Path
def normalize_header(s: str) -> str:
"""Strip BOM, lowercase, strip diacritics, collapse whitespace."""
s = s.replace("", "").strip().lower()
s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
s = re.sub(r"\s+", " ", s)
s = s.replace("?", "")
return s.strip()
def detect_dialect(first_line: str) -> tuple[str, str | None]:
"""Detect delimiter and quote char from first line.
SEAP historical CSVs vary wildly:
- 2017/2018: ^ delim, no quote
- 2022: , delim, | quote (header looks like |FIELD|,|FIELD|)
- 2023 T3: | delim, " quote (header: FIELD|FIELD with row "txt"|"txt")
- 2023 T4: , delim, " quote (standard CSV with title-case headers)
- 2024+: , delim, " quote (standard CSV)
Returns (delim, quotechar_or_None).
"""
# Strip BOM (efbb bf) and lstrip whitespace before sniffing
s = first_line
if s.startswith(""):
s = s[1:]
s_strip = s.lstrip()
# 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|`
# → delim=',' quote='|'
if s_strip.startswith("|") and "|," in s_strip:
return (",", "|")
counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]}
# Pick highest-count delimiter
delim = max(counts, key=counts.get)
if counts[delim] == 0:
delim = ","
if delim == "|":
return ("|", '"')
if delim == "^":
return ("^", None)
if delim == ";":
return (";", '"')
return (",", '"')
# Maps normalized header → output column name.
# Multiple headers may map to the same output (e.g. two "data publicare" cols).
# Schema variants seen across data.gov.ro yearly dumps:
# - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV"
# - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV"
# - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode"
HEADER_MAP = {
# 2024 standard CSV
"autoritate contractanta": "authority_name",
"cui": "authority_cui",
"cui autoritate contractanta": "authority_cui",
"cod cpv": "cpv_code",
"denumire cpv": "cpv_name",
"tip contract": "contract_type",
"tip procedura": "procedure_type",
"tip legislatie": "legislation",
"tip incheiere contract": "award_type",
"tip inchiere contract": "award_type", # typo seen in 2023 T1 XLS
"tip criteriu de atribuire": "criterion",
"numar anunt atribuire": "ref_number",
"numar anunt initiere": "ref_initiere",
"numar anunt": "ref_number",
"numar contract": "contract_number",
"numar lot": "lot_number",
"data contract": "contract_date",
"data publicare": "publication_date",
"data publicare anunt atribuire": "publication_date", # 2023 T4 standard CSV
"data anunt atribuire": "publication_date", # 2023 T1 XLS, 2017 ^-delim
"data anunt initiere": "ref_initiere_date",
"data publicare anunt initiere": "ref_initiere_date",
"data publicare anunt": "publication_date", # 2023 T4 atribuire-fara
"valoare atribuita (ron)": "awarded_value",
"valoare estimata procedura": "estimated_value",
"moneda valoare estimata procedura": "estimated_currency",
"denumire procedura": "procedure_name",
"tip activitate autoritate": "authority_activity",
"criteriu de atribuire": "criterion",
"denumire contract": "contract_title",
"oras ofertant castigator": "supplier_city",
"tara ofertant castigator": "supplier_country",
"data publicare contract": "contract_date",
"tip activitate": "authority_activity",
"tip autoritate": "authority_type",
"tip anunt": "announcement_type",
"criterii de atribuire": "criterion",
"licitatie electronica": "electronic_auction",
"ofertant castigator": "supplier_name",
"cui ofertant castigator": "supplier_cui",
"oras ofertant": "supplier_city",
"tara ofertant": "supplier_country",
"incheiat prin": "award_type",
"valoare contract (ron)": "awarded_value",
"valoare contract": "awarded_value",
"valoare estimata (ron)": "estimated_value",
"valoare estimata": "estimated_value",
"ofertant": "supplier_name",
"cui ofertant": "supplier_cui",
"cui castigator": "supplier_cui",
"castigator": "supplier_name",
"oras": "supplier_city",
"tara": "supplier_country",
"modalitate de desfasurare": "modality",
# 2022/2023 UPPER_SNAKE_CASE pipe-delim schema
"denumire_ac": "authority_name",
"cui_ac": "authority_cui",
"cui_autoritate": "authority_cui",
"autoritate_contractanta": "authority_name",
"numar_anunt_atribuire": "ref_number",
"numar_anunt": "ref_number",
"data_anunt_atribuire": "publication_date",
"data_publicare": "publication_date",
"data_publicare_ai": "ref_initiere_date",
"data_contract": "contract_date",
"numar_contract": "contract_number",
"denumire_contract": "contract_title",
"cod_cpv": "cpv_code",
"cod_cpv_procedura": "cpv_code",
"cpv_code": "cpv_code", # 2023 schema variant
"denumire_cpv": "cpv_name",
"denumire_cpv_procedura": "cpv_name",
"tip_contract": "contract_type",
"tip_procedura": "procedure_type",
"tip_legislatie": "legislation",
"tip_lesiglatie": "legislation", # SEAP typo present in many 2023 files
"tip_anunt": "announcement_type",
"tip_incheiere_contract": "award_type",
"incheiat_prin": "award_type",
"valoare_contract_ron": "awarded_value",
"valoare_atribuita": "awarded_value",
"valoare_estimata_procedura": "estimated_value",
"ofertant": "supplier_name",
"cui_of": "supplier_cui",
"nume_castigator": "supplier_name",
"cui_castigator": "supplier_cui",
"oras_castigator": "supplier_city",
"tara_castigator": "supplier_country",
"modalitate_desfasurare": "modality",
"modalitate_atribuire": "modality",
"tip_criterii_atribuire": "criterion",
"criteriu_de_atribuire": "criterion",
"numar_anunt_ai": "ref_initiere",
"numar_anunt_initiere": "ref_initiere",
"data_anunt_initiere": "ref_initiere_date",
"denumire_procedura": "procedure_name",
# 2017/2018 ^-delim CamelCase legacy schema
"castigator": "supplier_name", # already exists for 2024 but also legacy
"castigatorcui": "supplier_cui",
"castigatortara": "supplier_country",
"castigatorlocalitate": "supplier_city",
"castigatoradresa": "supplier_address",
"tipcontract": "contract_type",
"tipprocedura": "procedure_type",
"autoritatecontractanta": "authority_name",
"autoritatecontractantacui": "authority_cui",
"tipac": "authority_type",
"tipactivitateac": "authority_activity",
"denumireac": "authority_name",
"numaranuntatribuire": "ref_number",
"numaranuntparticipare": "ref_initiere",
"numaranunt": "ref_number",
"dataanuntatribuire": "publication_date",
"dataanuntparticipare": "ref_initiere_date",
"datapublicare": "publication_date",
"tipincheierecontract": "award_type",
"tipcriteriiatribuire": "criterion",
"culicitatieelectronica": "electronic_auction",
"numarofertepre primite": "n_offers",
"numarofertePrimite": "n_offers",
"subcontractat": "subcontracted",
"numarcontract": "contract_number",
"datacontract": "contract_date",
"titlucontract": "contract_title",
"valoare": "awarded_value_orig", # may be in non-RON currency for 2017
"moneda": "currency",
"valoareron": "awarded_value",
"valoareeur": "awarded_value_eur",
"cpvcodeid": "cpv_code_id", # internal SEAP id, not CPV
"cpvcode": "cpv_code", # actual CPV like 85150000-5
"valoareestimataparticipare": "estimated_value",
"monedavaloareestimataparticipare": "estimated_currency",
"fonduricomunitare": "eu_funded",
"tipfinantare": "funding_type",
"tiplegislatieid": "legislation",
"fondeuropean": "eu_fund",
"contractperiodic": "periodic",
"depozitegarantii": "deposits",
"modalitatifinantare": "funding_modes",
"tip": "announcement_subtype", # 2017 contracte has bare "Tip"
# 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores)
"castigator": "supplier_name",
"castigator_cui": "supplier_cui",
"castigator_tara": "supplier_country",
"castigator_localitate": "supplier_city",
"castigaor_localitate": "supplier_city", # SEAP typo seen in 2018 T2 XLS
"castigator_adresa": "supplier_address",
"tip_ac": "authority_type",
"tip_activitate_ac": "authority_activity",
"autoritate_contractanta_cui": "authority_cui",
"numar_anunt_participare": "ref_initiere",
"data_anunt_participare": "ref_initiere_date",
"tip_incheiere_contract": "award_type",
"tip_criterii_atribuire": "criterion",
"cu_licitatie_electronica": "electronic_auction",
"numar_oferte_primite": "n_offers",
"titlu_contract": "contract_title",
"valoare_ron": "awarded_value",
"valoare_eur": "awarded_value_eur",
"valoare_estimata_participare": "estimated_value",
"moneda_valoare_estimata_participare": "estimated_currency",
"fonduri_comunitare": "eu_funded",
"tip_finantare": "funding_type",
"tip_legislatie_id": "legislation",
"fond_european": "eu_fund",
"contract_periodic": "periodic",
"depozite_garantii": "deposits",
"modalitati_finantare": "funding_modes",
"cpv_code_id": "cpv_code_id",
"cpv_code": "cpv_code",
}
def parse_date(s: str | None) -> str | None:
"""Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD."""
if not s:
return None
s = s.strip()
if not s:
return None
# MM/DD/YYYY 01:35:39
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s)
if m:
try:
mm, dd, yy = int(m[1]), int(m[2]), int(m[3])
datetime(yy, mm, dd) # validate
return f"{yy:04d}-{mm:02d}-{dd:02d}"
except ValueError:
return None
# DD.MM.YYYY
m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s)
if m:
try:
dd, mm, yy = int(m[1]), int(m[2]), int(m[3])
datetime(yy, mm, dd)
return f"{yy:04d}-{mm:02d}-{dd:02d}"
except ValueError:
return None
# YYYY-MM-DD passthrough
if re.match(r"^\d{4}-\d{2}-\d{2}", s):
return s[:10]
return None
def parse_number(s: str | None) -> str | None:
"""Parse Romanian number → ISO float string.
SEAP CSV uses MIXED conventions:
- "1.234.567,89" → period=thousand, comma=decimal → 1234567.89
- "123,126" → comma=THOUSAND (3 digits after) → 123126
- "12345,67" → comma=decimal (2 digits after) → 12345.67
- "1,234,567" → all commas=thousand → 1234567
Heuristic: digits-after-final-comma == 3 → thousand separator,
otherwise → decimal. Robust to most real RO data.
"""
if not s:
return None
s = s.strip().strip('"').replace("\xa0", "").replace(" ", "")
if not s or s == "-":
return None
# Mixed period+comma → assume RO format (period thousand, comma decimal)
if "," in s and "." in s:
s = s.replace(".", "").replace(",", ".")
try:
return f"{float(s):.2f}"
except ValueError:
return None
# Multi-comma → all thousand separators
if s.count(",") >= 2:
try:
return f"{int(s.replace(',', '')):d}.00"
except ValueError:
return None
# Single comma → check digits after
if "," in s:
parts = s.split(",")
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
digits_after = len(parts[1])
if digits_after == 3:
# Thousand separator (most common SEAP case)
try:
return f"{int(parts[0] + parts[1])}.00"
except ValueError:
return None
# 1-2 digits after → decimal separator
try:
return f"{float(parts[0] + '.' + parts[1]):.2f}"
except ValueError:
return None
try:
return f"{float(s):.2f}"
except ValueError:
return None
def normalize_cui(s: str | None) -> str | None:
if not s:
return None
s = s.strip().strip('"')
s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE)
s = s.strip()
if not s or not s.isdigit():
return None
return s
def main() -> None:
if len(sys.argv) != 5:
print(__doc__)
sys.exit(2)
csv_path = Path(sys.argv[1])
out_path = Path(sys.argv[2])
record_type = sys.argv[3]
source = sys.argv[4]
if not csv_path.exists():
print(f"ERROR: {csv_path} does not exist", file=sys.stderr)
sys.exit(1)
seen: set[tuple[str, str]] = set()
out_cols = [
"type", "ref_number", "authority_name", "authority_cui",
"cpv_code", "cpv_name", "contract_type", "publication_date",
"contract_date", "awarded_value", "supplier_name", "supplier_cui",
"procedure_type", "legislation", "source",
]
written = 0
skipped_dup = 0
skipped_no_ref = 0
total = 0
# Sniff first line to detect delimiter/quotechar
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f:
first_line = f.readline()
delim, quotechar = detect_dialect(first_line)
print(f"[import] delim={delim!r} quote={quotechar!r}", file=sys.stderr)
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \
out_path.open("w", encoding="utf-8") as out:
if quotechar:
reader = csv.reader(f, delimiter=delim, quotechar=quotechar)
else:
reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE)
# Skip "title" rows — some XLS exports begin with a single-cell
# title (rest empty), then the real header row follows.
header_raw = next(reader)
non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip())
if non_empty <= 1:
print("[import] skipping title row, advancing to next", file=sys.stderr)
header_raw = next(reader)
# Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends
header_raw = [h.strip().strip("|").strip() for h in header_raw]
header = [normalize_header(h) for h in header_raw]
# Build column index map. For dup headers (2× "data publicare"), LAST wins.
col_idx: dict[str, int] = {}
for i, h in enumerate(header):
mapped = HEADER_MAP.get(h)
if mapped:
col_idx[mapped] = i
# Write header line for COPY (\\\\N markers for nulls)
out.write("\t".join(out_cols) + "\n")
for row in reader:
total += 1
if len(row) < len(header):
row = row + [""] * (len(header) - len(row))
def get(col: str) -> str | None:
idx = col_idx.get(col)
if idx is None or idx >= len(row):
return None
v = row[idx].strip().strip("|").strip()
return v if v else None
ref = get("ref_number")
# For initiere imports, files name the ref column "Numar anunt initiere"
# which we map to ref_initiere. Fall through to that field.
if not ref and record_type in ("initiere",):
ref = get("ref_initiere")
if not ref:
skipped_no_ref += 1
continue
key = (record_type, ref)
if key in seen:
skipped_dup += 1
continue
seen.add(key)
fields = {
"type": record_type,
"ref_number": ref,
"authority_name": get("authority_name"),
"authority_cui": normalize_cui(get("authority_cui")),
"cpv_code": get("cpv_code"),
"cpv_name": get("cpv_name"),
"contract_type": get("contract_type"),
"publication_date": parse_date(get("publication_date")),
"contract_date": parse_date(get("contract_date")),
"awarded_value": parse_number(get("awarded_value")),
"supplier_name": get("supplier_name"),
"supplier_cui": normalize_cui(get("supplier_cui")),
"procedure_type": get("procedure_type"),
"legislation": get("legislation"),
"source": source,
}
line_parts = []
for c in out_cols:
v = fields.get(c)
if v is None:
line_parts.append("\\N")
else:
# Escape tabs, newlines, backslashes for COPY format
v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "")
line_parts.append(v)
out.write("\t".join(line_parts) + "\n")
written += 1
print(f"[import] CSV={csv_path.name}")
print(f"[import] total rows: {total}")
print(f"[import] written: {written}")
print(f"[import] dup-skip: {skipped_dup}")
print(f"[import] no-ref: {skipped_no_ref}")
print(f"[import] output: {out_path}")
if __name__ == "__main__":
main()