initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,483 @@
#!/usr/bin/env python3
"""
SEAP historical CSV importer for data.gov.ro yearly dumps.
Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that
PostgreSQL COPY can ingest into seap.announcements. Handles:
- BOM stripping
- Romanian decimal commas → dots
- "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants)
- Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs
- CUI normalization (strip "RO " prefix)
Usage:
python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE
TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
SOURCE: e.g. 'datagov_2024_t1_contracte'
The output TSV columns are FIXED (15 columns matching the import SQL):
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
contract_type, publication_date, contract_date, awarded_value,
supplier_name, supplier_cui, procedure_type, legislation, source
Column mapping is inferred from CSV headers (case+diacritic-insensitive).
Falls back gracefully when columns are missing (older years had fewer cols).
"""
from __future__ import annotations
import csv
import re
import sys
import unicodedata
from datetime import datetime
from pathlib import Path
def normalize_header(s: str) -> str:
"""Strip BOM, lowercase, strip diacritics, collapse whitespace."""
s = s.replace("", "").strip().lower()
s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
s = re.sub(r"\s+", " ", s)
s = s.replace("?", "")
return s.strip()
def detect_dialect(first_line: str) -> tuple[str, str | None]:
"""Detect delimiter and quote char from first line.
SEAP historical CSVs vary wildly:
- 2017/2018: ^ delim, no quote
- 2022: , delim, | quote (header looks like |FIELD|,|FIELD|)
- 2023 T3: | delim, " quote (header: FIELD|FIELD with row "txt"|"txt")
- 2023 T4: , delim, " quote (standard CSV with title-case headers)
- 2024+: , delim, " quote (standard CSV)
Returns (delim, quotechar_or_None).
"""
# Strip BOM (efbb bf) and lstrip whitespace before sniffing
s = first_line
if s.startswith(""):
s = s[1:]
s_strip = s.lstrip()
# 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|`
# → delim=',' quote='|'
if s_strip.startswith("|") and "|," in s_strip:
return (",", "|")
counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]}
# Pick highest-count delimiter
delim = max(counts, key=counts.get)
if counts[delim] == 0:
delim = ","
if delim == "|":
return ("|", '"')
if delim == "^":
return ("^", None)
if delim == ";":
return (";", '"')
return (",", '"')
# Maps normalized header → output column name.
# Multiple headers may map to the same output (e.g. two "data publicare" cols).
# Schema variants seen across data.gov.ro yearly dumps:
# - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV"
# - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV"
# - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode"
HEADER_MAP = {
# 2024 standard CSV
"autoritate contractanta": "authority_name",
"cui": "authority_cui",
"cui autoritate contractanta": "authority_cui",
"cod cpv": "cpv_code",
"denumire cpv": "cpv_name",
"tip contract": "contract_type",
"tip procedura": "procedure_type",
"tip legislatie": "legislation",
"tip incheiere contract": "award_type",
"tip inchiere contract": "award_type", # typo seen in 2023 T1 XLS
"tip criteriu de atribuire": "criterion",
"numar anunt atribuire": "ref_number",
"numar anunt initiere": "ref_initiere",
"numar anunt": "ref_number",
"numar contract": "contract_number",
"numar lot": "lot_number",
"data contract": "contract_date",
"data publicare": "publication_date",
"data publicare anunt atribuire": "publication_date", # 2023 T4 standard CSV
"data anunt atribuire": "publication_date", # 2023 T1 XLS, 2017 ^-delim
"data anunt initiere": "ref_initiere_date",
"data publicare anunt initiere": "ref_initiere_date",
"data publicare anunt": "publication_date", # 2023 T4 atribuire-fara
"valoare atribuita (ron)": "awarded_value",
"valoare estimata procedura": "estimated_value",
"moneda valoare estimata procedura": "estimated_currency",
"denumire procedura": "procedure_name",
"tip activitate autoritate": "authority_activity",
"criteriu de atribuire": "criterion",
"denumire contract": "contract_title",
"oras ofertant castigator": "supplier_city",
"tara ofertant castigator": "supplier_country",
"data publicare contract": "contract_date",
"tip activitate": "authority_activity",
"tip autoritate": "authority_type",
"tip anunt": "announcement_type",
"criterii de atribuire": "criterion",
"licitatie electronica": "electronic_auction",
"ofertant castigator": "supplier_name",
"cui ofertant castigator": "supplier_cui",
"oras ofertant": "supplier_city",
"tara ofertant": "supplier_country",
"incheiat prin": "award_type",
"valoare contract (ron)": "awarded_value",
"valoare contract": "awarded_value",
"valoare estimata (ron)": "estimated_value",
"valoare estimata": "estimated_value",
"ofertant": "supplier_name",
"cui ofertant": "supplier_cui",
"cui castigator": "supplier_cui",
"castigator": "supplier_name",
"oras": "supplier_city",
"tara": "supplier_country",
"modalitate de desfasurare": "modality",
# 2022/2023 UPPER_SNAKE_CASE pipe-delim schema
"denumire_ac": "authority_name",
"cui_ac": "authority_cui",
"cui_autoritate": "authority_cui",
"autoritate_contractanta": "authority_name",
"numar_anunt_atribuire": "ref_number",
"numar_anunt": "ref_number",
"data_anunt_atribuire": "publication_date",
"data_publicare": "publication_date",
"data_publicare_ai": "ref_initiere_date",
"data_contract": "contract_date",
"numar_contract": "contract_number",
"denumire_contract": "contract_title",
"cod_cpv": "cpv_code",
"cod_cpv_procedura": "cpv_code",
"cpv_code": "cpv_code", # 2023 schema variant
"denumire_cpv": "cpv_name",
"denumire_cpv_procedura": "cpv_name",
"tip_contract": "contract_type",
"tip_procedura": "procedure_type",
"tip_legislatie": "legislation",
"tip_lesiglatie": "legislation", # SEAP typo present in many 2023 files
"tip_anunt": "announcement_type",
"tip_incheiere_contract": "award_type",
"incheiat_prin": "award_type",
"valoare_contract_ron": "awarded_value",
"valoare_atribuita": "awarded_value",
"valoare_estimata_procedura": "estimated_value",
"ofertant": "supplier_name",
"cui_of": "supplier_cui",
"nume_castigator": "supplier_name",
"cui_castigator": "supplier_cui",
"oras_castigator": "supplier_city",
"tara_castigator": "supplier_country",
"modalitate_desfasurare": "modality",
"modalitate_atribuire": "modality",
"tip_criterii_atribuire": "criterion",
"criteriu_de_atribuire": "criterion",
"numar_anunt_ai": "ref_initiere",
"numar_anunt_initiere": "ref_initiere",
"data_anunt_initiere": "ref_initiere_date",
"denumire_procedura": "procedure_name",
# 2017/2018 ^-delim CamelCase legacy schema
"castigator": "supplier_name", # already exists for 2024 but also legacy
"castigatorcui": "supplier_cui",
"castigatortara": "supplier_country",
"castigatorlocalitate": "supplier_city",
"castigatoradresa": "supplier_address",
"tipcontract": "contract_type",
"tipprocedura": "procedure_type",
"autoritatecontractanta": "authority_name",
"autoritatecontractantacui": "authority_cui",
"tipac": "authority_type",
"tipactivitateac": "authority_activity",
"denumireac": "authority_name",
"numaranuntatribuire": "ref_number",
"numaranuntparticipare": "ref_initiere",
"numaranunt": "ref_number",
"dataanuntatribuire": "publication_date",
"dataanuntparticipare": "ref_initiere_date",
"datapublicare": "publication_date",
"tipincheierecontract": "award_type",
"tipcriteriiatribuire": "criterion",
"culicitatieelectronica": "electronic_auction",
"numarofertepre primite": "n_offers",
"numarofertePrimite": "n_offers",
"subcontractat": "subcontracted",
"numarcontract": "contract_number",
"datacontract": "contract_date",
"titlucontract": "contract_title",
"valoare": "awarded_value_orig", # may be in non-RON currency for 2017
"moneda": "currency",
"valoareron": "awarded_value",
"valoareeur": "awarded_value_eur",
"cpvcodeid": "cpv_code_id", # internal SEAP id, not CPV
"cpvcode": "cpv_code", # actual CPV like 85150000-5
"valoareestimataparticipare": "estimated_value",
"monedavaloareestimataparticipare": "estimated_currency",
"fonduricomunitare": "eu_funded",
"tipfinantare": "funding_type",
"tiplegislatieid": "legislation",
"fondeuropean": "eu_fund",
"contractperiodic": "periodic",
"depozitegarantii": "deposits",
"modalitatifinantare": "funding_modes",
"tip": "announcement_subtype", # 2017 contracte has bare "Tip"
# 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores)
"castigator": "supplier_name",
"castigator_cui": "supplier_cui",
"castigator_tara": "supplier_country",
"castigator_localitate": "supplier_city",
"castigaor_localitate": "supplier_city", # SEAP typo seen in 2018 T2 XLS
"castigator_adresa": "supplier_address",
"tip_ac": "authority_type",
"tip_activitate_ac": "authority_activity",
"autoritate_contractanta_cui": "authority_cui",
"numar_anunt_participare": "ref_initiere",
"data_anunt_participare": "ref_initiere_date",
"tip_incheiere_contract": "award_type",
"tip_criterii_atribuire": "criterion",
"cu_licitatie_electronica": "electronic_auction",
"numar_oferte_primite": "n_offers",
"titlu_contract": "contract_title",
"valoare_ron": "awarded_value",
"valoare_eur": "awarded_value_eur",
"valoare_estimata_participare": "estimated_value",
"moneda_valoare_estimata_participare": "estimated_currency",
"fonduri_comunitare": "eu_funded",
"tip_finantare": "funding_type",
"tip_legislatie_id": "legislation",
"fond_european": "eu_fund",
"contract_periodic": "periodic",
"depozite_garantii": "deposits",
"modalitati_finantare": "funding_modes",
"cpv_code_id": "cpv_code_id",
"cpv_code": "cpv_code",
}
def parse_date(s: str | None) -> str | None:
"""Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD."""
if not s:
return None
s = s.strip()
if not s:
return None
# MM/DD/YYYY 01:35:39
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s)
if m:
try:
mm, dd, yy = int(m[1]), int(m[2]), int(m[3])
datetime(yy, mm, dd) # validate
return f"{yy:04d}-{mm:02d}-{dd:02d}"
except ValueError:
return None
# DD.MM.YYYY
m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s)
if m:
try:
dd, mm, yy = int(m[1]), int(m[2]), int(m[3])
datetime(yy, mm, dd)
return f"{yy:04d}-{mm:02d}-{dd:02d}"
except ValueError:
return None
# YYYY-MM-DD passthrough
if re.match(r"^\d{4}-\d{2}-\d{2}", s):
return s[:10]
return None
def parse_number(s: str | None) -> str | None:
"""Parse Romanian number → ISO float string.
SEAP CSV uses MIXED conventions:
- "1.234.567,89" → period=thousand, comma=decimal → 1234567.89
- "123,126" → comma=THOUSAND (3 digits after) → 123126
- "12345,67" → comma=decimal (2 digits after) → 12345.67
- "1,234,567" → all commas=thousand → 1234567
Heuristic: digits-after-final-comma == 3 → thousand separator,
otherwise → decimal. Robust to most real RO data.
"""
if not s:
return None
s = s.strip().strip('"').replace("\xa0", "").replace(" ", "")
if not s or s == "-":
return None
# Mixed period+comma → assume RO format (period thousand, comma decimal)
if "," in s and "." in s:
s = s.replace(".", "").replace(",", ".")
try:
return f"{float(s):.2f}"
except ValueError:
return None
# Multi-comma → all thousand separators
if s.count(",") >= 2:
try:
return f"{int(s.replace(',', '')):d}.00"
except ValueError:
return None
# Single comma → check digits after
if "," in s:
parts = s.split(",")
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
digits_after = len(parts[1])
if digits_after == 3:
# Thousand separator (most common SEAP case)
try:
return f"{int(parts[0] + parts[1])}.00"
except ValueError:
return None
# 1-2 digits after → decimal separator
try:
return f"{float(parts[0] + '.' + parts[1]):.2f}"
except ValueError:
return None
try:
return f"{float(s):.2f}"
except ValueError:
return None
def normalize_cui(s: str | None) -> str | None:
if not s:
return None
s = s.strip().strip('"')
s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE)
s = s.strip()
if not s or not s.isdigit():
return None
return s
def main() -> None:
if len(sys.argv) != 5:
print(__doc__)
sys.exit(2)
csv_path = Path(sys.argv[1])
out_path = Path(sys.argv[2])
record_type = sys.argv[3]
source = sys.argv[4]
if not csv_path.exists():
print(f"ERROR: {csv_path} does not exist", file=sys.stderr)
sys.exit(1)
seen: set[tuple[str, str]] = set()
out_cols = [
"type", "ref_number", "authority_name", "authority_cui",
"cpv_code", "cpv_name", "contract_type", "publication_date",
"contract_date", "awarded_value", "supplier_name", "supplier_cui",
"procedure_type", "legislation", "source",
]
written = 0
skipped_dup = 0
skipped_no_ref = 0
total = 0
# Sniff first line to detect delimiter/quotechar
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f:
first_line = f.readline()
delim, quotechar = detect_dialect(first_line)
print(f"[import] delim={delim!r} quote={quotechar!r}", file=sys.stderr)
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \
out_path.open("w", encoding="utf-8") as out:
if quotechar:
reader = csv.reader(f, delimiter=delim, quotechar=quotechar)
else:
reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE)
# Skip "title" rows — some XLS exports begin with a single-cell
# title (rest empty), then the real header row follows.
header_raw = next(reader)
non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip())
if non_empty <= 1:
print("[import] skipping title row, advancing to next", file=sys.stderr)
header_raw = next(reader)
# Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends
header_raw = [h.strip().strip("|").strip() for h in header_raw]
header = [normalize_header(h) for h in header_raw]
# Build column index map. For dup headers (2× "data publicare"), LAST wins.
col_idx: dict[str, int] = {}
for i, h in enumerate(header):
mapped = HEADER_MAP.get(h)
if mapped:
col_idx[mapped] = i
# Write header line for COPY (\\\\N markers for nulls)
out.write("\t".join(out_cols) + "\n")
for row in reader:
total += 1
if len(row) < len(header):
row = row + [""] * (len(header) - len(row))
def get(col: str) -> str | None:
idx = col_idx.get(col)
if idx is None or idx >= len(row):
return None
v = row[idx].strip().strip("|").strip()
return v if v else None
ref = get("ref_number")
# For initiere imports, files name the ref column "Numar anunt initiere"
# which we map to ref_initiere. Fall through to that field.
if not ref and record_type in ("initiere",):
ref = get("ref_initiere")
if not ref:
skipped_no_ref += 1
continue
key = (record_type, ref)
if key in seen:
skipped_dup += 1
continue
seen.add(key)
fields = {
"type": record_type,
"ref_number": ref,
"authority_name": get("authority_name"),
"authority_cui": normalize_cui(get("authority_cui")),
"cpv_code": get("cpv_code"),
"cpv_name": get("cpv_name"),
"contract_type": get("contract_type"),
"publication_date": parse_date(get("publication_date")),
"contract_date": parse_date(get("contract_date")),
"awarded_value": parse_number(get("awarded_value")),
"supplier_name": get("supplier_name"),
"supplier_cui": normalize_cui(get("supplier_cui")),
"procedure_type": get("procedure_type"),
"legislation": get("legislation"),
"source": source,
}
line_parts = []
for c in out_cols:
v = fields.get(c)
if v is None:
line_parts.append("\\N")
else:
# Escape tabs, newlines, backslashes for COPY format
v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "")
line_parts.append(v)
out.write("\t".join(line_parts) + "\n")
written += 1
print(f"[import] CSV={csv_path.name}")
print(f"[import] total rows: {total}")
print(f"[import] written: {written}")
print(f"[import] dup-skip: {skipped_dup}")
print(f"[import] no-ref: {skipped_no_ref}")
print(f"[import] output: {out_path}")
if __name__ == "__main__":
main()