initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,483 @@
+#!/usr/bin/env python3
+"""
+SEAP historical CSV importer for data.gov.ro yearly dumps.
+
+Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that
+PostgreSQL COPY can ingest into seap.announcements. Handles:
+- BOM stripping
+- Romanian decimal commas → dots
+- "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants)
+- Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs
+- CUI normalization (strip "RO " prefix)
+
+Usage:
+    python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE
+        TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
+        SOURCE: e.g. 'datagov_2024_t1_contracte'
+
+The output TSV columns are FIXED (15 columns matching the import SQL):
+    type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
+    contract_type, publication_date, contract_date, awarded_value,
+    supplier_name, supplier_cui, procedure_type, legislation, source
+
+Column mapping is inferred from CSV headers (case+diacritic-insensitive).
+Falls back gracefully when columns are missing (older years had fewer cols).
+"""
+from __future__ import annotations
+
+import csv
+import re
+import sys
+import unicodedata
+from datetime import datetime
+from pathlib import Path
+
+
+def normalize_header(s: str) -> str:
+    """Strip BOM, lowercase, strip diacritics, collapse whitespace."""
+    s = s.replace("", "").strip().lower()
+    s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
+    s = re.sub(r"\s+", " ", s)
+    s = s.replace("?", "")
+    return s.strip()
+
+
+def detect_dialect(first_line: str) -> tuple[str, str | None]:
+    """Detect delimiter and quote char from first line.
+
+    SEAP historical CSVs vary wildly:
+    - 2017/2018: ^ delim, no quote
+    - 2022:      , delim, | quote (header looks like |FIELD|,|FIELD|)
+    - 2023 T3:   | delim, " quote (header: FIELD|FIELD with row "txt"|"txt")
+    - 2023 T4:   , delim, " quote (standard CSV with title-case headers)
+    - 2024+:     , delim, " quote (standard CSV)
+    Returns (delim, quotechar_or_None).
+    """
+    # Strip BOM (efbb bf) and lstrip whitespace before sniffing
+    s = first_line
+    if s.startswith(""):
+        s = s[1:]
+    s_strip = s.lstrip()
+    # 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|`
+    # → delim=',' quote='|'
+    if s_strip.startswith("|") and "|," in s_strip:
+        return (",", "|")
+    counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]}
+    # Pick highest-count delimiter
+    delim = max(counts, key=counts.get)
+    if counts[delim] == 0:
+        delim = ","
+    if delim == "|":
+        return ("|", '"')
+    if delim == "^":
+        return ("^", None)
+    if delim == ";":
+        return (";", '"')
+    return (",", '"')
+
+
+# Maps normalized header → output column name.
+# Multiple headers may map to the same output (e.g. two "data publicare" cols).
+# Schema variants seen across data.gov.ro yearly dumps:
+# - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV"
+# - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV"
+# - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode"
+HEADER_MAP = {
+    # 2024 standard CSV
+    "autoritate contractanta": "authority_name",
+    "cui": "authority_cui",
+    "cui autoritate contractanta": "authority_cui",
+    "cod cpv": "cpv_code",
+    "denumire cpv": "cpv_name",
+    "tip contract": "contract_type",
+    "tip procedura": "procedure_type",
+    "tip legislatie": "legislation",
+    "tip incheiere contract": "award_type",
+    "tip inchiere contract": "award_type",  # typo seen in 2023 T1 XLS
+    "tip criteriu de atribuire": "criterion",
+    "numar anunt atribuire": "ref_number",
+    "numar anunt initiere": "ref_initiere",
+    "numar anunt": "ref_number",
+    "numar contract": "contract_number",
+    "numar lot": "lot_number",
+    "data contract": "contract_date",
+    "data publicare": "publication_date",
+    "data publicare anunt atribuire": "publication_date",  # 2023 T4 standard CSV
+    "data anunt atribuire": "publication_date",  # 2023 T1 XLS, 2017 ^-delim
+    "data anunt initiere": "ref_initiere_date",
+    "data publicare anunt initiere": "ref_initiere_date",
+    "data publicare anunt": "publication_date",  # 2023 T4 atribuire-fara
+    "valoare atribuita (ron)": "awarded_value",
+    "valoare estimata procedura": "estimated_value",
+    "moneda valoare estimata procedura": "estimated_currency",
+    "denumire procedura": "procedure_name",
+    "tip activitate autoritate": "authority_activity",
+    "criteriu de atribuire": "criterion",
+    "denumire contract": "contract_title",
+    "oras ofertant castigator": "supplier_city",
+    "tara ofertant castigator": "supplier_country",
+    "data publicare contract": "contract_date",
+    "tip activitate": "authority_activity",
+    "tip autoritate": "authority_type",
+    "tip anunt": "announcement_type",
+    "criterii de atribuire": "criterion",
+    "licitatie electronica": "electronic_auction",
+    "ofertant castigator": "supplier_name",
+    "cui ofertant castigator": "supplier_cui",
+    "oras ofertant": "supplier_city",
+    "tara ofertant": "supplier_country",
+    "incheiat prin": "award_type",
+    "valoare contract (ron)": "awarded_value",
+    "valoare contract": "awarded_value",
+    "valoare estimata (ron)": "estimated_value",
+    "valoare estimata": "estimated_value",
+    "ofertant": "supplier_name",
+    "cui ofertant": "supplier_cui",
+    "cui castigator": "supplier_cui",
+    "castigator": "supplier_name",
+    "oras": "supplier_city",
+    "tara": "supplier_country",
+    "modalitate de desfasurare": "modality",
+    # 2022/2023 UPPER_SNAKE_CASE pipe-delim schema
+    "denumire_ac": "authority_name",
+    "cui_ac": "authority_cui",
+    "cui_autoritate": "authority_cui",
+    "autoritate_contractanta": "authority_name",
+    "numar_anunt_atribuire": "ref_number",
+    "numar_anunt": "ref_number",
+    "data_anunt_atribuire": "publication_date",
+    "data_publicare": "publication_date",
+    "data_publicare_ai": "ref_initiere_date",
+    "data_contract": "contract_date",
+    "numar_contract": "contract_number",
+    "denumire_contract": "contract_title",
+    "cod_cpv": "cpv_code",
+    "cod_cpv_procedura": "cpv_code",
+    "cpv_code": "cpv_code",  # 2023 schema variant
+    "denumire_cpv": "cpv_name",
+    "denumire_cpv_procedura": "cpv_name",
+    "tip_contract": "contract_type",
+    "tip_procedura": "procedure_type",
+    "tip_legislatie": "legislation",
+    "tip_lesiglatie": "legislation",  # SEAP typo present in many 2023 files
+    "tip_anunt": "announcement_type",
+    "tip_incheiere_contract": "award_type",
+    "incheiat_prin": "award_type",
+    "valoare_contract_ron": "awarded_value",
+    "valoare_atribuita": "awarded_value",
+    "valoare_estimata_procedura": "estimated_value",
+    "ofertant": "supplier_name",
+    "cui_of": "supplier_cui",
+    "nume_castigator": "supplier_name",
+    "cui_castigator": "supplier_cui",
+    "oras_castigator": "supplier_city",
+    "tara_castigator": "supplier_country",
+    "modalitate_desfasurare": "modality",
+    "modalitate_atribuire": "modality",
+    "tip_criterii_atribuire": "criterion",
+    "criteriu_de_atribuire": "criterion",
+    "numar_anunt_ai": "ref_initiere",
+    "numar_anunt_initiere": "ref_initiere",
+    "data_anunt_initiere": "ref_initiere_date",
+    "denumire_procedura": "procedure_name",
+    # 2017/2018 ^-delim CamelCase legacy schema
+    "castigator": "supplier_name",  # already exists for 2024 but also legacy
+    "castigatorcui": "supplier_cui",
+    "castigatortara": "supplier_country",
+    "castigatorlocalitate": "supplier_city",
+    "castigatoradresa": "supplier_address",
+    "tipcontract": "contract_type",
+    "tipprocedura": "procedure_type",
+    "autoritatecontractanta": "authority_name",
+    "autoritatecontractantacui": "authority_cui",
+    "tipac": "authority_type",
+    "tipactivitateac": "authority_activity",
+    "denumireac": "authority_name",
+    "numaranuntatribuire": "ref_number",
+    "numaranuntparticipare": "ref_initiere",
+    "numaranunt": "ref_number",
+    "dataanuntatribuire": "publication_date",
+    "dataanuntparticipare": "ref_initiere_date",
+    "datapublicare": "publication_date",
+    "tipincheierecontract": "award_type",
+    "tipcriteriiatribuire": "criterion",
+    "culicitatieelectronica": "electronic_auction",
+    "numarofertepre primite": "n_offers",
+    "numarofertePrimite": "n_offers",
+    "subcontractat": "subcontracted",
+    "numarcontract": "contract_number",
+    "datacontract": "contract_date",
+    "titlucontract": "contract_title",
+    "valoare": "awarded_value_orig",  # may be in non-RON currency for 2017
+    "moneda": "currency",
+    "valoareron": "awarded_value",
+    "valoareeur": "awarded_value_eur",
+    "cpvcodeid": "cpv_code_id",  # internal SEAP id, not CPV
+    "cpvcode": "cpv_code",  # actual CPV like 85150000-5
+    "valoareestimataparticipare": "estimated_value",
+    "monedavaloareestimataparticipare": "estimated_currency",
+    "fonduricomunitare": "eu_funded",
+    "tipfinantare": "funding_type",
+    "tiplegislatieid": "legislation",
+    "fondeuropean": "eu_fund",
+    "contractperiodic": "periodic",
+    "depozitegarantii": "deposits",
+    "modalitatifinantare": "funding_modes",
+    "tip": "announcement_subtype",  # 2017 contracte has bare "Tip"
+    # 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores)
+    "castigator": "supplier_name",
+    "castigator_cui": "supplier_cui",
+    "castigator_tara": "supplier_country",
+    "castigator_localitate": "supplier_city",
+    "castigaor_localitate": "supplier_city",  # SEAP typo seen in 2018 T2 XLS
+    "castigator_adresa": "supplier_address",
+    "tip_ac": "authority_type",
+    "tip_activitate_ac": "authority_activity",
+    "autoritate_contractanta_cui": "authority_cui",
+    "numar_anunt_participare": "ref_initiere",
+    "data_anunt_participare": "ref_initiere_date",
+    "tip_incheiere_contract": "award_type",
+    "tip_criterii_atribuire": "criterion",
+    "cu_licitatie_electronica": "electronic_auction",
+    "numar_oferte_primite": "n_offers",
+    "titlu_contract": "contract_title",
+    "valoare_ron": "awarded_value",
+    "valoare_eur": "awarded_value_eur",
+    "valoare_estimata_participare": "estimated_value",
+    "moneda_valoare_estimata_participare": "estimated_currency",
+    "fonduri_comunitare": "eu_funded",
+    "tip_finantare": "funding_type",
+    "tip_legislatie_id": "legislation",
+    "fond_european": "eu_fund",
+    "contract_periodic": "periodic",
+    "depozite_garantii": "deposits",
+    "modalitati_finantare": "funding_modes",
+    "cpv_code_id": "cpv_code_id",
+    "cpv_code": "cpv_code",
+}
+
+
+def parse_date(s: str | None) -> str | None:
+    """Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD."""
+    if not s:
+        return None
+    s = s.strip()
+    if not s:
+        return None
+    # MM/DD/YYYY 01:35:39
+    m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s)
+    if m:
+        try:
+            mm, dd, yy = int(m[1]), int(m[2]), int(m[3])
+            datetime(yy, mm, dd)  # validate
+            return f"{yy:04d}-{mm:02d}-{dd:02d}"
+        except ValueError:
+            return None
+    # DD.MM.YYYY
+    m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s)
+    if m:
+        try:
+            dd, mm, yy = int(m[1]), int(m[2]), int(m[3])
+            datetime(yy, mm, dd)
+            return f"{yy:04d}-{mm:02d}-{dd:02d}"
+        except ValueError:
+            return None
+    # YYYY-MM-DD passthrough
+    if re.match(r"^\d{4}-\d{2}-\d{2}", s):
+        return s[:10]
+    return None
+
+
+def parse_number(s: str | None) -> str | None:
+    """Parse Romanian number → ISO float string.
+
+    SEAP CSV uses MIXED conventions:
+    - "1.234.567,89" → period=thousand, comma=decimal → 1234567.89
+    - "123,126"      → comma=THOUSAND (3 digits after) → 123126
+    - "12345,67"     → comma=decimal (2 digits after) → 12345.67
+    - "1,234,567"    → all commas=thousand → 1234567
+    Heuristic: digits-after-final-comma == 3 → thousand separator,
+    otherwise → decimal. Robust to most real RO data.
+    """
+    if not s:
+        return None
+    s = s.strip().strip('"').replace("\xa0", "").replace(" ", "")
+    if not s or s == "-":
+        return None
+
+    # Mixed period+comma → assume RO format (period thousand, comma decimal)
+    if "," in s and "." in s:
+        s = s.replace(".", "").replace(",", ".")
+        try:
+            return f"{float(s):.2f}"
+        except ValueError:
+            return None
+
+    # Multi-comma → all thousand separators
+    if s.count(",") >= 2:
+        try:
+            return f"{int(s.replace(',', '')):d}.00"
+        except ValueError:
+            return None
+
+    # Single comma → check digits after
+    if "," in s:
+        parts = s.split(",")
+        if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
+            digits_after = len(parts[1])
+            if digits_after == 3:
+                # Thousand separator (most common SEAP case)
+                try:
+                    return f"{int(parts[0] + parts[1])}.00"
+                except ValueError:
+                    return None
+            # 1-2 digits after → decimal separator
+            try:
+                return f"{float(parts[0] + '.' + parts[1]):.2f}"
+            except ValueError:
+                return None
+
+    try:
+        return f"{float(s):.2f}"
+    except ValueError:
+        return None
+
+
+def normalize_cui(s: str | None) -> str | None:
+    if not s:
+        return None
+    s = s.strip().strip('"')
+    s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE)
+    s = s.strip()
+    if not s or not s.isdigit():
+        return None
+    return s
+
+
+def main() -> None:
+    if len(sys.argv) != 5:
+        print(__doc__)
+        sys.exit(2)
+
+    csv_path = Path(sys.argv[1])
+    out_path = Path(sys.argv[2])
+    record_type = sys.argv[3]
+    source = sys.argv[4]
+
+    if not csv_path.exists():
+        print(f"ERROR: {csv_path} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    seen: set[tuple[str, str]] = set()
+    out_cols = [
+        "type", "ref_number", "authority_name", "authority_cui",
+        "cpv_code", "cpv_name", "contract_type", "publication_date",
+        "contract_date", "awarded_value", "supplier_name", "supplier_cui",
+        "procedure_type", "legislation", "source",
+    ]
+
+    written = 0
+    skipped_dup = 0
+    skipped_no_ref = 0
+    total = 0
+
+    # Sniff first line to detect delimiter/quotechar
+    with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f:
+        first_line = f.readline()
+    delim, quotechar = detect_dialect(first_line)
+    print(f"[import]   delim={delim!r} quote={quotechar!r}", file=sys.stderr)
+
+    with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \
+         out_path.open("w", encoding="utf-8") as out:
+        if quotechar:
+            reader = csv.reader(f, delimiter=delim, quotechar=quotechar)
+        else:
+            reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE)
+        # Skip "title" rows — some XLS exports begin with a single-cell
+        # title (rest empty), then the real header row follows.
+        header_raw = next(reader)
+        non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip())
+        if non_empty <= 1:
+            print("[import]   skipping title row, advancing to next", file=sys.stderr)
+            header_raw = next(reader)
+        # Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends
+        header_raw = [h.strip().strip("|").strip() for h in header_raw]
+        header = [normalize_header(h) for h in header_raw]
+
+        # Build column index map. For dup headers (2× "data publicare"), LAST wins.
+        col_idx: dict[str, int] = {}
+        for i, h in enumerate(header):
+            mapped = HEADER_MAP.get(h)
+            if mapped:
+                col_idx[mapped] = i
+
+        # Write header line for COPY (\\\\N markers for nulls)
+        out.write("\t".join(out_cols) + "\n")
+
+        for row in reader:
+            total += 1
+            if len(row) < len(header):
+                row = row + [""] * (len(header) - len(row))
+
+            def get(col: str) -> str | None:
+                idx = col_idx.get(col)
+                if idx is None or idx >= len(row):
+                    return None
+                v = row[idx].strip().strip("|").strip()
+                return v if v else None
+
+            ref = get("ref_number")
+            # For initiere imports, files name the ref column "Numar anunt initiere"
+            # which we map to ref_initiere. Fall through to that field.
+            if not ref and record_type in ("initiere",):
+                ref = get("ref_initiere")
+            if not ref:
+                skipped_no_ref += 1
+                continue
+
+            key = (record_type, ref)
+            if key in seen:
+                skipped_dup += 1
+                continue
+            seen.add(key)
+
+            fields = {
+                "type": record_type,
+                "ref_number": ref,
+                "authority_name": get("authority_name"),
+                "authority_cui": normalize_cui(get("authority_cui")),
+                "cpv_code": get("cpv_code"),
+                "cpv_name": get("cpv_name"),
+                "contract_type": get("contract_type"),
+                "publication_date": parse_date(get("publication_date")),
+                "contract_date": parse_date(get("contract_date")),
+                "awarded_value": parse_number(get("awarded_value")),
+                "supplier_name": get("supplier_name"),
+                "supplier_cui": normalize_cui(get("supplier_cui")),
+                "procedure_type": get("procedure_type"),
+                "legislation": get("legislation"),
+                "source": source,
+            }
+
+            line_parts = []
+            for c in out_cols:
+                v = fields.get(c)
+                if v is None:
+                    line_parts.append("\\N")
+                else:
+                    # Escape tabs, newlines, backslashes for COPY format
+                    v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "")
+                    line_parts.append(v)
+            out.write("\t".join(line_parts) + "\n")
+            written += 1
+
+    print(f"[import] CSV={csv_path.name}")
+    print(f"[import]   total rows: {total}")
+    print(f"[import]   written:    {written}")
+    print(f"[import]   dup-skip:   {skipped_dup}")
+    print(f"[import]   no-ref:     {skipped_no_ref}")
+    print(f"[import]   output:     {out_path}")
+
+
+if __name__ == "__main__":
+    main()