a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
484 lines
18 KiB
Python
484 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
SEAP historical CSV importer for data.gov.ro yearly dumps.
|
||
|
||
Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that
|
||
PostgreSQL COPY can ingest into seap.announcements. Handles:
|
||
- BOM stripping
|
||
- Romanian decimal commas → dots
|
||
- "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants)
|
||
- Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs
|
||
- CUI normalization (strip "RO " prefix)
|
||
|
||
Usage:
|
||
python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE
|
||
TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
|
||
SOURCE: e.g. 'datagov_2024_t1_contracte'
|
||
|
||
The output TSV columns are FIXED (15 columns matching the import SQL):
|
||
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
|
||
contract_type, publication_date, contract_date, awarded_value,
|
||
supplier_name, supplier_cui, procedure_type, legislation, source
|
||
|
||
Column mapping is inferred from CSV headers (case+diacritic-insensitive).
|
||
Falls back gracefully when columns are missing (older years had fewer cols).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import csv
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
|
||
def normalize_header(s: str) -> str:
|
||
"""Strip BOM, lowercase, strip diacritics, collapse whitespace."""
|
||
s = s.replace("", "").strip().lower()
|
||
s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
|
||
s = re.sub(r"\s+", " ", s)
|
||
s = s.replace("?", "")
|
||
return s.strip()
|
||
|
||
|
||
def detect_dialect(first_line: str) -> tuple[str, str | None]:
|
||
"""Detect delimiter and quote char from first line.
|
||
|
||
SEAP historical CSVs vary wildly:
|
||
- 2017/2018: ^ delim, no quote
|
||
- 2022: , delim, | quote (header looks like |FIELD|,|FIELD|)
|
||
- 2023 T3: | delim, " quote (header: FIELD|FIELD with row "txt"|"txt")
|
||
- 2023 T4: , delim, " quote (standard CSV with title-case headers)
|
||
- 2024+: , delim, " quote (standard CSV)
|
||
Returns (delim, quotechar_or_None).
|
||
"""
|
||
# Strip BOM (efbb bf) and lstrip whitespace before sniffing
|
||
s = first_line
|
||
if s.startswith(""):
|
||
s = s[1:]
|
||
s_strip = s.lstrip()
|
||
# 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|`
|
||
# → delim=',' quote='|'
|
||
if s_strip.startswith("|") and "|," in s_strip:
|
||
return (",", "|")
|
||
counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]}
|
||
# Pick highest-count delimiter
|
||
delim = max(counts, key=counts.get)
|
||
if counts[delim] == 0:
|
||
delim = ","
|
||
if delim == "|":
|
||
return ("|", '"')
|
||
if delim == "^":
|
||
return ("^", None)
|
||
if delim == ";":
|
||
return (";", '"')
|
||
return (",", '"')
|
||
|
||
|
||
# Maps normalized header → output column name.
|
||
# Multiple headers may map to the same output (e.g. two "data publicare" cols).
|
||
# Schema variants seen across data.gov.ro yearly dumps:
|
||
# - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV"
|
||
# - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV"
|
||
# - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode"
|
||
HEADER_MAP = {
|
||
# 2024 standard CSV
|
||
"autoritate contractanta": "authority_name",
|
||
"cui": "authority_cui",
|
||
"cui autoritate contractanta": "authority_cui",
|
||
"cod cpv": "cpv_code",
|
||
"denumire cpv": "cpv_name",
|
||
"tip contract": "contract_type",
|
||
"tip procedura": "procedure_type",
|
||
"tip legislatie": "legislation",
|
||
"tip incheiere contract": "award_type",
|
||
"tip inchiere contract": "award_type", # typo seen in 2023 T1 XLS
|
||
"tip criteriu de atribuire": "criterion",
|
||
"numar anunt atribuire": "ref_number",
|
||
"numar anunt initiere": "ref_initiere",
|
||
"numar anunt": "ref_number",
|
||
"numar contract": "contract_number",
|
||
"numar lot": "lot_number",
|
||
"data contract": "contract_date",
|
||
"data publicare": "publication_date",
|
||
"data publicare anunt atribuire": "publication_date", # 2023 T4 standard CSV
|
||
"data anunt atribuire": "publication_date", # 2023 T1 XLS, 2017 ^-delim
|
||
"data anunt initiere": "ref_initiere_date",
|
||
"data publicare anunt initiere": "ref_initiere_date",
|
||
"data publicare anunt": "publication_date", # 2023 T4 atribuire-fara
|
||
"valoare atribuita (ron)": "awarded_value",
|
||
"valoare estimata procedura": "estimated_value",
|
||
"moneda valoare estimata procedura": "estimated_currency",
|
||
"denumire procedura": "procedure_name",
|
||
"tip activitate autoritate": "authority_activity",
|
||
"criteriu de atribuire": "criterion",
|
||
"denumire contract": "contract_title",
|
||
"oras ofertant castigator": "supplier_city",
|
||
"tara ofertant castigator": "supplier_country",
|
||
"data publicare contract": "contract_date",
|
||
"tip activitate": "authority_activity",
|
||
"tip autoritate": "authority_type",
|
||
"tip anunt": "announcement_type",
|
||
"criterii de atribuire": "criterion",
|
||
"licitatie electronica": "electronic_auction",
|
||
"ofertant castigator": "supplier_name",
|
||
"cui ofertant castigator": "supplier_cui",
|
||
"oras ofertant": "supplier_city",
|
||
"tara ofertant": "supplier_country",
|
||
"incheiat prin": "award_type",
|
||
"valoare contract (ron)": "awarded_value",
|
||
"valoare contract": "awarded_value",
|
||
"valoare estimata (ron)": "estimated_value",
|
||
"valoare estimata": "estimated_value",
|
||
"ofertant": "supplier_name",
|
||
"cui ofertant": "supplier_cui",
|
||
"cui castigator": "supplier_cui",
|
||
"castigator": "supplier_name",
|
||
"oras": "supplier_city",
|
||
"tara": "supplier_country",
|
||
"modalitate de desfasurare": "modality",
|
||
# 2022/2023 UPPER_SNAKE_CASE pipe-delim schema
|
||
"denumire_ac": "authority_name",
|
||
"cui_ac": "authority_cui",
|
||
"cui_autoritate": "authority_cui",
|
||
"autoritate_contractanta": "authority_name",
|
||
"numar_anunt_atribuire": "ref_number",
|
||
"numar_anunt": "ref_number",
|
||
"data_anunt_atribuire": "publication_date",
|
||
"data_publicare": "publication_date",
|
||
"data_publicare_ai": "ref_initiere_date",
|
||
"data_contract": "contract_date",
|
||
"numar_contract": "contract_number",
|
||
"denumire_contract": "contract_title",
|
||
"cod_cpv": "cpv_code",
|
||
"cod_cpv_procedura": "cpv_code",
|
||
"cpv_code": "cpv_code", # 2023 schema variant
|
||
"denumire_cpv": "cpv_name",
|
||
"denumire_cpv_procedura": "cpv_name",
|
||
"tip_contract": "contract_type",
|
||
"tip_procedura": "procedure_type",
|
||
"tip_legislatie": "legislation",
|
||
"tip_lesiglatie": "legislation", # SEAP typo present in many 2023 files
|
||
"tip_anunt": "announcement_type",
|
||
"tip_incheiere_contract": "award_type",
|
||
"incheiat_prin": "award_type",
|
||
"valoare_contract_ron": "awarded_value",
|
||
"valoare_atribuita": "awarded_value",
|
||
"valoare_estimata_procedura": "estimated_value",
|
||
"ofertant": "supplier_name",
|
||
"cui_of": "supplier_cui",
|
||
"nume_castigator": "supplier_name",
|
||
"cui_castigator": "supplier_cui",
|
||
"oras_castigator": "supplier_city",
|
||
"tara_castigator": "supplier_country",
|
||
"modalitate_desfasurare": "modality",
|
||
"modalitate_atribuire": "modality",
|
||
"tip_criterii_atribuire": "criterion",
|
||
"criteriu_de_atribuire": "criterion",
|
||
"numar_anunt_ai": "ref_initiere",
|
||
"numar_anunt_initiere": "ref_initiere",
|
||
"data_anunt_initiere": "ref_initiere_date",
|
||
"denumire_procedura": "procedure_name",
|
||
# 2017/2018 ^-delim CamelCase legacy schema
|
||
"castigator": "supplier_name", # already exists for 2024 but also legacy
|
||
"castigatorcui": "supplier_cui",
|
||
"castigatortara": "supplier_country",
|
||
"castigatorlocalitate": "supplier_city",
|
||
"castigatoradresa": "supplier_address",
|
||
"tipcontract": "contract_type",
|
||
"tipprocedura": "procedure_type",
|
||
"autoritatecontractanta": "authority_name",
|
||
"autoritatecontractantacui": "authority_cui",
|
||
"tipac": "authority_type",
|
||
"tipactivitateac": "authority_activity",
|
||
"denumireac": "authority_name",
|
||
"numaranuntatribuire": "ref_number",
|
||
"numaranuntparticipare": "ref_initiere",
|
||
"numaranunt": "ref_number",
|
||
"dataanuntatribuire": "publication_date",
|
||
"dataanuntparticipare": "ref_initiere_date",
|
||
"datapublicare": "publication_date",
|
||
"tipincheierecontract": "award_type",
|
||
"tipcriteriiatribuire": "criterion",
|
||
"culicitatieelectronica": "electronic_auction",
|
||
"numarofertepre primite": "n_offers",
|
||
"numarofertePrimite": "n_offers",
|
||
"subcontractat": "subcontracted",
|
||
"numarcontract": "contract_number",
|
||
"datacontract": "contract_date",
|
||
"titlucontract": "contract_title",
|
||
"valoare": "awarded_value_orig", # may be in non-RON currency for 2017
|
||
"moneda": "currency",
|
||
"valoareron": "awarded_value",
|
||
"valoareeur": "awarded_value_eur",
|
||
"cpvcodeid": "cpv_code_id", # internal SEAP id, not CPV
|
||
"cpvcode": "cpv_code", # actual CPV like 85150000-5
|
||
"valoareestimataparticipare": "estimated_value",
|
||
"monedavaloareestimataparticipare": "estimated_currency",
|
||
"fonduricomunitare": "eu_funded",
|
||
"tipfinantare": "funding_type",
|
||
"tiplegislatieid": "legislation",
|
||
"fondeuropean": "eu_fund",
|
||
"contractperiodic": "periodic",
|
||
"depozitegarantii": "deposits",
|
||
"modalitatifinantare": "funding_modes",
|
||
"tip": "announcement_subtype", # 2017 contracte has bare "Tip"
|
||
# 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores)
|
||
"castigator": "supplier_name",
|
||
"castigator_cui": "supplier_cui",
|
||
"castigator_tara": "supplier_country",
|
||
"castigator_localitate": "supplier_city",
|
||
"castigaor_localitate": "supplier_city", # SEAP typo seen in 2018 T2 XLS
|
||
"castigator_adresa": "supplier_address",
|
||
"tip_ac": "authority_type",
|
||
"tip_activitate_ac": "authority_activity",
|
||
"autoritate_contractanta_cui": "authority_cui",
|
||
"numar_anunt_participare": "ref_initiere",
|
||
"data_anunt_participare": "ref_initiere_date",
|
||
"tip_incheiere_contract": "award_type",
|
||
"tip_criterii_atribuire": "criterion",
|
||
"cu_licitatie_electronica": "electronic_auction",
|
||
"numar_oferte_primite": "n_offers",
|
||
"titlu_contract": "contract_title",
|
||
"valoare_ron": "awarded_value",
|
||
"valoare_eur": "awarded_value_eur",
|
||
"valoare_estimata_participare": "estimated_value",
|
||
"moneda_valoare_estimata_participare": "estimated_currency",
|
||
"fonduri_comunitare": "eu_funded",
|
||
"tip_finantare": "funding_type",
|
||
"tip_legislatie_id": "legislation",
|
||
"fond_european": "eu_fund",
|
||
"contract_periodic": "periodic",
|
||
"depozite_garantii": "deposits",
|
||
"modalitati_finantare": "funding_modes",
|
||
"cpv_code_id": "cpv_code_id",
|
||
"cpv_code": "cpv_code",
|
||
}
|
||
|
||
|
||
def parse_date(s: str | None) -> str | None:
|
||
"""Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD."""
|
||
if not s:
|
||
return None
|
||
s = s.strip()
|
||
if not s:
|
||
return None
|
||
# MM/DD/YYYY 01:35:39
|
||
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s)
|
||
if m:
|
||
try:
|
||
mm, dd, yy = int(m[1]), int(m[2]), int(m[3])
|
||
datetime(yy, mm, dd) # validate
|
||
return f"{yy:04d}-{mm:02d}-{dd:02d}"
|
||
except ValueError:
|
||
return None
|
||
# DD.MM.YYYY
|
||
m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s)
|
||
if m:
|
||
try:
|
||
dd, mm, yy = int(m[1]), int(m[2]), int(m[3])
|
||
datetime(yy, mm, dd)
|
||
return f"{yy:04d}-{mm:02d}-{dd:02d}"
|
||
except ValueError:
|
||
return None
|
||
# YYYY-MM-DD passthrough
|
||
if re.match(r"^\d{4}-\d{2}-\d{2}", s):
|
||
return s[:10]
|
||
return None
|
||
|
||
|
||
def parse_number(s: str | None) -> str | None:
|
||
"""Parse Romanian number → ISO float string.
|
||
|
||
SEAP CSV uses MIXED conventions:
|
||
- "1.234.567,89" → period=thousand, comma=decimal → 1234567.89
|
||
- "123,126" → comma=THOUSAND (3 digits after) → 123126
|
||
- "12345,67" → comma=decimal (2 digits after) → 12345.67
|
||
- "1,234,567" → all commas=thousand → 1234567
|
||
Heuristic: digits-after-final-comma == 3 → thousand separator,
|
||
otherwise → decimal. Robust to most real RO data.
|
||
"""
|
||
if not s:
|
||
return None
|
||
s = s.strip().strip('"').replace("\xa0", "").replace(" ", "")
|
||
if not s or s == "-":
|
||
return None
|
||
|
||
# Mixed period+comma → assume RO format (period thousand, comma decimal)
|
||
if "," in s and "." in s:
|
||
s = s.replace(".", "").replace(",", ".")
|
||
try:
|
||
return f"{float(s):.2f}"
|
||
except ValueError:
|
||
return None
|
||
|
||
# Multi-comma → all thousand separators
|
||
if s.count(",") >= 2:
|
||
try:
|
||
return f"{int(s.replace(',', '')):d}.00"
|
||
except ValueError:
|
||
return None
|
||
|
||
# Single comma → check digits after
|
||
if "," in s:
|
||
parts = s.split(",")
|
||
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
||
digits_after = len(parts[1])
|
||
if digits_after == 3:
|
||
# Thousand separator (most common SEAP case)
|
||
try:
|
||
return f"{int(parts[0] + parts[1])}.00"
|
||
except ValueError:
|
||
return None
|
||
# 1-2 digits after → decimal separator
|
||
try:
|
||
return f"{float(parts[0] + '.' + parts[1]):.2f}"
|
||
except ValueError:
|
||
return None
|
||
|
||
try:
|
||
return f"{float(s):.2f}"
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def normalize_cui(s: str | None) -> str | None:
|
||
if not s:
|
||
return None
|
||
s = s.strip().strip('"')
|
||
s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE)
|
||
s = s.strip()
|
||
if not s or not s.isdigit():
|
||
return None
|
||
return s
|
||
|
||
|
||
def main() -> None:
|
||
if len(sys.argv) != 5:
|
||
print(__doc__)
|
||
sys.exit(2)
|
||
|
||
csv_path = Path(sys.argv[1])
|
||
out_path = Path(sys.argv[2])
|
||
record_type = sys.argv[3]
|
||
source = sys.argv[4]
|
||
|
||
if not csv_path.exists():
|
||
print(f"ERROR: {csv_path} does not exist", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
seen: set[tuple[str, str]] = set()
|
||
out_cols = [
|
||
"type", "ref_number", "authority_name", "authority_cui",
|
||
"cpv_code", "cpv_name", "contract_type", "publication_date",
|
||
"contract_date", "awarded_value", "supplier_name", "supplier_cui",
|
||
"procedure_type", "legislation", "source",
|
||
]
|
||
|
||
written = 0
|
||
skipped_dup = 0
|
||
skipped_no_ref = 0
|
||
total = 0
|
||
|
||
# Sniff first line to detect delimiter/quotechar
|
||
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f:
|
||
first_line = f.readline()
|
||
delim, quotechar = detect_dialect(first_line)
|
||
print(f"[import] delim={delim!r} quote={quotechar!r}", file=sys.stderr)
|
||
|
||
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \
|
||
out_path.open("w", encoding="utf-8") as out:
|
||
if quotechar:
|
||
reader = csv.reader(f, delimiter=delim, quotechar=quotechar)
|
||
else:
|
||
reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE)
|
||
# Skip "title" rows — some XLS exports begin with a single-cell
|
||
# title (rest empty), then the real header row follows.
|
||
header_raw = next(reader)
|
||
non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip())
|
||
if non_empty <= 1:
|
||
print("[import] skipping title row, advancing to next", file=sys.stderr)
|
||
header_raw = next(reader)
|
||
# Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends
|
||
header_raw = [h.strip().strip("|").strip() for h in header_raw]
|
||
header = [normalize_header(h) for h in header_raw]
|
||
|
||
# Build column index map. For dup headers (2× "data publicare"), LAST wins.
|
||
col_idx: dict[str, int] = {}
|
||
for i, h in enumerate(header):
|
||
mapped = HEADER_MAP.get(h)
|
||
if mapped:
|
||
col_idx[mapped] = i
|
||
|
||
# Write header line for COPY (\\\\N markers for nulls)
|
||
out.write("\t".join(out_cols) + "\n")
|
||
|
||
for row in reader:
|
||
total += 1
|
||
if len(row) < len(header):
|
||
row = row + [""] * (len(header) - len(row))
|
||
|
||
def get(col: str) -> str | None:
|
||
idx = col_idx.get(col)
|
||
if idx is None or idx >= len(row):
|
||
return None
|
||
v = row[idx].strip().strip("|").strip()
|
||
return v if v else None
|
||
|
||
ref = get("ref_number")
|
||
# For initiere imports, files name the ref column "Numar anunt initiere"
|
||
# which we map to ref_initiere. Fall through to that field.
|
||
if not ref and record_type in ("initiere",):
|
||
ref = get("ref_initiere")
|
||
if not ref:
|
||
skipped_no_ref += 1
|
||
continue
|
||
|
||
key = (record_type, ref)
|
||
if key in seen:
|
||
skipped_dup += 1
|
||
continue
|
||
seen.add(key)
|
||
|
||
fields = {
|
||
"type": record_type,
|
||
"ref_number": ref,
|
||
"authority_name": get("authority_name"),
|
||
"authority_cui": normalize_cui(get("authority_cui")),
|
||
"cpv_code": get("cpv_code"),
|
||
"cpv_name": get("cpv_name"),
|
||
"contract_type": get("contract_type"),
|
||
"publication_date": parse_date(get("publication_date")),
|
||
"contract_date": parse_date(get("contract_date")),
|
||
"awarded_value": parse_number(get("awarded_value")),
|
||
"supplier_name": get("supplier_name"),
|
||
"supplier_cui": normalize_cui(get("supplier_cui")),
|
||
"procedure_type": get("procedure_type"),
|
||
"legislation": get("legislation"),
|
||
"source": source,
|
||
}
|
||
|
||
line_parts = []
|
||
for c in out_cols:
|
||
v = fields.get(c)
|
||
if v is None:
|
||
line_parts.append("\\N")
|
||
else:
|
||
# Escape tabs, newlines, backslashes for COPY format
|
||
v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "")
|
||
line_parts.append(v)
|
||
out.write("\t".join(line_parts) + "\n")
|
||
written += 1
|
||
|
||
print(f"[import] CSV={csv_path.name}")
|
||
print(f"[import] total rows: {total}")
|
||
print(f"[import] written: {written}")
|
||
print(f"[import] dup-skip: {skipped_dup}")
|
||
print(f"[import] no-ref: {skipped_no_ref}")
|
||
print(f"[import] output: {out_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|