initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AFIR FEGA CSV importer — produces pipe-TSV ingestible by the same SQL
|
||||
loader as the FEADR XLSX path. Schema is identical to FEADR (15 columns):
|
||||
beneficiar, last_name, mama_cui, localitate, cod_masura, obiectiv,
|
||||
data_start, data_end, fega_op, fega_total, feadr_op, feadr_total,
|
||||
op_amount, cofinantare, ue_total.
|
||||
|
||||
FEGA CSV from AFIR portal uses:
|
||||
- comma-separated columns (English decimal, e.g. "4802.43")
|
||||
- CSV header row: DenumireBeneficiar,NumeFamilie,Cui,Localicate,Masura,
|
||||
ObiectivSpecific,DataIncepere,DataSfarsit,CuantumOperationeFEGA,
|
||||
CuantumTotalFega,CuantumOperatiuneFEADR,CuantumtotalFEADR,
|
||||
CuantumAferentOperatiune,CuantumTotalCofinantareBeneficiar,
|
||||
CuantumtotalUEBenefeciar
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
def norm_num(v):
|
||||
if v is None:
|
||||
return ""
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return ""
|
||||
# FEGA uses English format already ("4802.43") — comma swap unnecessary
|
||||
# but tolerate Romanian-style as defensive measure.
|
||||
if "," in s and "." not in s:
|
||||
s = s.replace(",", ".")
|
||||
elif "," in s and "." in s:
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
return s.replace("|", "/")
|
||||
|
||||
|
||||
def norm_text(v):
|
||||
if v is None:
|
||||
return ""
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return ""
|
||||
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
s = s.replace("\\", "\\\\")
|
||||
return s
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("usage: import-afir-fega-csv.py INPUT.csv OUTPUT.tsv", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
in_path, out_path = sys.argv[1], sys.argv[2]
|
||||
|
||||
n_data = 0
|
||||
n_skipped = 0
|
||||
|
||||
# FEGA CSV files include 1+ header rows. Auto-skip until we see a
|
||||
# row that doesn't start with "DenumireBeneficiar".
|
||||
with open(in_path, "r", encoding="utf-8-sig", errors="replace", newline="") as fin, \
|
||||
open(out_path, "w", encoding="utf-8") as fout:
|
||||
reader = csv.reader(fin)
|
||||
for r in reader:
|
||||
if not r:
|
||||
continue
|
||||
# Skip header row(s)
|
||||
if r[0].strip().lower().startswith("denumirebeneficiar"):
|
||||
continue
|
||||
# Pad to 15 columns
|
||||
cells = r + [""] * (15 - len(r))
|
||||
beneficiar = norm_text(cells[0])
|
||||
if not beneficiar:
|
||||
n_skipped += 1
|
||||
continue
|
||||
out = [
|
||||
beneficiar,
|
||||
norm_text(cells[1]), # last_name
|
||||
norm_text(cells[2]), # mama_cui (FEGA Cui)
|
||||
norm_text(cells[3]), # localitate (Localicate typo in source)
|
||||
norm_text(cells[4]), # cod_masura (Masura)
|
||||
norm_text(cells[5]), # obiectiv (ObiectivSpecific)
|
||||
norm_text(cells[6]), # data_start (DataIncepere)
|
||||
norm_text(cells[7]), # data_end (DataSfarsit)
|
||||
norm_num(cells[8]), # fega_op
|
||||
norm_num(cells[9]), # fega_total
|
||||
norm_num(cells[10]), # feadr_op
|
||||
norm_num(cells[11]), # feadr_total
|
||||
norm_num(cells[12]), # op_amount (CuantumAferentOperatiune)
|
||||
norm_num(cells[13]), # cofinantare (CuantumTotalCofinantareBeneficiar)
|
||||
norm_num(cells[14]), # ue_total (CuantumtotalUEBenefeciar)
|
||||
]
|
||||
fout.write("|".join(out) + "\n")
|
||||
n_data += 1
|
||||
if n_data % 100000 == 0:
|
||||
print(f"[afir-fega-import] wrote {n_data} rows", file=sys.stderr)
|
||||
|
||||
print(f"[afir-fega-import] done: {n_data} rows · {n_skipped} skipped", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+139
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""AFIR XLSX → pipe-delimited TSV normalizer.
|
||||
|
||||
Source: AFIR yearly listaplati XLSX (FEADR or FEGA), as published at
|
||||
https://www.afir.ro/rapoarte/beneficiari-de-fonduri-europene/date-deschise/
|
||||
|
||||
The XLSX has 9 banner rows, then a 15-column header at row 10 (1-indexed),
|
||||
then ~470K-560K data rows. Schema (since 2023, identical for 2024):
|
||||
|
||||
Numele beneficiarului
|
||||
Numele de familie al beneficiarului
|
||||
Denumirea societatii-mama si codul de inregistrare fiscala
|
||||
Localitate
|
||||
Codul masurii/tipului de interventie
|
||||
Obiectiv
|
||||
Data inceperii
|
||||
Data incheierii
|
||||
Cuantum Operatiune FEGA
|
||||
Cuantum Total FEGA
|
||||
Cuantum Operatiune FEADR
|
||||
Cuantum Total FEADR
|
||||
Cuantum aferent operatiunii
|
||||
Cuantum total cofinantare beneficiari
|
||||
Cuantum total UE Beneficiar
|
||||
|
||||
Output: pipe-delimited TSV (no quoting), in the same column order, suitable
|
||||
for `\\copy fonduri.staging_afir FROM ... WITH (FORMAT text, DELIMITER '|')`.
|
||||
|
||||
Usage:
|
||||
python3 import-afir-historical.py INPUT.xlsx OUTPUT.tsv
|
||||
|
||||
Numeric columns are normalized: Romanian decimal "12.345,67" → "12345.67".
|
||||
Empty strings stay empty (NULL in COPY with NULL '').
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
import openpyxl
|
||||
|
||||
EXPECTED_HEADER = "Numele beneficiarului"
|
||||
|
||||
|
||||
def norm_num(v):
|
||||
if v is None:
|
||||
return ""
|
||||
if isinstance(v, (int, float)):
|
||||
# Already numeric (rare for AFIR XLSX — values arrive as strings).
|
||||
return f"{v:.2f}".replace("-0.00", "0.00")
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return ""
|
||||
# Strip thousands "." and convert "," → "."
|
||||
# AFIR uses Romanian format: 12.345,67 or 12345,67 or 0,00
|
||||
if "," in s:
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
# Strip leading/trailing whitespace, replace any embedded pipe to be safe
|
||||
return s.replace("|", "/")
|
||||
|
||||
|
||||
def norm_text(v):
|
||||
if v is None:
|
||||
return ""
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return ""
|
||||
# COPY text format: tab and pipe collide with our delimiter; backslash needs escape.
|
||||
# We chose pipe as delimiter — replace embedded pipes with "/".
|
||||
# Newlines collapse to space.
|
||||
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
# Backslash escape for Postgres COPY text format
|
||||
s = s.replace("\\", "\\\\")
|
||||
return s
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("usage: import-afir-historical.py INPUT.xlsx OUTPUT.tsv", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
in_path, out_path = sys.argv[1], sys.argv[2]
|
||||
|
||||
wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
|
||||
ws = wb.active
|
||||
|
||||
rows = ws.iter_rows(values_only=True)
|
||||
header_idx = None
|
||||
for i, r in enumerate(rows):
|
||||
if r and r[0] and EXPECTED_HEADER in str(r[0]):
|
||||
header_idx = i
|
||||
break
|
||||
if i > 50:
|
||||
break
|
||||
if header_idx is None:
|
||||
print("[afir-import] ERROR: header row not found in first 50 rows", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
n_data = 0
|
||||
n_skipped = 0
|
||||
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
for r in rows:
|
||||
# 16 columns observed (last is None padding)
|
||||
if r is None:
|
||||
continue
|
||||
cells = list(r) + [None] * (16 - len(r))
|
||||
beneficiar = norm_text(cells[0])
|
||||
if not beneficiar:
|
||||
# Trailing empty rows
|
||||
n_skipped += 1
|
||||
continue
|
||||
|
||||
out = [
|
||||
beneficiar,
|
||||
norm_text(cells[1]), # last_name
|
||||
norm_text(cells[2]), # mama_cui
|
||||
norm_text(cells[3]), # localitate
|
||||
norm_text(cells[4]), # cod_masura
|
||||
norm_text(cells[5]), # obiectiv
|
||||
norm_text(cells[6]), # data_start
|
||||
norm_text(cells[7]), # data_end
|
||||
norm_num(cells[8]), # fega_op
|
||||
norm_num(cells[9]), # fega_total
|
||||
norm_num(cells[10]), # feadr_op
|
||||
norm_num(cells[11]), # feadr_total
|
||||
norm_num(cells[12]), # op_amount
|
||||
norm_num(cells[13]), # cofinantare
|
||||
norm_num(cells[14]), # ue_total
|
||||
]
|
||||
f.write("|".join(out) + "\n")
|
||||
n_data += 1
|
||||
if n_data % 50000 == 0:
|
||||
print(f"[afir-import] wrote {n_data} rows", file=sys.stderr)
|
||||
|
||||
print(f"[afir-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+167
@@ -0,0 +1,167 @@
|
||||
#!/usr/bin/env python3
|
||||
"""APIA "Lista fermieri" XLSX → pipe-delimited TSV normalizer.
|
||||
|
||||
Source: data.gov.ro CKAN package "lista-fermierilor-campania-apia-2024".
|
||||
Currently a single resource (comuna Găgești, Vaslui, ~192 farmers), but the
|
||||
package is supposed to grow as more UATs publish their lists. The XLSX
|
||||
schema is set by APIA and identical across UATs:
|
||||
|
||||
Row 0 (header): NR.CRT | NUME PRENUME | RESPONSABIL UAT 2024
|
||||
| COMUNA/ORAS | SAT | DATE CONTACT | CENTRUL APIA
|
||||
| SUPRAFATA 2023 | (~17 None columns)
|
||||
Rows 1..N (data): one row per farmer, NR.CRT 1-indexed.
|
||||
|
||||
Output: pipe-delimited TSV (no quoting), columns in this order:
|
||||
|
||||
campaign_year | name | comuna_oras | sat | centru_apia
|
||||
| responsabil_uat | suprafata_ha
|
||||
| source_dataset_id | source_resource_id | source_url
|
||||
|
||||
Empty strings stay empty (NULL in COPY with NULL '').
|
||||
|
||||
Usage:
|
||||
python3 import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv \\
|
||||
CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
import openpyxl
|
||||
|
||||
EXPECTED_HEADER_COL0 = "NR.CRT"
|
||||
EXPECTED_HEADER_COL1 = "NUME" # "NUME PRENUME" or "NUME SI PRENUME"
|
||||
|
||||
|
||||
def norm_text(v):
|
||||
if v is None:
|
||||
return ""
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return ""
|
||||
# Pipe is our delimiter — replace embedded pipes; collapse newlines.
|
||||
s = s.replace("|", "/").replace("\t", " ").replace("\r", " ").replace("\n", " ")
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
s = s.replace("\\", "\\\\")
|
||||
return s
|
||||
|
||||
|
||||
def norm_num(v):
|
||||
if v is None:
|
||||
return ""
|
||||
if isinstance(v, (int, float)):
|
||||
# APIA SUPRAFATA arrives as float ("1.04", "12.45") — already English.
|
||||
# Trim trailing zeros after decimal.
|
||||
s = f"{v:.4f}"
|
||||
s = s.rstrip("0").rstrip(".")
|
||||
return s if s else "0"
|
||||
s = str(v).strip()
|
||||
if not s:
|
||||
return ""
|
||||
if "," in s:
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
return s.replace("|", "/")
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 7:
|
||||
print(
|
||||
"usage: import-apia-fermieri.py INPUT.xlsx OUTPUT.tsv "
|
||||
"CAMPAIGN_YEAR DATASET_ID RESOURCE_ID SOURCE_URL",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(2)
|
||||
|
||||
in_path = sys.argv[1]
|
||||
out_path = sys.argv[2]
|
||||
campaign_year = sys.argv[3]
|
||||
dataset_id = sys.argv[4]
|
||||
resource_id = sys.argv[5]
|
||||
source_url = sys.argv[6]
|
||||
|
||||
wb = openpyxl.load_workbook(in_path, read_only=True, data_only=True)
|
||||
ws = wb.active
|
||||
|
||||
rows = ws.iter_rows(values_only=True)
|
||||
header_idx = None
|
||||
col_map = None
|
||||
for i, r in enumerate(rows):
|
||||
if not r:
|
||||
continue
|
||||
if r[0] and EXPECTED_HEADER_COL0 in str(r[0]).upper():
|
||||
# Build column index map from header for resilience.
|
||||
header = [str(c).strip().upper() if c is not None else "" for c in r]
|
||||
col_map = {}
|
||||
for idx, h in enumerate(header):
|
||||
if "NR.CRT" in h or "NRCRT" in h:
|
||||
col_map["nr"] = idx
|
||||
elif "NUME" in h: # "NUME PRENUME" / "NUME SI PRENUME"
|
||||
col_map.setdefault("name", idx)
|
||||
elif "RESPONSABIL" in h:
|
||||
col_map["responsabil"] = idx
|
||||
elif "COMUNA" in h or "ORAS" in h:
|
||||
col_map["comuna"] = idx
|
||||
elif h == "SAT" or h.startswith("SAT "):
|
||||
col_map["sat"] = idx
|
||||
elif "CENTRUL" in h or "CENTRU" in h:
|
||||
col_map["centru"] = idx
|
||||
elif "SUPRAFATA" in h or "SUPRAFAȚA" in h:
|
||||
col_map["suprafata"] = idx
|
||||
header_idx = i
|
||||
break
|
||||
if i > 50:
|
||||
break
|
||||
|
||||
if header_idx is None or not col_map or "name" not in col_map:
|
||||
print(
|
||||
"[apia-import] ERROR: header row not found in first 50 rows",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"[apia-import] header at row {header_idx}, col_map={col_map}", file=sys.stderr)
|
||||
|
||||
n_data = 0
|
||||
n_skipped = 0
|
||||
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
for r in rows:
|
||||
if r is None:
|
||||
continue
|
||||
cells = list(r)
|
||||
# Pad if short
|
||||
max_idx = max(col_map.values()) if col_map else 0
|
||||
while len(cells) <= max_idx:
|
||||
cells.append(None)
|
||||
|
||||
name = norm_text(cells[col_map["name"]])
|
||||
if not name:
|
||||
n_skipped += 1
|
||||
continue
|
||||
|
||||
comuna = norm_text(cells[col_map["comuna"]]) if "comuna" in col_map else ""
|
||||
sat = norm_text(cells[col_map["sat"]]) if "sat" in col_map else ""
|
||||
centru = norm_text(cells[col_map["centru"]]) if "centru" in col_map else ""
|
||||
responsabil = norm_text(cells[col_map["responsabil"]]) if "responsabil" in col_map else ""
|
||||
suprafata = norm_num(cells[col_map["suprafata"]]) if "suprafata" in col_map else ""
|
||||
|
||||
out = [
|
||||
campaign_year,
|
||||
name,
|
||||
comuna,
|
||||
sat,
|
||||
centru,
|
||||
responsabil,
|
||||
suprafata,
|
||||
dataset_id,
|
||||
resource_id,
|
||||
source_url,
|
||||
]
|
||||
f.write("|".join(out) + "\n")
|
||||
n_data += 1
|
||||
|
||||
print(f"[apia-import] done — {n_data} rows, {n_skipped} skipped", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,483 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
SEAP historical CSV importer for data.gov.ro yearly dumps.
|
||||
|
||||
Reads a SEAP CSV (any year/quarter/type) and emits a clean TSV that
|
||||
PostgreSQL COPY can ingest into seap.announcements. Handles:
|
||||
- BOM stripping
|
||||
- Romanian decimal commas → dots
|
||||
- "MM/DD/YYYY HH:MM:SS" date parsing (with second column variants)
|
||||
- Column dedupe by (type, ref_number) — first-row-wins for multi-lot CANs
|
||||
- CUI normalization (strip "RO " prefix)
|
||||
|
||||
Usage:
|
||||
python3 import-seap-historical.py CSV_PATH OUTPUT_TSV TYPE SOURCE
|
||||
TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
|
||||
SOURCE: e.g. 'datagov_2024_t1_contracte'
|
||||
|
||||
The output TSV columns are FIXED (15 columns matching the import SQL):
|
||||
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
|
||||
contract_type, publication_date, contract_date, awarded_value,
|
||||
supplier_name, supplier_cui, procedure_type, legislation, source
|
||||
|
||||
Column mapping is inferred from CSV headers (case+diacritic-insensitive).
|
||||
Falls back gracefully when columns are missing (older years had fewer cols).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def normalize_header(s: str) -> str:
|
||||
"""Strip BOM, lowercase, strip diacritics, collapse whitespace."""
|
||||
s = s.replace("", "").strip().lower()
|
||||
s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
|
||||
s = re.sub(r"\s+", " ", s)
|
||||
s = s.replace("?", "")
|
||||
return s.strip()
|
||||
|
||||
|
||||
def detect_dialect(first_line: str) -> tuple[str, str | None]:
|
||||
"""Detect delimiter and quote char from first line.
|
||||
|
||||
SEAP historical CSVs vary wildly:
|
||||
- 2017/2018: ^ delim, no quote
|
||||
- 2022: , delim, | quote (header looks like |FIELD|,|FIELD|)
|
||||
- 2023 T3: | delim, " quote (header: FIELD|FIELD with row "txt"|"txt")
|
||||
- 2023 T4: , delim, " quote (standard CSV with title-case headers)
|
||||
- 2024+: , delim, " quote (standard CSV)
|
||||
Returns (delim, quotechar_or_None).
|
||||
"""
|
||||
# Strip BOM (efbb bf) and lstrip whitespace before sniffing
|
||||
s = first_line
|
||||
if s.startswith(""):
|
||||
s = s[1:]
|
||||
s_strip = s.lstrip()
|
||||
# 2022 wire format: header LINE starts with `|` and uses `|FIELD|,|FIELD|`
|
||||
# → delim=',' quote='|'
|
||||
if s_strip.startswith("|") and "|," in s_strip:
|
||||
return (",", "|")
|
||||
counts = {c: s.count(c) for c in [",", "|", "^", ";", "\t"]}
|
||||
# Pick highest-count delimiter
|
||||
delim = max(counts, key=counts.get)
|
||||
if counts[delim] == 0:
|
||||
delim = ","
|
||||
if delim == "|":
|
||||
return ("|", '"')
|
||||
if delim == "^":
|
||||
return ("^", None)
|
||||
if delim == ";":
|
||||
return (";", '"')
|
||||
return (",", '"')
|
||||
|
||||
|
||||
# Maps normalized header → output column name.
|
||||
# Multiple headers may map to the same output (e.g. two "data publicare" cols).
|
||||
# Schema variants seen across data.gov.ro yearly dumps:
|
||||
# - 2024 (CSV, comma): "Autoritate contractanta", "Numar anunt", "Cod CPV"
|
||||
# - 2022/2023 (CSV/pipe, |QUOTE|): "DENUMIRE_AC", "NUMAR_ANUNT_ATRIBUIRE", "COD_CPV"
|
||||
# - 2017/2018 (^-delim): "AutoritateContractanta", "NumarAnuntAtribuire", "CPVCode"
|
||||
HEADER_MAP = {
|
||||
# 2024 standard CSV
|
||||
"autoritate contractanta": "authority_name",
|
||||
"cui": "authority_cui",
|
||||
"cui autoritate contractanta": "authority_cui",
|
||||
"cod cpv": "cpv_code",
|
||||
"denumire cpv": "cpv_name",
|
||||
"tip contract": "contract_type",
|
||||
"tip procedura": "procedure_type",
|
||||
"tip legislatie": "legislation",
|
||||
"tip incheiere contract": "award_type",
|
||||
"tip inchiere contract": "award_type", # typo seen in 2023 T1 XLS
|
||||
"tip criteriu de atribuire": "criterion",
|
||||
"numar anunt atribuire": "ref_number",
|
||||
"numar anunt initiere": "ref_initiere",
|
||||
"numar anunt": "ref_number",
|
||||
"numar contract": "contract_number",
|
||||
"numar lot": "lot_number",
|
||||
"data contract": "contract_date",
|
||||
"data publicare": "publication_date",
|
||||
"data publicare anunt atribuire": "publication_date", # 2023 T4 standard CSV
|
||||
"data anunt atribuire": "publication_date", # 2023 T1 XLS, 2017 ^-delim
|
||||
"data anunt initiere": "ref_initiere_date",
|
||||
"data publicare anunt initiere": "ref_initiere_date",
|
||||
"data publicare anunt": "publication_date", # 2023 T4 atribuire-fara
|
||||
"valoare atribuita (ron)": "awarded_value",
|
||||
"valoare estimata procedura": "estimated_value",
|
||||
"moneda valoare estimata procedura": "estimated_currency",
|
||||
"denumire procedura": "procedure_name",
|
||||
"tip activitate autoritate": "authority_activity",
|
||||
"criteriu de atribuire": "criterion",
|
||||
"denumire contract": "contract_title",
|
||||
"oras ofertant castigator": "supplier_city",
|
||||
"tara ofertant castigator": "supplier_country",
|
||||
"data publicare contract": "contract_date",
|
||||
"tip activitate": "authority_activity",
|
||||
"tip autoritate": "authority_type",
|
||||
"tip anunt": "announcement_type",
|
||||
"criterii de atribuire": "criterion",
|
||||
"licitatie electronica": "electronic_auction",
|
||||
"ofertant castigator": "supplier_name",
|
||||
"cui ofertant castigator": "supplier_cui",
|
||||
"oras ofertant": "supplier_city",
|
||||
"tara ofertant": "supplier_country",
|
||||
"incheiat prin": "award_type",
|
||||
"valoare contract (ron)": "awarded_value",
|
||||
"valoare contract": "awarded_value",
|
||||
"valoare estimata (ron)": "estimated_value",
|
||||
"valoare estimata": "estimated_value",
|
||||
"ofertant": "supplier_name",
|
||||
"cui ofertant": "supplier_cui",
|
||||
"cui castigator": "supplier_cui",
|
||||
"castigator": "supplier_name",
|
||||
"oras": "supplier_city",
|
||||
"tara": "supplier_country",
|
||||
"modalitate de desfasurare": "modality",
|
||||
# 2022/2023 UPPER_SNAKE_CASE pipe-delim schema
|
||||
"denumire_ac": "authority_name",
|
||||
"cui_ac": "authority_cui",
|
||||
"cui_autoritate": "authority_cui",
|
||||
"autoritate_contractanta": "authority_name",
|
||||
"numar_anunt_atribuire": "ref_number",
|
||||
"numar_anunt": "ref_number",
|
||||
"data_anunt_atribuire": "publication_date",
|
||||
"data_publicare": "publication_date",
|
||||
"data_publicare_ai": "ref_initiere_date",
|
||||
"data_contract": "contract_date",
|
||||
"numar_contract": "contract_number",
|
||||
"denumire_contract": "contract_title",
|
||||
"cod_cpv": "cpv_code",
|
||||
"cod_cpv_procedura": "cpv_code",
|
||||
"cpv_code": "cpv_code", # 2023 schema variant
|
||||
"denumire_cpv": "cpv_name",
|
||||
"denumire_cpv_procedura": "cpv_name",
|
||||
"tip_contract": "contract_type",
|
||||
"tip_procedura": "procedure_type",
|
||||
"tip_legislatie": "legislation",
|
||||
"tip_lesiglatie": "legislation", # SEAP typo present in many 2023 files
|
||||
"tip_anunt": "announcement_type",
|
||||
"tip_incheiere_contract": "award_type",
|
||||
"incheiat_prin": "award_type",
|
||||
"valoare_contract_ron": "awarded_value",
|
||||
"valoare_atribuita": "awarded_value",
|
||||
"valoare_estimata_procedura": "estimated_value",
|
||||
"ofertant": "supplier_name",
|
||||
"cui_of": "supplier_cui",
|
||||
"nume_castigator": "supplier_name",
|
||||
"cui_castigator": "supplier_cui",
|
||||
"oras_castigator": "supplier_city",
|
||||
"tara_castigator": "supplier_country",
|
||||
"modalitate_desfasurare": "modality",
|
||||
"modalitate_atribuire": "modality",
|
||||
"tip_criterii_atribuire": "criterion",
|
||||
"criteriu_de_atribuire": "criterion",
|
||||
"numar_anunt_ai": "ref_initiere",
|
||||
"numar_anunt_initiere": "ref_initiere",
|
||||
"data_anunt_initiere": "ref_initiere_date",
|
||||
"denumire_procedura": "procedure_name",
|
||||
# 2017/2018 ^-delim CamelCase legacy schema
|
||||
"castigator": "supplier_name", # already exists for 2024 but also legacy
|
||||
"castigatorcui": "supplier_cui",
|
||||
"castigatortara": "supplier_country",
|
||||
"castigatorlocalitate": "supplier_city",
|
||||
"castigatoradresa": "supplier_address",
|
||||
"tipcontract": "contract_type",
|
||||
"tipprocedura": "procedure_type",
|
||||
"autoritatecontractanta": "authority_name",
|
||||
"autoritatecontractantacui": "authority_cui",
|
||||
"tipac": "authority_type",
|
||||
"tipactivitateac": "authority_activity",
|
||||
"denumireac": "authority_name",
|
||||
"numaranuntatribuire": "ref_number",
|
||||
"numaranuntparticipare": "ref_initiere",
|
||||
"numaranunt": "ref_number",
|
||||
"dataanuntatribuire": "publication_date",
|
||||
"dataanuntparticipare": "ref_initiere_date",
|
||||
"datapublicare": "publication_date",
|
||||
"tipincheierecontract": "award_type",
|
||||
"tipcriteriiatribuire": "criterion",
|
||||
"culicitatieelectronica": "electronic_auction",
|
||||
"numarofertepre primite": "n_offers",
|
||||
"numarofertePrimite": "n_offers",
|
||||
"subcontractat": "subcontracted",
|
||||
"numarcontract": "contract_number",
|
||||
"datacontract": "contract_date",
|
||||
"titlucontract": "contract_title",
|
||||
"valoare": "awarded_value_orig", # may be in non-RON currency for 2017
|
||||
"moneda": "currency",
|
||||
"valoareron": "awarded_value",
|
||||
"valoareeur": "awarded_value_eur",
|
||||
"cpvcodeid": "cpv_code_id", # internal SEAP id, not CPV
|
||||
"cpvcode": "cpv_code", # actual CPV like 85150000-5
|
||||
"valoareestimataparticipare": "estimated_value",
|
||||
"monedavaloareestimataparticipare": "estimated_currency",
|
||||
"fonduricomunitare": "eu_funded",
|
||||
"tipfinantare": "funding_type",
|
||||
"tiplegislatieid": "legislation",
|
||||
"fondeuropean": "eu_fund",
|
||||
"contractperiodic": "periodic",
|
||||
"depozitegarantii": "deposits",
|
||||
"modalitatifinantare": "funding_modes",
|
||||
"tip": "announcement_subtype", # 2017 contracte has bare "Tip"
|
||||
# 2018-2019 XLS schema (UPPER_SNAKE with explicit underscores)
|
||||
"castigator": "supplier_name",
|
||||
"castigator_cui": "supplier_cui",
|
||||
"castigator_tara": "supplier_country",
|
||||
"castigator_localitate": "supplier_city",
|
||||
"castigaor_localitate": "supplier_city", # SEAP typo seen in 2018 T2 XLS
|
||||
"castigator_adresa": "supplier_address",
|
||||
"tip_ac": "authority_type",
|
||||
"tip_activitate_ac": "authority_activity",
|
||||
"autoritate_contractanta_cui": "authority_cui",
|
||||
"numar_anunt_participare": "ref_initiere",
|
||||
"data_anunt_participare": "ref_initiere_date",
|
||||
"tip_incheiere_contract": "award_type",
|
||||
"tip_criterii_atribuire": "criterion",
|
||||
"cu_licitatie_electronica": "electronic_auction",
|
||||
"numar_oferte_primite": "n_offers",
|
||||
"titlu_contract": "contract_title",
|
||||
"valoare_ron": "awarded_value",
|
||||
"valoare_eur": "awarded_value_eur",
|
||||
"valoare_estimata_participare": "estimated_value",
|
||||
"moneda_valoare_estimata_participare": "estimated_currency",
|
||||
"fonduri_comunitare": "eu_funded",
|
||||
"tip_finantare": "funding_type",
|
||||
"tip_legislatie_id": "legislation",
|
||||
"fond_european": "eu_fund",
|
||||
"contract_periodic": "periodic",
|
||||
"depozite_garantii": "deposits",
|
||||
"modalitati_finantare": "funding_modes",
|
||||
"cpv_code_id": "cpv_code_id",
|
||||
"cpv_code": "cpv_code",
|
||||
}
|
||||
|
||||
|
||||
def parse_date(s: str | None) -> str | None:
|
||||
"""Parse MM/DD/YYYY [HH:MM:SS] or DD.MM.YYYY → ISO YYYY-MM-DD."""
|
||||
if not s:
|
||||
return None
|
||||
s = s.strip()
|
||||
if not s:
|
||||
return None
|
||||
# MM/DD/YYYY 01:35:39
|
||||
m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{4})", s)
|
||||
if m:
|
||||
try:
|
||||
mm, dd, yy = int(m[1]), int(m[2]), int(m[3])
|
||||
datetime(yy, mm, dd) # validate
|
||||
return f"{yy:04d}-{mm:02d}-{dd:02d}"
|
||||
except ValueError:
|
||||
return None
|
||||
# DD.MM.YYYY
|
||||
m = re.match(r"^(\d{1,2})\.(\d{1,2})\.(\d{4})", s)
|
||||
if m:
|
||||
try:
|
||||
dd, mm, yy = int(m[1]), int(m[2]), int(m[3])
|
||||
datetime(yy, mm, dd)
|
||||
return f"{yy:04d}-{mm:02d}-{dd:02d}"
|
||||
except ValueError:
|
||||
return None
|
||||
# YYYY-MM-DD passthrough
|
||||
if re.match(r"^\d{4}-\d{2}-\d{2}", s):
|
||||
return s[:10]
|
||||
return None
|
||||
|
||||
|
||||
def parse_number(s: str | None) -> str | None:
|
||||
"""Parse Romanian number → ISO float string.
|
||||
|
||||
SEAP CSV uses MIXED conventions:
|
||||
- "1.234.567,89" → period=thousand, comma=decimal → 1234567.89
|
||||
- "123,126" → comma=THOUSAND (3 digits after) → 123126
|
||||
- "12345,67" → comma=decimal (2 digits after) → 12345.67
|
||||
- "1,234,567" → all commas=thousand → 1234567
|
||||
Heuristic: digits-after-final-comma == 3 → thousand separator,
|
||||
otherwise → decimal. Robust to most real RO data.
|
||||
"""
|
||||
if not s:
|
||||
return None
|
||||
s = s.strip().strip('"').replace("\xa0", "").replace(" ", "")
|
||||
if not s or s == "-":
|
||||
return None
|
||||
|
||||
# Mixed period+comma → assume RO format (period thousand, comma decimal)
|
||||
if "," in s and "." in s:
|
||||
s = s.replace(".", "").replace(",", ".")
|
||||
try:
|
||||
return f"{float(s):.2f}"
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Multi-comma → all thousand separators
|
||||
if s.count(",") >= 2:
|
||||
try:
|
||||
return f"{int(s.replace(',', '')):d}.00"
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
# Single comma → check digits after
|
||||
if "," in s:
|
||||
parts = s.split(",")
|
||||
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
||||
digits_after = len(parts[1])
|
||||
if digits_after == 3:
|
||||
# Thousand separator (most common SEAP case)
|
||||
try:
|
||||
return f"{int(parts[0] + parts[1])}.00"
|
||||
except ValueError:
|
||||
return None
|
||||
# 1-2 digits after → decimal separator
|
||||
try:
|
||||
return f"{float(parts[0] + '.' + parts[1]):.2f}"
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
try:
|
||||
return f"{float(s):.2f}"
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_cui(s: str | None) -> str | None:
|
||||
if not s:
|
||||
return None
|
||||
s = s.strip().strip('"')
|
||||
s = re.sub(r"^RO\s*", "", s, flags=re.IGNORECASE)
|
||||
s = s.strip()
|
||||
if not s or not s.isdigit():
|
||||
return None
|
||||
return s
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) != 5:
|
||||
print(__doc__)
|
||||
sys.exit(2)
|
||||
|
||||
csv_path = Path(sys.argv[1])
|
||||
out_path = Path(sys.argv[2])
|
||||
record_type = sys.argv[3]
|
||||
source = sys.argv[4]
|
||||
|
||||
if not csv_path.exists():
|
||||
print(f"ERROR: {csv_path} does not exist", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
seen: set[tuple[str, str]] = set()
|
||||
out_cols = [
|
||||
"type", "ref_number", "authority_name", "authority_cui",
|
||||
"cpv_code", "cpv_name", "contract_type", "publication_date",
|
||||
"contract_date", "awarded_value", "supplier_name", "supplier_cui",
|
||||
"procedure_type", "legislation", "source",
|
||||
]
|
||||
|
||||
written = 0
|
||||
skipped_dup = 0
|
||||
skipped_no_ref = 0
|
||||
total = 0
|
||||
|
||||
# Sniff first line to detect delimiter/quotechar
|
||||
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f:
|
||||
first_line = f.readline()
|
||||
delim, quotechar = detect_dialect(first_line)
|
||||
print(f"[import] delim={delim!r} quote={quotechar!r}", file=sys.stderr)
|
||||
|
||||
with csv_path.open("r", encoding="utf-8-sig", errors="replace") as f, \
|
||||
out_path.open("w", encoding="utf-8") as out:
|
||||
if quotechar:
|
||||
reader = csv.reader(f, delimiter=delim, quotechar=quotechar)
|
||||
else:
|
||||
reader = csv.reader(f, delimiter=delim, quoting=csv.QUOTE_NONE)
|
||||
# Skip "title" rows — some XLS exports begin with a single-cell
|
||||
# title (rest empty), then the real header row follows.
|
||||
header_raw = next(reader)
|
||||
non_empty = sum(1 for h in header_raw if h.strip().strip("|").strip())
|
||||
if non_empty <= 1:
|
||||
print("[import] skipping title row, advancing to next", file=sys.stderr)
|
||||
header_raw = next(reader)
|
||||
# Strip pipe-quote artifacts: 2022 fields look like |"FIELD"| with literal | bookends
|
||||
header_raw = [h.strip().strip("|").strip() for h in header_raw]
|
||||
header = [normalize_header(h) for h in header_raw]
|
||||
|
||||
# Build column index map. For dup headers (2× "data publicare"), LAST wins.
|
||||
col_idx: dict[str, int] = {}
|
||||
for i, h in enumerate(header):
|
||||
mapped = HEADER_MAP.get(h)
|
||||
if mapped:
|
||||
col_idx[mapped] = i
|
||||
|
||||
# Write header line for COPY (\\\\N markers for nulls)
|
||||
out.write("\t".join(out_cols) + "\n")
|
||||
|
||||
for row in reader:
|
||||
total += 1
|
||||
if len(row) < len(header):
|
||||
row = row + [""] * (len(header) - len(row))
|
||||
|
||||
def get(col: str) -> str | None:
|
||||
idx = col_idx.get(col)
|
||||
if idx is None or idx >= len(row):
|
||||
return None
|
||||
v = row[idx].strip().strip("|").strip()
|
||||
return v if v else None
|
||||
|
||||
ref = get("ref_number")
|
||||
# For initiere imports, files name the ref column "Numar anunt initiere"
|
||||
# which we map to ref_initiere. Fall through to that field.
|
||||
if not ref and record_type in ("initiere",):
|
||||
ref = get("ref_initiere")
|
||||
if not ref:
|
||||
skipped_no_ref += 1
|
||||
continue
|
||||
|
||||
key = (record_type, ref)
|
||||
if key in seen:
|
||||
skipped_dup += 1
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
fields = {
|
||||
"type": record_type,
|
||||
"ref_number": ref,
|
||||
"authority_name": get("authority_name"),
|
||||
"authority_cui": normalize_cui(get("authority_cui")),
|
||||
"cpv_code": get("cpv_code"),
|
||||
"cpv_name": get("cpv_name"),
|
||||
"contract_type": get("contract_type"),
|
||||
"publication_date": parse_date(get("publication_date")),
|
||||
"contract_date": parse_date(get("contract_date")),
|
||||
"awarded_value": parse_number(get("awarded_value")),
|
||||
"supplier_name": get("supplier_name"),
|
||||
"supplier_cui": normalize_cui(get("supplier_cui")),
|
||||
"procedure_type": get("procedure_type"),
|
||||
"legislation": get("legislation"),
|
||||
"source": source,
|
||||
}
|
||||
|
||||
line_parts = []
|
||||
for c in out_cols:
|
||||
v = fields.get(c)
|
||||
if v is None:
|
||||
line_parts.append("\\N")
|
||||
else:
|
||||
# Escape tabs, newlines, backslashes for COPY format
|
||||
v = str(v).replace("\\", "\\\\").replace("\t", " ").replace("\n", " ").replace("\r", "")
|
||||
line_parts.append(v)
|
||||
out.write("\t".join(line_parts) + "\n")
|
||||
written += 1
|
||||
|
||||
print(f"[import] CSV={csv_path.name}")
|
||||
print(f"[import] total rows: {total}")
|
||||
print(f"[import] written: {written}")
|
||||
print(f"[import] dup-skip: {skipped_dup}")
|
||||
print(f"[import] no-ref: {skipped_no_ref}")
|
||||
print(f"[import] output: {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+81
@@ -0,0 +1,81 @@
|
||||
#!/bin/bash
|
||||
# SEAP historical CSV importer wrapper.
|
||||
# Downloads a yearly+quarterly resource from data.gov.ro CKAN and imports
|
||||
# it into seap.announcements via the Python normalizer + psql COPY.
|
||||
#
|
||||
# Usage:
|
||||
# ./import-seap-historical.sh URL TYPE SOURCE [DELETE_FIRST]
|
||||
# URL: full data.gov.ro CKAN download URL
|
||||
# TYPE: 'contract' | 'da' | 'initiere' | 'atribuire_fara' | 'modificare'
|
||||
# SOURCE: tag e.g. 'datagov_2024_t1_contracte'
|
||||
# DELETE_FIRST: 'yes' to wipe rows tagged with this source before insert
|
||||
#
|
||||
# Example:
|
||||
# bash import-seap-historical.sh \
|
||||
# 'https://data.gov.ro/dataset/ed.../resource/24a.../download/...t-i-2024.csv' \
|
||||
# contract datagov_2024_t1_contracte yes
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
URL="$1"
|
||||
TYPE="$2"
|
||||
SOURCE="$3"
|
||||
DELETE_FIRST="${4:-no}"
|
||||
|
||||
WORK=/tmp/seap-historical-$$
|
||||
mkdir -p "$WORK"
|
||||
trap "rm -rf $WORK" EXIT
|
||||
|
||||
CSV="$WORK/data.csv"
|
||||
TSV="$WORK/data.tsv"
|
||||
|
||||
echo "[import] downloading: $URL"
|
||||
curl -sk --max-time 600 -L "$URL" -o "$CSV"
|
||||
echo "[import] downloaded: $(stat -c %s "$CSV") bytes"
|
||||
|
||||
echo "[import] normalizing CSV → TSV..."
|
||||
python3 "$(dirname "$0")/import-seap-historical.py" "$CSV" "$TSV" "$TYPE" "$SOURCE"
|
||||
|
||||
# Stage on the DB host
|
||||
echo "[import] copying TSV to satra..."
|
||||
scp -q "$TSV" "satra:/tmp/seap-historical.tsv"
|
||||
|
||||
DELETE_SQL=""
|
||||
if [ "$DELETE_FIRST" = "yes" ]; then
|
||||
DELETE_SQL="DELETE FROM seap.announcements WHERE source = '$SOURCE';"
|
||||
fi
|
||||
|
||||
echo "[import] running insert on satra..."
|
||||
ssh satra "/tmp/baseline.sh <<SQL
|
||||
$DELETE_SQL
|
||||
CREATE TEMP TABLE _stage_seap_hist (
|
||||
type text, ref_number text, authority_name text, authority_cui text,
|
||||
cpv_code text, cpv_name text, contract_type text, publication_date text,
|
||||
contract_date text, awarded_value text, supplier_name text, supplier_cui text,
|
||||
procedure_type text, legislation text, source text
|
||||
);
|
||||
\\COPY _stage_seap_hist FROM '/tmp/seap-historical.tsv' WITH (FORMAT text, DELIMITER E'\\t', HEADER true);
|
||||
|
||||
INSERT INTO seap.announcements (
|
||||
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
|
||||
contract_type, publication_date, contract_date, awarded_value,
|
||||
supplier_name, supplier_cui, procedure_type, legislation, source
|
||||
)
|
||||
SELECT type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
|
||||
contract_type,
|
||||
NULLIF(publication_date, '')::timestamptz,
|
||||
NULLIF(contract_date, '')::date,
|
||||
NULLIF(awarded_value, '')::numeric,
|
||||
supplier_name, supplier_cui, procedure_type, legislation, source
|
||||
FROM _stage_seap_hist
|
||||
ON CONFLICT (type, ref_number) DO NOTHING;
|
||||
|
||||
SELECT '$SOURCE' AS source, COUNT(*) AS rows,
|
||||
MIN(publication_date)::date AS oldest,
|
||||
MAX(publication_date)::date AS newest,
|
||||
SUM(awarded_value)::bigint AS total_lei
|
||||
FROM seap.announcements WHERE source = '$SOURCE';
|
||||
SQL"
|
||||
|
||||
ssh satra "rm -f /tmp/seap-historical.tsv"
|
||||
echo "[import] done."
|
||||
+73
@@ -0,0 +1,73 @@
|
||||
#!/bin/bash
|
||||
# SEAP historical XLSX importer.
|
||||
# Downloads an xlsx from data.gov.ro, converts to CSV via openpyxl,
|
||||
# then hands it to import-seap-historical.py + the same TSV+psql flow.
|
||||
#
|
||||
# Usage: ./import-seap-xlsx.sh URL TYPE SOURCE [DELETE_FIRST]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
URL="$1"
|
||||
TYPE="$2"
|
||||
SOURCE="$3"
|
||||
DELETE_FIRST="${4:-no}"
|
||||
|
||||
WORK=/tmp/seap-xlsx-$$
|
||||
mkdir -p "$WORK"
|
||||
trap "rm -rf $WORK" EXIT
|
||||
|
||||
XLSX="$WORK/data.xlsx"
|
||||
CSV="$WORK/data.csv"
|
||||
TSV="$WORK/data.tsv"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
echo "[xlsx-import] downloading: $URL"
|
||||
curl -sk --max-time 600 -L "$URL" -o "$XLSX"
|
||||
echo "[xlsx-import] downloaded: $(stat -c %s "$XLSX") bytes"
|
||||
|
||||
echo "[xlsx-import] xlsx → csv..."
|
||||
python3 "$SCRIPT_DIR/xlsx-to-csv.py" "$XLSX" "$CSV"
|
||||
echo "[xlsx-import] csv: $(stat -c %s "$CSV") bytes"
|
||||
|
||||
echo "[xlsx-import] normalizing CSV → TSV..."
|
||||
python3 "$SCRIPT_DIR/import-seap-historical.py" "$CSV" "$TSV" "$TYPE" "$SOURCE"
|
||||
|
||||
echo "[xlsx-import] copying TSV to satra..."
|
||||
scp -q "$TSV" "satra:/tmp/seap-historical.tsv"
|
||||
|
||||
DELETE_SQL=""
|
||||
if [ "$DELETE_FIRST" = "yes" ]; then
|
||||
DELETE_SQL="DELETE FROM seap.announcements WHERE source = '$SOURCE';"
|
||||
fi
|
||||
|
||||
ssh satra "/tmp/baseline.sh <<SQL
|
||||
$DELETE_SQL
|
||||
CREATE TEMP TABLE _stage_seap_hist (
|
||||
type text, ref_number text, authority_name text, authority_cui text,
|
||||
cpv_code text, cpv_name text, contract_type text, publication_date text,
|
||||
contract_date text, awarded_value text, supplier_name text, supplier_cui text,
|
||||
procedure_type text, legislation text, source text
|
||||
);
|
||||
\\COPY _stage_seap_hist FROM '/tmp/seap-historical.tsv' WITH (FORMAT text, DELIMITER E'\\t', HEADER true);
|
||||
INSERT INTO seap.announcements (
|
||||
type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
|
||||
contract_type, publication_date, contract_date, awarded_value,
|
||||
supplier_name, supplier_cui, procedure_type, legislation, source
|
||||
)
|
||||
SELECT type, ref_number, authority_name, authority_cui, cpv_code, cpv_name,
|
||||
contract_type,
|
||||
NULLIF(publication_date, '')::timestamptz,
|
||||
NULLIF(contract_date, '')::date,
|
||||
NULLIF(awarded_value, '')::numeric,
|
||||
supplier_name, supplier_cui, procedure_type, legislation, source
|
||||
FROM _stage_seap_hist
|
||||
ON CONFLICT (type, ref_number) DO NOTHING;
|
||||
SELECT '$SOURCE' AS source, COUNT(*) AS rows,
|
||||
MIN(publication_date)::date AS oldest,
|
||||
MAX(publication_date)::date AS newest,
|
||||
SUM(awarded_value)::bigint AS total_lei
|
||||
FROM seap.announcements WHERE source = '$SOURCE';
|
||||
SQL"
|
||||
|
||||
ssh satra "rm -f /tmp/seap-historical.tsv"
|
||||
echo "[xlsx-import] done."
|
||||
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* Standalone test for CNAS Layout-B parser.
|
||||
*
|
||||
* Reads pdftotext -layout output for the 8 known Layout-B PDFs (the 9th is
|
||||
* an empty form template), parses with parseProviderTextJudetGrouped(), and
|
||||
* prints results for manual inspection.
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scripts/test-cnas-layout-b.ts /tmp/cnas-pdfs/Lista-furnizori-testare-genetica-2024-2025_all.pdf
|
||||
* npx tsx scripts/test-cnas-layout-b.ts /tmp/cnas-pdfs/*.pdf
|
||||
*/
|
||||
import { execFile } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { basename } from 'path';
|
||||
import { parseProviderTextJudetGrouped, parseProviderTextRadio, parseProviderTextSingleCAS, parseProviderTextNumberedDot } from '../src/cnas-layout-b.js';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
async function pdftotextLayout(pdfPath: string): Promise<string> {
|
||||
const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
|
||||
maxBuffer: 64 * 1024 * 1024,
|
||||
});
|
||||
return stdout;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const files = process.argv.slice(2);
|
||||
if (files.length === 0) {
|
||||
console.error('Usage: tsx test-cnas-layout-b.ts <pdf>...');
|
||||
process.exit(1);
|
||||
}
|
||||
for (const f of files) {
|
||||
const fn = basename(f);
|
||||
console.log(`\n=== ${fn} ===`);
|
||||
const text = await pdftotextLayout(f);
|
||||
let rows;
|
||||
if (/radioterapie/i.test(fn)) {
|
||||
rows = parseProviderTextRadio(text, { tip: 'radioterapie' });
|
||||
} else if (/CAS-GORJ.*PNS/i.test(fn) || /Valori-de-contract-furnizori-PNS/i.test(fn)) {
|
||||
rows = parseProviderTextSingleCAS(text, { tip: 'pns', judet: 'GORJ' });
|
||||
} else if (/ASISTENTA-MEDICALA-PRIMARA/i.test(fn)) {
|
||||
rows = parseProviderTextNumberedDot(text, { tip: 'medicina_familie', judet: 'SIBIU' });
|
||||
} else {
|
||||
rows = parseProviderTextJudetGrouped(text, { tip: 'oncologie' });
|
||||
}
|
||||
const limit = parseInt(process.env.TEST_LIMIT || '20');
|
||||
console.log(`Parsed ${rows.length} rows`);
|
||||
for (let i = 0; i < Math.min(rows.length, limit); i++) {
|
||||
const r = rows[i];
|
||||
console.log(` [${i + 1}] judet=${r.judet || '-'} name="${r.name}" sediu="${r.sediu || '-'}" tel=${r.telefon || '-'} email=${r.email || '-'} flags=${r.specialitate || '-'}`);
|
||||
}
|
||||
if (rows.length > limit) console.log(` ... and ${rows.length - limit} more`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => { console.error(e); process.exit(1); });
|
||||
@@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
"""XLSX/XLS → CSV converter for SEAP data.gov.ro yearly dumps.
|
||||
|
||||
Reads the first sheet, writes a UTF-8 CSV (comma + double-quote) so the
|
||||
existing SEAP normalizer (import-seap-historical.py) can ingest it.
|
||||
|
||||
Auto-detects file format:
|
||||
- XLSX (zip archive) → openpyxl
|
||||
- XLS (BIFF8 OLE) → xlrd 1.x
|
||||
|
||||
Usage: python3 xlsx-to-csv.py INPUT.{xlsx|xls} OUTPUT.csv
|
||||
"""
|
||||
import csv
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def is_xlsx(path: Path) -> bool:
|
||||
"""XLSX is a ZIP archive (PK header)."""
|
||||
with path.open("rb") as f:
|
||||
return f.read(2) == b"PK"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) != 3:
|
||||
print(__doc__)
|
||||
sys.exit(2)
|
||||
src = Path(sys.argv[1])
|
||||
dst = Path(sys.argv[2])
|
||||
written = 0
|
||||
|
||||
if is_xlsx(src):
|
||||
import openpyxl
|
||||
wb = openpyxl.load_workbook(src, read_only=True, data_only=True)
|
||||
ws = wb.active
|
||||
with dst.open("w", encoding="utf-8", newline="") as f:
|
||||
w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
out = []
|
||||
for v in row:
|
||||
if v is None:
|
||||
out.append("")
|
||||
elif isinstance(v, datetime):
|
||||
out.append(v.strftime("%m/%d/%Y %H:%M:%S"))
|
||||
elif isinstance(v, float) and v.is_integer():
|
||||
out.append(str(int(v)))
|
||||
else:
|
||||
out.append(str(v))
|
||||
w.writerow(out)
|
||||
written += 1
|
||||
else:
|
||||
# Legacy XLS via xlrd 1.x — concat ALL sheets (some big SEAP files use
|
||||
# multiple sheets due to the 65k row limit in old XLS format).
|
||||
import xlrd
|
||||
b = xlrd.open_workbook(str(src))
|
||||
wrote_header = False
|
||||
with dst.open("w", encoding="utf-8", newline="") as f:
|
||||
w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||
for sidx, sname in enumerate(b.sheet_names()):
|
||||
sh = b.sheet_by_index(sidx)
|
||||
if sh.nrows == 0:
|
||||
continue
|
||||
start = 0
|
||||
if wrote_header:
|
||||
start = 1 # skip repeated header on subsequent sheets
|
||||
else:
|
||||
wrote_header = True
|
||||
for ridx in range(start, sh.nrows):
|
||||
row = sh.row(ridx)
|
||||
out = []
|
||||
for cell in row:
|
||||
if cell.ctype == xlrd.XL_CELL_EMPTY or cell.ctype == xlrd.XL_CELL_BLANK:
|
||||
out.append("")
|
||||
elif cell.ctype == xlrd.XL_CELL_DATE:
|
||||
try:
|
||||
tup = xlrd.xldate_as_tuple(cell.value, b.datemode)
|
||||
out.append(datetime(*tup).strftime("%m/%d/%Y %H:%M:%S"))
|
||||
except Exception:
|
||||
out.append(str(cell.value))
|
||||
elif cell.ctype == xlrd.XL_CELL_NUMBER:
|
||||
v = cell.value
|
||||
if v == int(v):
|
||||
out.append(str(int(v)))
|
||||
else:
|
||||
out.append(str(v))
|
||||
else:
|
||||
out.append(str(cell.value))
|
||||
w.writerow(out)
|
||||
written += 1
|
||||
print(f"[xlsx2csv] {src.name} → {dst.name}: {written} rows", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user