#!/usr/bin/env python3 """ Fast CUI → location resolver using ANAF dateidentificare bulk CSV. Reads 726MB CSV, matches against our 14K+ CUI list, updates DB. """ import csv import os import sys import psycopg2 from psycopg2.extras import execute_values DB_URL = os.environ.get('DATABASE_URL', 'postgresql://architools_user:stictMyFon34!_gonY@10.10.10.166:5432/architools_db') ANAF_CSV = os.path.join(os.path.dirname(__file__), 'data', 'dateidentificare2025.csv') def main(): conn = psycopg2.connect(DB_URL) cur = conn.cursor() # Step 1: Get all unique CUIs we need to resolve print("Loading CUI list from DB...") cur.execute(""" SELECT DISTINCT authority_cui FROM seap.direct_acquisitions WHERE authority_cui IS NOT NULL UNION SELECT DISTINCT supplier_cui FROM seap.direct_acquisitions WHERE supplier_cui IS NOT NULL UNION SELECT DISTINCT authority_cui FROM seap.public_notices WHERE authority_cui IS NOT NULL """) needed_cuis = set() for row in cur.fetchall(): cui = str(row[0]).strip().replace('RO', '').replace('ro', '') if cui.isdigit(): needed_cuis.add(cui) print(f" Need location for {len(needed_cuis)} CUIs") # Step 2: Ensure cui_location table exists cur.execute(""" CREATE TABLE IF NOT EXISTS seap.cui_location ( cui TEXT PRIMARY KEY, name TEXT, city TEXT, county TEXT, updated_at TIMESTAMPTZ DEFAULT now() ) """) # Add siruta column if missing cur.execute("ALTER TABLE seap.cui_location ADD COLUMN IF NOT EXISTS siruta TEXT") conn.commit() # Step 3: Read ANAF CSV and match print(f"Reading ANAF CSV: {ANAF_CSV}...") matched = 0 batch = [] line_count = 0 with open(ANAF_CSV, 'r', encoding='iso-8859-16', errors='replace') as f: reader = csv.reader(f, delimiter='^') headers = next(reader) # Find column indices h_map = {h.strip().upper(): i for i, h in enumerate(headers)} cui_idx = h_map.get('COD_FISCAL', 0) name_idx = h_map.get('DENUMIRE', 1) city_idx = h_map.get('LOCALITATE', 5) county_idx = h_map.get('JUDET', 22) # JUDET is col 22 (not JUDET_COMERT which is 13) print(f" Columns: CUI={cui_idx}, Name={name_idx}, City={city_idx}, County={county_idx}") print(f" Headers sample: {headers[:8]}") for row in reader: line_count += 1 if line_count % 500000 == 0: print(f" Processed {line_count} lines, matched {matched}...") if len(row) <= max(cui_idx, name_idx, city_idx, county_idx): continue cui = row[cui_idx].strip() if cui not in needed_cuis: continue name = row[name_idx].strip() if row[name_idx] else None city = row[city_idx].strip() if row[city_idx] else None county = row[county_idx].strip() if row[county_idx] else None if city: batch.append((cui, name, city, county)) matched += 1 if len(batch) >= 5000: _insert_batch(cur, batch) conn.commit() batch = [] if batch: _insert_batch(cur, batch) conn.commit() print(f"\n Total lines: {line_count}") print(f" Matched CUIs: {matched} / {len(needed_cuis)}") # Step 4: Match cui_location → SIRUTA print("\nMatching locations to SIRUTA...") # Exact match cur.execute(""" UPDATE seap.cui_location cl SET siruta = u.siruta FROM public."GisUat" u WHERE cl.siruta IS NULL AND cl.city IS NOT NULL AND cl.county IS NOT NULL AND seap.normalize_locality(u.name) = seap.normalize_locality(cl.city) AND seap.normalize_locality(u.county) = seap.normalize_locality(cl.county) """) exact = cur.rowcount print(f" Exact match: {exact}") # Fuzzy match cur.execute(""" UPDATE seap.cui_location cl SET siruta = sub.siruta FROM ( SELECT DISTINCT ON (cl2.cui) cl2.cui, u.siruta, similarity(seap.normalize_locality(u.name), seap.normalize_locality(cl2.city)) AS score FROM seap.cui_location cl2 JOIN public."GisUat" u ON seap.normalize_locality(u.county) = seap.normalize_locality(cl2.county) WHERE cl2.siruta IS NULL AND cl2.city IS NOT NULL AND cl2.county IS NOT NULL AND similarity(seap.normalize_locality(u.name), seap.normalize_locality(cl2.city)) > 0.3 ORDER BY cl2.cui, score DESC ) sub WHERE cl.cui = sub.cui """) fuzzy = cur.rowcount print(f" Fuzzy match: {fuzzy}") conn.commit() # Step 5: Propagate SIRUTA to DA records print("\nUpdating DA records with SIRUTA...") cur.execute(""" UPDATE seap.direct_acquisitions da SET authority_siruta = cl.siruta FROM seap.cui_location cl WHERE da.authority_cui = cl.cui AND cl.siruta IS NOT NULL AND (da.authority_siruta IS NULL OR da.authority_siruta != cl.siruta) """) da_updated = cur.rowcount print(f" DA records updated: {da_updated}") cur.execute(""" UPDATE seap.public_notices pn SET authority_siruta = cl.siruta FROM seap.cui_location cl WHERE pn.authority_cui = cl.cui AND cl.siruta IS NOT NULL AND (pn.authority_siruta IS NULL OR pn.authority_siruta != cl.siruta) """) pn_updated = cur.rowcount print(f" Notice records updated: {pn_updated}") conn.commit() # Step 6: Refresh materialized view print("\nRefreshing materialized view...") cur.execute("DROP MATERIALIZED VIEW IF EXISTS seap.uat_procurement_stats") cur.execute(""" CREATE MATERIALIZED VIEW seap.uat_procurement_stats AS SELECT u.siruta, u.name AS uat_name, u.county, COALESCE(da_s.da_count, 0)::bigint AS da_count, COALESCE(da_s.da_total_value, 0)::numeric AS da_total_value, COALESCE(pn_s.notice_count, 0)::bigint AS notice_count, COALESCE(pn_s.notice_total_value, 0)::numeric AS notice_total_value, (COALESCE(da_s.da_count, 0) + COALESCE(pn_s.notice_count, 0))::bigint AS total_contracts, (COALESCE(da_s.da_total_value, 0) + COALESCE(pn_s.notice_total_value, 0))::numeric AS total_value FROM public."GisUat" u LEFT JOIN ( SELECT authority_siruta AS siruta, COUNT(*) AS da_count, SUM(closing_value) AS da_total_value FROM seap.direct_acquisitions WHERE authority_siruta IS NOT NULL GROUP BY authority_siruta ) da_s ON da_s.siruta = u.siruta LEFT JOIN ( SELECT authority_siruta AS siruta, COUNT(*) AS notice_count, SUM(contract_value) AS notice_total_value FROM seap.public_notices WHERE authority_siruta IS NOT NULL GROUP BY authority_siruta ) pn_s ON pn_s.siruta = u.siruta """) cur.execute("CREATE UNIQUE INDEX idx_ups_siruta ON seap.uat_procurement_stats(siruta)") conn.commit() # Final stats cur.execute("SELECT COUNT(*) FROM seap.uat_procurement_stats WHERE total_contracts > 0") uats = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM seap.cui_location WHERE siruta IS NOT NULL") located = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM seap.cui_location") total_cui = cur.fetchone()[0] print(f"\n=== Done ===") print(f" CUI located: {located} / {total_cui}") print(f" UATs with data: {uats}") conn.close() def _insert_batch(cur, batch): execute_values(cur, """ INSERT INTO seap.cui_location (cui, name, city, county) VALUES %s ON CONFLICT (cui) DO UPDATE SET name = COALESCE(EXCLUDED.name, seap.cui_location.name), city = COALESCE(EXCLUDED.city, seap.cui_location.city), county = COALESCE(EXCLUDED.county, seap.cui_location.county), updated_at = now() """, batch) if __name__ == '__main__': main()