#!/usr/bin/env python3 """Load EU CPV nomenclature (RO names) into seap.cpv_codes + backfill announcements.""" import json import os import sys import urllib.request from pathlib import Path import psycopg2 from psycopg2.extras import execute_values CPV_URL = 'https://raw.githubusercontent.com/samhallskod/cpv-eu/main/data/cpv.json' def main(): db = os.environ.get('DATABASE_URL') if not db: print('DATABASE_URL not set', file=sys.stderr) sys.exit(1) cache = Path('/tmp/cpv.json') if not cache.exists(): print(f'Downloading {CPV_URL}...') urllib.request.urlretrieve(CPV_URL, cache) data = json.loads(cache.read_text()) print(f'Loaded {len(data)} CPV codes from JSON') rows = [] for entry in data: code = entry['code'] # 8-digit (no dash) labels = entry.get('labels', {}) name_ro = labels.get('ro') or labels.get('en') or code name_en = labels.get('en') level = entry.get('level', 0) emoji = entry.get('emoji') # Division = first 2 digits + 6 zeros division = code[:2] + '000000' # Parent = level - 1 — easiest: trim trailing zero pairs based on level # Level mapping: 1=XX000000, 2=XXX00000, 3=XXXX0000, 4=XXXXX000, 5=XXXXXX00, 6=XXXXXXX0, 7=XXXXXXXX (full) # For our purposes, parent is one level up — replace last non-zero digit with 0 parent = None if level > 1: # find rightmost non-zero, zero it out digits = list(code) # Find position of last meaningful digit for i in range(len(digits) - 1, -1, -1): if digits[i] != '0': digits[i] = '0' break parent_code = ''.join(digits) if parent_code != code: parent = parent_code # Build full code with check digit (synthesize a placeholder dash + 0) # The real check digit is per-code — we don't compute it; use the 8-digit form everywhere. rows.append((code, code, name_ro, name_en, level, division, parent, emoji)) print(f'Inserting {len(rows)} rows into seap.cpv_codes...') conn = psycopg2.connect(db) cur = conn.cursor() cur.execute('TRUNCATE seap.cpv_codes') execute_values(cur, """ INSERT INTO seap.cpv_codes (code, code_full, name_ro, name_en, level, division_code, parent_code, emoji) VALUES %s """, rows, page_size=500) conn.commit() cur.execute('SELECT count(*), count(*) FILTER (WHERE level=1) FROM seap.cpv_codes') total, divisions = cur.fetchone() print(f' → {total} codes loaded, {divisions} top-level divisions') # Backfill cpv_division + cpv_name_ro on announcements print('Backfilling cpv_division + cpv_name_ro on announcements...') cur.execute(""" UPDATE seap.announcements SET cpv_division = seap.cpv_division(cpv_code), cpv_name_ro = seap.cpv_name(cpv_code) WHERE cpv_code IS NOT NULL AND (cpv_division IS NULL OR cpv_name_ro IS NULL) """) affected = cur.rowcount conn.commit() print(f' → {affected:,} announcement rows enriched') # Show top divisions cur.execute(""" SELECT a.cpv_division, c.name_ro, c.emoji, count(*) as n, sum(awarded_value)::numeric(15,0) as total_value FROM seap.announcements a LEFT JOIN seap.cpv_codes c ON c.code = a.cpv_division WHERE a.cpv_division IS NOT NULL AND a.source LIKE 'wsp_%' GROUP BY 1, 2, 3 ORDER BY 4 DESC LIMIT 15 """) print('\nTop 15 CPV divisions in WSP data:') for r in cur.fetchall(): print(f' {r[2] or " "} {r[0]:<10} {(r[1] or "?")[:45]:<45} {r[3]:>5} contracte {r[4] or 0:>13,} RON') conn.close() if __name__ == '__main__': main()