Files
vreau-digital/services/seap-scraper/sql/039_bugetar_uat_pattern_match.sql
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

63 lines
2.7 KiB
SQL

-- 039_bugetar_uat_pattern_match.sql
-- High-precision CUI match for bugetar.entitate UAT entries by stripping
-- ONRC parenthetical suffix "X (PRIMARIA Y)" and comparing normalized names.
--
-- Replaces the trgm-based 038 attempt which was too slow (90+ min, low yield)
-- and hit false positives on the COMUNA-PRIMARIA naming gap.
--
-- Insight: ONRC stores comune/orașe with parenthetical suffix:
-- "Comuna Surduc (Primaria Comunei Surduc)"
-- "COMUNA CIZER (PRIMARIA)"
-- "Comuna Mesesenii de Jos (Primaria Mesesenii de Jos Jud. Salaj)"
-- Bugetar entries are clean: "COMUNA SURDUC". Stripping ' (...)' from ONRC
-- name and comparing normalized → exact match → high-confidence resolve.
--
-- Idempotent: UPDATEs only WHERE cui IS NULL. Threshold-agnostic.
-- Test runtime: ~1.7s per judet, 42 judete → ~70s total.
\timing on
SET pg_trgm.similarity_threshold = 0.78; -- not used here but reset for safety
WITH judet_map AS (
SELECT * FROM (VALUES
('AB','ALBA'),('AG','ARGES'),('AR','ARAD'),('B','MUNICIPIUL BUCURESTI'),
('BC','BACAU'),('BH','BIHOR'),('BN','BISTRITA-NASAUD'),('BR','BRAILA'),
('BT','BOTOSANI'),('BV','BRASOV'),('BZ','BUZAU'),('CJ','CLUJ'),
('CL','CALARASI'),('CS','CARAS-SEVERIN'),('CT','CONSTANTA'),('CV','COVASNA'),
('DB','DAMBOVITA'),('DJ','DOLJ'),('GJ','GORJ'),('GL','GALATI'),
('GR','GIURGIU'),('HD','HUNEDOARA'),('HR','HARGHITA'),('IF','ILFOV'),
('IL','IALOMITA'),('IS','IASI'),('MH','MEHEDINTI'),('MM','MARAMURES'),
('MS','MURES'),('NT','NEAMT'),('OT','OLT'),('PH','PRAHOVA'),
('SB','SIBIU'),('SJ','SALAJ'),('SM','SATU MARE'),('SV','SUCEAVA'),
('TL','TULCEA'),('TM','TIMIS'),('TR','TELEORMAN'),('VL','VALCEA'),
('VN','VRANCEA'),('VS','VASLUI')
) AS m(code, name_ascii)
),
candidates AS (
SELECT DISTINCT ON (b.id) b.id, e.cui, e.name AS firm_name
FROM bugetar.entitate b
JOIN judet_map jm ON jm.code = b.judet
JOIN firms.entities e
ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', '')) = firms.normalize_company_name(b.entity_name)
AND UPPER(translate(COALESCE(e.adr_judet,''), 'ŞȘŢȚăâîĂÂÎ', 'SSTTAAIAAI')) = jm.name_ascii
WHERE b.cui IS NULL
ORDER BY b.id, e.cui -- deterministic when multiple firms share normalized stripped name
)
UPDATE bugetar.entitate b
SET cui = c.cui,
cui_match_score = 0.95::numeric(5,2), -- high-confidence stripped-exact match
cui_match_method = 'uat_pattern',
updated_at = now()
FROM candidates c
WHERE b.id = c.id AND b.cui IS NULL;
-- Final stats
SELECT cui_match_method, count(*),
round(avg(cui_match_score)::numeric, 2) AS avg_score
FROM bugetar.entitate
WHERE cui IS NOT NULL
GROUP BY 1
ORDER BY 2 DESC;
SELECT 'unmatched' AS t, count(*) FROM bugetar.entitate WHERE cui IS NULL;