a6c03a091e
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
106 lines
4.0 KiB
SQL
106 lines
4.0 KiB
SQL
-- 040_curteacont_uat_pattern_match.sql
|
|
-- High-precision CUI match for curteacont.rapoarte using UAT-pattern + strip-parens.
|
|
--
|
|
-- Curtea de Conturi uses specific abbreviations for territorial units:
|
|
-- UATC X → COMUNA X
|
|
-- UATJ X → JUDETUL X
|
|
-- UATO X → ORASUL X / ORAS X
|
|
-- UATM X → MUNICIPIUL X
|
|
-- UAT SECTOR N (... BUCURESTI) → SECTOR N
|
|
--
|
|
-- ONRC stores these with parenthetical suffix indicating the operating body:
|
|
-- "JUDETUL MARAMURES (CONSILIUL JUDETEAN MARAMURES)"
|
|
-- "Comuna Surduc (Primaria Comunei Surduc)"
|
|
-- "SECTOR 3 (PRIMARIA SECTOR 3 BUCURESTI)"
|
|
--
|
|
-- Strip ONRC " (...)" suffix and compare normalized → exact match.
|
|
--
|
|
-- Idempotent: UPDATEs only WHERE audited_entity_cui IS NULL.
|
|
|
|
\timing on
|
|
SET pg_trgm.similarity_threshold = 0.78; -- safety reset
|
|
|
|
-- Build a small prefiltered firms table once (UATs only ~10K rows)
|
|
DROP TABLE IF EXISTS tmp_firms_uat;
|
|
CREATE TEMP TABLE tmp_firms_uat AS
|
|
SELECT cui, name,
|
|
firms.normalize_company_name(regexp_replace(name, '\s*\(.*$', '')) AS norm_stripped
|
|
FROM firms.entities
|
|
WHERE name ~* '^(COMUNA |JUDETUL |ORAS |ORASUL |MUNICIPIUL |SECTOR(UL)? [1-6])';
|
|
CREATE INDEX ON tmp_firms_uat (norm_stripped);
|
|
ANALYZE tmp_firms_uat;
|
|
|
|
-- Compute expected ONRC-form name for each cdc audited entity
|
|
DROP TABLE IF EXISTS tmp_cdc_uat;
|
|
CREATE TEMP TABLE tmp_cdc_uat AS
|
|
SELECT slug_id, audited_entity_name,
|
|
firms.normalize_company_name(
|
|
CASE
|
|
WHEN audited_entity_name ~* '^UATC '
|
|
THEN 'COMUNA ' || regexp_replace(audited_entity_name, '^UATC\s+', '', 'i')
|
|
WHEN audited_entity_name ~* '^UATJ '
|
|
THEN 'JUDETUL ' || regexp_replace(audited_entity_name, '^UATJ\s+', '', 'i')
|
|
WHEN audited_entity_name ~* '^UATO '
|
|
THEN 'ORAS ' || regexp_replace(audited_entity_name, '^UATO\s+', '', 'i')
|
|
WHEN audited_entity_name ~* '^UATM '
|
|
THEN 'MUNICIPIUL ' || regexp_replace(audited_entity_name, '^UATM\s+', '', 'i')
|
|
WHEN audited_entity_name ~* '^UAT SECTOR(UL)? [1-6]'
|
|
THEN 'SECTOR ' || substring(audited_entity_name FROM '^UAT SECTOR(?:UL)? ([1-6])')
|
|
ELSE NULL
|
|
END
|
|
) AS expected_norm
|
|
FROM curteacont.rapoarte
|
|
WHERE audited_entity_cui IS NULL
|
|
AND audited_entity_name IS NOT NULL
|
|
AND audited_entity_name ~* '^(UATC |UATJ |UATO |UATM |UAT SECTOR)';
|
|
|
|
-- Stats before update
|
|
SELECT count(*) AS unmapped_uat_rows FROM tmp_cdc_uat WHERE expected_norm IS NOT NULL;
|
|
|
|
-- Apply the match
|
|
WITH candidates AS (
|
|
SELECT DISTINCT ON (c.slug_id) c.slug_id, f.cui
|
|
FROM tmp_cdc_uat c
|
|
JOIN tmp_firms_uat f ON f.norm_stripped = c.expected_norm
|
|
ORDER BY c.slug_id, f.cui
|
|
)
|
|
UPDATE curteacont.rapoarte r
|
|
SET audited_entity_cui = c.cui,
|
|
parsed_at = COALESCE(r.parsed_at, now())
|
|
FROM candidates c
|
|
WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL;
|
|
|
|
-- Also try a fallback exact-match path for non-UAT names (ministries etc.)
|
|
-- Match audited_entity_name directly to firms.entities.name with strip-parens.
|
|
WITH cdc_non_uat AS (
|
|
SELECT slug_id, audited_entity_name,
|
|
firms.normalize_company_name(audited_entity_name) AS norm
|
|
FROM curteacont.rapoarte
|
|
WHERE audited_entity_cui IS NULL
|
|
AND audited_entity_name IS NOT NULL
|
|
AND audited_entity_name !~* '^(UATC |UATJ |UATO |UATM |UAT SECTOR)'
|
|
),
|
|
candidates2 AS (
|
|
SELECT DISTINCT ON (c.slug_id) c.slug_id, e.cui
|
|
FROM cdc_non_uat c
|
|
JOIN firms.entities e
|
|
ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', '')) = c.norm
|
|
ORDER BY c.slug_id, e.cui
|
|
)
|
|
UPDATE curteacont.rapoarte r
|
|
SET audited_entity_cui = c.cui,
|
|
parsed_at = COALESCE(r.parsed_at, now())
|
|
FROM candidates2 c
|
|
WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL;
|
|
|
|
-- Final stats
|
|
SELECT count(*) AS total,
|
|
count(audited_entity_cui) AS with_cui,
|
|
round(100.0 * count(audited_entity_cui) / count(*), 1) AS pct
|
|
FROM curteacont.rapoarte;
|
|
|
|
-- Refresh the per-audited rollup if exists (no MV defined yet for curteacont but
|
|
-- the autoritate profile pulls live; no refresh needed)
|
|
|
|
DROP TABLE tmp_firms_uat, tmp_cdc_uat;
|