initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
-- 041_curteacont_cleaned_name_match.sql
|
||||
-- Follow-up to 040 — handles the residual 131 rows whose audited_entity_name
|
||||
-- contains the prefix " ) privind raportul de audit al performantei nr.X, ENTITY"
|
||||
-- (scraper bug: parser kept the prefix instead of just the entity).
|
||||
--
|
||||
-- Extract the entity name via split-on-last-comma, then retry both
|
||||
-- UAT-pattern and strip-parens match.
|
||||
--
|
||||
-- Source bug should also be fixed in services/seap-scraper/src/scrape-curteacont.ts
|
||||
-- but that's a separate task; SQL repair lands the data improvement immediately.
|
||||
|
||||
\timing on
|
||||
|
||||
DROP TABLE IF EXISTS tmp_cdc_residue;
|
||||
CREATE TEMP TABLE tmp_cdc_residue AS
|
||||
SELECT slug_id, audited_entity_name,
|
||||
trim(reverse(split_part(reverse(audited_entity_name), ',', 1))) AS clean_name
|
||||
FROM curteacont.rapoarte
|
||||
WHERE audited_entity_cui IS NULL
|
||||
AND audited_entity_name IS NOT NULL
|
||||
AND audited_entity_name ~ '\) privind raportul';
|
||||
|
||||
-- Pass 1: UAT-pattern on cleaned names
|
||||
WITH cleaned AS (
|
||||
SELECT slug_id, clean_name,
|
||||
firms.normalize_company_name(
|
||||
CASE
|
||||
WHEN clean_name ~* '^UATC ' THEN 'COMUNA ' || regexp_replace(clean_name, '^UATC\s+', '', 'i')
|
||||
WHEN clean_name ~* '^UATJ ' THEN 'JUDETUL ' || regexp_replace(clean_name, '^UATJ\s+', '', 'i')
|
||||
WHEN clean_name ~* '^UATO ' THEN 'ORAS ' || regexp_replace(clean_name, '^UATO\s+', '', 'i')
|
||||
WHEN clean_name ~* '^UATM ' THEN 'MUNICIPIUL ' || regexp_replace(clean_name, '^UATM\s+', '', 'i')
|
||||
WHEN clean_name ~* '^UAT SECTOR(UL)? [1-6]'
|
||||
THEN 'SECTOR ' || substring(clean_name FROM '^UAT SECTOR(?:UL)? ([1-6])')
|
||||
ELSE NULL
|
||||
END
|
||||
) AS expected_norm
|
||||
FROM tmp_cdc_residue
|
||||
),
|
||||
firms_uat AS (
|
||||
SELECT cui, firms.normalize_company_name(regexp_replace(name, '\s*\(.*$', '')) AS norm_stripped
|
||||
FROM firms.entities
|
||||
WHERE name ~* '^(COMUNA |JUDETUL |ORAS |ORASUL |MUNICIPIUL |SECTOR(UL)? [1-6])'
|
||||
),
|
||||
candidates AS (
|
||||
SELECT DISTINCT ON (c.slug_id) c.slug_id, f.cui
|
||||
FROM cleaned c
|
||||
JOIN firms_uat f ON f.norm_stripped = c.expected_norm
|
||||
WHERE c.expected_norm IS NOT NULL
|
||||
ORDER BY c.slug_id, f.cui
|
||||
)
|
||||
UPDATE curteacont.rapoarte r
|
||||
SET audited_entity_cui = c.cui,
|
||||
audited_entity_name = trim(reverse(split_part(reverse(r.audited_entity_name), ',', 1))), -- also fix the name field
|
||||
parsed_at = COALESCE(r.parsed_at, now())
|
||||
FROM candidates c
|
||||
WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL;
|
||||
|
||||
-- Pass 2: strip-parens exact on cleaned name + ONRC stripped name
|
||||
WITH cleaned AS (
|
||||
SELECT slug_id,
|
||||
trim(reverse(split_part(reverse(audited_entity_name), ',', 1))) AS clean_name
|
||||
FROM curteacont.rapoarte
|
||||
WHERE audited_entity_cui IS NULL
|
||||
AND audited_entity_name ~ '\) privind raportul'
|
||||
),
|
||||
candidates2 AS (
|
||||
SELECT DISTINCT ON (c.slug_id) c.slug_id, e.cui
|
||||
FROM cleaned c
|
||||
JOIN firms.entities e
|
||||
ON firms.normalize_company_name(regexp_replace(e.name, '\s*\(.*$', ''))
|
||||
= firms.normalize_company_name(regexp_replace(c.clean_name, '\s*\(.*$', ''))
|
||||
ORDER BY c.slug_id, e.cui
|
||||
)
|
||||
UPDATE curteacont.rapoarte r
|
||||
SET audited_entity_cui = c.cui,
|
||||
audited_entity_name = trim(reverse(split_part(reverse(r.audited_entity_name), ',', 1))),
|
||||
parsed_at = COALESCE(r.parsed_at, now())
|
||||
FROM candidates2 c
|
||||
WHERE r.slug_id = c.slug_id AND r.audited_entity_cui IS NULL;
|
||||
|
||||
-- Final stats
|
||||
SELECT count(*) AS total,
|
||||
count(audited_entity_cui) AS with_cui,
|
||||
round(100.0 * count(audited_entity_cui) / count(*), 1) AS pct
|
||||
FROM curteacont.rapoarte;
|
||||
|
||||
DROP TABLE tmp_cdc_residue;
|
||||
Reference in New Issue
Block a user