Files
vreau-digital/services/seap-scraper/sql/036_apia.sql
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

236 lines
11 KiB
PL/PgSQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- 036_apia.sql
-- APIA — Agenția de Plăți și Intervenție pentru Agricultură.
-- "Lista fermierilor" — annual list of farmers receiving direct payments
-- (subvenții) per UAT campaign. Covers ~each commune that publishes a list
-- via data.gov.ro (currently 2024 only; one comuna live, more on the way).
--
-- Sources investigated 2026-05-10:
-- 1. data.gov.ro CKAN — only ONE published "Lista fermieri APIA" XLSX
-- lives at /api/3/action/package_show?id=lista-fermierilor-campania-apia-2024
-- (single resource: comuna Găgești, jud. Vaslui, 192 farmers).
-- The schema is per-comuna so future ingests over the same package
-- will multiply rows linearly.
-- 2. https://www.apia.org.ro/ — bot-blocked (HTTP 403 from non-browser
-- User-Agents). National-level lists exist on APIA's site but require
-- JS / browser session to retrieve. Out of scope for this pass.
-- 3. AFIR FEGA dump (fonduri.afir_plati WHERE tip_fond='FEGA', 4.29M rows
-- for 2023+2024) is the *closest* national equivalent — it contains
-- payment amounts but no SUPRAFATA (hectares) and no
-- RESPONSABIL UAT / CENTRUL APIA fields.
--
-- This schema is therefore intentionally narrow but extensible:
-- - one row per (campaign_year, name, comuna, sat) — natural composite key
-- - source_dataset_id + source_resource_id on every row → idempotent re-ingest
-- - cui populated only when the row is a legal person (SC ... SRL / PFA);
-- for natural persons (CNP-keyed in source) cui stays NULL
--
-- Cross-source value:
-- apia.fermieri.cui × fonduri.afir_plati(tip_fond='FEGA').cui
-- = "Fermier in lista APIA care apare ȘI in plățile FEGA AFIR" — sanity
-- check duplicate-receipt audit. APIA list shows hectares declared,
-- FEGA shows EUR plătiți; ratio EUR/ha → outliers.
-- apia.fermieri.cui × anaf.datornici.cui
-- = "Fermier (PFA/SRL) cu datorii la stat care primește subvenții APIA"
-- — direct red flag.
-- apia.fermieri.name (PF, no CUI) × ani.declaratii.persoana_name
-- = persoane cu funcții publice care primesc subvenții agricole.
-- apia.fermieri.cui × seap.announcements.supplier_cui
-- = ferme care iau și subvenții și contracte publice.
CREATE SCHEMA IF NOT EXISTS apia;
-- ── 1. Lista fermieri ─────────────────────────────────────────────────────
-- One row per farmer × campaign × comuna × sat. Composite uniqueness chosen
-- because data.gov.ro source has no national ID column (no CNP/CUI per row).
-- 'name' is raw "NUME PRENUME" string from the published XLSX.
CREATE TABLE IF NOT EXISTS apia.fermieri (
id bigserial PRIMARY KEY,
campaign_year smallint NOT NULL, -- e.g. 2024 (also 2023 SUPRAFATA exists in same row but campaign year is publication year)
name text NOT NULL, -- raw "NUME PRENUME" or "SC ... SRL"
name_normalized text, -- firms.normalize_company_name(name) — only when looks like PJ
cui text, -- only if matched to firms.entities (PJ rows like "SC X SRL")
cui_match_method text, -- 'exact_norm' | 'trgm_unique' | NULL
cui_match_score numeric(4,3),
is_legal_person boolean, -- guessed from name shape (SC, SRL, PFA, II, IF, SA prefixes/suffixes)
judet text, -- enriched via centru_apia mapping (Găgești → VS Vaslui)
comuna_oras text, -- raw "COMUNA/ORAS" cell
sat text,
centru_apia text, -- "CENTRUL APIA" (e.g. MURGENI)
responsabil_uat text, -- "RESPONSABIL UAT 2024" (the UAT employee, not the farmer)
suprafata_ha numeric(12,4), -- "SUPRAFATA 2023" hectares, decimal allowed (e.g. 1.04, 12.45)
source_dataset_id text NOT NULL, -- CKAN package_id, e.g. 'lista-fermierilor-campania-apia-2024'
source_resource_id text NOT NULL, -- CKAN resource_id (UUID)
source_url text NOT NULL, -- direct XLSX download URL
fetched_at timestamptz NOT NULL DEFAULT now(),
-- NULLS NOT DISTINCT: treat NULL sat as a single value so we don't get
-- duplicate rows when source omits sat for some farmers.
UNIQUE NULLS NOT DISTINCT (campaign_year, name, comuna_oras, sat)
);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_cui ON apia.fermieri(cui) WHERE cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_year ON apia.fermieri(campaign_year);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_judet ON apia.fermieri(judet);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_comuna ON apia.fermieri(comuna_oras);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_centru ON apia.fermieri(centru_apia);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_resource ON apia.fermieri(source_resource_id);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_suprafata ON apia.fermieri(suprafata_ha DESC NULLS LAST);
CREATE INDEX IF NOT EXISTS idx_apia_fermieri_name_trgm ON apia.fermieri USING gin (name_normalized gin_trgm_ops);
COMMENT ON TABLE apia.fermieri IS
'Lista fermierilor publicată de APIA prin UAT-uri pe data.gov.ro. '
'Sursă primară: CKAN package "lista-fermierilor-campania-apia-2024". '
'Cross-source cu fonduri.afir_plati (FEGA) și anaf.datornici.';
COMMENT ON COLUMN apia.fermieri.suprafata_ha IS
'Hectare declarate — coloana "SUPRAFATA 2023" (precedent campaign).';
COMMENT ON COLUMN apia.fermieri.is_legal_person IS
'true = nume conține markeri "SC/SRL/SA/PFA/II/IF/CABINET" → potențial match firms.entities.';
COMMENT ON COLUMN apia.fermieri.cui_match_method IS
'exact_norm = match exact pe firms.normalize_company_name; '
'trgm_unique = match trigram unic peste 0.85; NULL = nepotrivit (probabil PF).';
-- ── 1b. Staging table (used by importer for COPY → INSERT pipeline) ─────
CREATE TABLE IF NOT EXISTS apia.staging_fermieri (
campaign_year text,
name text,
comuna_oras text,
sat text,
centru_apia text,
responsabil_uat text,
suprafata_ha text,
source_dataset_id text,
source_resource_id text,
source_url text
);
COMMENT ON TABLE apia.staging_fermieri IS
'Tabel de staging pentru importul XLSX→COPY. TRUNCATE între import-uri.';
-- ── 2. Scrape log ─────────────────────────────────────────────────────────
-- One row per CKAN-resource ingest. Useful for "ce am importat când" history.
CREATE TABLE IF NOT EXISTS apia.scrape_log (
id bigserial PRIMARY KEY,
source_dataset_id text NOT NULL,
source_resource_id text NOT NULL,
source_url text NOT NULL,
campaign_year smallint NOT NULL,
rows_seen integer NOT NULL DEFAULT 0,
rows_inserted integer NOT NULL DEFAULT 0,
rows_updated integer NOT NULL DEFAULT 0,
rows_matched_cui integer NOT NULL DEFAULT 0,
duration_ms integer NOT NULL DEFAULT 0,
started_at timestamptz NOT NULL,
finished_at timestamptz NOT NULL DEFAULT now(),
error text
);
CREATE INDEX IF NOT EXISTS idx_apia_scrape_log_started ON apia.scrape_log(started_at DESC);
CREATE INDEX IF NOT EXISTS idx_apia_scrape_log_resource ON apia.scrape_log(source_resource_id);
-- ── 3. Materialized view: per-CUI rollup ─────────────────────────────────
-- Refresh: REFRESH MATERIALIZED VIEW CONCURRENTLY apia.mv_per_cui;
CREATE MATERIALIZED VIEW IF NOT EXISTS apia.mv_per_cui AS
SELECT
cui,
array_agg(DISTINCT campaign_year ORDER BY campaign_year) AS campaign_years,
array_agg(DISTINCT judet) FILTER (WHERE judet IS NOT NULL) AS judete,
SUM(suprafata_ha) AS total_suprafata_ha,
COUNT(*) AS rows_count,
MAX(fetched_at) AS last_seen_at
FROM apia.fermieri
WHERE cui IS NOT NULL
GROUP BY cui;
CREATE UNIQUE INDEX IF NOT EXISTS idx_apia_mv_per_cui ON apia.mv_per_cui(cui);
COMMENT ON MATERIALIZED VIEW apia.mv_per_cui IS
'Rollup APIA per CUI (doar PJ-uri cu match). '
'Refresh: REFRESH MATERIALIZED VIEW CONCURRENTLY apia.mv_per_cui.';
-- ── 4. CUI matcher (call after each ingest) ──────────────────────────────
-- Populates apia.fermieri.cui by joining name_normalized against
-- firms.entities. Conservative: requires *unique* match in firms (no
-- ambiguous trgm collisions).
CREATE OR REPLACE FUNCTION apia.match_cui()
RETURNS TABLE(matched_count bigint, ambiguous_count bigint) AS $$
DECLARE
v_matched bigint := 0;
v_ambiguous bigint := 0;
BEGIN
-- Heuristic: row is candidate PJ if name has SRL/SA/PFA/II/IF/SC/CABINET.
UPDATE apia.fermieri f
SET is_legal_person = TRUE,
name_normalized = firms.normalize_company_name(f.name)
WHERE f.is_legal_person IS NULL
AND f.name ~* '\m(SRL|S\.R\.L\.|S\.A\.|SA|PFA|P\.F\.A\.|II|I\.I\.|IF|I\.F\.|SC|S\.C\.|CABINET|COOPERATIVA|COOP)\M';
-- Exact-norm match
WITH cands AS (
SELECT f.id, e.cui
FROM apia.fermieri f
JOIN firms.entities e
ON e.name_normalized = f.name_normalized
WHERE f.cui IS NULL
AND f.is_legal_person = TRUE
AND f.name_normalized IS NOT NULL
),
uniq AS (
SELECT id, MIN(cui) AS cui
FROM cands
GROUP BY id
HAVING COUNT(DISTINCT cui) = 1
),
upd AS (
UPDATE apia.fermieri f
SET cui = u.cui,
cui_match_method = 'exact_norm',
cui_match_score = 1.0
FROM uniq u
WHERE f.id = u.id
RETURNING f.id
)
SELECT COUNT(*) INTO v_matched FROM upd;
-- Trigram fallback for unmatched PJs (threshold 0.85, must be unique)
WITH cands AS (
SELECT f.id, e.cui,
similarity(e.name_normalized, f.name_normalized) AS sim
FROM apia.fermieri f
JOIN firms.entities e
ON e.name_normalized % f.name_normalized
WHERE f.cui IS NULL
AND f.is_legal_person = TRUE
AND f.name_normalized IS NOT NULL
AND similarity(e.name_normalized, f.name_normalized) >= 0.85
),
ranked AS (
SELECT id, cui, sim,
COUNT(*) OVER (PARTITION BY id) AS n_cands
FROM cands
),
uniq AS (
SELECT DISTINCT ON (id) id, cui, sim
FROM ranked
WHERE n_cands = 1
),
upd AS (
UPDATE apia.fermieri f
SET cui = u.cui,
cui_match_method = 'trgm_unique',
cui_match_score = u.sim
FROM uniq u
WHERE f.id = u.id
RETURNING f.id
)
SELECT COUNT(*) INTO v_ambiguous FROM upd;
matched_count := v_matched;
ambiguous_count := v_ambiguous;
RETURN NEXT;
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION apia.match_cui() IS
'Matchează apia.fermieri.cui prin firms.entities. Întâi exact_norm, apoi '
'trgm fallback >=0.85 cu unicitate. Returnează (matched_exact, matched_trgm). '
'Doar rândurile marcate is_legal_person.';