-- 036_apia.sql -- APIA — Agenția de Plăți și Intervenție pentru Agricultură. -- "Lista fermierilor" — annual list of farmers receiving direct payments -- (subvenții) per UAT campaign. Covers ~each commune that publishes a list -- via data.gov.ro (currently 2024 only; one comuna live, more on the way). -- -- Sources investigated 2026-05-10: -- 1. data.gov.ro CKAN — only ONE published "Lista fermieri APIA" XLSX -- lives at /api/3/action/package_show?id=lista-fermierilor-campania-apia-2024 -- (single resource: comuna Găgești, jud. Vaslui, 192 farmers). -- The schema is per-comuna so future ingests over the same package -- will multiply rows linearly. -- 2. https://www.apia.org.ro/ — bot-blocked (HTTP 403 from non-browser -- User-Agents). National-level lists exist on APIA's site but require -- JS / browser session to retrieve. Out of scope for this pass. -- 3. AFIR FEGA dump (fonduri.afir_plati WHERE tip_fond='FEGA', 4.29M rows -- for 2023+2024) is the *closest* national equivalent — it contains -- payment amounts but no SUPRAFATA (hectares) and no -- RESPONSABIL UAT / CENTRUL APIA fields. -- -- This schema is therefore intentionally narrow but extensible: -- - one row per (campaign_year, name, comuna, sat) — natural composite key -- - source_dataset_id + source_resource_id on every row → idempotent re-ingest -- - cui populated only when the row is a legal person (SC ... SRL / PFA); -- for natural persons (CNP-keyed in source) cui stays NULL -- -- Cross-source value: -- apia.fermieri.cui × fonduri.afir_plati(tip_fond='FEGA').cui -- = "Fermier in lista APIA care apare ȘI in plățile FEGA AFIR" — sanity -- check duplicate-receipt audit. APIA list shows hectares declared, -- FEGA shows EUR plătiți; ratio EUR/ha → outliers. -- apia.fermieri.cui × anaf.datornici.cui -- = "Fermier (PFA/SRL) cu datorii la stat care primește subvenții APIA" -- — direct red flag. -- apia.fermieri.name (PF, no CUI) × ani.declaratii.persoana_name -- = persoane cu funcții publice care primesc subvenții agricole. -- apia.fermieri.cui × seap.announcements.supplier_cui -- = ferme care iau și subvenții și contracte publice. CREATE SCHEMA IF NOT EXISTS apia; -- ── 1. Lista fermieri ───────────────────────────────────────────────────── -- One row per farmer × campaign × comuna × sat. Composite uniqueness chosen -- because data.gov.ro source has no national ID column (no CNP/CUI per row). -- 'name' is raw "NUME PRENUME" string from the published XLSX. CREATE TABLE IF NOT EXISTS apia.fermieri ( id bigserial PRIMARY KEY, campaign_year smallint NOT NULL, -- e.g. 2024 (also 2023 SUPRAFATA exists in same row but campaign year is publication year) name text NOT NULL, -- raw "NUME PRENUME" or "SC ... SRL" name_normalized text, -- firms.normalize_company_name(name) — only when looks like PJ cui text, -- only if matched to firms.entities (PJ rows like "SC X SRL") cui_match_method text, -- 'exact_norm' | 'trgm_unique' | NULL cui_match_score numeric(4,3), is_legal_person boolean, -- guessed from name shape (SC, SRL, PFA, II, IF, SA prefixes/suffixes) judet text, -- enriched via centru_apia mapping (Găgești → VS Vaslui) comuna_oras text, -- raw "COMUNA/ORAS" cell sat text, centru_apia text, -- "CENTRUL APIA" (e.g. MURGENI) responsabil_uat text, -- "RESPONSABIL UAT 2024" (the UAT employee, not the farmer) suprafata_ha numeric(12,4), -- "SUPRAFATA 2023" hectares, decimal allowed (e.g. 1.04, 12.45) source_dataset_id text NOT NULL, -- CKAN package_id, e.g. 'lista-fermierilor-campania-apia-2024' source_resource_id text NOT NULL, -- CKAN resource_id (UUID) source_url text NOT NULL, -- direct XLSX download URL fetched_at timestamptz NOT NULL DEFAULT now(), -- NULLS NOT DISTINCT: treat NULL sat as a single value so we don't get -- duplicate rows when source omits sat for some farmers. UNIQUE NULLS NOT DISTINCT (campaign_year, name, comuna_oras, sat) ); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_cui ON apia.fermieri(cui) WHERE cui IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_apia_fermieri_year ON apia.fermieri(campaign_year); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_judet ON apia.fermieri(judet); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_comuna ON apia.fermieri(comuna_oras); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_centru ON apia.fermieri(centru_apia); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_resource ON apia.fermieri(source_resource_id); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_suprafata ON apia.fermieri(suprafata_ha DESC NULLS LAST); CREATE INDEX IF NOT EXISTS idx_apia_fermieri_name_trgm ON apia.fermieri USING gin (name_normalized gin_trgm_ops); COMMENT ON TABLE apia.fermieri IS 'Lista fermierilor publicată de APIA prin UAT-uri pe data.gov.ro. ' 'Sursă primară: CKAN package "lista-fermierilor-campania-apia-2024". ' 'Cross-source cu fonduri.afir_plati (FEGA) și anaf.datornici.'; COMMENT ON COLUMN apia.fermieri.suprafata_ha IS 'Hectare declarate — coloana "SUPRAFATA 2023" (precedent campaign).'; COMMENT ON COLUMN apia.fermieri.is_legal_person IS 'true = nume conține markeri "SC/SRL/SA/PFA/II/IF/CABINET" → potențial match firms.entities.'; COMMENT ON COLUMN apia.fermieri.cui_match_method IS 'exact_norm = match exact pe firms.normalize_company_name; ' 'trgm_unique = match trigram unic peste 0.85; NULL = nepotrivit (probabil PF).'; -- ── 1b. Staging table (used by importer for COPY → INSERT pipeline) ───── CREATE TABLE IF NOT EXISTS apia.staging_fermieri ( campaign_year text, name text, comuna_oras text, sat text, centru_apia text, responsabil_uat text, suprafata_ha text, source_dataset_id text, source_resource_id text, source_url text ); COMMENT ON TABLE apia.staging_fermieri IS 'Tabel de staging pentru importul XLSX→COPY. TRUNCATE între import-uri.'; -- ── 2. Scrape log ───────────────────────────────────────────────────────── -- One row per CKAN-resource ingest. Useful for "ce am importat când" history. CREATE TABLE IF NOT EXISTS apia.scrape_log ( id bigserial PRIMARY KEY, source_dataset_id text NOT NULL, source_resource_id text NOT NULL, source_url text NOT NULL, campaign_year smallint NOT NULL, rows_seen integer NOT NULL DEFAULT 0, rows_inserted integer NOT NULL DEFAULT 0, rows_updated integer NOT NULL DEFAULT 0, rows_matched_cui integer NOT NULL DEFAULT 0, duration_ms integer NOT NULL DEFAULT 0, started_at timestamptz NOT NULL, finished_at timestamptz NOT NULL DEFAULT now(), error text ); CREATE INDEX IF NOT EXISTS idx_apia_scrape_log_started ON apia.scrape_log(started_at DESC); CREATE INDEX IF NOT EXISTS idx_apia_scrape_log_resource ON apia.scrape_log(source_resource_id); -- ── 3. Materialized view: per-CUI rollup ───────────────────────────────── -- Refresh: REFRESH MATERIALIZED VIEW CONCURRENTLY apia.mv_per_cui; CREATE MATERIALIZED VIEW IF NOT EXISTS apia.mv_per_cui AS SELECT cui, array_agg(DISTINCT campaign_year ORDER BY campaign_year) AS campaign_years, array_agg(DISTINCT judet) FILTER (WHERE judet IS NOT NULL) AS judete, SUM(suprafata_ha) AS total_suprafata_ha, COUNT(*) AS rows_count, MAX(fetched_at) AS last_seen_at FROM apia.fermieri WHERE cui IS NOT NULL GROUP BY cui; CREATE UNIQUE INDEX IF NOT EXISTS idx_apia_mv_per_cui ON apia.mv_per_cui(cui); COMMENT ON MATERIALIZED VIEW apia.mv_per_cui IS 'Rollup APIA per CUI (doar PJ-uri cu match). ' 'Refresh: REFRESH MATERIALIZED VIEW CONCURRENTLY apia.mv_per_cui.'; -- ── 4. CUI matcher (call after each ingest) ────────────────────────────── -- Populates apia.fermieri.cui by joining name_normalized against -- firms.entities. Conservative: requires *unique* match in firms (no -- ambiguous trgm collisions). CREATE OR REPLACE FUNCTION apia.match_cui() RETURNS TABLE(matched_count bigint, ambiguous_count bigint) AS $$ DECLARE v_matched bigint := 0; v_ambiguous bigint := 0; BEGIN -- Heuristic: row is candidate PJ if name has SRL/SA/PFA/II/IF/SC/CABINET. UPDATE apia.fermieri f SET is_legal_person = TRUE, name_normalized = firms.normalize_company_name(f.name) WHERE f.is_legal_person IS NULL AND f.name ~* '\m(SRL|S\.R\.L\.|S\.A\.|SA|PFA|P\.F\.A\.|II|I\.I\.|IF|I\.F\.|SC|S\.C\.|CABINET|COOPERATIVA|COOP)\M'; -- Exact-norm match WITH cands AS ( SELECT f.id, e.cui FROM apia.fermieri f JOIN firms.entities e ON e.name_normalized = f.name_normalized WHERE f.cui IS NULL AND f.is_legal_person = TRUE AND f.name_normalized IS NOT NULL ), uniq AS ( SELECT id, MIN(cui) AS cui FROM cands GROUP BY id HAVING COUNT(DISTINCT cui) = 1 ), upd AS ( UPDATE apia.fermieri f SET cui = u.cui, cui_match_method = 'exact_norm', cui_match_score = 1.0 FROM uniq u WHERE f.id = u.id RETURNING f.id ) SELECT COUNT(*) INTO v_matched FROM upd; -- Trigram fallback for unmatched PJs (threshold 0.85, must be unique) WITH cands AS ( SELECT f.id, e.cui, similarity(e.name_normalized, f.name_normalized) AS sim FROM apia.fermieri f JOIN firms.entities e ON e.name_normalized % f.name_normalized WHERE f.cui IS NULL AND f.is_legal_person = TRUE AND f.name_normalized IS NOT NULL AND similarity(e.name_normalized, f.name_normalized) >= 0.85 ), ranked AS ( SELECT id, cui, sim, COUNT(*) OVER (PARTITION BY id) AS n_cands FROM cands ), uniq AS ( SELECT DISTINCT ON (id) id, cui, sim FROM ranked WHERE n_cands = 1 ), upd AS ( UPDATE apia.fermieri f SET cui = u.cui, cui_match_method = 'trgm_unique', cui_match_score = u.sim FROM uniq u WHERE f.id = u.id RETURNING f.id ) SELECT COUNT(*) INTO v_ambiguous FROM upd; matched_count := v_matched; ambiguous_count := v_ambiguous; RETURN NEXT; END; $$ LANGUAGE plpgsql; COMMENT ON FUNCTION apia.match_cui() IS 'Matchează apia.fermieri.cui prin firms.entities. Întâi exact_norm, apoi ' 'trgm fallback >=0.85 cu unicitate. Returnează (matched_exact, matched_trgm). ' 'Doar rândurile marcate is_legal_person.';