vreau-digital/services/seap-scraper/sql/037_gnm.sql

-- 037_gnm.sql
-- GNM — Garda Națională de Mediu.
-- Public press releases of environmental enforcement actions scraped from
-- gnm.ro/feed/ (WordPress RSS, ~358 items / 36 pages).
--
-- Investigation summary (2026-05-10):
--   • The institution publishes only AGGREGATE statistics (per-month / per-judet
--     totals) in their monthly synthesis PDFs (sinteza_<luna>_<an>.pdf) and the
--     annual activity report (raport_activitate_<an>.pdf). NO per-firm registry
--     is published with CUIs and individual fine amounts.
--   • The only place where individual violators are named is in press releases
--     ("comunicate de presă"). Even there:
--       – Most releases reference "doi operatori", "șapte operatori în patru
--         județe" without naming firms.
--       – When firms are named (e.g. Petrobrazi, Vega, Lukoil refineries), the
--         individual amount is rarely broken down — they receive a collective
--         "€340,000 în ultimul an" figure.
--       – CUIs are NEVER published; we must fuzzy-match on company name +
--         judet via cui_matcher (Stage B of the pipeline).
--   • data.gov.ro has 0 GNM datasets; ANPM publishes IPPC/SEVESO inventories
--     (which we ingest separately) but no fines.
--
-- Conclusion: this is a partial / sample-quality dataset. We capture every
-- press release as gnm.communicate, then run a regex extractor to surface
-- candidate (company, fine_lei, fapta) tuples into gnm.amenzi_extrase. The
-- coverage will be ~5-15% of total GNM enforcement activity (estimated 5K
-- fines/year, of which only ~50-200 firms are named publicly per year).
--
-- The cross-source value remains: any firm publicly shamed by GNM that ALSO
-- wins SEAP construction/industrial contracts is a 1st-page scandal pattern.
-- We accept that we miss the long tail; we capture the headlines.

CREATE SCHEMA IF NOT EXISTS gnm;

-- ── 1. Press releases (one row per gnm.ro article) ─────────────────────────
-- Captures the full enforcement-related communicate published by GNM. Used
-- both as raw archive (in case interpretation rules change) and as parent
-- for extracted violator rows.
CREATE TABLE IF NOT EXISTS gnm.comunicate (
  id              bigserial PRIMARY KEY,
  guid            text NOT NULL UNIQUE,        -- WordPress GUID (stable post id)
  url             text NOT NULL,
  titlu           text NOT NULL,
  publicat_la     timestamptz,                  -- pubDate from RSS
  autor           text,                         -- dc:creator
  categorii       text[],                       -- e.g. {COMUNICATE DE PRESĂ, NOUTĂȚI}
  continut_html   text,                         -- raw content:encoded
  continut_text   text,                         -- HTML-stripped, line-collapsed
  is_enforcement  boolean NOT NULL DEFAULT false,
                                                -- true if title/body matches
                                                -- /amenz|sancțiun|sancțiun|sistare|confiscat/i
  total_amenzi_lei numeric,                     -- sum mentioned in article (best-effort)
  raw_hash        text NOT NULL,                -- sha1(continut_text) for change detection
  fetched_at      timestamptz NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_gnm_com_publicat        ON gnm.comunicate(publicat_la DESC);
CREATE INDEX IF NOT EXISTS idx_gnm_com_enforcement     ON gnm.comunicate(is_enforcement) WHERE is_enforcement;
CREATE INDEX IF NOT EXISTS idx_gnm_com_total_amenzi    ON gnm.comunicate(total_amenzi_lei DESC NULLS LAST);

COMMENT ON TABLE gnm.comunicate IS
  'GNM press releases (gnm.ro/feed/). Source-of-truth raw archive. Articles flagged is_enforcement when text mentions fines/sanctions; gnm.amenzi_extrase populated by NLP-light extractor.';

-- ── 2. Extracted violator records ──────────────────────────────────────────
-- One row per (article × candidate firm) tuple identified by the regex/NER
-- pass. Most enforcement articles have 0-3 firms named; some have none
-- (collective references like "operatori industriali din Prahova").
--
-- contravenient_cui is filled by Stage B fuzzy match against firms.cui_lookup
-- using contravenient_name + judet hint. Score ≥ 0.85 is acceptable.
CREATE TABLE IF NOT EXISTS gnm.amenzi_extrase (
  id                  bigserial PRIMARY KEY,
  comunicat_id        bigint NOT NULL REFERENCES gnm.comunicate(id) ON DELETE CASCADE,
  contravenient_name  text NOT NULL,            -- raw mention (e.g. "Rafinăria Petrobrazi")
  contravenient_name_norm text,                 -- firms.normalize_company_name(); NULL until Stage B
  contravenient_cui   text,                     -- fuzzy-matched, NULL when unmatched
  cui_match_method    text,                     -- 'direct' | 'fuzzy_name' | 'fuzzy_name_judet' | NULL
  cui_match_score     numeric,                  -- 0..1
  matched_at          timestamptz,
  judet               text,                     -- inferred from article title/body
  fapta               text,                     -- short violation description (extracted snippet)
  suma_lei            numeric,                  -- per-firm amount when present, NULL when only aggregate
  suma_eur            numeric,                  -- when source quotes EUR (rare)
  suma_aggregate      boolean NOT NULL DEFAULT false,
                                                -- true when amount applies to >1 firm collectively
  context_snippet     text NOT NULL,            -- the sentence(s) that triggered extraction
  fetched_at          timestamptz NOT NULL DEFAULT now()
);

CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_cui          ON gnm.amenzi_extrase(contravenient_cui)
  WHERE contravenient_cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_name_norm    ON gnm.amenzi_extrase(contravenient_name_norm);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_judet        ON gnm.amenzi_extrase(judet);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_suma         ON gnm.amenzi_extrase(suma_lei DESC NULLS LAST);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_comunicat    ON gnm.amenzi_extrase(comunicat_id);

COMMENT ON TABLE gnm.amenzi_extrase IS
  'Best-effort extracted violator tuples from gnm.comunicate. Coverage is partial — only firms named in press releases. Use gnm.comunicate.is_enforcement for full enforcement-article archive.';

-- ── 3. Scrape log (mirrors anre/ancom convention) ──────────────────────────
CREATE TABLE IF NOT EXISTS gnm.scrape_log (
  id            bigserial PRIMARY KEY,
  scraper       text NOT NULL,                  -- 'rss_feed' | 'extractor'
  source_url    text NOT NULL,
  rows_seen     integer NOT NULL DEFAULT 0,
  rows_inserted integer NOT NULL DEFAULT 0,
  rows_updated  integer NOT NULL DEFAULT 0,
  rows_skipped  integer NOT NULL DEFAULT 0,
  duration_ms   integer NOT NULL DEFAULT 0,
  started_at    timestamptz NOT NULL,
  finished_at   timestamptz NOT NULL DEFAULT now(),
  error         text
);

CREATE INDEX IF NOT EXISTS idx_gnm_scrape_log_started ON gnm.scrape_log(started_at DESC);