-- 037_gnm.sql -- GNM — Garda Națională de Mediu. -- Public press releases of environmental enforcement actions scraped from -- gnm.ro/feed/ (WordPress RSS, ~358 items / 36 pages). -- -- Investigation summary (2026-05-10): -- • The institution publishes only AGGREGATE statistics (per-month / per-judet -- totals) in their monthly synthesis PDFs (sinteza__.pdf) and the -- annual activity report (raport_activitate_.pdf). NO per-firm registry -- is published with CUIs and individual fine amounts. -- • The only place where individual violators are named is in press releases -- ("comunicate de presă"). Even there: -- – Most releases reference "doi operatori", "șapte operatori în patru -- județe" without naming firms. -- – When firms are named (e.g. Petrobrazi, Vega, Lukoil refineries), the -- individual amount is rarely broken down — they receive a collective -- "€340,000 în ultimul an" figure. -- – CUIs are NEVER published; we must fuzzy-match on company name + -- judet via cui_matcher (Stage B of the pipeline). -- • data.gov.ro has 0 GNM datasets; ANPM publishes IPPC/SEVESO inventories -- (which we ingest separately) but no fines. -- -- Conclusion: this is a partial / sample-quality dataset. We capture every -- press release as gnm.communicate, then run a regex extractor to surface -- candidate (company, fine_lei, fapta) tuples into gnm.amenzi_extrase. The -- coverage will be ~5-15% of total GNM enforcement activity (estimated 5K -- fines/year, of which only ~50-200 firms are named publicly per year). -- -- The cross-source value remains: any firm publicly shamed by GNM that ALSO -- wins SEAP construction/industrial contracts is a 1st-page scandal pattern. -- We accept that we miss the long tail; we capture the headlines. CREATE SCHEMA IF NOT EXISTS gnm; -- ── 1. Press releases (one row per gnm.ro article) ───────────────────────── -- Captures the full enforcement-related communicate published by GNM. Used -- both as raw archive (in case interpretation rules change) and as parent -- for extracted violator rows. CREATE TABLE IF NOT EXISTS gnm.comunicate ( id bigserial PRIMARY KEY, guid text NOT NULL UNIQUE, -- WordPress GUID (stable post id) url text NOT NULL, titlu text NOT NULL, publicat_la timestamptz, -- pubDate from RSS autor text, -- dc:creator categorii text[], -- e.g. {COMUNICATE DE PRESĂ, NOUTĂȚI} continut_html text, -- raw content:encoded continut_text text, -- HTML-stripped, line-collapsed is_enforcement boolean NOT NULL DEFAULT false, -- true if title/body matches -- /amenz|sancțiun|sancțiun|sistare|confiscat/i total_amenzi_lei numeric, -- sum mentioned in article (best-effort) raw_hash text NOT NULL, -- sha1(continut_text) for change detection fetched_at timestamptz NOT NULL DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_gnm_com_publicat ON gnm.comunicate(publicat_la DESC); CREATE INDEX IF NOT EXISTS idx_gnm_com_enforcement ON gnm.comunicate(is_enforcement) WHERE is_enforcement; CREATE INDEX IF NOT EXISTS idx_gnm_com_total_amenzi ON gnm.comunicate(total_amenzi_lei DESC NULLS LAST); COMMENT ON TABLE gnm.comunicate IS 'GNM press releases (gnm.ro/feed/). Source-of-truth raw archive. Articles flagged is_enforcement when text mentions fines/sanctions; gnm.amenzi_extrase populated by NLP-light extractor.'; -- ── 2. Extracted violator records ────────────────────────────────────────── -- One row per (article × candidate firm) tuple identified by the regex/NER -- pass. Most enforcement articles have 0-3 firms named; some have none -- (collective references like "operatori industriali din Prahova"). -- -- contravenient_cui is filled by Stage B fuzzy match against firms.cui_lookup -- using contravenient_name + judet hint. Score ≥ 0.85 is acceptable. CREATE TABLE IF NOT EXISTS gnm.amenzi_extrase ( id bigserial PRIMARY KEY, comunicat_id bigint NOT NULL REFERENCES gnm.comunicate(id) ON DELETE CASCADE, contravenient_name text NOT NULL, -- raw mention (e.g. "Rafinăria Petrobrazi") contravenient_name_norm text, -- firms.normalize_company_name(); NULL until Stage B contravenient_cui text, -- fuzzy-matched, NULL when unmatched cui_match_method text, -- 'direct' | 'fuzzy_name' | 'fuzzy_name_judet' | NULL cui_match_score numeric, -- 0..1 matched_at timestamptz, judet text, -- inferred from article title/body fapta text, -- short violation description (extracted snippet) suma_lei numeric, -- per-firm amount when present, NULL when only aggregate suma_eur numeric, -- when source quotes EUR (rare) suma_aggregate boolean NOT NULL DEFAULT false, -- true when amount applies to >1 firm collectively context_snippet text NOT NULL, -- the sentence(s) that triggered extraction fetched_at timestamptz NOT NULL DEFAULT now() ); CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_cui ON gnm.amenzi_extrase(contravenient_cui) WHERE contravenient_cui IS NOT NULL; CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_name_norm ON gnm.amenzi_extrase(contravenient_name_norm); CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_judet ON gnm.amenzi_extrase(judet); CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_suma ON gnm.amenzi_extrase(suma_lei DESC NULLS LAST); CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_comunicat ON gnm.amenzi_extrase(comunicat_id); COMMENT ON TABLE gnm.amenzi_extrase IS 'Best-effort extracted violator tuples from gnm.comunicate. Coverage is partial — only firms named in press releases. Use gnm.comunicate.is_enforcement for full enforcement-article archive.'; -- ── 3. Scrape log (mirrors anre/ancom convention) ────────────────────────── CREATE TABLE IF NOT EXISTS gnm.scrape_log ( id bigserial PRIMARY KEY, scraper text NOT NULL, -- 'rss_feed' | 'extractor' source_url text NOT NULL, rows_seen integer NOT NULL DEFAULT 0, rows_inserted integer NOT NULL DEFAULT 0, rows_updated integer NOT NULL DEFAULT 0, rows_skipped integer NOT NULL DEFAULT 0, duration_ms integer NOT NULL DEFAULT 0, started_at timestamptz NOT NULL, finished_at timestamptz NOT NULL DEFAULT now(), error text ); CREATE INDEX IF NOT EXISTS idx_gnm_scrape_log_started ON gnm.scrape_log(started_at DESC);