Files
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

116 lines
7.0 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- 037_gnm.sql
-- GNM — Garda Națională de Mediu.
-- Public press releases of environmental enforcement actions scraped from
-- gnm.ro/feed/ (WordPress RSS, ~358 items / 36 pages).
--
-- Investigation summary (2026-05-10):
-- • The institution publishes only AGGREGATE statistics (per-month / per-judet
-- totals) in their monthly synthesis PDFs (sinteza_<luna>_<an>.pdf) and the
-- annual activity report (raport_activitate_<an>.pdf). NO per-firm registry
-- is published with CUIs and individual fine amounts.
-- • The only place where individual violators are named is in press releases
-- ("comunicate de presă"). Even there:
-- Most releases reference "doi operatori", "șapte operatori în patru
-- județe" without naming firms.
-- When firms are named (e.g. Petrobrazi, Vega, Lukoil refineries), the
-- individual amount is rarely broken down — they receive a collective
-- "€340,000 în ultimul an" figure.
-- CUIs are NEVER published; we must fuzzy-match on company name +
-- judet via cui_matcher (Stage B of the pipeline).
-- • data.gov.ro has 0 GNM datasets; ANPM publishes IPPC/SEVESO inventories
-- (which we ingest separately) but no fines.
--
-- Conclusion: this is a partial / sample-quality dataset. We capture every
-- press release as gnm.communicate, then run a regex extractor to surface
-- candidate (company, fine_lei, fapta) tuples into gnm.amenzi_extrase. The
-- coverage will be ~5-15% of total GNM enforcement activity (estimated 5K
-- fines/year, of which only ~50-200 firms are named publicly per year).
--
-- The cross-source value remains: any firm publicly shamed by GNM that ALSO
-- wins SEAP construction/industrial contracts is a 1st-page scandal pattern.
-- We accept that we miss the long tail; we capture the headlines.
CREATE SCHEMA IF NOT EXISTS gnm;
-- ── 1. Press releases (one row per gnm.ro article) ─────────────────────────
-- Captures the full enforcement-related communicate published by GNM. Used
-- both as raw archive (in case interpretation rules change) and as parent
-- for extracted violator rows.
CREATE TABLE IF NOT EXISTS gnm.comunicate (
id bigserial PRIMARY KEY,
guid text NOT NULL UNIQUE, -- WordPress GUID (stable post id)
url text NOT NULL,
titlu text NOT NULL,
publicat_la timestamptz, -- pubDate from RSS
autor text, -- dc:creator
categorii text[], -- e.g. {COMUNICATE DE PRESĂ, NOUTĂȚI}
continut_html text, -- raw content:encoded
continut_text text, -- HTML-stripped, line-collapsed
is_enforcement boolean NOT NULL DEFAULT false,
-- true if title/body matches
-- /amenz|sancțiun|sancțiun|sistare|confiscat/i
total_amenzi_lei numeric, -- sum mentioned in article (best-effort)
raw_hash text NOT NULL, -- sha1(continut_text) for change detection
fetched_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_gnm_com_publicat ON gnm.comunicate(publicat_la DESC);
CREATE INDEX IF NOT EXISTS idx_gnm_com_enforcement ON gnm.comunicate(is_enforcement) WHERE is_enforcement;
CREATE INDEX IF NOT EXISTS idx_gnm_com_total_amenzi ON gnm.comunicate(total_amenzi_lei DESC NULLS LAST);
COMMENT ON TABLE gnm.comunicate IS
'GNM press releases (gnm.ro/feed/). Source-of-truth raw archive. Articles flagged is_enforcement when text mentions fines/sanctions; gnm.amenzi_extrase populated by NLP-light extractor.';
-- ── 2. Extracted violator records ──────────────────────────────────────────
-- One row per (article × candidate firm) tuple identified by the regex/NER
-- pass. Most enforcement articles have 0-3 firms named; some have none
-- (collective references like "operatori industriali din Prahova").
--
-- contravenient_cui is filled by Stage B fuzzy match against firms.cui_lookup
-- using contravenient_name + judet hint. Score ≥ 0.85 is acceptable.
CREATE TABLE IF NOT EXISTS gnm.amenzi_extrase (
id bigserial PRIMARY KEY,
comunicat_id bigint NOT NULL REFERENCES gnm.comunicate(id) ON DELETE CASCADE,
contravenient_name text NOT NULL, -- raw mention (e.g. "Rafinăria Petrobrazi")
contravenient_name_norm text, -- firms.normalize_company_name(); NULL until Stage B
contravenient_cui text, -- fuzzy-matched, NULL when unmatched
cui_match_method text, -- 'direct' | 'fuzzy_name' | 'fuzzy_name_judet' | NULL
cui_match_score numeric, -- 0..1
matched_at timestamptz,
judet text, -- inferred from article title/body
fapta text, -- short violation description (extracted snippet)
suma_lei numeric, -- per-firm amount when present, NULL when only aggregate
suma_eur numeric, -- when source quotes EUR (rare)
suma_aggregate boolean NOT NULL DEFAULT false,
-- true when amount applies to >1 firm collectively
context_snippet text NOT NULL, -- the sentence(s) that triggered extraction
fetched_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_cui ON gnm.amenzi_extrase(contravenient_cui)
WHERE contravenient_cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_name_norm ON gnm.amenzi_extrase(contravenient_name_norm);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_judet ON gnm.amenzi_extrase(judet);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_suma ON gnm.amenzi_extrase(suma_lei DESC NULLS LAST);
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_comunicat ON gnm.amenzi_extrase(comunicat_id);
COMMENT ON TABLE gnm.amenzi_extrase IS
'Best-effort extracted violator tuples from gnm.comunicate. Coverage is partial — only firms named in press releases. Use gnm.comunicate.is_enforcement for full enforcement-article archive.';
-- ── 3. Scrape log (mirrors anre/ancom convention) ──────────────────────────
CREATE TABLE IF NOT EXISTS gnm.scrape_log (
id bigserial PRIMARY KEY,
scraper text NOT NULL, -- 'rss_feed' | 'extractor'
source_url text NOT NULL,
rows_seen integer NOT NULL DEFAULT 0,
rows_inserted integer NOT NULL DEFAULT 0,
rows_updated integer NOT NULL DEFAULT 0,
rows_skipped integer NOT NULL DEFAULT 0,
duration_ms integer NOT NULL DEFAULT 0,
started_at timestamptz NOT NULL,
finished_at timestamptz NOT NULL DEFAULT now(),
error text
);
CREATE INDEX IF NOT EXISTS idx_gnm_scrape_log_started ON gnm.scrape_log(started_at DESC);