initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,115 @@
|
||||
-- 037_gnm.sql
|
||||
-- GNM — Garda Națională de Mediu.
|
||||
-- Public press releases of environmental enforcement actions scraped from
|
||||
-- gnm.ro/feed/ (WordPress RSS, ~358 items / 36 pages).
|
||||
--
|
||||
-- Investigation summary (2026-05-10):
|
||||
-- • The institution publishes only AGGREGATE statistics (per-month / per-judet
|
||||
-- totals) in their monthly synthesis PDFs (sinteza_<luna>_<an>.pdf) and the
|
||||
-- annual activity report (raport_activitate_<an>.pdf). NO per-firm registry
|
||||
-- is published with CUIs and individual fine amounts.
|
||||
-- • The only place where individual violators are named is in press releases
|
||||
-- ("comunicate de presă"). Even there:
|
||||
-- – Most releases reference "doi operatori", "șapte operatori în patru
|
||||
-- județe" without naming firms.
|
||||
-- – When firms are named (e.g. Petrobrazi, Vega, Lukoil refineries), the
|
||||
-- individual amount is rarely broken down — they receive a collective
|
||||
-- "€340,000 în ultimul an" figure.
|
||||
-- – CUIs are NEVER published; we must fuzzy-match on company name +
|
||||
-- judet via cui_matcher (Stage B of the pipeline).
|
||||
-- • data.gov.ro has 0 GNM datasets; ANPM publishes IPPC/SEVESO inventories
|
||||
-- (which we ingest separately) but no fines.
|
||||
--
|
||||
-- Conclusion: this is a partial / sample-quality dataset. We capture every
|
||||
-- press release as gnm.communicate, then run a regex extractor to surface
|
||||
-- candidate (company, fine_lei, fapta) tuples into gnm.amenzi_extrase. The
|
||||
-- coverage will be ~5-15% of total GNM enforcement activity (estimated 5K
|
||||
-- fines/year, of which only ~50-200 firms are named publicly per year).
|
||||
--
|
||||
-- The cross-source value remains: any firm publicly shamed by GNM that ALSO
|
||||
-- wins SEAP construction/industrial contracts is a 1st-page scandal pattern.
|
||||
-- We accept that we miss the long tail; we capture the headlines.
|
||||
|
||||
CREATE SCHEMA IF NOT EXISTS gnm;
|
||||
|
||||
-- ── 1. Press releases (one row per gnm.ro article) ─────────────────────────
|
||||
-- Captures the full enforcement-related communicate published by GNM. Used
|
||||
-- both as raw archive (in case interpretation rules change) and as parent
|
||||
-- for extracted violator rows.
|
||||
CREATE TABLE IF NOT EXISTS gnm.comunicate (
|
||||
id bigserial PRIMARY KEY,
|
||||
guid text NOT NULL UNIQUE, -- WordPress GUID (stable post id)
|
||||
url text NOT NULL,
|
||||
titlu text NOT NULL,
|
||||
publicat_la timestamptz, -- pubDate from RSS
|
||||
autor text, -- dc:creator
|
||||
categorii text[], -- e.g. {COMUNICATE DE PRESĂ, NOUTĂȚI}
|
||||
continut_html text, -- raw content:encoded
|
||||
continut_text text, -- HTML-stripped, line-collapsed
|
||||
is_enforcement boolean NOT NULL DEFAULT false,
|
||||
-- true if title/body matches
|
||||
-- /amenz|sancțiun|sancțiun|sistare|confiscat/i
|
||||
total_amenzi_lei numeric, -- sum mentioned in article (best-effort)
|
||||
raw_hash text NOT NULL, -- sha1(continut_text) for change detection
|
||||
fetched_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_com_publicat ON gnm.comunicate(publicat_la DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_com_enforcement ON gnm.comunicate(is_enforcement) WHERE is_enforcement;
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_com_total_amenzi ON gnm.comunicate(total_amenzi_lei DESC NULLS LAST);
|
||||
|
||||
COMMENT ON TABLE gnm.comunicate IS
|
||||
'GNM press releases (gnm.ro/feed/). Source-of-truth raw archive. Articles flagged is_enforcement when text mentions fines/sanctions; gnm.amenzi_extrase populated by NLP-light extractor.';
|
||||
|
||||
-- ── 2. Extracted violator records ──────────────────────────────────────────
|
||||
-- One row per (article × candidate firm) tuple identified by the regex/NER
|
||||
-- pass. Most enforcement articles have 0-3 firms named; some have none
|
||||
-- (collective references like "operatori industriali din Prahova").
|
||||
--
|
||||
-- contravenient_cui is filled by Stage B fuzzy match against firms.cui_lookup
|
||||
-- using contravenient_name + judet hint. Score ≥ 0.85 is acceptable.
|
||||
CREATE TABLE IF NOT EXISTS gnm.amenzi_extrase (
|
||||
id bigserial PRIMARY KEY,
|
||||
comunicat_id bigint NOT NULL REFERENCES gnm.comunicate(id) ON DELETE CASCADE,
|
||||
contravenient_name text NOT NULL, -- raw mention (e.g. "Rafinăria Petrobrazi")
|
||||
contravenient_name_norm text, -- firms.normalize_company_name(); NULL until Stage B
|
||||
contravenient_cui text, -- fuzzy-matched, NULL when unmatched
|
||||
cui_match_method text, -- 'direct' | 'fuzzy_name' | 'fuzzy_name_judet' | NULL
|
||||
cui_match_score numeric, -- 0..1
|
||||
matched_at timestamptz,
|
||||
judet text, -- inferred from article title/body
|
||||
fapta text, -- short violation description (extracted snippet)
|
||||
suma_lei numeric, -- per-firm amount when present, NULL when only aggregate
|
||||
suma_eur numeric, -- when source quotes EUR (rare)
|
||||
suma_aggregate boolean NOT NULL DEFAULT false,
|
||||
-- true when amount applies to >1 firm collectively
|
||||
context_snippet text NOT NULL, -- the sentence(s) that triggered extraction
|
||||
fetched_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_cui ON gnm.amenzi_extrase(contravenient_cui)
|
||||
WHERE contravenient_cui IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_name_norm ON gnm.amenzi_extrase(contravenient_name_norm);
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_judet ON gnm.amenzi_extrase(judet);
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_suma ON gnm.amenzi_extrase(suma_lei DESC NULLS LAST);
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_amenzi_comunicat ON gnm.amenzi_extrase(comunicat_id);
|
||||
|
||||
COMMENT ON TABLE gnm.amenzi_extrase IS
|
||||
'Best-effort extracted violator tuples from gnm.comunicate. Coverage is partial — only firms named in press releases. Use gnm.comunicate.is_enforcement for full enforcement-article archive.';
|
||||
|
||||
-- ── 3. Scrape log (mirrors anre/ancom convention) ──────────────────────────
|
||||
CREATE TABLE IF NOT EXISTS gnm.scrape_log (
|
||||
id bigserial PRIMARY KEY,
|
||||
scraper text NOT NULL, -- 'rss_feed' | 'extractor'
|
||||
source_url text NOT NULL,
|
||||
rows_seen integer NOT NULL DEFAULT 0,
|
||||
rows_inserted integer NOT NULL DEFAULT 0,
|
||||
rows_updated integer NOT NULL DEFAULT 0,
|
||||
rows_skipped integer NOT NULL DEFAULT 0,
|
||||
duration_ms integer NOT NULL DEFAULT 0,
|
||||
started_at timestamptz NOT NULL,
|
||||
finished_at timestamptz NOT NULL DEFAULT now(),
|
||||
error text
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_gnm_scrape_log_started ON gnm.scrape_log(started_at DESC);
|
||||
Reference in New Issue
Block a user