Files
vreau-digital/services/seap-scraper/sql/030_ani_schema.sql
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

212 lines
11 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- 030_ani_schema.sql
-- ANI declarații de avere și interese — flagship transparency feature.
--
-- Source: declaratii.integritate.eu (e-DAI 2022→), old-declaratii.integritate.eu
-- (archive 2008-2022). Public by Law 176/2010, GDPR-safe (no CNP stored).
--
-- ~1.3M PDF declarations of Romanian public officials. Cross-references
-- politicians × firms-they-own × procurement-contracts (firms.entities, seap.*).
--
-- See ANI-PLAN.md for full architecture, volume estimates, and rollout plan.
-- This file = Stage 0 (schema only, no data).
CREATE SCHEMA IF NOT EXISTS ani;
GRANT USAGE ON SCHEMA ani TO PUBLIC;
-- ── ani.officials ──────────────────────────────────────────────────────────
-- Distinct demnitar/funcționar public. Filled by Stage 4 (entity resolution),
-- not by the listing scraper. ani.declaratii.official_id is nullable until
-- dedup runs.
CREATE TABLE IF NOT EXISTS ani.officials (
id bigserial PRIMARY KEY,
normalized_name text NOT NULL, -- lower(unaccent(name)) collapsed
display_name text NOT NULL, -- "Popescu Ioan-Vasile"
cnp_hash char(64), -- SHA-256 if extractable (rare)
first_seen_year smallint, -- min(declaration year)
last_seen_year smallint, -- max(declaration year)
slug text UNIQUE, -- "popescu-ioan-vasile" + suffix
primary_function text, -- most-frequent function
primary_judet text, -- most-frequent judet
declaration_count integer DEFAULT 0, -- materialized count for UI
created_at timestamptz DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_officials_norm_name
ON ani.officials (normalized_name);
CREATE INDEX IF NOT EXISTS idx_officials_norm_name_trgm
ON ani.officials USING gin (normalized_name gin_trgm_ops);
-- ── ani.declaratii ─────────────────────────────────────────────────────────
-- One row per PDF declaration. Listing scraper fills the metadata; PDF
-- downloader fills pdf_path + pdf_sha256; parser fills parse_status.
CREATE TABLE IF NOT EXISTS ani.declaratii (
id bigserial PRIMARY KEY,
official_id bigint REFERENCES ani.officials(id) ON DELETE SET NULL,
-- raw fields straight from portal listing (pre-resolution)
raw_official_name text NOT NULL,
raw_institution text,
raw_function text,
raw_localitate text,
raw_judet text,
-- declaration details
year smallint NOT NULL,
declaration_type text NOT NULL CHECK (declaration_type IN
('avere','interese','avere+interese')),
submission_kind text CHECK (submission_kind IN
('anuala','numire-functie','incetare-functie',
'rectificativa','periodica','altele') OR
submission_kind IS NULL),
data_completare date,
-- source tracking (which portal, which ID)
source_portal text NOT NULL CHECK (source_portal IN
('old','new','depozitar')),
source_url text NOT NULL,
source_id text, -- uniqueIdentifier (old) / _id (new)
-- PDF storage
pdf_path text, -- relative to /opt/vreaudigital-data/ani
pdf_sha256 char(64),
pdf_size_bytes integer,
fetched_at timestamptz,
-- parser state
parsed_at timestamptz,
parse_status text DEFAULT 'pending' CHECK (parse_status IN
('pending','ok','ocr_required','parse_failed',
'template_unknown','download_failed')),
parse_template text, -- '2008-2010' | '2011-2016' | '2017+' | 'edai'
parse_error text,
inserted_at timestamptz DEFAULT now()
);
-- one declaration per (portal, source_id) — primary dedup key
CREATE UNIQUE INDEX IF NOT EXISTS idx_declaratii_source
ON ani.declaratii (source_portal, source_id) WHERE source_id IS NOT NULL;
-- content-hash dedup — same PDF re-uploaded under different IDs
CREATE UNIQUE INDEX IF NOT EXISTS idx_declaratii_sha
ON ani.declaratii (pdf_sha256) WHERE pdf_sha256 IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_declaratii_official
ON ani.declaratii (official_id, year DESC) WHERE official_id IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_declaratii_year
ON ani.declaratii (year DESC, declaration_type);
CREATE INDEX IF NOT EXISTS idx_declaratii_pending
ON ani.declaratii (parse_status) WHERE parse_status IN ('pending','ocr_required');
CREATE INDEX IF NOT EXISTS idx_declaratii_raw_name_trgm
ON ani.declaratii USING gin (raw_official_name gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_declaratii_raw_inst_trgm
ON ani.declaratii USING gin (raw_institution gin_trgm_ops);
-- ── ani.bunuri ─────────────────────────────────────────────────────────────
-- Sections I (imobile) + II (mobile). raw_row_text always preserved for
-- audit / debug.
CREATE TABLE IF NOT EXISTS ani.bunuri (
id bigserial PRIMARY KEY,
declaration_id bigint NOT NULL REFERENCES ani.declaratii(id) ON DELETE CASCADE,
category text NOT NULL CHECK (category IN
('imobil-teren','imobil-cladire','mobil-vehicul',
'mobil-bijuterii','mobil-tablouri-arta','mobil-altele')),
subcategory text, -- "agricol"/"intravilan"/"apartament"
localitate text,
judet text,
tara text DEFAULT 'România',
year_acquired smallint,
mode_acquired text, -- "cumparare"/"mostenire"/"donatie"
area_sqm numeric,
share_pct numeric, -- 1.0 = full ownership
co_owner text,
value_lei numeric,
value_currency text DEFAULT 'RON',
raw_row_text text -- audit
);
CREATE INDEX IF NOT EXISTS idx_bunuri_decl ON ani.bunuri (declaration_id);
CREATE INDEX IF NOT EXISTS idx_bunuri_judet ON ani.bunuri (judet) WHERE judet IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_bunuri_category ON ani.bunuri (category);
-- ── ani.shareholdings ──────────────────────────────────────────────────────
-- Section IX (firme deținute / asociate). THE flagship table — joins to
-- firms.entities via firm_cui (resolved in Stage 4) and to seap.announcements
-- via that CUI for "politician-with-firm-supplier-to-state" recipes.
CREATE TABLE IF NOT EXISTS ani.shareholdings (
id bigserial PRIMARY KEY,
declaration_id bigint NOT NULL REFERENCES ani.declaratii(id) ON DELETE CASCADE,
firm_name_raw text NOT NULL, -- text from PDF
firm_cui text, -- resolved later
firm_match_score real, -- pg_trgm similarity
firm_match_method text CHECK (firm_match_method IN
('exact_name','trgm','manual','unmatched') OR
firm_match_method IS NULL),
matched_at timestamptz,
role text, -- "actionar"/"asociat"/"administrator"/"membru CA"
share_pct numeric,
value_lei numeric,
category text CHECK (category IN
('societate','asociatie','fundatie','cooperativa',
'oNG','altele') OR category IS NULL),
raw_row_text text
);
CREATE INDEX IF NOT EXISTS idx_share_decl ON ani.shareholdings (declaration_id);
CREATE INDEX IF NOT EXISTS idx_share_cui
ON ani.shareholdings (firm_cui) WHERE firm_cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_share_name_trgm
ON ani.shareholdings USING gin (firm_name_raw gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_share_unmatched
ON ani.shareholdings (firm_match_method)
WHERE firm_match_method IS NULL OR firm_match_method = 'unmatched';
-- ── ani.functii ────────────────────────────────────────────────────────────
-- Section VIII — funcții publice și private. Joinable to seap.cui_authority
-- (when is_public + institution_cui matches an authority) and firms.entities
-- (when is_public = false).
CREATE TABLE IF NOT EXISTS ani.functii (
id bigserial PRIMARY KEY,
declaration_id bigint NOT NULL REFERENCES ani.declaratii(id) ON DELETE CASCADE,
is_public boolean,
function_name text NOT NULL,
institution_name text NOT NULL,
institution_cui text, -- resolved later
start_year smallint,
end_year smallint, -- NULL if active
salary_lei numeric, -- annual income from this function
raw_row_text text
);
CREATE INDEX IF NOT EXISTS idx_functii_decl ON ani.functii (declaration_id);
CREATE INDEX IF NOT EXISTS idx_functii_inst_cui
ON ani.functii (institution_cui) WHERE institution_cui IS NOT NULL;
CREATE INDEX IF NOT EXISTS idx_functii_inst_name_trgm
ON ani.functii USING gin (institution_name gin_trgm_ops);
-- ── ani.donatii ────────────────────────────────────────────────────────────
-- Section V (donații primite).
CREATE TABLE IF NOT EXISTS ani.donatii (
id bigserial PRIMARY KEY,
declaration_id bigint NOT NULL REFERENCES ani.declaratii(id) ON DELETE CASCADE,
donor_name text,
donation_type text CHECK (donation_type IN
('bani','imobil','mobil','servicii','altele') OR
donation_type IS NULL),
value_lei numeric,
currency text DEFAULT 'RON',
year_received smallint,
raw_row_text text
);
CREATE INDEX IF NOT EXISTS idx_donatii_decl ON ani.donatii (declaration_id);
-- ── Comments ──────────────────────────────────────────────────────────────
COMMENT ON SCHEMA ani IS
'ANI declarații de avere și interese. Sources: declaratii.integritate.eu + old-declaratii.integritate.eu. Public by Law 176/2010.';
COMMENT ON TABLE ani.declaratii IS
'One row per PDF declaration. official_id resolved in Stage 4 dedup.';
COMMENT ON TABLE ani.shareholdings IS
'Section IX firme deținute. THE flagship cross-reference: firm_cui joins to firms.entities → seap.announcements.';
COMMENT ON COLUMN ani.declaratii.pdf_path IS
'Relative path under /opt/vreaudigital-data/ani/. Full path: /opt/vreaudigital-data/ani/$pdf_path';