initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,525 @@
"""
ANAF datornici (persoane juridice) — live scraper.
Source: https://www.anaf.ro/restante/ (JSF/PrimeFaces, JCaptcha image).
NOT Cloudflare Turnstile (initial assumption was wrong, confirmed via probe).
Mechanism (per probe 2026-05-12):
1. GET /restante/ → extract `javax.faces.ViewState` + session cookies
2. GET /restante/kaptcha.jpg (same session)
3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token
4. POST /restante/index.xhtml with captcha + form fields → first page of data
5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha)
6. Parse <tr data-ri=N> rows, extract 24 cells per row, UPSERT to anaf.datornici
Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run
captures one snapshot. Historical pre-2026-Q1 is permanently lost — we keep
the 2016-Q1 data.gov.ro snapshot already in DB.
Env vars:
TWOCAPTCHA_KEY — required (image solver)
DATABASE_URL — postgres conn string (Prisma-style ?schema= stripped)
DRY_RUN=1 — parse plan, no captcha, no DB writes
ROWS_PER_PAGE=1000 — pagination chunk size (default 1000; reduce if PrimeFaces times out)
ANAF_DATORNICI_LOG — log path (default stderr)
"""
from __future__ import annotations
import base64
import io
import logging
import os
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import date
from typing import Any
import psycopg2
import psycopg2.extras
import requests
# ─────────────────────────────────────────────────────────────────────────────
# Logging
LOG_FILE = os.environ.get("ANAF_DATORNICI_LOG", "")
_handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
if LOG_FILE:
try:
_handlers.append(logging.FileHandler(LOG_FILE))
except OSError:
pass
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=_handlers,
)
log = logging.getLogger("anaf_datornici")
# ─────────────────────────────────────────────────────────────────────────────
# Constants
BASE = "https://www.anaf.ro/restante"
INDEX_PAGE = f"{BASE}/"
INDEX_FORM = f"{BASE}/index.xhtml"
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
)
TIMEOUT = 60
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
TWOCAPTCHA_POLL_INTERVAL = 5 # seconds
TWOCAPTCHA_MAX_POLL = 36 # 36 * 5s = 180s
TWOCAPTCHA_MAX_ATTEMPTS = 3 # captcha solve retries on wrong-text
TWOCAPTCHA_REPORT = True # report bad solves for credit refund
DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000"))
# ─────────────────────────────────────────────────────────────────────────────
# Quarter math
def parse_publication_date(html: str) -> tuple[date, str]:
"""Extract 'Obligații fiscale restante la data de DD.MM.YYYY' from page."""
m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE)
if not m:
raise RuntimeError("Cannot parse publication_date from page HTML")
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
pub_date = date(y, mo, d)
# Map publication_date → quarter label.
# Convention: pub at end-of-quarter (31 Mar = T1, 30 Jun = T2, 30 Sep = T3, 31 Dec = T4).
q = (mo - 1) // 3 + 1
period_label = f"T{q} {y}"
return pub_date, period_label
# ─────────────────────────────────────────────────────────────────────────────
# 2captcha image solver
def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]:
"""Submit JPEG to 2captcha, poll for text. Returns (token, captcha_id)."""
b64 = base64.b64encode(image).decode()
r = requests.post(
TWOCAPTCHA_IN,
data={
"key": api_key,
"method": "base64",
"body": b64,
"json": "1",
"numeric": "0", # any chars
"min_len": "4",
"max_len": "8",
"language": "2", # any language
"regsense": "1", # case-sensitive
},
timeout=TIMEOUT,
)
r.raise_for_status()
j = r.json()
if j.get("status") != 1:
raise RuntimeError(f"2captcha in.php error: {j}")
cid = j["request"]
log.info(f"2captcha attempt {attempt}: id={cid}, polling…")
for poll in range(TWOCAPTCHA_MAX_POLL):
time.sleep(TWOCAPTCHA_POLL_INTERVAL)
rr = requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
timeout=TIMEOUT,
)
jj = rr.json()
if jj.get("status") == 1:
return jj["request"], cid
if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
continue
raise RuntimeError(f"2captcha res.php error: {jj}")
raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s")
def report_bad_solve(api_key: str, cid: str) -> None:
"""Report wrong solve to 2captcha for credit refund."""
try:
requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "reportbad", "id": cid},
timeout=TIMEOUT,
)
except Exception:
pass
# ─────────────────────────────────────────────────────────────────────────────
# Session / pagination
@dataclass
class AnafSession:
api_key: str
s: requests.Session = field(default_factory=requests.Session)
viewstate: str = ""
publication_date: date | None = None
period_label: str = ""
total_records: int = 0
def __post_init__(self):
self.s.headers.update({"User-Agent": USER_AGENT})
def bootstrap(self) -> None:
"""GET initial page, extract ViewState + session cookies."""
log.info(f"GET {INDEX_PAGE}")
r = self.s.get(INDEX_PAGE, timeout=TIMEOUT)
r.raise_for_status()
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
if not m:
raise RuntimeError("No ViewState in initial page")
self.viewstate = m.group(1)
log.info(f"viewstate fetched ({len(self.viewstate)} chars)")
def get_kaptcha(self) -> bytes:
r = self.s.get(
KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE}
)
r.raise_for_status()
if not r.content.startswith(b"\xff\xd8\xff"):
raise RuntimeError("kaptcha response not JPEG")
return r.content
def submit_initial(self, captcha_text: str, rows_per_page: int) -> str:
"""POST form with captcha → first page of data (HTML)."""
log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})")
r = self.s.post(
INDEX_FORM,
data={
"form": "form",
"form:inputc": captcha_text,
"form:searchdata": "",
"form:submit": "",
"form_SUBMIT": "1",
"javax.faces.ViewState": self.viewstate,
},
headers={
"Referer": INDEX_PAGE,
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=TIMEOUT,
)
r.raise_for_status()
# Refresh ViewState (JSF rotates it on each interaction)
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
if m:
self.viewstate = m.group(1)
# Detect captcha-error case
if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]:
raise CaptchaWrong(r.text)
# Extract publication date + total
try:
self.publication_date, self.period_label = parse_publication_date(r.text)
log.info(f"publication_date={self.publication_date} period={self.period_label}")
except RuntimeError:
log.warning("could not parse publication_date — using today's quarter")
today = date.today()
self.publication_date = today
self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}"
m = re.search(r"\((\d+)\s+of\s+(\d+)\)", r.text)
if m:
self.total_records = int(m.group(2)) * 16 # pages * rows-per-page-default
log.info(f"total_records estimate (from paginator): ~{self.total_records}")
return r.text
def fetch_page(self, first: int, rows_per_page: int) -> str:
"""AJAX PrimeFaces pagination POST. Returns partial response XML."""
r = self.s.post(
INDEX_FORM,
data={
"javax.faces.partial.ajax": "true",
"javax.faces.source": "form:dataTable",
"javax.faces.partial.execute": "form:dataTable",
"javax.faces.partial.render": "form:dataTable",
"form:dataTable": "form:dataTable",
"form:dataTable_pagination": "true",
"form:dataTable_first": str(first),
"form:dataTable_rows": str(rows_per_page),
"form:dataTable_encodeFeature": "true",
"form": "form",
"form:inputc": "",
"form:searchdata": "",
"javax.faces.ViewState": self.viewstate,
},
headers={
"Referer": INDEX_FORM,
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "application/xml,text/xml,*/*;q=0.01",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Faces-Request": "partial/ajax",
},
timeout=TIMEOUT,
)
r.raise_for_status()
# Update ViewState from partial response
m = re.search(r'<update id="[^"]*javax\.faces\.ViewState[^"]*"><!\[CDATA\[([^\]]+)\]\]>', r.text)
if m:
self.viewstate = m.group(1)
return r.text
class CaptchaWrong(Exception):
pass
# ─────────────────────────────────────────────────────────────────────────────
# Row parsing
def parse_rows(html_or_partial: str) -> list[dict[str, Any]]:
"""Extract debtor rows from initial HTML or AJAX partial response.
Row layout (24 cells observed via probe 2026-05-12):
0: nr_crt
1: name (denumire debitor)
2: CIF (cui)
3: total bugetul de stat
4: total asigurări sociale
5: total șomaj
6: total sănătate
7-10: state {principal, accesorii, necontestate, contestate}
11-14: social {principal, accesorii, necontestate, contestate}
15-18: unemployment {principal, accesorii, necontestate, contestate}
19-22: health {principal, accesorii, necontestate, contestate}
23: observation/status (e.g. "Faliment")
"""
rows: list[dict[str, Any]] = []
# Match each <tr ... data-ri="N">…</tr>
for tr_m in re.finditer(
r'<tr\b[^>]*data-ri="(\d+)"[^>]*>(.*?)</tr>',
html_or_partial, re.DOTALL,
):
body = tr_m.group(2)
cells = re.findall(r"<td\b[^>]*>(.*?)</td>", body, re.DOTALL)
if len(cells) < 24:
continue
def _txt(s: str) -> str:
t = re.sub(r"<[^>]+>", "", s)
return re.sub(r"\s+", " ", t).strip()
def _num(s: str) -> float:
t = _txt(s).replace(".", "").replace(",", ".")
try:
return float(t)
except ValueError:
return 0.0
rows.append({
"nr_crt": _txt(cells[0]),
"name": _txt(cells[1]),
"cui": _txt(cells[2]),
"budget_state_total": _num(cells[3]),
"budget_social_total": _num(cells[4]),
"budget_unemployment_total": _num(cells[5]),
"budget_health_total": _num(cells[6]),
"state_principal": _num(cells[7]),
"state_penalty": _num(cells[8]),
"state_necontestate": _num(cells[9]),
"state_contestate": _num(cells[10]),
"social_principal": _num(cells[11]),
"social_penalty": _num(cells[12]),
"social_necontestate": _num(cells[13]),
"social_contestate": _num(cells[14]),
"unemp_principal": _num(cells[15]),
"unemp_penalty": _num(cells[16]),
"unemp_necontestate": _num(cells[17]),
"unemp_contestate": _num(cells[18]),
"health_principal": _num(cells[19]),
"health_penalty": _num(cells[20]),
"health_necontestate": _num(cells[21]),
"health_contestate": _num(cells[22]),
"observation": _txt(cells[23]),
})
return rows
# ─────────────────────────────────────────────────────────────────────────────
# DB UPSERT
def upsert_rows(
conn,
rows: list[dict[str, Any]],
publication_date: date,
period_label: str,
debtor_category: str = "persoane_juridice",
) -> int:
if not rows:
return 0
source_url = INDEX_PAGE
# debt_total per row = sum of 4 category totals
payload = [(
r["cui"].replace(" ", "").upper().lstrip("RO"),
r["name"],
None, # judet not provided by ANAF /restante/
publication_date,
period_label,
debtor_category,
# debt_total = sum of 4 category totals
r["budget_state_total"] + r["budget_social_total"]
+ r["budget_unemployment_total"] + r["budget_health_total"],
# principal across categories
r["state_principal"] + r["social_principal"]
+ r["unemp_principal"] + r["health_principal"],
# penalty across categories
r["state_penalty"] + r["social_penalty"]
+ r["unemp_penalty"] + r["health_penalty"],
# contestate across categories
r["state_contestate"] + r["social_contestate"]
+ r["unemp_contestate"] + r["health_contestate"],
# per-budget detail (12 columns)
r["state_principal"], r["state_penalty"], r["state_contestate"],
r["social_principal"], r["social_penalty"], r["social_contestate"],
r["unemp_principal"], r["unemp_penalty"], r["unemp_contestate"],
r["health_principal"], r["health_penalty"], r["health_contestate"],
source_url,
) for r in rows if r["cui"]]
sql = """
INSERT INTO anaf.datornici (
cui, name, judet, publication_date, period_label, debtor_category,
debt_total, debt_principal, debt_penalty, debt_contested,
budget_state_principal, budget_state_penalty, budget_state_contested,
budget_social_principal, budget_social_penalty, budget_social_contested,
budget_unemployment_principal, budget_unemployment_penalty, budget_unemployment_contested,
budget_health_principal, budget_health_penalty, budget_health_contested,
source_url
) VALUES %s
ON CONFLICT (cui, publication_date)
DO UPDATE SET
name = EXCLUDED.name,
debt_total = EXCLUDED.debt_total,
debt_principal = EXCLUDED.debt_principal,
debt_penalty = EXCLUDED.debt_penalty,
debt_contested = EXCLUDED.debt_contested,
budget_state_principal = EXCLUDED.budget_state_principal,
budget_state_penalty = EXCLUDED.budget_state_penalty,
budget_state_contested = EXCLUDED.budget_state_contested,
budget_social_principal = EXCLUDED.budget_social_principal,
budget_social_penalty = EXCLUDED.budget_social_penalty,
budget_social_contested = EXCLUDED.budget_social_contested,
budget_unemployment_principal = EXCLUDED.budget_unemployment_principal,
budget_unemployment_penalty = EXCLUDED.budget_unemployment_penalty,
budget_unemployment_contested = EXCLUDED.budget_unemployment_contested,
budget_health_principal = EXCLUDED.budget_health_principal,
budget_health_penalty = EXCLUDED.budget_health_penalty,
budget_health_contested = EXCLUDED.budget_health_contested,
fetched_at = now()
"""
with conn.cursor() as cur:
psycopg2.extras.execute_values(cur, sql, payload, page_size=500)
conn.commit()
return len(payload)
# ─────────────────────────────────────────────────────────────────────────────
# Orchestration
def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]:
api_key = os.environ.get("TWOCAPTCHA_KEY", "")
if not api_key and not dry_run:
raise RuntimeError("Missing TWOCAPTCHA_KEY env var — see HANDOFF doc")
if dry_run:
log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve")
sess = AnafSession(api_key="")
sess.bootstrap()
log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)")
log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page")
return {"datornici_inserted": 0, "errors": 0}
db_url = os.environ.get("DATABASE_URL", "")
if not db_url:
raise RuntimeError("Missing DATABASE_URL env var")
db_url = re.sub(r"[?&]schema=[^&]*", "", db_url)
db_url = re.sub(r"\?$", "", db_url)
conn = psycopg2.connect(db_url)
conn.autocommit = False
sess = AnafSession(api_key=api_key)
# Captcha solve with retries (wrong-text bounce)
last_cid: str | None = None
for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1):
sess.bootstrap()
image = sess.get_kaptcha()
token, cid = solve_kaptcha(api_key, image, attempt=attempt)
last_cid = cid
try:
initial_html = sess.submit_initial(token, rows_per_page)
log.info(f"captcha accepted on attempt {attempt}")
break
except CaptchaWrong:
log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying")
if TWOCAPTCHA_REPORT and last_cid:
report_bad_solve(api_key, last_cid)
if attempt == TWOCAPTCHA_MAX_ATTEMPTS:
raise RuntimeError("captcha solve failed after retries")
# Initial page rows
all_rows = parse_rows(initial_html)
log.info(f"page 1: {len(all_rows)} rows")
# Paginate
# Discover total via paginator markup. Default page count is 16/page;
# if we set rows_per_page>16, total_records estimate may be wrong.
# Just iterate until parse_rows returns empty.
first = len(all_rows)
page_num = 2
while True:
try:
partial = sess.fetch_page(first=first, rows_per_page=rows_per_page)
new_rows = parse_rows(partial)
if not new_rows:
log.info(f"pagination exhausted at first={first}")
break
all_rows.extend(new_rows)
log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})")
first += len(new_rows)
page_num += 1
except Exception as e:
log.error(f"pagination error at page {page_num}: {e}")
break
log.info(f"total rows collected: {len(all_rows)}")
if not sess.publication_date:
raise RuntimeError("No publication_date captured")
inserted = upsert_rows(
conn, all_rows,
publication_date=sess.publication_date,
period_label=sess.period_label,
)
log.info(f"upserted {inserted} rows into anaf.datornici for {sess.period_label}")
conn.close()
return {"datornici_inserted": inserted, "errors": 0}
def main():
dry_run = os.environ.get("DRY_RUN", "0") == "1"
rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE)))
log.info(f"=== ANAF datornici scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===")
try:
result = run(dry_run=dry_run, rows_per_page=rows_per_page)
except Exception as e:
log.error(f"FATAL: {e}", exc_info=True)
sys.exit(1)
log.info(f"DONE {result}")
if __name__ == "__main__":
main()