initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
@@ -0,0 +1,212 @@
"""
ANAF /restante/ probe — discovers actual mechanism.
Steps:
1. GET /restante/ → extract javax.faces.ViewState, session cookie
2. GET kaptcha.jpg (same session)
3. POST kaptcha image to 2captcha → get text solution
4. POST /restante/index.xhtml with captcha + form fields → get response
5. Print: response HTML structure, table shape, pagination markers, quarter
selector evidence
Used ONCE to understand the page before committing to a full scraper rewrite.
Spends ~$0.001 of 2captcha credit.
"""
import base64
import os
import re
import sys
import time
import requests
BASE = "https://www.anaf.ro/restante"
INDEX_URL = f"{BASE}/index.xhtml"
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
)
TIMEOUT = 30
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
def log(msg: str) -> None:
print(f"[probe] {msg}", file=sys.stderr, flush=True)
def get_initial(session: requests.Session) -> tuple[str, str]:
"""Fetch /restante/ page, return (html, viewstate)."""
log(f"GET {BASE}/")
r = session.get(f"{BASE}/", timeout=TIMEOUT)
r.raise_for_status()
html = r.text
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', html)
if not m:
raise RuntimeError("No ViewState found")
viewstate = m.group(1)
log(f"viewstate={viewstate[:24]}")
log(f"cookies after GET: {list(session.cookies.keys())}")
return html, viewstate
def get_kaptcha(session: requests.Session) -> bytes:
log(f"GET {KAPTCHA_URL}")
r = session.get(KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": f"{BASE}/"})
r.raise_for_status()
if not r.content.startswith(b"\xff\xd8\xff"):
log(f"WARN: kaptcha response not JPEG (first bytes: {r.content[:10]!r})")
log(f"kaptcha bytes: {len(r.content)} (jpg)")
return r.content
def solve_kaptcha(api_key: str, image: bytes) -> str:
"""Submit image to 2captcha, poll for solution."""
b64 = base64.b64encode(image).decode()
log("POST 2captcha in.php with image…")
r = requests.post(
TWOCAPTCHA_IN,
data={
"key": api_key,
"method": "base64",
"body": b64,
"json": "1",
# Hint to 2captcha workers: this is short alphanumeric (kaptcha
# default is 5-6 chars, mixed letter+digit, anti-aliased).
"numeric": "0", # 0 = any chars allowed
"min_len": "4",
"max_len": "8",
"language": "2", # 2 = any language (alphanumeric)
"regsense": "1", # case-sensitive ON
},
timeout=TIMEOUT,
)
r.raise_for_status()
j = r.json()
if j.get("status") != 1:
raise RuntimeError(f"2captcha in.php error: {j}")
cid = j["request"]
log(f"2captcha job id={cid}, polling…")
for attempt in range(30): # 30 * 5s = 150s cap
time.sleep(5)
r = requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
timeout=TIMEOUT,
)
j = r.json()
if j.get("status") == 1:
token = j["request"]
log(f"2captcha solved: {token!r}")
return token
if j.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
log(f" poll {attempt+1}: not ready")
continue
raise RuntimeError(f"2captcha res.php error: {j}")
raise RuntimeError("2captcha timeout 150s")
def post_search(session: requests.Session, viewstate: str, captcha: str, search: str = "") -> requests.Response:
"""POST the form. Empty search = list all (best-case, hopefully bulk)."""
log(f"POST {INDEX_URL} captcha={captcha!r} search={search!r}")
r = session.post(
INDEX_URL,
data={
"form": "form",
"form:inputc": captcha,
"form:searchdata": search,
"form:submit": "", # button submit
"form_SUBMIT": "1",
"javax.faces.ViewState": viewstate,
},
headers={
"Referer": f"{BASE}/",
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=TIMEOUT,
)
log(f"POST status={r.status_code} bytes={len(r.text)} content-type={r.headers.get('content-type')}")
return r
def analyze_response(html: str) -> None:
"""Look for key signals in the response."""
log("=" * 70)
log("RESPONSE ANALYSIS:")
# Captcha error?
if "incorect" in html.lower() or "invalid" in html.lower() or "gresit" in html.lower():
for m in re.finditer(r".{40}(?:incorect|invalid|gresit).{80}", html, re.IGNORECASE):
log(f" ERR phrase: {m.group(0)!r}")
# Table presence?
tbls = re.findall(r"<table[^>]*>", html, re.IGNORECASE)
log(f" <table> count: {len(tbls)}")
# Row count in datatable?
trs = re.findall(r"<tr[^>]*>", html, re.IGNORECASE)
log(f" <tr> count: {len(trs)}")
# PrimeFaces datatable markers?
if "ui-datatable" in html:
log(" PrimeFaces DataTable detected")
# rows per page hint?
m = re.search(r'rows="?(\d+)"?', html)
if m: log(f" rows attr: {m.group(1)}")
# Pagination evidence?
if "ui-paginator" in html or "paginator" in html.lower():
log(" Pagination control present")
# CUI/CIF column?
cuis = re.findall(r"\b\d{6,10}\b", html)
log(f" numeric strings 6-10 digits: {len(cuis)} (possible CUIs)")
if cuis: log(f" samples: {cuis[:10]}")
# Total count somewhere?
for m in re.finditer(r"(?:total|înregistrări|inregistrari|rezultate)[^<>]{0,60}", html, re.IGNORECASE):
log(f" total phrase: {m.group(0)!r}")
# Quarter / publication date references?
for m in re.finditer(r"(?:trim|trimestru|publicat)[^<>]{0,80}", html, re.IGNORECASE):
log(f" date phrase: {m.group(0)!r}")
# Export buttons (CSV/XLSX)?
for m in re.finditer(r"(?:export|descarc|csv|xls)[^<>]{0,40}", html, re.IGNORECASE):
log(f" export phrase: {m.group(0)!r}")
# First 200 chars of body
body = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
if body:
text = re.sub(r"<[^>]+>", " ", body.group(1))
text = re.sub(r"\s+", " ", text).strip()
log(f" body text preview: {text[:500]!r}")
def main():
api_key = os.environ.get("TWOCAPTCHA_KEY")
if not api_key:
print("Missing TWOCAPTCHA_KEY env var", file=sys.stderr)
sys.exit(1)
s = requests.Session()
s.headers.update({"User-Agent": USER_AGENT})
html_initial, viewstate = get_initial(s)
image = get_kaptcha(s)
# Save image locally for debugging
with open("/tmp/probe_kaptcha.jpg", "wb") as f:
f.write(image)
log("kaptcha image saved /tmp/probe_kaptcha.jpg")
captcha_text = solve_kaptcha(api_key, image)
r = post_search(s, viewstate, captcha_text, search="")
with open("/tmp/probe_response.html", "w") as f:
f.write(r.text)
log("response saved /tmp/probe_response.html")
analyze_response(r.text)
if __name__ == "__main__":
main()
@@ -0,0 +1,525 @@
"""
ANAF datornici (persoane juridice) — live scraper.
Source: https://www.anaf.ro/restante/ (JSF/PrimeFaces, JCaptcha image).
NOT Cloudflare Turnstile (initial assumption was wrong, confirmed via probe).
Mechanism (per probe 2026-05-12):
1. GET /restante/ → extract `javax.faces.ViewState` + session cookies
2. GET /restante/kaptcha.jpg (same session)
3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token
4. POST /restante/index.xhtml with captcha + form fields → first page of data
5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha)
6. Parse <tr data-ri=N> rows, extract 24 cells per row, UPSERT to anaf.datornici
Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run
captures one snapshot. Historical pre-2026-Q1 is permanently lost — we keep
the 2016-Q1 data.gov.ro snapshot already in DB.
Env vars:
TWOCAPTCHA_KEY — required (image solver)
DATABASE_URL — postgres conn string (Prisma-style ?schema= stripped)
DRY_RUN=1 — parse plan, no captcha, no DB writes
ROWS_PER_PAGE=1000 — pagination chunk size (default 1000; reduce if PrimeFaces times out)
ANAF_DATORNICI_LOG — log path (default stderr)
"""
from __future__ import annotations
import base64
import io
import logging
import os
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import date
from typing import Any
import psycopg2
import psycopg2.extras
import requests
# ─────────────────────────────────────────────────────────────────────────────
# Logging
LOG_FILE = os.environ.get("ANAF_DATORNICI_LOG", "")
_handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
if LOG_FILE:
try:
_handlers.append(logging.FileHandler(LOG_FILE))
except OSError:
pass
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=_handlers,
)
log = logging.getLogger("anaf_datornici")
# ─────────────────────────────────────────────────────────────────────────────
# Constants
BASE = "https://www.anaf.ro/restante"
INDEX_PAGE = f"{BASE}/"
INDEX_FORM = f"{BASE}/index.xhtml"
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
)
TIMEOUT = 60
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
TWOCAPTCHA_POLL_INTERVAL = 5 # seconds
TWOCAPTCHA_MAX_POLL = 36 # 36 * 5s = 180s
TWOCAPTCHA_MAX_ATTEMPTS = 3 # captcha solve retries on wrong-text
TWOCAPTCHA_REPORT = True # report bad solves for credit refund
DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000"))
# ─────────────────────────────────────────────────────────────────────────────
# Quarter math
def parse_publication_date(html: str) -> tuple[date, str]:
"""Extract 'Obligații fiscale restante la data de DD.MM.YYYY' from page."""
m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE)
if not m:
raise RuntimeError("Cannot parse publication_date from page HTML")
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
pub_date = date(y, mo, d)
# Map publication_date → quarter label.
# Convention: pub at end-of-quarter (31 Mar = T1, 30 Jun = T2, 30 Sep = T3, 31 Dec = T4).
q = (mo - 1) // 3 + 1
period_label = f"T{q} {y}"
return pub_date, period_label
# ─────────────────────────────────────────────────────────────────────────────
# 2captcha image solver
def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]:
"""Submit JPEG to 2captcha, poll for text. Returns (token, captcha_id)."""
b64 = base64.b64encode(image).decode()
r = requests.post(
TWOCAPTCHA_IN,
data={
"key": api_key,
"method": "base64",
"body": b64,
"json": "1",
"numeric": "0", # any chars
"min_len": "4",
"max_len": "8",
"language": "2", # any language
"regsense": "1", # case-sensitive
},
timeout=TIMEOUT,
)
r.raise_for_status()
j = r.json()
if j.get("status") != 1:
raise RuntimeError(f"2captcha in.php error: {j}")
cid = j["request"]
log.info(f"2captcha attempt {attempt}: id={cid}, polling…")
for poll in range(TWOCAPTCHA_MAX_POLL):
time.sleep(TWOCAPTCHA_POLL_INTERVAL)
rr = requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
timeout=TIMEOUT,
)
jj = rr.json()
if jj.get("status") == 1:
return jj["request"], cid
if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
continue
raise RuntimeError(f"2captcha res.php error: {jj}")
raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s")
def report_bad_solve(api_key: str, cid: str) -> None:
"""Report wrong solve to 2captcha for credit refund."""
try:
requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "reportbad", "id": cid},
timeout=TIMEOUT,
)
except Exception:
pass
# ─────────────────────────────────────────────────────────────────────────────
# Session / pagination
@dataclass
class AnafSession:
api_key: str
s: requests.Session = field(default_factory=requests.Session)
viewstate: str = ""
publication_date: date | None = None
period_label: str = ""
total_records: int = 0
def __post_init__(self):
self.s.headers.update({"User-Agent": USER_AGENT})
def bootstrap(self) -> None:
"""GET initial page, extract ViewState + session cookies."""
log.info(f"GET {INDEX_PAGE}")
r = self.s.get(INDEX_PAGE, timeout=TIMEOUT)
r.raise_for_status()
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
if not m:
raise RuntimeError("No ViewState in initial page")
self.viewstate = m.group(1)
log.info(f"viewstate fetched ({len(self.viewstate)} chars)")
def get_kaptcha(self) -> bytes:
r = self.s.get(
KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE}
)
r.raise_for_status()
if not r.content.startswith(b"\xff\xd8\xff"):
raise RuntimeError("kaptcha response not JPEG")
return r.content
def submit_initial(self, captcha_text: str, rows_per_page: int) -> str:
"""POST form with captcha → first page of data (HTML)."""
log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})")
r = self.s.post(
INDEX_FORM,
data={
"form": "form",
"form:inputc": captcha_text,
"form:searchdata": "",
"form:submit": "",
"form_SUBMIT": "1",
"javax.faces.ViewState": self.viewstate,
},
headers={
"Referer": INDEX_PAGE,
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=TIMEOUT,
)
r.raise_for_status()
# Refresh ViewState (JSF rotates it on each interaction)
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
if m:
self.viewstate = m.group(1)
# Detect captcha-error case
if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]:
raise CaptchaWrong(r.text)
# Extract publication date + total
try:
self.publication_date, self.period_label = parse_publication_date(r.text)
log.info(f"publication_date={self.publication_date} period={self.period_label}")
except RuntimeError:
log.warning("could not parse publication_date — using today's quarter")
today = date.today()
self.publication_date = today
self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}"
m = re.search(r"\((\d+)\s+of\s+(\d+)\)", r.text)
if m:
self.total_records = int(m.group(2)) * 16 # pages * rows-per-page-default
log.info(f"total_records estimate (from paginator): ~{self.total_records}")
return r.text
def fetch_page(self, first: int, rows_per_page: int) -> str:
"""AJAX PrimeFaces pagination POST. Returns partial response XML."""
r = self.s.post(
INDEX_FORM,
data={
"javax.faces.partial.ajax": "true",
"javax.faces.source": "form:dataTable",
"javax.faces.partial.execute": "form:dataTable",
"javax.faces.partial.render": "form:dataTable",
"form:dataTable": "form:dataTable",
"form:dataTable_pagination": "true",
"form:dataTable_first": str(first),
"form:dataTable_rows": str(rows_per_page),
"form:dataTable_encodeFeature": "true",
"form": "form",
"form:inputc": "",
"form:searchdata": "",
"javax.faces.ViewState": self.viewstate,
},
headers={
"Referer": INDEX_FORM,
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "application/xml,text/xml,*/*;q=0.01",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Faces-Request": "partial/ajax",
},
timeout=TIMEOUT,
)
r.raise_for_status()
# Update ViewState from partial response
m = re.search(r'<update id="[^"]*javax\.faces\.ViewState[^"]*"><!\[CDATA\[([^\]]+)\]\]>', r.text)
if m:
self.viewstate = m.group(1)
return r.text
class CaptchaWrong(Exception):
pass
# ─────────────────────────────────────────────────────────────────────────────
# Row parsing
def parse_rows(html_or_partial: str) -> list[dict[str, Any]]:
"""Extract debtor rows from initial HTML or AJAX partial response.
Row layout (24 cells observed via probe 2026-05-12):
0: nr_crt
1: name (denumire debitor)
2: CIF (cui)
3: total bugetul de stat
4: total asigurări sociale
5: total șomaj
6: total sănătate
7-10: state {principal, accesorii, necontestate, contestate}
11-14: social {principal, accesorii, necontestate, contestate}
15-18: unemployment {principal, accesorii, necontestate, contestate}
19-22: health {principal, accesorii, necontestate, contestate}
23: observation/status (e.g. "Faliment")
"""
rows: list[dict[str, Any]] = []
# Match each <tr ... data-ri="N">…</tr>
for tr_m in re.finditer(
r'<tr\b[^>]*data-ri="(\d+)"[^>]*>(.*?)</tr>',
html_or_partial, re.DOTALL,
):
body = tr_m.group(2)
cells = re.findall(r"<td\b[^>]*>(.*?)</td>", body, re.DOTALL)
if len(cells) < 24:
continue
def _txt(s: str) -> str:
t = re.sub(r"<[^>]+>", "", s)
return re.sub(r"\s+", " ", t).strip()
def _num(s: str) -> float:
t = _txt(s).replace(".", "").replace(",", ".")
try:
return float(t)
except ValueError:
return 0.0
rows.append({
"nr_crt": _txt(cells[0]),
"name": _txt(cells[1]),
"cui": _txt(cells[2]),
"budget_state_total": _num(cells[3]),
"budget_social_total": _num(cells[4]),
"budget_unemployment_total": _num(cells[5]),
"budget_health_total": _num(cells[6]),
"state_principal": _num(cells[7]),
"state_penalty": _num(cells[8]),
"state_necontestate": _num(cells[9]),
"state_contestate": _num(cells[10]),
"social_principal": _num(cells[11]),
"social_penalty": _num(cells[12]),
"social_necontestate": _num(cells[13]),
"social_contestate": _num(cells[14]),
"unemp_principal": _num(cells[15]),
"unemp_penalty": _num(cells[16]),
"unemp_necontestate": _num(cells[17]),
"unemp_contestate": _num(cells[18]),
"health_principal": _num(cells[19]),
"health_penalty": _num(cells[20]),
"health_necontestate": _num(cells[21]),
"health_contestate": _num(cells[22]),
"observation": _txt(cells[23]),
})
return rows
# ─────────────────────────────────────────────────────────────────────────────
# DB UPSERT
def upsert_rows(
conn,
rows: list[dict[str, Any]],
publication_date: date,
period_label: str,
debtor_category: str = "persoane_juridice",
) -> int:
if not rows:
return 0
source_url = INDEX_PAGE
# debt_total per row = sum of 4 category totals
payload = [(
r["cui"].replace(" ", "").upper().lstrip("RO"),
r["name"],
None, # judet not provided by ANAF /restante/
publication_date,
period_label,
debtor_category,
# debt_total = sum of 4 category totals
r["budget_state_total"] + r["budget_social_total"]
+ r["budget_unemployment_total"] + r["budget_health_total"],
# principal across categories
r["state_principal"] + r["social_principal"]
+ r["unemp_principal"] + r["health_principal"],
# penalty across categories
r["state_penalty"] + r["social_penalty"]
+ r["unemp_penalty"] + r["health_penalty"],
# contestate across categories
r["state_contestate"] + r["social_contestate"]
+ r["unemp_contestate"] + r["health_contestate"],
# per-budget detail (12 columns)
r["state_principal"], r["state_penalty"], r["state_contestate"],
r["social_principal"], r["social_penalty"], r["social_contestate"],
r["unemp_principal"], r["unemp_penalty"], r["unemp_contestate"],
r["health_principal"], r["health_penalty"], r["health_contestate"],
source_url,
) for r in rows if r["cui"]]
sql = """
INSERT INTO anaf.datornici (
cui, name, judet, publication_date, period_label, debtor_category,
debt_total, debt_principal, debt_penalty, debt_contested,
budget_state_principal, budget_state_penalty, budget_state_contested,
budget_social_principal, budget_social_penalty, budget_social_contested,
budget_unemployment_principal, budget_unemployment_penalty, budget_unemployment_contested,
budget_health_principal, budget_health_penalty, budget_health_contested,
source_url
) VALUES %s
ON CONFLICT (cui, publication_date)
DO UPDATE SET
name = EXCLUDED.name,
debt_total = EXCLUDED.debt_total,
debt_principal = EXCLUDED.debt_principal,
debt_penalty = EXCLUDED.debt_penalty,
debt_contested = EXCLUDED.debt_contested,
budget_state_principal = EXCLUDED.budget_state_principal,
budget_state_penalty = EXCLUDED.budget_state_penalty,
budget_state_contested = EXCLUDED.budget_state_contested,
budget_social_principal = EXCLUDED.budget_social_principal,
budget_social_penalty = EXCLUDED.budget_social_penalty,
budget_social_contested = EXCLUDED.budget_social_contested,
budget_unemployment_principal = EXCLUDED.budget_unemployment_principal,
budget_unemployment_penalty = EXCLUDED.budget_unemployment_penalty,
budget_unemployment_contested = EXCLUDED.budget_unemployment_contested,
budget_health_principal = EXCLUDED.budget_health_principal,
budget_health_penalty = EXCLUDED.budget_health_penalty,
budget_health_contested = EXCLUDED.budget_health_contested,
fetched_at = now()
"""
with conn.cursor() as cur:
psycopg2.extras.execute_values(cur, sql, payload, page_size=500)
conn.commit()
return len(payload)
# ─────────────────────────────────────────────────────────────────────────────
# Orchestration
def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]:
api_key = os.environ.get("TWOCAPTCHA_KEY", "")
if not api_key and not dry_run:
raise RuntimeError("Missing TWOCAPTCHA_KEY env var — see HANDOFF doc")
if dry_run:
log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve")
sess = AnafSession(api_key="")
sess.bootstrap()
log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)")
log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page")
return {"datornici_inserted": 0, "errors": 0}
db_url = os.environ.get("DATABASE_URL", "")
if not db_url:
raise RuntimeError("Missing DATABASE_URL env var")
db_url = re.sub(r"[?&]schema=[^&]*", "", db_url)
db_url = re.sub(r"\?$", "", db_url)
conn = psycopg2.connect(db_url)
conn.autocommit = False
sess = AnafSession(api_key=api_key)
# Captcha solve with retries (wrong-text bounce)
last_cid: str | None = None
for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1):
sess.bootstrap()
image = sess.get_kaptcha()
token, cid = solve_kaptcha(api_key, image, attempt=attempt)
last_cid = cid
try:
initial_html = sess.submit_initial(token, rows_per_page)
log.info(f"captcha accepted on attempt {attempt}")
break
except CaptchaWrong:
log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying")
if TWOCAPTCHA_REPORT and last_cid:
report_bad_solve(api_key, last_cid)
if attempt == TWOCAPTCHA_MAX_ATTEMPTS:
raise RuntimeError("captcha solve failed after retries")
# Initial page rows
all_rows = parse_rows(initial_html)
log.info(f"page 1: {len(all_rows)} rows")
# Paginate
# Discover total via paginator markup. Default page count is 16/page;
# if we set rows_per_page>16, total_records estimate may be wrong.
# Just iterate until parse_rows returns empty.
first = len(all_rows)
page_num = 2
while True:
try:
partial = sess.fetch_page(first=first, rows_per_page=rows_per_page)
new_rows = parse_rows(partial)
if not new_rows:
log.info(f"pagination exhausted at first={first}")
break
all_rows.extend(new_rows)
log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})")
first += len(new_rows)
page_num += 1
except Exception as e:
log.error(f"pagination error at page {page_num}: {e}")
break
log.info(f"total rows collected: {len(all_rows)}")
if not sess.publication_date:
raise RuntimeError("No publication_date captured")
inserted = upsert_rows(
conn, all_rows,
publication_date=sess.publication_date,
period_label=sess.period_label,
)
log.info(f"upserted {inserted} rows into anaf.datornici for {sess.period_label}")
conn.close()
return {"datornici_inserted": inserted, "errors": 0}
def main():
dry_run = os.environ.get("DRY_RUN", "0") == "1"
rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE)))
log.info(f"=== ANAF datornici scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===")
try:
result = run(dry_run=dry_run, rows_per_page=rows_per_page)
except Exception as e:
log.error(f"FATAL: {e}", exc_info=True)
sys.exit(1)
log.info(f"DONE {result}")
if __name__ == "__main__":
main()
@@ -0,0 +1,425 @@
"""
ANAF lista albă (contribuabili FĂRĂ obligații restante) — live scraper.
Source: https://www.anaf.ro/restante/listaalba.xhtml (JSF/PrimeFaces, JCaptcha image).
SAME mechanism as anaf_datornici scraper, but different endpoint and 3-column row layout.
Mechanism (per probe 2026-05-12):
1. GET /restante/listaalba.xhtml → extract `javax.faces.ViewState` + session cookies
2. GET /restante/kaptcha.jpg (same session)
3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token
4. POST /restante/listaalba.xhtml with captcha + form fields → first page of data
5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha)
6. Parse <tr data-ri=N> rows, extract 3 cells per row (nr_crt, name, cui),
UPSERT to anaf.lista_alba
Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run
captures one snapshot.
Env vars:
TWOCAPTCHA_KEY — required (image solver)
DATABASE_URL — postgres conn string (Prisma-style ?schema= stripped)
DRY_RUN=1 — parse plan, no captcha, no DB writes
ROWS_PER_PAGE=1000 — pagination chunk size
ANAF_LISTA_ALBA_LOG — log path (default stderr)
"""
from __future__ import annotations
import base64
import logging
import os
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import date
from typing import Any
import psycopg2
import psycopg2.extras
import requests
# ─────────────────────────────────────────────────────────────────────────────
# Logging
LOG_FILE = os.environ.get("ANAF_LISTA_ALBA_LOG", "")
_handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)]
if LOG_FILE:
try:
_handlers.append(logging.FileHandler(LOG_FILE))
except OSError:
pass
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=_handlers,
)
log = logging.getLogger("anaf_lista_alba")
# ─────────────────────────────────────────────────────────────────────────────
# Constants
BASE = "https://www.anaf.ro/restante"
INDEX_PAGE = f"{BASE}/listaalba.xhtml"
INDEX_FORM = INDEX_PAGE # form action POSTs to the same URL
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
)
TIMEOUT = 60
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
TWOCAPTCHA_POLL_INTERVAL = 5
TWOCAPTCHA_MAX_POLL = 36 # 36 * 5s = 180s
TWOCAPTCHA_MAX_ATTEMPTS = 3
TWOCAPTCHA_REPORT = True
DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000"))
# ─────────────────────────────────────────────────────────────────────────────
# Quarter math
def parse_publication_date(html: str) -> tuple[date, str]:
"""Extract 'la data de DD.MM.YYYY' from page."""
m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE)
if not m:
raise RuntimeError("Cannot parse publication_date from page HTML")
d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
pub_date = date(y, mo, d)
q = (mo - 1) // 3 + 1
period_label = f"T{q} {y}"
return pub_date, period_label
# ─────────────────────────────────────────────────────────────────────────────
# 2captcha image solver
def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]:
b64 = base64.b64encode(image).decode()
r = requests.post(
TWOCAPTCHA_IN,
data={
"key": api_key,
"method": "base64",
"body": b64,
"json": "1",
"numeric": "0",
"min_len": "4",
"max_len": "8",
"language": "2",
"regsense": "1",
},
timeout=TIMEOUT,
)
r.raise_for_status()
j = r.json()
if j.get("status") != 1:
raise RuntimeError(f"2captcha in.php error: {j}")
cid = j["request"]
log.info(f"2captcha attempt {attempt}: id={cid}, polling…")
for _ in range(TWOCAPTCHA_MAX_POLL):
time.sleep(TWOCAPTCHA_POLL_INTERVAL)
rr = requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
timeout=TIMEOUT,
)
jj = rr.json()
if jj.get("status") == 1:
return jj["request"], cid
if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
continue
raise RuntimeError(f"2captcha res.php error: {jj}")
raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s")
def report_bad_solve(api_key: str, cid: str) -> None:
try:
requests.get(
TWOCAPTCHA_RES,
params={"key": api_key, "action": "reportbad", "id": cid},
timeout=TIMEOUT,
)
except Exception:
pass
# ─────────────────────────────────────────────────────────────────────────────
# Session / pagination
@dataclass
class AnafSession:
api_key: str
s: requests.Session = field(default_factory=requests.Session)
viewstate: str = ""
publication_date: date | None = None
period_label: str = ""
def __post_init__(self):
self.s.headers.update({"User-Agent": USER_AGENT})
def bootstrap(self) -> None:
log.info(f"GET {INDEX_PAGE}")
r = self.s.get(INDEX_PAGE, timeout=TIMEOUT)
r.raise_for_status()
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
if not m:
raise RuntimeError("No ViewState in initial page")
self.viewstate = m.group(1)
log.info(f"viewstate fetched ({len(self.viewstate)} chars)")
def get_kaptcha(self) -> bytes:
r = self.s.get(
KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE}
)
r.raise_for_status()
if not r.content.startswith(b"\xff\xd8\xff"):
raise RuntimeError("kaptcha response not JPEG")
return r.content
def submit_initial(self, captcha_text: str, rows_per_page: int) -> str:
log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})")
r = self.s.post(
INDEX_FORM,
data={
"form": "form",
"form:inputc": captcha_text,
"form:searchdata": "",
"form:submit": "",
"form_SUBMIT": "1",
"javax.faces.ViewState": self.viewstate,
},
headers={
"Referer": INDEX_PAGE,
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded",
},
timeout=TIMEOUT,
)
r.raise_for_status()
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text)
if m:
self.viewstate = m.group(1)
if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]:
raise CaptchaWrong(r.text)
try:
self.publication_date, self.period_label = parse_publication_date(r.text)
log.info(f"publication_date={self.publication_date} period={self.period_label}")
except RuntimeError:
log.warning("could not parse publication_date — using today's quarter")
today = date.today()
self.publication_date = today
self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}"
return r.text
def fetch_page(self, first: int, rows_per_page: int) -> str:
r = self.s.post(
INDEX_FORM,
data={
"javax.faces.partial.ajax": "true",
"javax.faces.source": "form:dataTable",
"javax.faces.partial.execute": "form:dataTable",
"javax.faces.partial.render": "form:dataTable",
"form:dataTable": "form:dataTable",
"form:dataTable_pagination": "true",
"form:dataTable_first": str(first),
"form:dataTable_rows": str(rows_per_page),
"form:dataTable_encodeFeature": "true",
"form": "form",
"form:inputc": "",
"form:searchdata": "",
"javax.faces.ViewState": self.viewstate,
},
headers={
"Referer": INDEX_FORM,
"Origin": "https://www.anaf.ro",
"User-Agent": USER_AGENT,
"Accept": "application/xml,text/xml,*/*;q=0.01",
"Accept-Language": "ro,en;q=0.9",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Faces-Request": "partial/ajax",
},
timeout=TIMEOUT,
)
r.raise_for_status()
m = re.search(r'<update id="[^"]*javax\.faces\.ViewState[^"]*"><!\[CDATA\[([^\]]+)\]\]>', r.text)
if m:
self.viewstate = m.group(1)
return r.text
class CaptchaWrong(Exception):
pass
# ─────────────────────────────────────────────────────────────────────────────
# Row parsing — only 3 columns in lista_alba
def parse_rows(html_or_partial: str) -> list[dict[str, Any]]:
"""Extract rows from initial HTML or AJAX partial.
Row layout (3 cells per probe 2026-05-12):
0: nr_crt
1: name (denumirea contribuabilului)
2: CIF (cui)
"""
rows: list[dict[str, Any]] = []
for tr_m in re.finditer(
r'<tr\b[^>]*data-ri="(\d+)"[^>]*>(.*?)</tr>',
html_or_partial, re.DOTALL,
):
body = tr_m.group(2)
cells = re.findall(r"<td\b[^>]*>(.*?)</td>", body, re.DOTALL)
if len(cells) < 3:
continue
def _txt(s: str) -> str:
t = re.sub(r"<[^>]+>", "", s)
return re.sub(r"\s+", " ", t).strip()
rows.append({
"nr_crt": _txt(cells[0]),
"name": _txt(cells[1]),
"cui": _txt(cells[2]),
})
return rows
# ─────────────────────────────────────────────────────────────────────────────
# DB UPSERT
def upsert_rows(
conn,
rows: list[dict[str, Any]],
publication_date: date,
period_label: str,
) -> int:
if not rows:
return 0
source_url = INDEX_PAGE
payload = [(
r["cui"].replace(" ", "").upper().lstrip("RO"),
r["name"],
publication_date,
period_label,
source_url,
) for r in rows if r["cui"]]
sql = """
INSERT INTO anaf.lista_alba (
cui, name, publication_date, period_label, source_url
) VALUES %s
ON CONFLICT (cui, publication_date)
DO UPDATE SET
name = EXCLUDED.name,
period_label = EXCLUDED.period_label,
source_url = EXCLUDED.source_url,
fetched_at = now()
"""
with conn.cursor() as cur:
psycopg2.extras.execute_values(cur, sql, payload, page_size=1000)
conn.commit()
return len(payload)
# ─────────────────────────────────────────────────────────────────────────────
# Orchestration
def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]:
api_key = os.environ.get("TWOCAPTCHA_KEY", "")
if not api_key and not dry_run:
raise RuntimeError("Missing TWOCAPTCHA_KEY env var")
if dry_run:
log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve")
sess = AnafSession(api_key="")
sess.bootstrap()
log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)")
log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page")
return {"lista_alba_inserted": 0, "errors": 0}
db_url = os.environ.get("DATABASE_URL", "")
if not db_url:
raise RuntimeError("Missing DATABASE_URL env var")
db_url = re.sub(r"[?&]schema=[^&]*", "", db_url)
db_url = re.sub(r"\?$", "", db_url)
conn = psycopg2.connect(db_url)
conn.autocommit = False
sess = AnafSession(api_key=api_key)
last_cid: str | None = None
initial_html = ""
for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1):
sess.bootstrap()
image = sess.get_kaptcha()
token, cid = solve_kaptcha(api_key, image, attempt=attempt)
last_cid = cid
try:
initial_html = sess.submit_initial(token, rows_per_page)
log.info(f"captcha accepted on attempt {attempt}")
break
except CaptchaWrong:
log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying")
if TWOCAPTCHA_REPORT and last_cid:
report_bad_solve(api_key, last_cid)
if attempt == TWOCAPTCHA_MAX_ATTEMPTS:
raise RuntimeError("captcha solve failed after retries")
all_rows = parse_rows(initial_html)
log.info(f"page 1: {len(all_rows)} rows")
first = len(all_rows)
page_num = 2
while True:
try:
partial = sess.fetch_page(first=first, rows_per_page=rows_per_page)
new_rows = parse_rows(partial)
if not new_rows:
log.info(f"pagination exhausted at first={first}")
break
all_rows.extend(new_rows)
log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})")
first += len(new_rows)
page_num += 1
except Exception as e:
log.error(f"pagination error at page {page_num}: {e}")
break
log.info(f"total rows collected: {len(all_rows)}")
if not sess.publication_date:
raise RuntimeError("No publication_date captured")
inserted = upsert_rows(
conn, all_rows,
publication_date=sess.publication_date,
period_label=sess.period_label,
)
log.info(f"upserted {inserted} rows into anaf.lista_alba for {sess.period_label}")
conn.close()
return {"lista_alba_inserted": inserted, "errors": 0}
def main():
dry_run = os.environ.get("DRY_RUN", "0") == "1"
rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE)))
log.info(f"=== ANAF lista_alba scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===")
try:
result = run(dry_run=dry_run, rows_per_page=rows_per_page)
except Exception as e:
log.error(f"FATAL: {e}", exc_info=True)
sys.exit(1)
log.info(f"DONE {result}")
if __name__ == "__main__":
main()