""" ANAF datornici (persoane juridice) — live scraper. Source: https://www.anaf.ro/restante/ (JSF/PrimeFaces, JCaptcha image). NOT Cloudflare Turnstile (initial assumption was wrong, confirmed via probe). Mechanism (per probe 2026-05-12): 1. GET /restante/ → extract `javax.faces.ViewState` + session cookies 2. GET /restante/kaptcha.jpg (same session) 3. POST kaptcha image to 2captcha (~$0.0005) → get 5-char text token 4. POST /restante/index.xhtml with captcha + form fields → first page of data 5. AJAX PrimeFaces pagination POSTs for subsequent pages (no new captcha) 6. Parse rows, extract 24 cells per row, UPSERT to anaf.datornici Site shows CURRENT QUARTER ONLY (no historical access). Each quarterly run captures one snapshot. Historical pre-2026-Q1 is permanently lost — we keep the 2016-Q1 data.gov.ro snapshot already in DB. Env vars: TWOCAPTCHA_KEY — required (image solver) DATABASE_URL — postgres conn string (Prisma-style ?schema= stripped) DRY_RUN=1 — parse plan, no captcha, no DB writes ROWS_PER_PAGE=1000 — pagination chunk size (default 1000; reduce if PrimeFaces times out) ANAF_DATORNICI_LOG — log path (default stderr) """ from __future__ import annotations import base64 import io import logging import os import re import sys import time from dataclasses import dataclass, field from datetime import date from typing import Any import psycopg2 import psycopg2.extras import requests # ───────────────────────────────────────────────────────────────────────────── # Logging LOG_FILE = os.environ.get("ANAF_DATORNICI_LOG", "") _handlers: list[logging.Handler] = [logging.StreamHandler(sys.stderr)] if LOG_FILE: try: _handlers.append(logging.FileHandler(LOG_FILE)) except OSError: pass logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=_handlers, ) log = logging.getLogger("anaf_datornici") # ───────────────────────────────────────────────────────────────────────────── # Constants BASE = "https://www.anaf.ro/restante" INDEX_PAGE = f"{BASE}/" INDEX_FORM = f"{BASE}/index.xhtml" KAPTCHA_URL = f"{BASE}/kaptcha.jpg" USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0 Safari/537.36" ) TIMEOUT = 60 TWOCAPTCHA_IN = "https://2captcha.com/in.php" TWOCAPTCHA_RES = "https://2captcha.com/res.php" TWOCAPTCHA_POLL_INTERVAL = 5 # seconds TWOCAPTCHA_MAX_POLL = 36 # 36 * 5s = 180s TWOCAPTCHA_MAX_ATTEMPTS = 3 # captcha solve retries on wrong-text TWOCAPTCHA_REPORT = True # report bad solves for credit refund DEFAULT_ROWS_PER_PAGE = int(os.environ.get("ROWS_PER_PAGE", "1000")) # ───────────────────────────────────────────────────────────────────────────── # Quarter math def parse_publication_date(html: str) -> tuple[date, str]: """Extract 'Obligații fiscale restante la data de DD.MM.YYYY' from page.""" m = re.search(r"data\s+de\s+(\d{2})\.(\d{2})\.(\d{4})", html, re.IGNORECASE) if not m: raise RuntimeError("Cannot parse publication_date from page HTML") d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3)) pub_date = date(y, mo, d) # Map publication_date → quarter label. # Convention: pub at end-of-quarter (31 Mar = T1, 30 Jun = T2, 30 Sep = T3, 31 Dec = T4). q = (mo - 1) // 3 + 1 period_label = f"T{q} {y}" return pub_date, period_label # ───────────────────────────────────────────────────────────────────────────── # 2captcha image solver def solve_kaptcha(api_key: str, image: bytes, *, attempt: int = 1) -> tuple[str, str]: """Submit JPEG to 2captcha, poll for text. Returns (token, captcha_id).""" b64 = base64.b64encode(image).decode() r = requests.post( TWOCAPTCHA_IN, data={ "key": api_key, "method": "base64", "body": b64, "json": "1", "numeric": "0", # any chars "min_len": "4", "max_len": "8", "language": "2", # any language "regsense": "1", # case-sensitive }, timeout=TIMEOUT, ) r.raise_for_status() j = r.json() if j.get("status") != 1: raise RuntimeError(f"2captcha in.php error: {j}") cid = j["request"] log.info(f"2captcha attempt {attempt}: id={cid}, polling…") for poll in range(TWOCAPTCHA_MAX_POLL): time.sleep(TWOCAPTCHA_POLL_INTERVAL) rr = requests.get( TWOCAPTCHA_RES, params={"key": api_key, "action": "get", "id": cid, "json": "1"}, timeout=TIMEOUT, ) jj = rr.json() if jj.get("status") == 1: return jj["request"], cid if jj.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"): continue raise RuntimeError(f"2captcha res.php error: {jj}") raise RuntimeError(f"2captcha timeout after {TWOCAPTCHA_MAX_POLL*TWOCAPTCHA_POLL_INTERVAL}s") def report_bad_solve(api_key: str, cid: str) -> None: """Report wrong solve to 2captcha for credit refund.""" try: requests.get( TWOCAPTCHA_RES, params={"key": api_key, "action": "reportbad", "id": cid}, timeout=TIMEOUT, ) except Exception: pass # ───────────────────────────────────────────────────────────────────────────── # Session / pagination @dataclass class AnafSession: api_key: str s: requests.Session = field(default_factory=requests.Session) viewstate: str = "" publication_date: date | None = None period_label: str = "" total_records: int = 0 def __post_init__(self): self.s.headers.update({"User-Agent": USER_AGENT}) def bootstrap(self) -> None: """GET initial page, extract ViewState + session cookies.""" log.info(f"GET {INDEX_PAGE}") r = self.s.get(INDEX_PAGE, timeout=TIMEOUT) r.raise_for_status() m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text) if not m: raise RuntimeError("No ViewState in initial page") self.viewstate = m.group(1) log.info(f"viewstate fetched ({len(self.viewstate)} chars)") def get_kaptcha(self) -> bytes: r = self.s.get( KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": INDEX_PAGE} ) r.raise_for_status() if not r.content.startswith(b"\xff\xd8\xff"): raise RuntimeError("kaptcha response not JPEG") return r.content def submit_initial(self, captcha_text: str, rows_per_page: int) -> str: """POST form with captcha → first page of data (HTML).""" log.info(f"POST {INDEX_FORM} (captcha={captcha_text!r}, rows={rows_per_page})") r = self.s.post( INDEX_FORM, data={ "form": "form", "form:inputc": captcha_text, "form:searchdata": "", "form:submit": "", "form_SUBMIT": "1", "javax.faces.ViewState": self.viewstate, }, headers={ "Referer": INDEX_PAGE, "Origin": "https://www.anaf.ro", "User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "ro,en;q=0.9", "Content-Type": "application/x-www-form-urlencoded", }, timeout=TIMEOUT, ) r.raise_for_status() # Refresh ViewState (JSF rotates it on each interaction) m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', r.text) if m: self.viewstate = m.group(1) # Detect captcha-error case if "Cod de validare gresit" in r.text or "incorect" in r.text.lower()[:5000]: raise CaptchaWrong(r.text) # Extract publication date + total try: self.publication_date, self.period_label = parse_publication_date(r.text) log.info(f"publication_date={self.publication_date} period={self.period_label}") except RuntimeError: log.warning("could not parse publication_date — using today's quarter") today = date.today() self.publication_date = today self.period_label = f"T{(today.month - 1) // 3 + 1} {today.year}" m = re.search(r"\((\d+)\s+of\s+(\d+)\)", r.text) if m: self.total_records = int(m.group(2)) * 16 # pages * rows-per-page-default log.info(f"total_records estimate (from paginator): ~{self.total_records}") return r.text def fetch_page(self, first: int, rows_per_page: int) -> str: """AJAX PrimeFaces pagination POST. Returns partial response XML.""" r = self.s.post( INDEX_FORM, data={ "javax.faces.partial.ajax": "true", "javax.faces.source": "form:dataTable", "javax.faces.partial.execute": "form:dataTable", "javax.faces.partial.render": "form:dataTable", "form:dataTable": "form:dataTable", "form:dataTable_pagination": "true", "form:dataTable_first": str(first), "form:dataTable_rows": str(rows_per_page), "form:dataTable_encodeFeature": "true", "form": "form", "form:inputc": "", "form:searchdata": "", "javax.faces.ViewState": self.viewstate, }, headers={ "Referer": INDEX_FORM, "Origin": "https://www.anaf.ro", "User-Agent": USER_AGENT, "Accept": "application/xml,text/xml,*/*;q=0.01", "Accept-Language": "ro,en;q=0.9", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "Faces-Request": "partial/ajax", }, timeout=TIMEOUT, ) r.raise_for_status() # Update ViewState from partial response m = re.search(r'', r.text) if m: self.viewstate = m.group(1) return r.text class CaptchaWrong(Exception): pass # ───────────────────────────────────────────────────────────────────────────── # Row parsing def parse_rows(html_or_partial: str) -> list[dict[str, Any]]: """Extract debtor rows from initial HTML or AJAX partial response. Row layout (24 cells observed via probe 2026-05-12): 0: nr_crt 1: name (denumire debitor) 2: CIF (cui) 3: total bugetul de stat 4: total asigurări sociale 5: total șomaj 6: total sănătate 7-10: state {principal, accesorii, necontestate, contestate} 11-14: social {principal, accesorii, necontestate, contestate} 15-18: unemployment {principal, accesorii, necontestate, contestate} 19-22: health {principal, accesorii, necontestate, contestate} 23: observation/status (e.g. "Faliment") """ rows: list[dict[str, Any]] = [] # Match each … for tr_m in re.finditer( r']*data-ri="(\d+)"[^>]*>(.*?)', html_or_partial, re.DOTALL, ): body = tr_m.group(2) cells = re.findall(r"]*>(.*?)", body, re.DOTALL) if len(cells) < 24: continue def _txt(s: str) -> str: t = re.sub(r"<[^>]+>", "", s) return re.sub(r"\s+", " ", t).strip() def _num(s: str) -> float: t = _txt(s).replace(".", "").replace(",", ".") try: return float(t) except ValueError: return 0.0 rows.append({ "nr_crt": _txt(cells[0]), "name": _txt(cells[1]), "cui": _txt(cells[2]), "budget_state_total": _num(cells[3]), "budget_social_total": _num(cells[4]), "budget_unemployment_total": _num(cells[5]), "budget_health_total": _num(cells[6]), "state_principal": _num(cells[7]), "state_penalty": _num(cells[8]), "state_necontestate": _num(cells[9]), "state_contestate": _num(cells[10]), "social_principal": _num(cells[11]), "social_penalty": _num(cells[12]), "social_necontestate": _num(cells[13]), "social_contestate": _num(cells[14]), "unemp_principal": _num(cells[15]), "unemp_penalty": _num(cells[16]), "unemp_necontestate": _num(cells[17]), "unemp_contestate": _num(cells[18]), "health_principal": _num(cells[19]), "health_penalty": _num(cells[20]), "health_necontestate": _num(cells[21]), "health_contestate": _num(cells[22]), "observation": _txt(cells[23]), }) return rows # ───────────────────────────────────────────────────────────────────────────── # DB UPSERT def upsert_rows( conn, rows: list[dict[str, Any]], publication_date: date, period_label: str, debtor_category: str = "persoane_juridice", ) -> int: if not rows: return 0 source_url = INDEX_PAGE # debt_total per row = sum of 4 category totals payload = [( r["cui"].replace(" ", "").upper().lstrip("RO"), r["name"], None, # judet not provided by ANAF /restante/ publication_date, period_label, debtor_category, # debt_total = sum of 4 category totals r["budget_state_total"] + r["budget_social_total"] + r["budget_unemployment_total"] + r["budget_health_total"], # principal across categories r["state_principal"] + r["social_principal"] + r["unemp_principal"] + r["health_principal"], # penalty across categories r["state_penalty"] + r["social_penalty"] + r["unemp_penalty"] + r["health_penalty"], # contestate across categories r["state_contestate"] + r["social_contestate"] + r["unemp_contestate"] + r["health_contestate"], # per-budget detail (12 columns) r["state_principal"], r["state_penalty"], r["state_contestate"], r["social_principal"], r["social_penalty"], r["social_contestate"], r["unemp_principal"], r["unemp_penalty"], r["unemp_contestate"], r["health_principal"], r["health_penalty"], r["health_contestate"], source_url, ) for r in rows if r["cui"]] sql = """ INSERT INTO anaf.datornici ( cui, name, judet, publication_date, period_label, debtor_category, debt_total, debt_principal, debt_penalty, debt_contested, budget_state_principal, budget_state_penalty, budget_state_contested, budget_social_principal, budget_social_penalty, budget_social_contested, budget_unemployment_principal, budget_unemployment_penalty, budget_unemployment_contested, budget_health_principal, budget_health_penalty, budget_health_contested, source_url ) VALUES %s ON CONFLICT (cui, publication_date) DO UPDATE SET name = EXCLUDED.name, debt_total = EXCLUDED.debt_total, debt_principal = EXCLUDED.debt_principal, debt_penalty = EXCLUDED.debt_penalty, debt_contested = EXCLUDED.debt_contested, budget_state_principal = EXCLUDED.budget_state_principal, budget_state_penalty = EXCLUDED.budget_state_penalty, budget_state_contested = EXCLUDED.budget_state_contested, budget_social_principal = EXCLUDED.budget_social_principal, budget_social_penalty = EXCLUDED.budget_social_penalty, budget_social_contested = EXCLUDED.budget_social_contested, budget_unemployment_principal = EXCLUDED.budget_unemployment_principal, budget_unemployment_penalty = EXCLUDED.budget_unemployment_penalty, budget_unemployment_contested = EXCLUDED.budget_unemployment_contested, budget_health_principal = EXCLUDED.budget_health_principal, budget_health_penalty = EXCLUDED.budget_health_penalty, budget_health_contested = EXCLUDED.budget_health_contested, fetched_at = now() """ with conn.cursor() as cur: psycopg2.extras.execute_values(cur, sql, payload, page_size=500) conn.commit() return len(payload) # ───────────────────────────────────────────────────────────────────────────── # Orchestration def run(*, dry_run: bool, rows_per_page: int) -> dict[str, int]: api_key = os.environ.get("TWOCAPTCHA_KEY", "") if not api_key and not dry_run: raise RuntimeError("Missing TWOCAPTCHA_KEY env var — see HANDOFF doc") if dry_run: log.info("DRY_RUN=1 — connecting only to validate config, no captcha solve") sess = AnafSession(api_key="") sess.bootstrap() log.info(f"bootstrap OK, viewstate captured ({len(sess.viewstate)} chars)") log.info(f"would solve 1 captcha (~$0.001 worst case) then paginate {rows_per_page} rows/page") return {"datornici_inserted": 0, "errors": 0} db_url = os.environ.get("DATABASE_URL", "") if not db_url: raise RuntimeError("Missing DATABASE_URL env var") db_url = re.sub(r"[?&]schema=[^&]*", "", db_url) db_url = re.sub(r"\?$", "", db_url) conn = psycopg2.connect(db_url) conn.autocommit = False sess = AnafSession(api_key=api_key) # Captcha solve with retries (wrong-text bounce) last_cid: str | None = None for attempt in range(1, TWOCAPTCHA_MAX_ATTEMPTS + 1): sess.bootstrap() image = sess.get_kaptcha() token, cid = solve_kaptcha(api_key, image, attempt=attempt) last_cid = cid try: initial_html = sess.submit_initial(token, rows_per_page) log.info(f"captcha accepted on attempt {attempt}") break except CaptchaWrong: log.warning(f"captcha rejected by ANAF on attempt {attempt}, retrying") if TWOCAPTCHA_REPORT and last_cid: report_bad_solve(api_key, last_cid) if attempt == TWOCAPTCHA_MAX_ATTEMPTS: raise RuntimeError("captcha solve failed after retries") # Initial page rows all_rows = parse_rows(initial_html) log.info(f"page 1: {len(all_rows)} rows") # Paginate # Discover total via paginator markup. Default page count is 16/page; # if we set rows_per_page>16, total_records estimate may be wrong. # Just iterate until parse_rows returns empty. first = len(all_rows) page_num = 2 while True: try: partial = sess.fetch_page(first=first, rows_per_page=rows_per_page) new_rows = parse_rows(partial) if not new_rows: log.info(f"pagination exhausted at first={first}") break all_rows.extend(new_rows) log.info(f"page {page_num}: {len(new_rows)} rows (running total: {len(all_rows)})") first += len(new_rows) page_num += 1 except Exception as e: log.error(f"pagination error at page {page_num}: {e}") break log.info(f"total rows collected: {len(all_rows)}") if not sess.publication_date: raise RuntimeError("No publication_date captured") inserted = upsert_rows( conn, all_rows, publication_date=sess.publication_date, period_label=sess.period_label, ) log.info(f"upserted {inserted} rows into anaf.datornici for {sess.period_label}") conn.close() return {"datornici_inserted": inserted, "errors": 0} def main(): dry_run = os.environ.get("DRY_RUN", "0") == "1" rows_per_page = int(os.environ.get("ROWS_PER_PAGE", str(DEFAULT_ROWS_PER_PAGE))) log.info(f"=== ANAF datornici scrape: dry_run={dry_run} rows_per_page={rows_per_page} ===") try: result = run(dry_run=dry_run, rows_per_page=rows_per_page) except Exception as e: log.error(f"FATAL: {e}", exc_info=True) sys.exit(1) log.info(f"DONE {result}") if __name__ == "__main__": main()