initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix). - 22 pages migrated, 127 files total - All internal links: /achizitii/X → /X (176 occurrences fixed) - AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub - BaseLayout new (vreau.digital branding, OG tags, site URL) - astro.config.mjs: site https://vreau.digital, server output (was static) - docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital - deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log Backend shared with gov-agreg: - PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...) - Photon, Martin tiles - Infisical /vreaudigital path (DATABASE_URL etc. shared) build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
@@ -0,0 +1,212 @@
|
||||
"""
|
||||
ANAF /restante/ probe — discovers actual mechanism.
|
||||
|
||||
Steps:
|
||||
1. GET /restante/ → extract javax.faces.ViewState, session cookie
|
||||
2. GET kaptcha.jpg (same session)
|
||||
3. POST kaptcha image to 2captcha → get text solution
|
||||
4. POST /restante/index.xhtml with captcha + form fields → get response
|
||||
5. Print: response HTML structure, table shape, pagination markers, quarter
|
||||
selector evidence
|
||||
|
||||
Used ONCE to understand the page before committing to a full scraper rewrite.
|
||||
Spends ~$0.001 of 2captcha credit.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
BASE = "https://www.anaf.ro/restante"
|
||||
INDEX_URL = f"{BASE}/index.xhtml"
|
||||
KAPTCHA_URL = f"{BASE}/kaptcha.jpg"
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||
)
|
||||
TIMEOUT = 30
|
||||
|
||||
TWOCAPTCHA_IN = "https://2captcha.com/in.php"
|
||||
TWOCAPTCHA_RES = "https://2captcha.com/res.php"
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(f"[probe] {msg}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def get_initial(session: requests.Session) -> tuple[str, str]:
|
||||
"""Fetch /restante/ page, return (html, viewstate)."""
|
||||
log(f"GET {BASE}/")
|
||||
r = session.get(f"{BASE}/", timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
html = r.text
|
||||
m = re.search(r'name="javax\.faces\.ViewState"[^>]*value="([^"]+)"', html)
|
||||
if not m:
|
||||
raise RuntimeError("No ViewState found")
|
||||
viewstate = m.group(1)
|
||||
log(f"viewstate={viewstate[:24]}…")
|
||||
log(f"cookies after GET: {list(session.cookies.keys())}")
|
||||
return html, viewstate
|
||||
|
||||
|
||||
def get_kaptcha(session: requests.Session) -> bytes:
|
||||
log(f"GET {KAPTCHA_URL}")
|
||||
r = session.get(KAPTCHA_URL, timeout=TIMEOUT, headers={"Referer": f"{BASE}/"})
|
||||
r.raise_for_status()
|
||||
if not r.content.startswith(b"\xff\xd8\xff"):
|
||||
log(f"WARN: kaptcha response not JPEG (first bytes: {r.content[:10]!r})")
|
||||
log(f"kaptcha bytes: {len(r.content)} (jpg)")
|
||||
return r.content
|
||||
|
||||
|
||||
def solve_kaptcha(api_key: str, image: bytes) -> str:
|
||||
"""Submit image to 2captcha, poll for solution."""
|
||||
b64 = base64.b64encode(image).decode()
|
||||
log("POST 2captcha in.php with image…")
|
||||
r = requests.post(
|
||||
TWOCAPTCHA_IN,
|
||||
data={
|
||||
"key": api_key,
|
||||
"method": "base64",
|
||||
"body": b64,
|
||||
"json": "1",
|
||||
# Hint to 2captcha workers: this is short alphanumeric (kaptcha
|
||||
# default is 5-6 chars, mixed letter+digit, anti-aliased).
|
||||
"numeric": "0", # 0 = any chars allowed
|
||||
"min_len": "4",
|
||||
"max_len": "8",
|
||||
"language": "2", # 2 = any language (alphanumeric)
|
||||
"regsense": "1", # case-sensitive ON
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
r.raise_for_status()
|
||||
j = r.json()
|
||||
if j.get("status") != 1:
|
||||
raise RuntimeError(f"2captcha in.php error: {j}")
|
||||
cid = j["request"]
|
||||
log(f"2captcha job id={cid}, polling…")
|
||||
|
||||
for attempt in range(30): # 30 * 5s = 150s cap
|
||||
time.sleep(5)
|
||||
r = requests.get(
|
||||
TWOCAPTCHA_RES,
|
||||
params={"key": api_key, "action": "get", "id": cid, "json": "1"},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
j = r.json()
|
||||
if j.get("status") == 1:
|
||||
token = j["request"]
|
||||
log(f"2captcha solved: {token!r}")
|
||||
return token
|
||||
if j.get("request") in ("CAPCHA_NOT_READY", "CAPTCHA_NOT_READY"):
|
||||
log(f" poll {attempt+1}: not ready")
|
||||
continue
|
||||
raise RuntimeError(f"2captcha res.php error: {j}")
|
||||
raise RuntimeError("2captcha timeout 150s")
|
||||
|
||||
|
||||
def post_search(session: requests.Session, viewstate: str, captcha: str, search: str = "") -> requests.Response:
|
||||
"""POST the form. Empty search = list all (best-case, hopefully bulk)."""
|
||||
log(f"POST {INDEX_URL} captcha={captcha!r} search={search!r}")
|
||||
r = session.post(
|
||||
INDEX_URL,
|
||||
data={
|
||||
"form": "form",
|
||||
"form:inputc": captcha,
|
||||
"form:searchdata": search,
|
||||
"form:submit": "", # button submit
|
||||
"form_SUBMIT": "1",
|
||||
"javax.faces.ViewState": viewstate,
|
||||
},
|
||||
headers={
|
||||
"Referer": f"{BASE}/",
|
||||
"Origin": "https://www.anaf.ro",
|
||||
"User-Agent": USER_AGENT,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "ro,en;q=0.9",
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
timeout=TIMEOUT,
|
||||
)
|
||||
log(f"POST status={r.status_code} bytes={len(r.text)} content-type={r.headers.get('content-type')}")
|
||||
return r
|
||||
|
||||
|
||||
def analyze_response(html: str) -> None:
|
||||
"""Look for key signals in the response."""
|
||||
log("=" * 70)
|
||||
log("RESPONSE ANALYSIS:")
|
||||
# Captcha error?
|
||||
if "incorect" in html.lower() or "invalid" in html.lower() or "gresit" in html.lower():
|
||||
for m in re.finditer(r".{40}(?:incorect|invalid|gresit).{80}", html, re.IGNORECASE):
|
||||
log(f" ERR phrase: {m.group(0)!r}")
|
||||
# Table presence?
|
||||
tbls = re.findall(r"<table[^>]*>", html, re.IGNORECASE)
|
||||
log(f" <table> count: {len(tbls)}")
|
||||
# Row count in datatable?
|
||||
trs = re.findall(r"<tr[^>]*>", html, re.IGNORECASE)
|
||||
log(f" <tr> count: {len(trs)}")
|
||||
# PrimeFaces datatable markers?
|
||||
if "ui-datatable" in html:
|
||||
log(" PrimeFaces DataTable detected")
|
||||
# rows per page hint?
|
||||
m = re.search(r'rows="?(\d+)"?', html)
|
||||
if m: log(f" rows attr: {m.group(1)}")
|
||||
# Pagination evidence?
|
||||
if "ui-paginator" in html or "paginator" in html.lower():
|
||||
log(" Pagination control present")
|
||||
# CUI/CIF column?
|
||||
cuis = re.findall(r"\b\d{6,10}\b", html)
|
||||
log(f" numeric strings 6-10 digits: {len(cuis)} (possible CUIs)")
|
||||
if cuis: log(f" samples: {cuis[:10]}")
|
||||
# Total count somewhere?
|
||||
for m in re.finditer(r"(?:total|înregistrări|inregistrari|rezultate)[^<>]{0,60}", html, re.IGNORECASE):
|
||||
log(f" total phrase: {m.group(0)!r}")
|
||||
# Quarter / publication date references?
|
||||
for m in re.finditer(r"(?:trim|trimestru|publicat)[^<>]{0,80}", html, re.IGNORECASE):
|
||||
log(f" date phrase: {m.group(0)!r}")
|
||||
# Export buttons (CSV/XLSX)?
|
||||
for m in re.finditer(r"(?:export|descarc|csv|xls)[^<>]{0,40}", html, re.IGNORECASE):
|
||||
log(f" export phrase: {m.group(0)!r}")
|
||||
# First 200 chars of body
|
||||
body = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
|
||||
if body:
|
||||
text = re.sub(r"<[^>]+>", " ", body.group(1))
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
log(f" body text preview: {text[:500]!r}")
|
||||
|
||||
|
||||
def main():
|
||||
api_key = os.environ.get("TWOCAPTCHA_KEY")
|
||||
if not api_key:
|
||||
print("Missing TWOCAPTCHA_KEY env var", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": USER_AGENT})
|
||||
|
||||
html_initial, viewstate = get_initial(s)
|
||||
image = get_kaptcha(s)
|
||||
|
||||
# Save image locally for debugging
|
||||
with open("/tmp/probe_kaptcha.jpg", "wb") as f:
|
||||
f.write(image)
|
||||
log("kaptcha image saved /tmp/probe_kaptcha.jpg")
|
||||
|
||||
captcha_text = solve_kaptcha(api_key, image)
|
||||
|
||||
r = post_search(s, viewstate, captcha_text, search="")
|
||||
with open("/tmp/probe_response.html", "w") as f:
|
||||
f.write(r.text)
|
||||
log("response saved /tmp/probe_response.html")
|
||||
|
||||
analyze_response(r.text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user