initial: split from gov-agreg — vreau.digital standalone platform

Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
This commit is contained in:
Claude VM
2026-05-13 00:10:32 +03:00
commit a6c03a091e
352 changed files with 75295 additions and 0 deletions
+199
View File
@@ -0,0 +1,199 @@
"""
Window-aware pagination + auto-split.
Constraints discovered empirically:
- PageSize is server-fixed at 100 items per response.
- Server caps results at PageTotal == 1000 — when reached, the window
truncated and we lose data. Auto-split window in halves until PageTotal < 1000.
- Server silently returns PageTotal=0 for windows that are too wide
(e.g. >7 days for SU_CaNotices) — also triggers auto-split.
Yields parsed item dicts as they arrive; caller is responsible for batching
DB writes. Cursor advancement (last successful publication_date) is tracked
externally in seap.wsp_sync_state.
"""
from __future__ import annotations
import importlib
import logging
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from typing import Any, Iterator
from lxml import etree
from .client import WspClient
from .operations import WspOp
log = logging.getLogger(__name__)
@dataclass
class WindowResult:
op_name: str
window_start: datetime
window_end: datetime
items_imported: int = 0
pages_fetched: int = 0
page_total_first: int = 0 # PageTotal returned on page 1
sub_windows: list['WindowResult'] = field(default_factory=list) # if split
errors: list[str] = field(default_factory=list)
skipped: bool = False # set if window was outside cursor range
@property
def all_items_imported(self) -> int:
return self.items_imported + sum(s.all_items_imported for s in self.sub_windows)
def _load_parser(op: WspOp):
"""Dynamically import the parser for an operation."""
if not op.parser_module:
return None
try:
mod = importlib.import_module(op.parser_module)
return mod
except ModuleNotFoundError:
log.warning('Parser module %s not found — items will be skipped', op.parser_module)
return None
def _iter_items_xml(items_xml: bytes, op: WspOp):
"""Iterate item elements out of <a:Items>."""
if not items_xml:
return
try:
items_el = etree.fromstring(items_xml)
except etree.XMLSyntaxError as e:
log.error('Cannot parse items XML for %s: %s', op.name, e)
return
for el in items_el.iter():
# Match the configured item tag (or fallback: any direct child)
if op.item_xpath:
qname = op.item_xpath.rsplit('}', 1)[-1]
if etree.QName(el.tag).localname == qname:
yield el
else:
# fallback: yield all children of Items
if el.getparent() is items_el:
yield el
def fetch_window(
client: WspClient,
op: WspOp,
window_start: datetime,
window_end: datetime,
extra_fields: dict | None = None,
on_item: callable = None,
) -> WindowResult:
"""Fetch all pages for a single (start, end) window.
Auto-splits if PageTotal == op.items_cap (server-side cap reached).
Calls `on_item(parsed_dict, raw_xml_element)` for each item parsed.
Returns WindowResult with stats; sub_windows populated if splits occurred.
"""
extra_fields = extra_fields or {}
parser = _load_parser(op)
result = WindowResult(
op_name=op.name,
window_start=window_start,
window_end=window_end,
)
# Build base fields
fields_base = dict(extra_fields)
if op.date_start_field:
fields_base[op.date_start_field] = window_start.strftime('%Y-%m-%dT%H:%M:%S')
if op.date_end_field:
fields_base[op.date_end_field] = window_end.strftime('%Y-%m-%dT%H:%M:%S')
# Page 1
fields = {**fields_base, 'PageIndex': 1}
r = client.call(op, fields)
result.pages_fetched = 1
result.page_total_first = r.page_total
if r.status != 'Success':
result.errors.append(f'page1: {r.status}{r.description}')
return result
# Detect server cap and split
if r.page_total >= op.items_cap and (window_end - window_start) > timedelta(hours=1):
log.info(' %s: PageTotal=%d hit cap (>= %d) on %s%s — splitting',
op.name, r.page_total, op.items_cap, window_start, window_end)
return _split_window(client, op, window_start, window_end, extra_fields, on_item)
# Process page 1 items
if r.items_xml and parser is not None:
for el in _iter_items_xml(r.items_xml, op):
try:
parsed = parser.parse(el)
if parsed and on_item:
on_item(parsed, el)
result.items_imported += 1
except Exception as e:
log.exception('Parse error in %s: %s', op.name, e)
result.errors.append(f'parse: {e}')
# Remaining pages
total_pages = r.num_pages
for page in range(2, total_pages + 1):
fields = {**fields_base, 'PageIndex': page}
r2 = client.call(op, fields)
result.pages_fetched += 1
if r2.status != 'Success':
result.errors.append(f'page{page}: {r2.status}{r2.description}')
break
if parser is None:
continue
for el in _iter_items_xml(r2.items_xml, op):
try:
parsed = parser.parse(el)
if parsed and on_item:
on_item(parsed, el)
result.items_imported += 1
except Exception as e:
log.exception('Parse error in %s page %d: %s', op.name, page, e)
result.errors.append(f'parse p{page}: {e}')
return result
def _split_window(client: WspClient, op: WspOp,
start: datetime, end: datetime,
extra_fields: dict, on_item) -> WindowResult:
"""Recursively split window in half until each fits under cap."""
parent = WindowResult(op_name=op.name, window_start=start, window_end=end)
span = end - start
if span <= timedelta(hours=1):
# Cannot split further — accept truncation
log.warning('Cannot split below 1h for %s [%s, %s] — accepting cap',
op.name, start, end)
# Just fetch with single window and tolerate cap
sub = fetch_window(client, op, start, end, extra_fields, on_item)
parent.sub_windows.append(sub)
return parent
mid = start + span / 2
sub1 = fetch_window(client, op, start, mid, extra_fields, on_item)
sub2 = fetch_window(client, op, mid, end, extra_fields, on_item)
parent.sub_windows.extend([sub1, sub2])
return parent
def split_into_windows(start: datetime, end: datetime,
max_window_days: int) -> list[tuple[datetime, datetime]]:
"""Slice [start, end) into chunks of max_window_days each.
Returns list of (window_start, window_end) tuples.
"""
if start >= end:
return []
span = timedelta(days=max_window_days)
windows = []
cur = start
while cur < end:
nxt = min(cur + span, end)
windows.append((cur, nxt))
cur = nxt
return windows