Files
vreau-digital/services/seap-scraper/wsp/parsers/_base.py
T
Claude VM a6c03a091e initial: split from gov-agreg — vreau.digital standalone platform
Moved from gov-agreg/src/pages/achizitii/* to root (drop prefix).
- 22 pages migrated, 127 files total
- All internal links: /achizitii/X → /X (176 occurrences fixed)
- AchizitiiLayout subnav rewritten: /X paths, top-right link to vreaudigital.ro hub
- BaseLayout new (vreau.digital branding, OG tags, site URL)
- astro.config.mjs: site https://vreau.digital, server output (was static)
- docker-compose: port 5096 (vreaudigital is 5095), container vreau-digital
- deploy.sh: paths /opt/vreau-digital, log /var/log/vreau-digital-deploy.log

Backend shared with gov-agreg:
- PostgreSQL satra (same schemas: seap, firms, anaf, anre, ...)
- Photon, Martin tiles
- Infisical /vreaudigital path (DATABASE_URL etc. shared)

build: PASS (npx astro check 0 errors, npm run build 5s vite + 10s server)
2026-05-13 00:10:32 +03:00

215 lines
9.1 KiB
Python

"""
Base notice parser — shared logic for CN, PI, RFQ, DC, PC, Rdc, EN notices.
CA notice has more complex structure (lots + winners) so it has its own parser.
"""
from __future__ import annotations
from ..xml_utils import (
find_child_local, find_local, find_path,
text_under, text_direct, int_under, decimal_under, bool_under,
datetime_under, sysitem_name, sysitem_id,
)
def parse_basic_notice(el, *, type_tag: str, source_tag: str,
notice_id_field: str = None) -> dict | None:
"""Generic notice parser — works for CN, PI, DC, PC, Rdc, EN, RFQ.
type_tag: short type identifier for the row (e.g. 'c_notice', 'pi_notice')
source_tag: value for source column (e.g. 'wsp_cnotice')
notice_id_field: local name of the ID field (CNoticeId, PiNoticeId, etc.)
If None, auto-detect by trying common names.
"""
notice_no = (text_under(el, 'NoticeNo') or
text_under(el, 'CNoticeNumber') or
text_under(el, 'PiNoticeNumber') or
text_under(el, 'NoticeNumber') or
text_under(el, 'RFQInvitationNumber') or
text_under(el, 'RFQNoticeNumber') or
text_under(el, 'DCNoticeNumber') or
text_under(el, 'PCNoticeNumber') or
text_under(el, 'RDCNoticeNumber') or
text_under(el, 'ENoticeNumber') or
text_under(el, 'EAProcedureNumber') or
text_under(el, 'DfNoticeNo')) # last-resort fallback
if not notice_no:
return None
notice_id = None
for field in (notice_id_field, 'CNoticeId', 'PiNoticeId', 'NoticeId',
'CaNoticeId', 'DCNoticeId', 'PCNoticeId',
'RFQNoticeId', 'RdcNoticeId', 'ENoticeId',
'EAProcedureId', 'RFQInvitationId'):
if field:
notice_id = int_under(el, field)
if notice_id is not None:
break
general = find_child_local(el, 'General')
section1 = find_child_local(el, 'Section1')
section2 = find_child_local(el, 'Section2')
section4 = find_child_local(el, 'Section4')
# Authority
auth_addresses = find_path(section1, 'Section1_1', 'CaAddresses')
if auth_addresses is None:
auth_addresses = find_path(section1, 'Section1_1')
auth_info = find_local(auth_addresses, 'EntityInformation') if auth_addresses is not None else None
authority_name = text_direct(auth_info, 'Name') if auth_info is not None else None
authority_cui = text_direct(auth_info, 'Cif') if auth_info is not None else None
authority_address = text_direct(auth_info, 'Address') if auth_info is not None else None
authority_email = text_direct(auth_info, 'Email') if auth_info is not None else None
authority_phone = text_direct(auth_info, 'Phone') if auth_info is not None else None
authority_url = text_direct(auth_info, 'Url') if auth_info is not None else None
county_code = sysitem_name(auth_info, 'NutsCode') if auth_info is not None else None
entity_id = int_under(general, 'EntityId') if general is not None else None
s1_4 = find_child_local(section1, 'Section1_4') if section1 is not None else None
authority_type = sysitem_name(s1_4, 'ContractingAuthorityType')
s1_5 = find_child_local(section1, 'Section1_5') if section1 is not None else None
main_activity = sysitem_name(s1_5, 'MainActivity')
# Section 2 — contract
s2_1 = find_child_local(section2, 'Section2_1') if section2 is not None else None
contract_title = (text_under(general, 'ContractTitle') or
text_under(s2_1, 'ContractName') or
text_under(s2_1, 'Title'))
short_desc = text_under(s2_1, 'ShortContractDescription')
main_cpv_code = sysitem_name(s2_1, 'MainCPV') or sysitem_name(s2_1, 'MainCPVCode')
main_cpv_id = sysitem_id(s2_1, 'MainCPV') or sysitem_id(s2_1, 'MainCPVCode')
contract_type = sysitem_name(s2_1, 'SysAcquisitionContractType')
currency = sysitem_name(s2_1, 'Currency')
estimated_value = decimal_under(s2_1, 'EstimatedValue') or decimal_under(s2_1, 'TotalValue')
has_lots = bool_under(s2_1, 'ContractHasLots')
reference_number = text_under(s2_1, 'ReferenceNumber')
# Lots — for CN/RFQ/etc., lots in Section2_2
lots = _extract_lots_simple(section2)
lots_count = len(lots) if lots else None
# Procedure
s4_1 = find_child_local(section4, 'Section4_1') if section4 is not None else None
procedure_type = sysitem_name(s4_1, 'SysProcedureType')
framework_agreement = bool_under(s4_1, 'FrameworkAgreement')
# Section 4_2 — deadlines
s4_2 = find_child_local(section4, 'Section4_2') if section4 is not None else None
deadline_submission = (datetime_under(s4_2, 'TenderAvailabilityDeadline') or
datetime_under(s4_2, 'ReceiptTimeLimit') or
datetime_under(s4_2, 'ReceiptDeadline'))
opening_date = datetime_under(s4_2, 'TenderOpeningDate')
# Dates + state
publication_date = datetime_under(general, 'PublishDate')
legislation = sysitem_name(general, 'SysLegislationType') or sysitem_name(general, 'LegislationType')
notice_state = sysitem_name(general, 'SysNoticeState')
notice_state_id = sysitem_id(general, 'SysNoticeState')
is_utility = bool_under(general, 'IsUtility')
notice_no_joue = text_under(general, 'NoticeNoJoue') or text_under(general, 'JOUEPublicationNumber')
# Documents
documents = _extract_documents(general)
return {
'type': type_tag,
'ref_number': f'WSP-{notice_no}',
'authority_name': authority_name,
'authority_cui': authority_cui,
'authority_address': authority_address,
'authority_email': authority_email,
'authority_phone': authority_phone,
'authority_url': authority_url,
'authority_type': authority_type,
'authority_main_activity': main_activity,
'authority_entity_id': entity_id,
'title': contract_title[:1000] if contract_title else None,
'cpv_code': main_cpv_code,
'contract_type': contract_type,
'publication_date': publication_date,
'estimated_value': estimated_value,
'awarded_value': None,
'currency': currency,
'supplier_name': None,
'supplier_cui': None,
'procedure_type': procedure_type,
'procedure_state': notice_state,
'legislation': legislation,
'has_lots': 'da' if has_lots else 'nu' if has_lots is False else None,
'contract_has_lots': has_lots,
'lots_count': lots_count,
'joue': notice_no_joue,
'county_code': county_code,
'notice_state': notice_state,
'notice_state_id': notice_state_id,
'framework_agreement': framework_agreement,
'notice_id_internal': notice_id,
'deadline_submission': deadline_submission,
'opening_date': opening_date,
'documents': documents or None,
'lots': lots or None,
'details': {
'short_description': short_desc,
'reference_number': reference_number,
'main_cpv_id': main_cpv_id,
'is_utility': is_utility,
},
'source': source_tag,
}
def _extract_lots_simple(section2) -> list[dict]:
"""Extract Lots list from Section2_2 → Lots → LotInfo."""
if section2 is None:
return []
lots_list = find_local(section2, 'Lots')
if lots_list is None:
return []
out = []
for lot in lots_list:
if etree.QName(lot.tag).localname != 'LotInfo':
continue
lot_data = {
'lot_id': int_under(lot, 'LotID'),
'lot_no': int_under(lot, 'LotNo'),
'title': text_under(lot, 'Title'),
'description': text_under(lot, 'DescriptionOfProcurement'),
'cpv_code': sysitem_name(lot, 'MainCPVCode'),
'estimated_value': _str_decimal(decimal_under(lot, 'EstimatedValue')),
'duration_months': int_under(lot, 'DurationInMonths'),
'duration_days': int_under(lot, 'DurationInDays'),
'currency': sysitem_name(lot, 'Currency'),
'place_of_performance': text_under(lot, 'MainSiteOrPlaceOfPerformance'),
'is_community_financed': bool_under(lot, 'IsCommunityFinanced'),
}
out.append({k: v for k, v in lot_data.items() if v is not None})
return out
def _extract_documents(general) -> list[dict]:
if general is None:
return []
out = []
for fld in ('NoticeFiles', 'CompanyFiles', 'DfNoticeFiles'):
container = find_local(general, fld)
if container is None:
continue
for kvp in container:
key = find_local(kvp, 'key')
if key is None:
key = find_local(kvp, 'Key')
val = find_local(kvp, 'value')
if val is None:
val = find_local(kvp, 'Value')
if key is not None and val is not None:
out.append({'type': fld, 'name': key.text, 'id': val.text})
return out
def _str_decimal(d):
return str(d) if d is not None else None
from lxml import etree # noqa: E402