#!/bin/bash # Discovers the latest ONRC bulk dataset on data.gov.ro, downloads any newer # CSVs, and runs import-onrc.sh — but only if the dataset is fresher than # what's already on disk. Idempotent: re-running on the same day is a no-op. # # Dataset on data.gov.ro is published ~monthly with slug pattern # `firme-DD-MM-YYYY`. Resource UUIDs change each release, so we can't # hardcode URLs — query CKAN to discover the current ones. set -euo pipefail DATA_DIR=/opt/vreaudigital/data/onrc LOG=/var/log/vreaudigital-onrc-import.log STAMP_FILE="$DATA_DIR/.dataset-name" SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG"; } mkdir -p "$DATA_DIR" log "=== ONRC fresh-check started ===" # Query CKAN for the most recently modified `firme-...` dataset. LATEST_NAME=$(curl -fsS --max-time 30 \ "https://data.gov.ro/api/3/action/package_search?q=firme&sort=metadata_modified+desc&rows=10" \ | jq -r '[.result.results[] | select(.name | test("^firme-[0-9]{2}-[0-9]{2}-[0-9]{4}$"))][0].name // empty') if [ -z "$LATEST_NAME" ]; then log "ERROR: could not find a firme-DD-MM-YYYY dataset on data.gov.ro" exit 1 fi log "Latest dataset on data.gov.ro: $LATEST_NAME" # Skip if we've already imported this snapshot. if [ -f "$STAMP_FILE" ] && [ "$(cat "$STAMP_FILE")" = "$LATEST_NAME" ]; then log "Already imported $LATEST_NAME — nothing to do." exit 0 fi # Fetch resource URLs for the dataset. We need 4 of them (the rest are unused). log "Fetching resource URLs for $LATEST_NAME..." RESOURCES_JSON=$(curl -fsS --max-time 30 \ "https://data.gov.ro/api/3/action/package_show?id=$LATEST_NAME") declare -A NEEDED=( [od_firme.csv]="" [od_caen_autorizat.csv]="" [od_stare_firma.csv]="" [od_reprezentanti_legali.csv]="" ) while IFS=$'\t' read -r url; do fname=$(basename "$url" | tr 'A-Z' 'a-z') if [ -n "${NEEDED[$fname]+x}" ]; then NEEDED[$fname]="$url" fi done < <(echo "$RESOURCES_JSON" | jq -r '.result.resources[] | "\(.url)"') for f in "${!NEEDED[@]}"; do if [ -z "${NEEDED[$f]}" ]; then log "ERROR: resource $f not found in dataset $LATEST_NAME" exit 1 fi done # Download each CSV (curl -z compares against existing file's mtime). for f in od_firme.csv od_caen_autorizat.csv od_stare_firma.csv od_reprezentanti_legali.csv; do url="${NEEDED[$f]}" log "Downloading $f..." curl -fL --max-time 600 -o "$DATA_DIR/$f.tmp" "$url" 2>&1 | tail -3 | tee -a "$LOG" mv -f "$DATA_DIR/$f.tmp" "$DATA_DIR/$f" done log "Running import-onrc.sh..." "$SCRIPT_DIR/import-onrc.sh" # ONRC import inserts new firms without lat/lng. Run the full geocoding # fallback chain (geonames_postal → uat_centroid → photon → judet_centroid) # so /harta + UI map clustering have coordinates for every fresh-import row. log "Running geocode-firms.sh fallback chain..." "$SCRIPT_DIR/geocode-firms.sh" || log "WARN: geocode-firms.sh exited non-zero; continuing" # Record the snapshot we just successfully imported. echo "$LATEST_NAME" > "$STAMP_FILE" log "=== ONRC fresh-import done (snapshot=$LATEST_NAME) ==="