Shadowbroker/backend/analyze_xlsx.py

import zipfile
import xml.etree.ElementTree as ET
import re
import csv
import os

xlsx_path = r"f:\Codebase\Oracle\live-risk-dashboard\TheAirTraffic Database.xlsx"
output_path = r"f:\Codebase\Oracle\live-risk-dashboard\backend\xlsx_analysis.txt"

def parse_xlsx_sheet(z, shared_strings, sheet_num):
    ns = {'s': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
    sheet_file = f'xl/worksheets/sheet{sheet_num}.xml'
    if sheet_file not in z.namelist():
        return []
    ws_xml = z.read(sheet_file)
    ws_root = ET.fromstring(ws_xml)
    rows = []
    for row in ws_root.findall('.//s:sheetData/s:row', ns):
        cells = {}
        for cell in row.findall('s:c', ns):
            cell_ref = cell.get('r', '')
            cell_type = cell.get('t', '')
            val_elem = cell.find('s:v', ns)
            val = val_elem.text if val_elem is not None else ''
            if cell_type == 's' and val:
                val = shared_strings[int(val)]
            col = re.match(r'([A-Z]+)', cell_ref).group(1) if re.match(r'([A-Z]+)', cell_ref) else ''
            cells[col] = val
        rows.append(cells)
    return rows

with open(output_path, 'w', encoding='utf-8') as out:
    with zipfile.ZipFile(xlsx_path, 'r') as z:
        shared_strings = []
        if 'xl/sharedStrings.xml' in z.namelist():
            ss_xml = z.read('xl/sharedStrings.xml')
            root = ET.fromstring(ss_xml)
            ns = {'s': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
            for si in root.findall('.//s:si', ns):
                texts = si.findall('.//s:t', ns)
                val = ''.join(t.text or '' for t in texts)
                shared_strings.append(val)

        all_entries = []
        for sheet_idx in range(1, 5):
            rows = parse_xlsx_sheet(z, shared_strings, sheet_idx)
            if not rows:
                continue

            out.write(f"\n=== SHEET {sheet_idx}: {len(rows)} rows ===\n")
            # Print first 5 rows
            for i in range(min(5, len(rows))):
                for col in sorted(rows[i].keys(), key=lambda x: (len(x), x)):
                    val = rows[i][col]
                    if val:
                        out.write(f"  Row{i} {col}: '{val[:80]}'\n")
                out.write("\n")

            for r in rows[1:]:
                for col, val in r.items():
                    val = str(val).strip()
                    n_regs = re.findall(r'N\d{1,5}[A-Z]{0,2}', val)
                    owner = r.get('B', r.get('A', '')).strip()
                    aircraft_type = r.get('C', r.get('D', '')).strip()
                    for reg in n_regs:
                        all_entries.append({
                            'registration': reg.upper(),
                            'owner': owner,
                            'type': aircraft_type,
                            'sheet': sheet_idx
                        })

    unique_regs = set(e['registration'] for e in all_entries)
    out.write(f"\nTOTAL ENTRIES: {len(all_entries)}\n")
    out.write(f"UNIQUE REGISTRATIONS: {len(unique_regs)}\n")

    csv_path = r"f:\Codebase\Oracle\live-risk-dashboard\PLANEALERTLIST\plane-alert-db-main\plane-alert-db.csv"
    existing = {}
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            icao = row.get('$ICAO', '').strip().upper()
            reg = row.get('$Registration', '').strip().upper()
            if reg:
                existing[reg] = {
                    'icao': icao,
                    'category': row.get('Category', ''),
                    'operator': row.get('$Operator', ''),
                }

    already_in = unique_regs & set(existing.keys())
    missing = unique_regs - set(existing.keys())
    out.write(f"\nplane-alert-db: {len(existing)} registrations\n")
    out.write(f"Already covered: {len(already_in)}\n")
    out.write(f"MISSING: {len(missing)}\n")

    out.write(f"\n--- ALREADY TRACKED ---\n")
    seen = set()
    for e in all_entries:
        if e['registration'] in already_in and e['registration'] not in seen:
            info = existing[e['registration']]
            out.write(f"  {e['owner'][:40]:40s} {e['registration']:10s} DB_CAT: {info['category'][:25]:25s} DB_OP: {info['operator'][:40]}\n")
            seen.add(e['registration'])

    out.write(f"\n--- MISSING (NEED TO ADD) ---\n")
    seen = set()
    for e in all_entries:
        if e['registration'] in missing and e['registration'] not in seen:
            out.write(f"  {e['owner'][:40]:40s} {e['registration']:10s} TYPE: {e['type'][:30]}\n")
            seen.add(e['registration'])

print(f"Analysis written to {output_path}")