#!/usr/bin/env python3
"""
TMED-II Phase III — Package List vs Master Schedule Cross-Reference

Parses master_schedule.tsv (our plan) and 6 packing list xlsx files (Daniel Lee),
cross-references them, and outputs results to Google Sheets.

Audit fixes applied: F-01 through F-09.
"""

import os
import re
import math
import json
from collections import defaultdict
from pathlib import Path

import openpyxl

# ── Paths ──────────────────────────────────────────────────────────────────────
BASE = Path("/home/borbolla/clawd/projects/HTS/TMED-II-Phase-III")
TSV_PATH = BASE / "bidding/our-quote/breakdowns/master_schedule.tsv"
PKG_DIR = BASE / "project/package_lists"
OUTPUT_SHEET_ID = "1Tvt3fAZdwR7DAhBmnJ3AbzWdUJi2wagcG2P4xLQvgUQ"

# ── Korean→English mapping ────────────────────────────────────────────────────
# Each Korean substring → list of English regex patterns to search in facility name
# Items with numbered duplicates (#1, #2) share the same base key — the matching
# logic distributes package items across numbered Korean items evenly.
KR_EN_MAP = {
    # DKTIND items (winding line)
    "보빈공급기": ["Bobbin Lifter", "Bobbin Carrier", "BOBBIN LIFTER", "BOBBIN CARRIER",
                  "BOBIN LIFTER", "Bobin Lifter",
                  "Palletizer", "PALLETTIZER", "ROLLER CONVEYOR", "SCARA",
                  "SKID CONVEYOR"],
    "권선기": ["WINDING MACHINE(?!.*CONVEYOR)"],  # Exclude 4400MM CONVEYOR items
    "탈피기": ["Decoating Machine", "DECOATING MACHINE"],
    "리턴 컨베이어": ["\\d+\\..*RETURN CONVEYOR", "DECOATING CONVEYOR"],
    "대차류": ["CART", "TROLLEY", "대차"],  # Push carts, likely packed loose
    # HWIA post-process items
    "보빈 정렬": ["Bobbin alignment", "Bobbin Alignment"],
    "서포트링 열간압입": ["Hot Pressing", "Hot pressing", "Supporting.*Hot",
                        "Supporting \\(P"],
    "보빈 에어블로우": ["Air Blow", "Airblow", "Air blow"],
    "보빈 단차 측정": ["Terminal measurement", "단차"],
    "팔레트 에어블로우": ["Pallet Air", "Pallet.*blow"],
    "권선라인 이재로더": ["Process connection reloade", "reloader"],
    "레이져마킹": ["Marking machine", "Laser Marking", "Marking"],
    "바니시 예열": ["Preheating the varnish", "Preheating"],
    "바니쉬 예열": ["Preheating the varnish", "Preheating"],
    "바니시 함침": ["Varnish application"],
    "바니쉬 도포": ["Varnish application"],
    "1차 고주파 경화": ["Varnish curing", "High frequency"],
    "겐트리로더": ["Kentry", "Gantry(?!.*robot)"],  # Only gantry loaders, NOT autoloaders
    "바니쉬 경화 & 냉각": ["Varnish drying oven", "Varnish cooling furnace",
                          "Varnish.*cooling", "Varnish.*drying",
                          "Varnish.*tank cover", "Varnish.*duct",
                          "Chiller.*Varnish"],
    "포밍기": ["Wire Forming(?!.*Vision)", "Wire Forming #"],
    "포밍 비젼검사": ["Wireforming Vision", "Forming Vision", "Wire.*Vision"],
    "포밍 에어블로우": ["Forming.*Air", "Air.*blow.*form", "Wire.*Air"],
    "터미널 조립기": ["Terminal Assembly"],
    "코킹 & 컷팅": ["Caulking.*Cutting", "Caulking(?!.*Vision)"],
    "코킹 & 컷팅 비젼검사": ["Caulking.*Cutting.*Vision", "Caulking.*Vision",
                            "Cutting.*Vision", "Caulking.*Inspection"],
    "퓨징 & 검사": ["Fusing(?!.*Air)"],
    "퓨징 에어블로우": ["Fusing.*Air", "Fusing.*blow"],
    "트래버스": ["Traverse"],
    "온도센서 조립": ["Temperature sensor(?!.*vision)", "Temperature sensor assembly"],
    "온도센서 조립 비젼검사": ["Temperature sensor vision", "Temperature.*vision"],
    "에폭시 몰딩기": ["Epoxy Molding(?!.*Vision)", "Epoxy Molding #",
                    "Epoxy Tank"],
    "에폭시 몰딩 비젼검사": ["Epoxy.*Vision"],
    "에폭시 경화 & 냉각": ["Epoxy drying furnace", "Epoxy cooling furnace",
                          "Epoxy.*cooling", "Epoxy.*drying"],
    "지그 탈거 장치": ["Jig removal", "Jig.*robot"],
    "전기 특성 검사": ["Electrical.*test", "Electric.*test", "Elect.*Inspec"],
    "완성검사": ["Completion inspection", "Finished.*inspection",
               "Final inspection"],
    "완성이재": ["multi-joint robot", "completed.*elevator",
               "Robot controller"],
    "NG대차": ["NG Conveyor", "Master Car", "NG Cart", "NG replacement"],
    "제어반": ["Control Panel(?!.*stand)", "RTB Panel", "Electric Control",
             "Electrical supplies", "ELECTRIC CONTROL UNIT",
             "CONTROL PANEL", "PANEL", "PC RACK"],
    # Jun 15 assembly items
    "레졸버 체결": ["Resolver"],
    "리크테스트": ["Leak.*test", "Leak Tester"],
    "성능 특성 검사": ["Performance.*test"],
}

# Catch-all buckets (F-07): absorb remaining unmatched conveyors/fences/pallets/ancillary
CATCHALL_ITEMS = {
    "공정연결장치": [
        "Conveyor(?!.*Finished)", "conveyor(?!.*finished)", "Profile Fence",
        "Fence(?!.*Autoloader)", "Stair", "Lift.*table", "Stopper", "Input NG",
        "Air conditioner.*hose", "Process.*Pallet",
        "Autoloader(?!.*Fence)", "Autoloader #",
    ],
    "컨베어 外 (열간압입": [
        "Supporting \\(P", "Hot Pressing",
    ],
    "컨베어 外 (메인)": [
        "Conveyor", "conveyor", "Fence", "Profile",
        "Finished Pallet", "Control panel stand", "Vision PC",
        "Air conditioner", "dust collector", "RTB Panel",
        "Autoloader Fence", "Chiller", "Bobbin.*[Pp]allet",
        "Main.*[Pp]allet", "Main.*palette", "Bobbin.*pallet",
        "Air.*blow", "Vision PC Spare",
    ],
}

# ── Step 1a: Parse master_schedule.tsv ─────────────────────────────────────────

def parse_master_schedule(path):
    """Extract INCLUDED Motor items from TSV."""
    items = []
    excluded_names = set()
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    in_excluded = False
    for lineno, raw in enumerate(lines, 1):
        if lineno == 1:
            continue  # header
        line = raw.rstrip("\n")
        if not line.strip():
            continue
        if "EXCLUDED" in line:
            in_excluded = True
            if line.startswith("---"):
                continue

        cols = line.split("\t")
        area = cols[0].strip() if cols else ""

        if in_excluded:
            if len(cols) >= 5 and "EXCLUDED" in area:
                name = cols[3].strip() if len(cols) > 3 else ""
                if name:
                    excluded_names.add(name)
            continue

        if area not in ("Motor",):
            continue
        if len(cols) < 5:
            continue
        status = cols[7].strip() if len(cols) > 7 else ""
        if status != "INCLUDED":
            continue

        name = cols[3].strip()
        try:
            weight = float(cols[4].strip())
        except (ValueError, IndexError):
            weight = 0.0

        # Determine arrival group
        arrival_raw = cols[5].strip() if len(cols) > 5 else ""
        if "06-15" in arrival_raw or "6/15" in arrival_raw:
            arrival = "jun15"
        elif lineno >= 59:
            arrival = "jun15"
        else:
            arrival = "may20"

        section = cols[1].strip() if len(cols) > 1 else ""

        items.append({
            "lineno": lineno,
            "section": section,
            "item_no_raw": cols[2].strip() if len(cols) > 2 else "",
            "korean_name": name,
            "weight_t": weight,
            "arrival": arrival,
            "matched_pkgs": [],
            "flag": None,
            "notes": "",
        })

    return items, excluded_names


# ── Step 1b: Parse xlsx packing detail sheets ──────────────────────────────────

def detect_columns(header_rows):
    """Auto-detect column positions by scanning header rows for keywords (F-01)."""
    cols = {
        "container": None, "facility": None,
        "net_wt": None, "gross_wt": None,
        "length": None, "width": None, "height": None,
        "container_specs": None, "container_id": None,
        "description_en": None,
    }

    for row in header_rows:
        for col_idx, cell in enumerate(row):
            val = str(cell.value).replace("\n", " ").strip().lower() if cell.value else ""
            if not val:
                continue

            if col_idx == 0 and ("container" in val or "c/t" in val):
                cols["container"] = col_idx
            if col_idx <= 4 and "facility" in val:
                cols["facility"] = col_idx

            # Net Weight — must have both "net" and "weight" (or "w" after newline)
            if "net" in val and ("weight" in val or val.endswith("w")):
                cols["net_wt"] = col_idx
            # Gross Weight
            if "gross" in val and ("weight" in val or val.endswith("w")):
                cols["gross_wt"] = col_idx

            if "description in english" in val:
                cols["description_en"] = col_idx

            if val == "l" and cols["length"] is None:
                cols["length"] = col_idx
            if val == "w" and cols["width"] is None and col_idx > (cols.get("length") or 0):
                cols["width"] = col_idx
            if val == "h" and cols["height"] is None and col_idx > (cols.get("width") or 0):
                cols["height"] = col_idx

            if "container" in val and "spec" in val:
                cols["container_specs"] = col_idx
            if "container" in val and ("no" in val) and col_idx > 20:
                cols["container_id"] = col_idx

    return cols


def safe_float(val):
    if val is None:
        return 0.0
    try:
        return float(val)
    except (ValueError, TypeError):
        return 0.0


def parse_packing_detail(wb, sheet_name, supplier_prefix):
    """Parse a PACKING DETAIL sheet, auto-detecting column positions."""
    ws = wb[sheet_name]
    all_rows = list(ws.iter_rows(values_only=False))

    if len(all_rows) < 3:
        return []

    col_map = detect_columns(all_rows[:3])

    # Defaults
    if col_map["container"] is None:
        col_map["container"] = 0
    if col_map["facility"] is None:
        col_map["facility"] = 3
    if col_map["net_wt"] is None:
        for try_col in [16, 15, 13]:
            if try_col < len(all_rows[1]):
                hdr = str(all_rows[1][try_col].value or "").lower()
                if "net" in hdr:
                    col_map["net_wt"] = try_col
                    break
        if col_map["net_wt"] is None:
            col_map["net_wt"] = 13
    if col_map["gross_wt"] is None:
        col_map["gross_wt"] = col_map["net_wt"] + 1

    items = []
    current_container = None
    current_container_specs = None

    for row_idx, row in enumerate(all_rows):
        if row_idx < 3:
            continue

        cells = [c.value for c in row]
        max_col = len(cells)

        if cells[0] and str(cells[0]).strip().upper() == "END":
            break

        fac_col = col_map["facility"]
        facility = str(cells[fac_col]).strip() if fac_col < max_col and cells[fac_col] else ""
        if not facility or facility == "None":
            continue

        # Container number
        ct_col = col_map["container"]
        ct_val = cells[ct_col] if ct_col < max_col else None
        if ct_val is not None and str(ct_val).strip() and str(ct_val).strip() != "None":
            try:
                current_container = str(int(float(str(ct_val).strip())))
            except (ValueError, TypeError):
                current_container = str(ct_val).strip()

        # Container specs
        if col_map["container_specs"] and col_map["container_specs"] < max_col:
            specs_val = cells[col_map["container_specs"]]
            if specs_val and str(specs_val).strip() and str(specs_val).strip() != "None":
                raw_specs = str(specs_val).strip().replace("\n", " ").strip()
                if raw_specs:
                    current_container_specs = raw_specs

        # Weights
        nw_col = col_map["net_wt"]
        gw_col = col_map["gross_wt"]
        net_wt = safe_float(cells[nw_col]) if nw_col < max_col else 0.0
        gross_wt = safe_float(cells[gw_col]) if gw_col < max_col else 0.0

        if net_wt == 0 and gross_wt == 0:
            continue

        # Dimensions
        l_col = col_map.get("length")
        w_col = col_map.get("width")
        h_col = col_map.get("height")
        length = safe_float(cells[l_col]) if l_col and l_col < max_col else 0.0
        width = safe_float(cells[w_col]) if w_col and w_col < max_col else 0.0
        height = safe_float(cells[h_col]) if h_col and h_col < max_col else 0.0

        # Description in English (DKTIND col I)
        desc_en_col = col_map.get("description_en")
        desc_en = ""
        if desc_en_col and desc_en_col < max_col and cells[desc_en_col]:
            desc_en = str(cells[desc_en_col]).strip()

        # Prefixed container number (F-04)
        prefixed_ct = f"{supplier_prefix}-{current_container}" if current_container else f"{supplier_prefix}-?"

        items.append({
            "supplier": supplier_prefix,
            "container": prefixed_ct,
            "container_raw": current_container,
            "container_specs": current_container_specs or "",
            "facility_name": facility,
            "description_en": desc_en,
            "net_wt_kg": net_wt,
            "gross_wt_kg": gross_wt,
            "dims_mm": (length, width, height),
            "matched_to": None,
        })

    return items


def parse_all_packages(pkg_dir):
    """Parse all 6 xlsx files."""
    all_items = []
    # prefix, vessel, sailing date, arrival sort key, estimated arrival range
    files_info = {
        "DKTIND-2604-01.xlsx": ("DK", "BUENAVENTURA EXPRESS 2614E", "Apr 14", 1, "May 14-19"),
        "DKTIND-2604-02.xlsx": ("DK2", "HYUNDAI COURAGE 0123E", "Apr 20", 3, "May 20-25"),
        "HWIA-EM260408-01.xlsx": ("HWIA", "CMA CGM THAMES 1WU0SE1MA", "Apr 14", 1, "May 14-19"),
        "HWIA-EM260408-02.xlsx": ("HWIA", "MSC CALIDRIS III QM615A", "Apr 16", 2, "May 16-21"),
        "HWIA-EM260413-03.xlsx": ("HWIA", "HYUNDAI COURAGE 0123E", "Apr 20", 3, "May 20-25"),
        "HWIA-EM260413-04.xlsx": ("HWIA", "LIMA EXPRESS 2615E", "Apr 21", 4, "May 21-26"),
    }

    for fname, (prefix, vessel, sailing, arrival_order, est_arrival) in files_info.items():
        fpath = pkg_dir / fname
        if not fpath.exists():
            print(f"WARNING: {fname} not found")
            continue

        wb = openpyxl.load_workbook(str(fpath), read_only=True, data_only=True)

        detail_sheet = None
        for sn in wb.sheetnames:
            if sn == "PACKING DETAIL":
                detail_sheet = sn
                break

        if detail_sheet:
            items = parse_packing_detail(wb, detail_sheet, prefix)
            for item in items:
                item["source_file"] = fname
                item["vessel"] = vessel
                item["sailing"] = sailing
                item["arrival_order"] = arrival_order
                item["est_arrival"] = est_arrival
            all_items.extend(items)
            print(f"  {fname}: {len(items)} items from {detail_sheet}")

        wb.close()

    return all_items


# ── Step 1c: Auto-build mapping from Machine PHOTO sheets (F-06) ──────────────

def scrape_bilingual_captions(pkg_dir):
    """Extract English(Korean) pairs from Machine PHOTO sheets."""
    pairs = {}
    hwia_files = [
        "HWIA-EM260408-01.xlsx", "HWIA-EM260408-02.xlsx",
        "HWIA-EM260413-03.xlsx", "HWIA-EM260413-04.xlsx",
    ]

    for fname in hwia_files:
        fpath = pkg_dir / fname
        if not fpath.exists():
            continue
        wb = openpyxl.load_workbook(str(fpath), read_only=True, data_only=True)
        if "Machine PHOTO" not in wb.sheetnames:
            wb.close()
            continue
        ws = wb["Machine PHOTO"]
        for row in ws.iter_rows(values_only=True):
            for cell in row:
                if cell is None:
                    continue
                val = str(cell).strip()
                m = re.match(r'^(.+?)\s*[\(（]([가-힣\s&]+)[\)）]', val)
                if m:
                    en = m.group(1).strip()
                    kr = m.group(2).strip()
                    if kr and en:
                        pairs[kr] = en
        wb.close()

    return pairs


# ── Step 2: Matching logic ─────────────────────────────────────────────────────

def get_base_name(korean_name):
    """Strip trailing number suffix: '권선기 #4' → '권선기'."""
    return re.sub(r'\s*#\d+$', '', korean_name).strip()


def find_matching_packages(patterns, pkg_items, exclude_claimed=True):
    """Find all unclaimed package items matching any pattern."""
    matches = []
    for pkg in pkg_items:
        if exclude_claimed and pkg["matched_to"] is not None:
            continue
        search_text = f"{pkg['facility_name']} {pkg['description_en']}"
        for pat in patterns:
            try:
                if re.search(pat, search_text, re.IGNORECASE):
                    matches.append(pkg)
                    break
            except re.error:
                if pat.lower() in search_text.lower():
                    matches.append(pkg)
                    break
    return matches


def get_patterns_for(korean_name, kr_en_map, auto_captions):
    """Get all English search patterns for a Korean equipment name.

    Uses longest-match-wins: if 'A' and 'A B' both match, only 'A B' patterns are used.
    This prevents '코킹 & 컷팅 비젼검사' from inheriting '코킹 & 컷팅' patterns.
    """
    base = get_base_name(korean_name)
    patterns = []

    # Collect all matching kr_keys
    matching_keys = []
    for kr_key in kr_en_map:
        if kr_key in korean_name or kr_key in base:
            matching_keys.append(kr_key)

    # Filter: remove any key that is a proper substring of another matching key
    filtered_keys = []
    for k in matching_keys:
        is_substring_of_longer = any(
            k != other and k in other
            for other in matching_keys
        )
        if not is_substring_of_longer:
            filtered_keys.append(k)

    for kr_key in filtered_keys:
        patterns.extend(kr_en_map[kr_key])

    # Check auto-scraped captions
    matching_captions = []
    for kr_caption, en_caption in auto_captions.items():
        if kr_caption in korean_name or korean_name in kr_caption:
            matching_captions.append((kr_caption, en_caption))
        elif kr_caption in base or base in kr_caption:
            matching_captions.append((kr_caption, en_caption))

    # Same longest-match filter for captions
    caption_keys = [c[0] for c in matching_captions]
    for kr_caption, en_caption in matching_captions:
        is_sub = any(kr_caption != other and kr_caption in other for other in caption_keys)
        if not is_sub:
            patterns.append(re.escape(en_caption))

    return patterns


def run_matching(master_items, pkg_items, auto_captions):
    """Two-pass matching with group-aware distribution for numbered items."""
    flags = []

    # Separate catch-all items
    catchall_kr_keys = set(CATCHALL_ITEMS.keys())

    def is_catchall(name):
        return any(ck in name for ck in catchall_kr_keys)

    specific_items = [m for m in master_items if not is_catchall(m["korean_name"])]
    catchall_items = [m for m in master_items if is_catchall(m["korean_name"])]

    # ── Pass 1: Group numbered items, match as a group, then distribute ──
    # Group specific items by base name
    groups = defaultdict(list)
    for m in specific_items:
        if m["arrival"] == "jun15":
            m["flag"] = "NOT_IN_PACKAGES_EXPECTED"
            m["notes"] = "Jun 15 arrival (separate shipment)"
            continue
        base = get_base_name(m["korean_name"])
        groups[base].append(m)

    # Sort groups: longer/more specific names first (F-07 ordering)
    # e.g. "코킹 & 컷팅 비젼검사" before "코킹 & 컷팅"
    sorted_group_keys = sorted(groups.keys(), key=lambda k: -len(k))

    for base_name in sorted_group_keys:
        group = groups[base_name]
        patterns = get_patterns_for(base_name, KR_EN_MAP, auto_captions)
        # Also try patterns from the full name of the first item
        for m in group:
            extra = get_patterns_for(m["korean_name"], KR_EN_MAP, auto_captions)
            for p in extra:
                if p not in patterns:
                    patterns.append(p)

        if not patterns:
            for m in group:
                m["flag"] = "NOT_FOUND"
                m["notes"] = "No mapping found for this Korean name"
                flags.append({
                    "type": "NOT_FOUND",
                    "item": m["korean_name"],
                    "detail": f"Weight {m['weight_t']}T, no English pattern mapped",
                    "impact": "Cannot verify against package list",
                    "action": "Add Korean→English mapping manually",
                })
            continue

        # Find ALL matching packages for this group
        matched_pkgs = find_matching_packages(patterns, pkg_items)

        if not matched_pkgs:
            for m in group:
                m["flag"] = "NOT_FOUND"
                m["notes"] = f"Patterns {patterns[:3]} found no matches"
                flags.append({
                    "type": "NOT_FOUND",
                    "item": m["korean_name"],
                    "detail": f"Weight {m['weight_t']}T, patterns tried but no match",
                    "impact": "May be packed under different name or missing",
                    "action": "Verify with Daniel Lee",
                })
            continue

        # Distribute packages across group members
        n_masters = len(group)
        n_pkgs = len(matched_pkgs)

        if n_masters == 1:
            # Single item: claim all
            for pkg in matched_pkgs:
                pkg["matched_to"] = group[0]["korean_name"]
            group[0]["matched_pkgs"] = matched_pkgs
        else:
            # Multiple items: sort packages by weight descending, distribute round-robin
            sorted_pkgs = sorted(matched_pkgs, key=lambda p: p["net_wt_kg"], reverse=True)
            buckets = [[] for _ in range(n_masters)]

            # Greedy assignment: assign each package to the bucket with lowest total weight
            bucket_weights = [0.0] * n_masters
            for pkg in sorted_pkgs:
                min_idx = bucket_weights.index(min(bucket_weights))
                buckets[min_idx].append(pkg)
                bucket_weights[min_idx] += pkg["net_wt_kg"]

            for i, m in enumerate(group):
                for pkg in buckets[i]:
                    pkg["matched_to"] = m["korean_name"]
                m["matched_pkgs"] = buckets[i]

        # Evaluate each group member
        for m in group:
            pkgs = m["matched_pkgs"]
            if not pkgs:
                m["flag"] = "NOT_FOUND"
                m["notes"] = "Group had matches but none distributed to this item"
                continue

            pkg_net_t = sum(p["net_wt_kg"] for p in pkgs) / 1000.0
            planned_t = m["weight_t"]
            if planned_t > 0:
                delta_pct = abs(pkg_net_t - planned_t) / planned_t * 100
            else:
                delta_pct = 0

            if delta_pct <= 10:
                m["flag"] = "MATCH"
            else:
                m["flag"] = "WEIGHT_DIFF"
                m["notes"] = f"Delta {delta_pct:.0f}% (plan {planned_t}T vs pkg {pkg_net_t:.2f}T net)"

    # ── Pass 2: Catch-all items absorb remaining unmatched ──
    for master in catchall_items:
        if master["arrival"] == "jun15":
            master["flag"] = "NOT_IN_PACKAGES_EXPECTED"
            master["notes"] = "Jun 15 arrival (separate shipment)"
            continue

        kr_name = master["korean_name"]
        patterns = []

        # Get catch-all patterns
        for catchall_kr, catchall_pats in CATCHALL_ITEMS.items():
            if catchall_kr in kr_name:
                patterns.extend(catchall_pats)

        # Also add regular mapping patterns
        for kr_key, en_pats in KR_EN_MAP.items():
            if kr_key in kr_name:
                patterns.extend(en_pats)

        if not patterns:
            master["flag"] = "NOT_FOUND"
            continue

        matched = find_matching_packages(patterns, pkg_items)
        for pkg in matched:
            pkg["matched_to"] = master["korean_name"]
        master["matched_pkgs"] = matched

        if not matched:
            master["flag"] = "NOT_FOUND"
            master["notes"] = "Catch-all found no remaining unmatched items"
            continue

        pkg_net_t = sum(p["net_wt_kg"] for p in matched) / 1000.0
        planned_t = master["weight_t"]
        if planned_t > 0:
            delta_pct = abs(pkg_net_t - planned_t) / planned_t * 100
        else:
            delta_pct = 0

        if delta_pct <= 10:
            master["flag"] = "MATCH"
        else:
            master["flag"] = "WEIGHT_DIFF"
            master["notes"] = f"Catch-all delta {delta_pct:.0f}% (plan {planned_t}T vs pkg {pkg_net_t:.2f}T)"
            if planned_t >= 30:
                master["notes"] += " — expected large delta for catch-all bucket"

    # ── EXTRA items in packages ──
    for pkg in pkg_items:
        if pkg["matched_to"] is None:
            flags.append({
                "type": "EXTRA_IN_PACKAGES",
                "item": pkg["facility_name"],
                "detail": f"Container {pkg['container']}, Net {pkg['net_wt_kg']}kg, Source: {pkg['source_file']}",
                "impact": "Item in packages but not in our master schedule",
                "action": "Determine if scope addition or ancillary equipment",
            })

    # NOTE: EXCLUDED winding machine check removed — package sequence numbers
    # (e.g., "09. WINDING MACHINE 1") are NOT machine numbers. All 8 winding
    # machines in packages are the 8 INCLUDED ones.

    # Add WEIGHT_DIFF entries to flags
    for m in master_items:
        if m["flag"] == "WEIGHT_DIFF":
            flags.append({
                "type": "WEIGHT_DIFF",
                "item": m["korean_name"],
                "detail": m["notes"],
                "impact": "Weight discrepancy — may affect rigging plan",
                "action": "Verify weight with supplier documentation",
            })

    return flags


# ── Step 3: Container summary ─────────────────────────────────────────────────

def build_container_summary(pkg_items):
    """Group items by container, compute forklift class and unload days."""
    containers = defaultdict(lambda: {
        "supplier": "", "container_type": "", "items": [],
        "net_wt": 0.0, "gross_wt": 0.0,
        "heaviest_kg": 0.0, "heaviest_name": "",
    })

    for pkg in pkg_items:
        ct = pkg["container"]
        c = containers[ct]
        c["supplier"] = pkg["supplier"]
        c["items"].append(pkg)
        c["net_wt"] += pkg["net_wt_kg"]
        c["gross_wt"] += pkg["gross_wt_kg"]

        specs = pkg.get("container_specs", "")
        if specs and not c["container_type"]:
            c["container_type"] = specs

        if pkg["gross_wt_kg"] > c["heaviest_kg"]:
            c["heaviest_kg"] = pkg["gross_wt_kg"]
            c["heaviest_name"] = pkg["facility_name"]

        # Vessel info from source file
        if not c.get("vessel"):
            c["vessel"] = pkg.get("vessel", "")
            c["sailing"] = pkg.get("sailing", "")
            c["arrival_order"] = pkg.get("arrival_order", 99)
            c["est_arrival"] = pkg.get("est_arrival", "")

    # Sort by arrival order (earliest vessel first), then container number
    sorted_containers = sorted(
        containers.keys(),
        key=lambda k: (containers[k].get("arrival_order", 99), k)
    )
    summary = []
    day = 1
    count_today = 0

    for ct_num in sorted_containers:
        c = containers[ct_num]
        ct_type = c["container_type"].upper()
        heaviest_t = c["heaviest_kg"] / 1000.0

        # Flat racks are forklift-accessible from the sides — no crane needed.
        # Assign forklift class by heaviest item weight regardless of container type.
        if heaviest_t <= 3:
            forklift = "3.5T"
        elif heaviest_t <= 5:
            forklift = "5T"
        elif heaviest_t <= 7:
            forklift = "7.5T"
        elif heaviest_t <= 15:
            forklift = "17T"
        else:
            forklift = "Crane"

        count_today += 1
        if count_today > 10:
            day += 1
            count_today = 1

        items_list = "; ".join(p["facility_name"][:30] for p in c["items"][:5])
        if len(c["items"]) > 5:
            items_list += f" (+{len(c['items'])-5} more)"

        supplier_name = "DKTEC" if c["supplier"] in ("DK", "DK2") else "Hyundai WIA"

        summary.append({
            "container": ct_num,
            "supplier": supplier_name,
            "line": "Motor Line",  # All containers in this shipment are Motor Line
            "container_type": c["container_type"] or "40HQ",
            "vessel": c.get("vessel", ""),
            "sailing": c.get("sailing", ""),
            "est_arrival": c.get("est_arrival", ""),
            "num_items": len(c["items"]),
            "net_wt_kg": round(c["net_wt"]),
            "gross_wt_kg": round(c["gross_wt"]),
            "heaviest_kg": round(c["heaviest_kg"]),
            "heaviest_name": c["heaviest_name"],
            "forklift_class": forklift,
            "unload_day": day,
            "items_list": items_list,
        })

    return summary


# ── Step 4: Output builders ───────────────────────────────────────────────────

def build_crossref_sheet(master_items):
    """Build Sheet 1: Cross-Reference data."""
    headers = [
        "Item No", "Line", "Korean Name", "English Name", "Our Weight NET (T)",
        "Arrival", "Found?", "Container #s", "Package Net (T)",
        "Package Gross (T)", "Delta %", "Max Piece (T)",
        "Max Dims LxWxH (mm)", "Flag", "Notes",
    ]
    rows = [headers]

    for i, m in enumerate(master_items, 1):
        pkgs = m.get("matched_pkgs", [])

        en_names = list(set(p["facility_name"] for p in pkgs))
        en_name = "; ".join(en_names[:3]) if en_names else "(not matched)"
        if len(en_names) > 3:
            en_name += f" (+{len(en_names)-3})"

        ct_nums = list(set(p["container"] for p in pkgs))
        ct_str = ", ".join(sorted(ct_nums)) if ct_nums else ""

        pkg_net = sum(p["net_wt_kg"] for p in pkgs) / 1000.0
        pkg_gross = sum(p["gross_wt_kg"] for p in pkgs) / 1000.0

        if m["weight_t"] > 0 and pkgs:
            delta = (pkg_net - m["weight_t"]) / m["weight_t"] * 100
            delta_str = f"{delta:+.0f}%"
        else:
            delta_str = ""

        max_piece = max((p["gross_wt_kg"] for p in pkgs), default=0) / 1000.0

        max_dims = ""
        if pkgs:
            dims_list = [p["dims_mm"] for p in pkgs if any(d > 0 for d in p["dims_mm"])]
            if dims_list:
                max_l = max(d[0] for d in dims_list)
                max_w = max(d[1] for d in dims_list)
                max_h = max(d[2] for d in dims_list)
                max_dims = f"{max_l:.0f}x{max_w:.0f}x{max_h:.0f}"

        rows.append([
            str(i), "Motor Line", m["korean_name"], en_name,
            f"{m['weight_t']:.2f}", m["arrival"],
            m["flag"] or "", ct_str,
            f"{pkg_net:.2f}" if pkgs else "",
            f"{pkg_gross:.2f}" if pkgs else "",
            delta_str,
            f"{max_piece:.2f}" if max_piece > 0 else "",
            max_dims, m["flag"] or "", m["notes"],
        ])

    return rows


def build_container_sheet(summary):
    headers = [
        "Supplier", "Container #", "Line", "Container Type",
        "Vessel", "Sailing", "Est. Arrival",
        "Items", "Net Wt (kg)", "Gross Wt (kg)", "Heaviest Item (kg)",
        "Heaviest Name", "Forklift Class", "Unload Day", "Items List",
    ]
    rows = [headers]
    for c in summary:
        rows.append([
            c["supplier"], c["container"], c["line"], c["container_type"],
            c["vessel"], c["sailing"], c["est_arrival"],
            str(c["num_items"]), str(c["net_wt_kg"]), str(c["gross_wt_kg"]),
            str(c["heaviest_kg"]), c["heaviest_name"],
            c["forklift_class"], str(c["unload_day"]), c["items_list"],
        ])
    return rows


def build_flags_sheet(flags):
    headers = ["Flag Type", "Item", "Detail", "Impact", "Action"]
    rows = [headers]
    for f in flags:
        rows.append([f["type"], f["item"], f["detail"], f["impact"], f["action"]])
    return rows


# ── Main ───────────────────────────────────────────────────────────────────────

def main():
    print("=" * 70)
    print("TMED-II Phase III — Package List vs Master Schedule Cross-Reference")
    print("=" * 70)

    # Step 1a
    print("\n[1a] Parsing master_schedule.tsv...")
    master_items, excluded_names = parse_master_schedule(TSV_PATH)
    may20 = [m for m in master_items if m["arrival"] == "may20"]
    jun15 = [m for m in master_items if m["arrival"] == "jun15"]
    total_planned = sum(m["weight_t"] for m in master_items)
    print(f"  Motor INCLUDED: {len(master_items)} items, {total_planned:.2f}T total")
    print(f"  May 20 arrival: {len(may20)} items, {sum(m['weight_t'] for m in may20):.2f}T")
    print(f"  Jun 15 arrival: {len(jun15)} items, {sum(m['weight_t'] for m in jun15):.2f}T")
    print(f"  Excluded names tracked: {len(excluded_names)}")

    # Step 1b
    print("\n[1b] Parsing package list xlsx files...")
    pkg_items = parse_all_packages(PKG_DIR)
    total_net = sum(p["net_wt_kg"] for p in pkg_items) / 1000.0
    total_gross = sum(p["gross_wt_kg"] for p in pkg_items) / 1000.0
    print(f"  Total package items: {len(pkg_items)}")
    print(f"  Total net weight: {total_net:.2f}T")
    print(f"  Total gross weight: {total_gross:.2f}T")

    # Step 1c
    print("\n[1c] Scraping bilingual captions from Machine PHOTO sheets...")
    auto_captions = scrape_bilingual_captions(PKG_DIR)
    print(f"  Found {len(auto_captions)} Korean→English pairs")

    # Step 2
    print("\n[2] Running two-pass matching...")
    flags = run_matching(master_items, pkg_items, auto_captions)

    by_flag = defaultdict(int)
    for m in master_items:
        by_flag[m["flag"] or "NONE"] += 1
    print(f"  Results: {dict(by_flag)}")

    unmatched_pkgs = [p for p in pkg_items if p["matched_to"] is None]
    print(f"  Unmatched package items: {len(unmatched_pkgs)}")
    if unmatched_pkgs:
        for p in unmatched_pkgs[:10]:
            print(f"    - {p['facility_name'][:50]} ({p['net_wt_kg']}kg) [{p['source_file']}]")
        if len(unmatched_pkgs) > 10:
            print(f"    ... and {len(unmatched_pkgs)-10} more")

    # Step 3
    print("\n[3] Building container summary...")
    container_summary = build_container_summary(pkg_items)
    num_containers = len(container_summary)
    unload_days = max((c["unload_day"] for c in container_summary), default=0)
    crane_needed = sum(1 for c in container_summary if c["forklift_class"] == "Crane")
    print(f"  Containers: {num_containers}")
    print(f"  Unload days (10/day): {unload_days}")
    print(f"  Crane required: {crane_needed} containers")

    # Step 4
    print("\n[4] Building sheet data...")
    crossref_rows = build_crossref_sheet(master_items)
    container_rows = build_container_sheet(container_summary)
    flags_rows = build_flags_sheet(flags)

    print(f"  Cross-Reference: {len(crossref_rows)-1} rows")
    print(f"  Container Summary: {len(container_rows)-1} rows")
    print(f"  Flags: {len(flags_rows)-1} rows")

    # Save JSON
    output = {
        "crossref": crossref_rows,
        "containers": container_rows,
        "flags": flags_rows,
        "stats": {
            "master_items": len(master_items),
            "pkg_items": len(pkg_items),
            "total_planned_t": round(total_planned, 2),
            "total_pkg_net_t": round(total_net, 2),
            "total_pkg_gross_t": round(total_gross, 2),
            "num_containers": num_containers,
            "unload_days": unload_days,
            "crane_containers": crane_needed,
            "flags_count": len(flags),
            "unmatched_pkg_items": len(unmatched_pkgs),
        },
    }

    json_path = BASE / "project/cross_reference_output.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    print(f"\n  Output saved to: {json_path}")

    # Verification
    print("\n" + "=" * 70)
    print("VERIFICATION")
    print("=" * 70)
    print(f"  1. Total package gross weight: {total_gross:.1f}T (expected ~324T)")
    print(f"  2. Non-assembly planned: {sum(m['weight_t'] for m in may20):.2f}T vs packages {total_gross:.1f}T gross")
    print(f"  3. Jun 15 items NOT_IN_PACKAGES_EXPECTED: {by_flag.get('NOT_IN_PACKAGES_EXPECTED', 0)} (expected 7)")
    print(f"  4. Unmatched package items: {len(unmatched_pkgs)} (target: 0)")
    print(f"  5. Containers: {num_containers} / 10/day = {unload_days} working days")

    return output


if __name__ == "__main__":
    main()