#!/usr/bin/env python3
"""
Extract packing photos from xlsx files and upload to MinIO tmed2-photos bucket.

xlsx files are ZIP archives with embedded images in xl/media/.
Machine PHOTO sheets have bilingual captions linking images to machines.
"""

import os
import re
import zipfile
import tempfile
from pathlib import Path

PKG_DIR = Path("/home/borbolla/clawd/projects/HTS/TMED-II-Phase-III/project/package_lists")
PHOTO_DIR = Path("/home/borbolla/clawd/projects/HTS/TMED-II-Phase-III/project/photos_extracted")
BUCKET = "tmed2-photos"

FILES = [
    "DKTIND-2604-01.xlsx",
    "DKTIND-2604-02.xlsx",
    "HWIA-EM260408-01.xlsx",
    "HWIA-EM260408-02.xlsx",
    "HWIA-EM260413-03.xlsx",
    "HWIA-EM260413-04.xlsx",
]


def extract_images_from_xlsx(xlsx_path, output_dir):
    """Extract all images from xl/media/ inside the xlsx zip."""
    images = []
    prefix = xlsx_path.stem  # e.g., DKTIND-2604-01

    with zipfile.ZipFile(xlsx_path, 'r') as z:
        media_files = [f for f in z.namelist() if f.startswith('xl/media/')]
        for media_file in media_files:
            ext = Path(media_file).suffix.lower()
            if ext not in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.emf', '.wmf'):
                continue

            fname = f"{prefix}_{Path(media_file).name}"
            out_path = output_dir / fname
            with z.open(media_file) as src, open(out_path, 'wb') as dst:
                dst.write(src.read())

            images.append({
                "source_file": xlsx_path.name,
                "media_path": media_file,
                "local_path": str(out_path),
                "filename": fname,
                "minio_key": f"packing/{fname}",
            })

    return images


def main():
    PHOTO_DIR.mkdir(exist_ok=True)

    all_images = []
    for fname in FILES:
        fpath = PKG_DIR / fname
        if not fpath.exists():
            print(f"  SKIP: {fname} not found")
            continue

        images = extract_images_from_xlsx(fpath, PHOTO_DIR)
        print(f"  {fname}: {len(images)} images extracted")
        all_images.extend(images)

    print(f"\nTotal images extracted: {len(all_images)}")
    print(f"Output directory: {PHOTO_DIR}")

    # List by type
    by_ext = {}
    for img in all_images:
        ext = Path(img["filename"]).suffix.lower()
        by_ext[ext] = by_ext.get(ext, 0) + 1
    print(f"By type: {by_ext}")

    # Total size
    total_size = sum(os.path.getsize(img["local_path"]) for img in all_images)
    print(f"Total size: {total_size / 1024 / 1024:.1f} MB")

    return all_images


if __name__ == "__main__":
    images = main()
