# pod_extractor_service.py
# Single service for:
#   - /pod/extract   (baseline + optional template boxes)
#   - /pod/ocr_box   (single-box OCR for template tuning)
#   - /pod/split     (split multi-page PDF into single-page PDFs)
#
# Notes:
# - Keeps ONE FastAPI app for both single and batch flows (no separate service needed).
# - Cleans up duplicate imports and groups models/endpoints neatly.

from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional, Dict, Any, List
from pathlib import Path
import fitz  # PyMuPDF
import re
from datetime import datetime

from PIL import Image, ImageOps, ImageFilter
import pytesseract
import io
import os

# Adjust if needed
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

app = FastAPI()


# ---------- request models ----------
class TemplateModel(BaseModel):
    page_no: int = 1
    dpi: int = 300
    boxes: Dict[str, Any]  # {field: {x,y,w,h,psm}} in PDF points (72dpi)


class PodReq(BaseModel):
    # Either pdf_path or image_path (jpg/png) is fine
    path: str
    hint: Optional[str] = None
    template: Optional[TemplateModel] = None


# Single-box OCR request models
class BoxModel(BaseModel):
    x: float
    y: float
    w: float
    h: float
    psm: int = 6


class PodBoxReq(BaseModel):
    path: str
    page_no: int = 1
    dpi: int = 300

    # Provide EITHER:
    field: Optional[str] = None          # key in template.boxes
    box: Optional[BoxModel] = None       # direct box coords

    # If using field, you must send template with boxes
    template: Optional[TemplateModel] = None

    hint: Optional[str] = None


# Batch split request model
class SplitReq(BaseModel):
    path: str
    out_dir: str
    # Optional: if True and files exist, we will generate unique names instead of overwriting
    avoid_overwrite: bool = True


# ---------- helpers ----------
def preprocess(img: Image.Image) -> Image.Image:
    g = ImageOps.grayscale(img)
    g = ImageOps.autocontrast(g)
    g = g.filter(ImageFilter.SHARPEN)
    return g


def ocr_text_from_pdf(doc: fitz.Document, max_pages: int = 1, dpi: int = 300) -> str:
    texts = []
    mat = fitz.Matrix(dpi / 72, dpi / 72)
    pages = min(len(doc), max_pages)
    for i in range(pages):
        pix = doc[i].get_pixmap(matrix=mat, alpha=False)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        img2 = preprocess(img)
        texts.append(pytesseract.image_to_string(img2, lang="eng", config="--psm 6"))
    return "\n".join(texts).strip()


def ocr_text_from_image(path: str) -> str:
    img = Image.open(path)
    img2 = preprocess(img)
    return pytesseract.image_to_string(img2, lang="eng", config="--psm 6").strip()


def ocr_boxes_from_pdf(doc: fitz.Document, page_no_1based: int, boxes: dict, dpi: int = 300) -> Dict[str, str]:
    page_index = max(0, page_no_1based - 1)
    page_index = min(page_index, len(doc) - 1)
    page = doc[page_index]

    mat = fitz.Matrix(dpi / 72, dpi / 72)
    out: Dict[str, str] = {}

    for key, b in boxes.items():
        try:
            x, y, w, h = float(b["x"]), float(b["y"]), float(b["w"]), float(b["h"])
        except Exception:
            continue

        psm = int(b.get("psm", 6))
        rect = fitz.Rect(x, y, x + w, y + h)

        pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        img2 = preprocess(img)

        txt = pytesseract.image_to_string(img2, lang="eng", config=f"--psm {psm}")
        out[key] = (txt or "").strip()

    return out


def ocr_one_box_from_pdf(doc: fitz.Document, page_no_1based: int, box: dict, dpi: int = 300) -> str:
    page_index = max(0, page_no_1based - 1)
    page_index = min(page_index, len(doc) - 1)
    page = doc[page_index]

    mat = fitz.Matrix(dpi / 72, dpi / 72)

    x, y, w, h = float(box["x"]), float(box["y"]), float(box["w"]), float(box["h"])
    psm = int(box.get("psm", 6))

    rect = fitz.Rect(x, y, x + w, y + h)
    pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)

    img = Image.open(io.BytesIO(pix.tobytes("png")))
    img2 = preprocess(img)

    txt = pytesseract.image_to_string(img2, lang="eng", config=f"--psm {psm}")
    return (txt or "").strip()


def clean(s: str) -> str:
    if not s:
        return ""
    s = s.replace("\u00a0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    return s.strip()


def find_pod_number(text: str) -> Optional[str]:
    pats = [
        r"\bDelivery\s*Note\s*(?:No|Number|#)?\s*[:\-]?\s*([A-Z0-9\-\/]{4,20})\b",
        r"\bPOD\s*(?:No|Number|#)?\s*[:\-]?\s*([A-Z0-9\-\/]{4,20})\b",
        r"\bD\/N\s*[:\-]?\s*([A-Z0-9\-\/]{4,20})\b",
    ]
    for p in pats:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            v = clean(m.group(1))
            if v.upper() in {"DATE", "NUMBER", "NOTE"}:
                continue
            return v
    top = "\n".join(text.splitlines()[:40])
    m = re.search(r"\b(\d{5,8})\b", top)
    if m:
        return m.group(1)
    return None


def find_type_guess(text: str) -> Optional[str]:
    t = text.upper()
    if "DELIVERY" in t:
        return "Delivery"
    if "COLLECTION" in t:
        return "Collection"
    if "EXCHANGE" in t:
        return "Exchange"
    if "SITE MOVE" in t or ("SITE" in t and "MOVE" in t):
        return "Site Move"
    return None


def find_fleet(text: str) -> Optional[str]:
    pats = [
        r"\bFleet\s*(?:No|Number)?\s*[:\-]?\s*([A-Z0-9]{2,12})\b",
        r"\bFleet\s*#\s*([A-Z0-9]{2,12})\b",
    ]
    for p in pats:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            return clean(m.group(1))
    return None


def find_hire_ref(text: str) -> Optional[str]:
    pats = [
        r"\bHire\s*(?:Ref|Reference|No|#)?\s*[:\-]?\s*([A-Z0-9\-\/]{3,30})\b",
        r"\bContract\s*(?:No|#)?\s*[:\-]?\s*([A-Z0-9\-\/]{3,30})\b",
    ]
    for p in pats:
        m = re.search(p, text, re.IGNORECASE)
        if m:
            v = clean(m.group(1))
            if re.search(r"\d", v):
                return v
    return None


def find_customer_site_freeform(text: str) -> Dict[str, Optional[str]]:
    customer = None
    site = None
    lines = [clean(l) for l in text.splitlines() if clean(l)]
    for i, line in enumerate(lines[:120]):
        if customer is None and re.search(r"\b(Customer|Account)\b", line, re.IGNORECASE):
            m = re.search(r"\b(?:Customer|Account)\b\s*[:\-]?\s*(.+)$", line, re.IGNORECASE)
            if m and clean(m.group(1)):
                customer = clean(m.group(1))[:120]
                continue
            if i + 1 < len(lines) and len(lines[i + 1]) >= 3:
                customer = lines[i + 1][:120]
                continue

        if site is None and re.search(r"\bSite\b", line, re.IGNORECASE):
            m = re.search(r"\bSite\b\s*[:\-]?\s*(.+)$", line, re.IGNORECASE)
            if m and clean(m.group(1)):
                site = clean(m.group(1))[:160]
                continue
            if i + 1 < len(lines) and len(lines[i + 1]) >= 3:
                site = lines[i + 1][:160]
                continue

        if customer and site:
            break

    return {"customer_text": customer, "site_text": site}


def find_fuel_fraction(text: str) -> Optional[str]:
    t = text.upper().replace(" ", "")
    m = re.search(r"\bFUEL[:\-]?(FULL|EMPTY|1/8|1/4|3/8|1/2|5/8|3/4|7/8)\b", t)
    if m:
        v = m.group(1)
        return {
            "EMPTY": "0",
            "1/8": "0.125",
            "1/4": "0.25",
            "3/8": "0.375",
            "1/2": "0.5",
            "5/8": "0.625",
            "3/4": "0.75",
            "7/8": "0.875",
            "FULL": "1",
        }.get(v)
    return None


def find_adblue_fraction(text: str) -> Optional[str]:
    t = text.upper().replace(" ", "")
    if re.search(r"\bAD[\-]?BLUE[:\-]?(N/?A)\b", t):
        return "NA"
    m = re.search(r"\bAD[\-]?BLUE[:\-]?(FULL|EMPTY|1/4|1/2|3/4)\b", t)
    if m:
        v = m.group(1)
        return {
            "EMPTY": "0",
            "1/4": "0.25",
            "1/2": "0.5",
            "3/4": "0.75",
            "FULL": "1",
        }.get(v)
    return None


def extract_buckets(text: str) -> List[str]:
    found = []
    for size in ["300", "450", "600", "750", "900", "1050", "1200", "1500", "DIG", "DITCH"]:
        if re.search(rf"\b{re.escape(size)}\b", text.upper()):
            found.append(size.lower() if size in {"DIG", "DITCH"} else size)
    return found


def extract_machine_checks(text: str) -> Dict[str, str]:
    # Placeholder if you later add robust parsing
    return {}


# ---------- endpoints ----------
@app.post("/pod/ocr_box")
def pod_ocr_box(req: PodBoxReq):
    path = req.path
    if not path or not os.path.exists(path):
        return {"error": "file_not_found", "path": path}

    ext = os.path.splitext(path)[1].lower().strip(".")

    # Choose box: direct "box" OR "field"+"template"
    chosen_box = None
    chosen_key = None

    if req.box is not None:
        chosen_box = req.box.model_dump()
        chosen_key = "direct_box"
    else:
        if not req.field:
            return {"error": "missing_field_or_box", "message": "Provide either 'box' or 'field' + 'template'."}
        if not req.template or not req.template.boxes:
            return {"error": "missing_template", "message": "Provide 'template.boxes' when using 'field'."}
        if req.field not in req.template.boxes:
            return {"error": "field_not_found", "field": req.field, "available": list(req.template.boxes.keys())[:50]}
        chosen_key = req.field
        chosen_box = req.template.boxes[req.field]

    # OCR it (PDF only for now; template boxes are in PDF points)
    if ext == "pdf":
        doc = fitz.open(path)
        try:
            use_page = req.page_no or (req.template.page_no if req.template else 1)
            use_dpi = req.dpi or (req.template.dpi if req.template else 300)
            text = ocr_one_box_from_pdf(doc, page_no_1based=use_page, box=chosen_box, dpi=use_dpi)
        finally:
            doc.close()
    else:
        return {"error": "not_pdf", "message": "ocr_box supports PDF only (template boxes are in PDF points)."}

    return {"field": chosen_key, "text": clean(text), "box": chosen_box}


@app.post("/pod/extract")
def pod_extract(req: PodReq):
    path = req.path
    if not path or not os.path.exists(path):
        return {"error": "file_not_found", "path": path, "needs_review": 1, "confidence": 0.0}

    ext = os.path.splitext(path)[1].lower().strip(".")
    raw_text = ""
    template_used = False
    box_results: Dict[str, str] = {}

    if ext == "pdf":
        doc = fitz.open(path)
        try:
            raw_text = ocr_text_from_pdf(doc, max_pages=1, dpi=300)

            if req.template and req.template.boxes:
                box_results = ocr_boxes_from_pdf(
                    doc=doc,
                    page_no_1based=req.template.page_no,
                    boxes=req.template.boxes,
                    dpi=req.template.dpi,
                )
                template_used = True
        finally:
            doc.close()
    else:
        raw_text = ocr_text_from_image(path)

    pod_number = find_pod_number(raw_text)
    type_guess = find_type_guess(raw_text)
    fleet_text = find_fleet(raw_text)
    hire_ref_text = find_hire_ref(raw_text)

    cs = find_customer_site_freeform(raw_text)
    customer_text = cs.get("customer_text")
    site_text = cs.get("site_text")

    fuel = find_fuel_fraction(raw_text)
    adblue = find_adblue_fraction(raw_text)

    buckets = extract_buckets(raw_text)
    machine = extract_machine_checks(raw_text)

    # Template override
    if template_used and box_results:
        pod_number = clean(box_results.get("pod_number", "")) or pod_number
        customer_text = clean(box_results.get("customer_text", "")) or customer_text
        site_text = clean(box_results.get("site_text", "")) or site_text
        fleet_text = clean(box_results.get("fleet_text", "")) or fleet_text
        hire_ref_text = clean(box_results.get("hire_ref_text", "")) or hire_ref_text

        if clean(box_results.get("fuel", "")):
            fuel = clean(box_results.get("fuel", ""))
        if clean(box_results.get("adblue", "")):
            adblue = clean(box_results.get("adblue", ""))

    conf = 0.0
    conf += 0.40 if pod_number else 0.0
    conf += 0.15 if customer_text else 0.0
    conf += 0.15 if site_text else 0.0
    conf += 0.15 if fleet_text else 0.0
    conf += 0.10 if type_guess else 0.0
    conf += 0.05 if hire_ref_text else 0.0
    conf = min(conf, 0.999)

    needs_review = conf < 0.75

    return {
        "extraction_method": "ocr_template" if template_used else "ocr_baseline",
        "raw_text": raw_text,

        "pod_number": pod_number or "",
        "customer_text": customer_text or "",
        "site_text": site_text or "",
        "fleet_text": fleet_text or "",
        "type_guess": type_guess or "",
        "hire_ref_text": hire_ref_text or "",

        "fuel": fuel or "",
        "adblue": adblue or "",

        "buckets": buckets,
        "machine": machine,

        "template_used": template_used,
        "box_results": box_results,

        "confidence": round(conf, 3),
        "needs_review": 1 if needs_review else 0,
    }


@app.post("/pod/split")
def pod_split(req: SplitReq):
    """
    Split a multi-page PDF into one PDF per page.
    Returns absolute file paths of the generated single-page PDFs.
    """
    if not req.path or not os.path.exists(req.path):
        return {"error": "file_not_found", "path": req.path}

    Path(req.out_dir).mkdir(parents=True, exist_ok=True)

    doc = fitz.open(req.path)
    out_paths: List[str] = []
    try:
        for i in range(len(doc)):
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=i, to_page=i)

            base = Path(req.out_dir) / f"page_{i+1:03d}.pdf"
            out_path = base

            # Avoid overwrite if requested
            if req.avoid_overwrite and out_path.exists():
                stamp = datetime.now().strftime("%H%M%S")
                out_path = Path(req.out_dir) / f"page_{i+1:03d}_{stamp}.pdf"

            new_doc.save(str(out_path))
            new_doc.close()

            out_paths.append(str(out_path))
    finally:
        doc.close()

    return {"ok": True, "count": len(out_paths), "paths": out_paths}