"""
docTR OCR + light text cleaning.
"""

import asyncio
import logging
import re
import tempfile
from pathlib import Path

from app.utils.helpers import (
    normalize_ocr_units,
    normalize_range_string,
    remove_symbol_garbage_only,
)

logger = logging.getLogger("lab_analyzer")

_predictor = None


def _get_predictor():
    global _predictor
    if _predictor is None:
        from doctr.models import ocr_predictor

        logger.info("[OCR] Loading docTR model...")
        _predictor = ocr_predictor(pretrained=True)
        logger.info("[OCR] docTR model ready")
    return _predictor


def _run_ocr_on_path(file_path: str) -> str:
    from doctr.io import DocumentFile

    suffix = Path(file_path).suffix.lower()
    if suffix == ".pdf":
        doc = DocumentFile.from_pdf(file_path)
    else:
        doc = DocumentFile.from_images(file_path)

    result = _get_predictor()(doc)
    page_count = len(result.pages)
    logger.info("[OCR] Processing %d page(s)", page_count)

    lines: list[str] = []
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                text = " ".join(w.value for w in line.words if w.value).strip()
                if text:
                    lines.append(text)

    raw_text = "\n".join(lines)
    logger.info(
        "[OCR] Extracted %d lines from %d pages (%d chars)",
        len(lines),
        page_count,
        len(raw_text),
    )
    return raw_text


def light_clean_ocr_text(raw_text: str) -> str:
    """Minimal cleaning — preserve medical context for AI."""
    if not raw_text:
        return ""

    cleaned_lines: list[str] = []
    for line in raw_text.replace("\r\n", "\n").replace("\r", "\n").split("\n"):
        line = remove_symbol_garbage_only(line)
        line = re.sub(r"[ \t]+", " ", line).strip()
        if not line:
            continue
        line = normalize_ocr_units(line)
        line = normalize_range_string(line)
        cleaned_lines.append(line)

    cleaned = "\n".join(cleaned_lines)
    logger.info(
        "[CLEANING] %d -> %d lines (%d chars)",
        raw_text.count("\n") + 1,
        len(cleaned_lines),
        len(cleaned),
    )
    return cleaned


async def extract_ocr_from_file(file_bytes: bytes, filename: str) -> str:
    suffix = Path(filename).suffix.lower() or ".png"
    allowed = {".pdf", ".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff"}
    if suffix not in allowed:
        raise ValueError(f"Unsupported file type: {suffix}. Use PDF or image.")

    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
        tmp.write(file_bytes)
        tmp_path = tmp.name

    try:
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, _run_ocr_on_path, tmp_path)
    finally:
        Path(tmp_path).unlink(missing_ok=True)
