from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang="en")


def group_lines(words, y_threshold=12):
    """
    Agrupa palabras en líneas reales usando proximidad vertical
    """

    groups = []

    # ordenar por Y (vertical)
    words = sorted(words, key=lambda w: w["bbox"][0][1])

    for w in words:

        y = w["bbox"][0][1]
        placed = False

        for g in groups:
            gy = g[-1]["bbox"][0][1]

            if abs(gy - y) < y_threshold:
                g.append(w)
                placed = True
                break

        if not placed:
            groups.append([w])

    return groups


def merge_line(line_words):
    """
    Une palabras en una sola línea de texto
    y crea bbox combinado
    """

    text = " ".join([w["text"] for w in line_words])

    x1 = min(w["bbox"][0][0] for w in line_words)
    y1 = min(w["bbox"][0][1] for w in line_words)
    x2 = max(w["bbox"][2][0] for w in line_words)
    y2 = max(w["bbox"][2][1] for w in line_words)

    return {
        "text": text,
        "bbox": [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
    }


def ocr_blocks(blocks):
    """
    Salida PRO:
    [
        {
            "text": "línea completa",
            "bbox": [...],
            "page": 0
        }
    ]
    """

    results = []

    for page_index, b in enumerate(blocks):

        img_path = b["image"]

        ocr_result = ocr.ocr(img_path, cls=True)

        if not ocr_result:
            continue

        words = []

        # 🔥 extraer palabras planas
        for line in ocr_result:
            for word in line:

                text = word[1][0]

                if not text.strip():
                    continue

                words.append({
                    "text": text,
                    "bbox": word[0]
                })

        # 🧠 AGRUPAR EN LÍNEAS
        lines = group_lines(words)

        # 🔥 MERGE FINAL
        for line in lines:

            merged = merge_line(line)

            merged["page"] = page_index

            results.append(merged)

    return results