rag-from-scratch/conversione/_pipeline/report.py

import json
import re
from datetime import datetime
from pathlib import Path

from .structure import _parse_sections_with_body
from ._constants import _MATH_SYMBOLS_RE, _EXERCISE_TRIGGER_RE, _MATH_HDR_RE


def build_report(
    stem: str,
    out_dir: Path,
    clean_text: str,
    t_stats: dict,
    profile: dict,
    reduction: float,
) -> Path:
    text_lines = clean_text.split("\n")

    sections = _parse_sections_with_body(clean_text, 3)
    lengths  = [len(body) for _, body in sections]

    def _pct(data: list[int], p: float) -> int:
        if not data:
            return 0
        s = sorted(data)
        return s[max(0, min(len(s) - 1, int(len(s) * p)))]

    distribution = {
        "min":     min(lengths) if lengths else 0,
        "p25":     _pct(lengths, 0.25),
        "mediana": _pct(lengths, 0.50),
        "p75":     _pct(lengths, 0.75),
        "max":     max(lengths) if lengths else 0,
    }

    bare_hdrs = [
        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
        for hdr, body in sections
        if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
    ]
    short_secs = [
        {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
        for (hdr, body), length in zip(sections, lengths)
        if 0 < length < 150
    ]
    long_secs = [
        {"header": hdr, "chars": length}
        for (hdr, _), length in zip(sections, lengths)
        if length > 1500
    ]

    def _scan(pattern: str, max_n: int = 10) -> list[dict]:
        hits = []
        for i, line in enumerate(text_lines):
            if re.search(pattern, line) and not re.match(r"^#+ ", line):
                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
                if len(hits) >= max_n:
                    break
        return hits

    def _scan_formula_headers(max_n: int = 10) -> list[dict]:
        hits = []
        for i, line in enumerate(text_lines):
            m = _MATH_HDR_RE.match(line)
            if not m:
                continue
            body = m.group(2)
            if len(body) <= 100:
                continue
            has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
            has_ex   = bool(_EXERCISE_TRIGGER_RE.search(body))
            if has_math or has_ex:
                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
                if len(hits) >= max_n:
                    break
        return hits

    residui = {
        "backtick":         _scan(r"`"),
        "dotleader":        _scan(r"(?:\. ){5,}"),
        "url":              _scan(r"^(https?://|www\.)\S+"),
        "immagini":         _scan(r"!\[[^\]]*\]\([^)]*\)"),
        "br_inline":        _scan(r"<br>"),
        "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
        "formule_inline":   _scan(r"\[\d+\.\d+\]"),
        "footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'),
        "pua_markers":      _scan(r'[-]'),
        "formula_headers":  _scan_formula_headers(),
    }

    report = {
        "stem":      stem,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
        "transforms": {
            **t_stats,
            "riduzione_pct": round(reduction),
        },
        "structure":    profile,
        "distribution": distribution,
        "anomalie": {
            "bare_headers":        len(bare_hdrs),
            "short_sections":      len(short_secs),
            "long_sections":       len(long_secs),
            "bare_headers_list":   bare_hdrs,
            "short_sections_list": short_secs,
            "long_sections_list":  long_secs,
        },
        "residui": {
            "backtick":                   len(residui["backtick"]),
            "dotleader":                  len(residui["dotleader"]),
            "url":                        len(residui["url"]),
            "immagini":                   len(residui["immagini"]),
            "br_inline":                  len(residui["br_inline"]),
            "simboli_encoding":           len(residui["simboli_encoding"]),
            "formule_inline":             len(residui["formule_inline"]),
            "footnote_markers":           len(residui["footnote_markers"]),
            "pua_markers":                len(residui["pua_markers"]),
            "backtick_esempi":            residui["backtick"],
            "dotleader_esempi":           residui["dotleader"],
            "url_esempi":                 residui["url"],
            "immagini_esempi":            residui["immagini"],
            "br_inline_esempi":           residui["br_inline"],
            "simboli_encoding_esempi":    residui["simboli_encoding"],
            "formule_inline_esempi":      residui["formule_inline"],
            "footnote_markers_esempi":    residui["footnote_markers"],
            "pua_markers_esempi":         residui["pua_markers"],
            "formula_headers":            len(residui["formula_headers"]),
            "formula_headers_esempi":     residui["formula_headers"],
        },
    }

    report_path = out_dir / "report.json"
    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
    return report_path