conversione/_pipeline/report.py

import json
import re
from datetime import datetime
from pathlib import Path

from .structure import _parse_sections_with_body
from ._constants import _MATH_SYMBOLS_RE, _EXERCISE_TRIGGER_RE, _MATH_HDR_RE


def build_report(
    stem: str,
    out_dir: Path,
    clean_text: str,
    t_stats: dict,
    profile: dict,
    reduction: float,
) -> Path:
    text_lines = clean_text.split("\n")

    sections = _parse_sections_with_body(clean_text, 3)
    lengths  = [len(body) for _, body in sections]

    def _pct(data: list[int], p: float) -> int:
        if not data:
            return 0
        s = sorted(data)
        return s[max(0, min(len(s) - 1, int(len(s) * p)))]

    distribution = {
        "min":     min(lengths) if lengths else 0,
        "p25":     _pct(lengths, 0.25),
        "mediana": _pct(lengths, 0.50),
        "p75":     _pct(lengths, 0.75),
        "max":     max(lengths) if lengths else 0,
    }

    bare_hdrs = [
        {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
        for hdr, body in sections
        if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
    ]
    short_secs = [
        {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
        for (hdr, body), length in zip(sections, lengths)
        if 0 < length < 150
    ]
    long_secs = [
        {"header": hdr, "chars": length}
        for (hdr, _), length in zip(sections, lengths)
        if length > 1500
    ]

    def _scan(pattern: str, max_n: int = 10) -> list[dict]:
        hits = []
        for i, line in enumerate(text_lines):
            if re.search(pattern, line) and not re.match(r"^#+ ", line):
                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
                if len(hits) >= max_n:
                    break
        return hits

    def _scan_formula_headers(max_n: int = 10) -> list[dict]:
        hits = []
        for i, line in enumerate(text_lines):
            m = _MATH_HDR_RE.match(line)
            if not m:
                continue
            body = m.group(2)
            if len(body) <= 100:
                continue
            has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
            has_ex   = bool(_EXERCISE_TRIGGER_RE.search(body))
            if has_math or has_ex:
                hits.append({"riga": i + 1, "testo": line.strip()[:120]})
                if len(hits) >= max_n:
                    break
        return hits

    residui = {
        "backtick":         _scan(r"`"),
        "dotleader":        _scan(r"(?:\. ){5,}"),
        "url":              _scan(r"^(https?://|www\.)\S+"),
        "immagini":         _scan(r"!\[[^\]]*\]\([^)]*\)"),
        "br_inline":        _scan(r"<br>"),
        "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
        "formule_inline":   _scan(r"\[\d+\.\d+\]"),
        "footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'),
        "pua_markers":      _scan(r'[-]'),
        "formula_headers":  _scan_formula_headers(),
    }

    report = {
        "stem":      stem,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
        "transforms": {
            **t_stats,
            "riduzione_pct": round(reduction),
        },
        "structure":    profile,
        "distribution": distribution,
        "anomalie": {
            "bare_headers":        len(bare_hdrs),
            "short_sections":      len(short_secs),
            "long_sections":       len(long_secs),
            "bare_headers_list":   bare_hdrs,
            "short_sections_list": short_secs,
            "long_sections_list":  long_secs,
        },
        "residui": {
            "backtick":                   len(residui["backtick"]),
            "dotleader":                  len(residui["dotleader"]),
            "url":                        len(residui["url"]),
            "immagini":                   len(residui["immagini"]),
            "br_inline":                  len(residui["br_inline"]),
            "simboli_encoding":           len(residui["simboli_encoding"]),
            "formule_inline":             len(residui["formule_inline"]),
            "footnote_markers":           len(residui["footnote_markers"]),
            "pua_markers":                len(residui["pua_markers"]),
            "backtick_esempi":            residui["backtick"],
            "dotleader_esempi":           residui["dotleader"],
            "url_esempi":                 residui["url"],
            "immagini_esempi":            residui["immagini"],
            "br_inline_esempi":           residui["br_inline"],
            "simboli_encoding_esempi":    residui["simboli_encoding"],
            "formule_inline_esempi":      residui["formule_inline"],
            "footnote_markers_esempi":    residui["footnote_markers"],
            "pua_markers_esempi":         residui["pua_markers"],
            "formula_headers":            len(residui["formula_headers"]),
            "formula_headers_esempi":     residui["formula_headers"],
        },
    }

    report_path = out_dir / "report.json"
    report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
    return report_path
feat: integra pipeline PDF→Markdown a 9 stadi e test suite 2026-05-11 14:46:16 +02:00			`import json`
			`import re`
			`from datetime import datetime`
			`from pathlib import Path`

			`from .structure import _parse_sections_with_body`
			`from ._constants import _MATH_SYMBOLS_RE, _EXERCISE_TRIGGER_RE, _MATH_HDR_RE`


			`def build_report(`
			`stem: str,`
			`out_dir: Path,`
			`clean_text: str,`
			`t_stats: dict,`
			`profile: dict,`
			`reduction: float,`
			`) -> Path:`
			`text_lines = clean_text.split("\n")`

			`sections = _parse_sections_with_body(clean_text, 3)`
			`lengths = [len(body) for _, body in sections]`

			`def _pct(data: list[int], p: float) -> int:`
			`if not data:`
			`return 0`
			`s = sorted(data)`
			`return s[max(0, min(len(s) - 1, int(len(s) * p)))]`

			`distribution = {`
			`"min": min(lengths) if lengths else 0,`
			`"p25": _pct(lengths, 0.25),`
			`"mediana": _pct(lengths, 0.50),`
			`"p75": _pct(lengths, 0.75),`
			`"max": max(lengths) if lengths else 0,`
			`}`

			`bare_hdrs = [`
			`{"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}`
			`for hdr, body in sections`
			`if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30`
			`]`
			`short_secs = [`
			`{"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}`
			`for (hdr, body), length in zip(sections, lengths)`
			`if 0 < length < 150`
			`]`
			`long_secs = [`
			`{"header": hdr, "chars": length}`
			`for (hdr, _), length in zip(sections, lengths)`
			`if length > 1500`
			`]`

			`def _scan(pattern: str, max_n: int = 10) -> list[dict]:`
			`hits = []`
			`for i, line in enumerate(text_lines):`
			`if re.search(pattern, line) and not re.match(r"^#+ ", line):`
			`hits.append({"riga": i + 1, "testo": line.strip()[:120]})`
			`if len(hits) >= max_n:`
			`break`
			`return hits`

			`def _scan_formula_headers(max_n: int = 10) -> list[dict]:`
			`hits = []`
			`for i, line in enumerate(text_lines):`
			`m = _MATH_HDR_RE.match(line)`
			`if not m:`
			`continue`
			`body = m.group(2)`
			`if len(body) <= 100:`
			`continue`
			`has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3`
			`has_ex = bool(_EXERCISE_TRIGGER_RE.search(body))`
			`if has_math or has_ex:`
			`hits.append({"riga": i + 1, "testo": line.strip()[:120]})`
			`if len(hits) >= max_n:`
			`break`
			`return hits`

			`residui = {`
			"backtick": _scan(r"`"),
			`"dotleader": _scan(r"(?:\. ){5,}"),`
			`"url": _scan(r"^(https?://\|www\.)\S+"),`
			`"immagini": _scan(r"!\[[^\]]\]\([^)]\)"),`
			`"br_inline": _scan(r"<br>"),`
			`"simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),`
			`"formule_inline": _scan(r"\[\d+\.\d+\]"),`
			`"footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'),`
			`"pua_markers": _scan(r'[-]'),`
			`"formula_headers": _scan_formula_headers(),`
			`}`

			`report = {`
			`"stem": stem,`
			`"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),`
			`"transforms": {`
			`**t_stats,`
			`"riduzione_pct": round(reduction),`
			`},`
			`"structure": profile,`
			`"distribution": distribution,`
			`"anomalie": {`
			`"bare_headers": len(bare_hdrs),`
			`"short_sections": len(short_secs),`
			`"long_sections": len(long_secs),`
			`"bare_headers_list": bare_hdrs,`
			`"short_sections_list": short_secs,`
			`"long_sections_list": long_secs,`
			`},`
			`"residui": {`
			`"backtick": len(residui["backtick"]),`
			`"dotleader": len(residui["dotleader"]),`
			`"url": len(residui["url"]),`
			`"immagini": len(residui["immagini"]),`
			`"br_inline": len(residui["br_inline"]),`
			`"simboli_encoding": len(residui["simboli_encoding"]),`
			`"formule_inline": len(residui["formule_inline"]),`
			`"footnote_markers": len(residui["footnote_markers"]),`
			`"pua_markers": len(residui["pua_markers"]),`
			`"backtick_esempi": residui["backtick"],`
			`"dotleader_esempi": residui["dotleader"],`
			`"url_esempi": residui["url"],`
			`"immagini_esempi": residui["immagini"],`
			`"br_inline_esempi": residui["br_inline"],`
			`"simboli_encoding_esempi": residui["simboli_encoding"],`
			`"formule_inline_esempi": residui["formule_inline"],`
			`"footnote_markers_esempi": residui["footnote_markers"],`
			`"pua_markers_esempi": residui["pua_markers"],`
			`"formula_headers": len(residui["formula_headers"]),`
			`"formula_headers_esempi": residui["formula_headers"],`
			`},`
			`}`

			`report_path = out_dir / "report.json"`
			`report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")`
			`return report_path`