import json import re from datetime import datetime from pathlib import Path from .structure import _parse_sections_with_body from ._constants import _MATH_SYMBOLS_RE, _EXERCISE_TRIGGER_RE, _MATH_HDR_RE def build_report( stem: str, out_dir: Path, clean_text: str, t_stats: dict, profile: dict, reduction: float, ) -> Path: text_lines = clean_text.split("\n") sections = _parse_sections_with_body(clean_text, 3) lengths = [len(body) for _, body in sections] def _pct(data: list[int], p: float) -> int: if not data: return 0 s = sorted(data) return s[max(0, min(len(s) - 1, int(len(s) * p)))] distribution = { "min": min(lengths) if lengths else 0, "p25": _pct(lengths, 0.25), "mediana": _pct(lengths, 0.50), "p75": _pct(lengths, 0.75), "max": max(lengths) if lengths else 0, } bare_hdrs = [ {"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")} for hdr, body in sections if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30 ] short_secs = [ {"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")} for (hdr, body), length in zip(sections, lengths) if 0 < length < 150 ] long_secs = [ {"header": hdr, "chars": length} for (hdr, _), length in zip(sections, lengths) if length > 1500 ] def _scan(pattern: str, max_n: int = 10) -> list[dict]: hits = [] for i, line in enumerate(text_lines): if re.search(pattern, line) and not re.match(r"^#+ ", line): hits.append({"riga": i + 1, "testo": line.strip()[:120]}) if len(hits) >= max_n: break return hits def _scan_formula_headers(max_n: int = 10) -> list[dict]: hits = [] for i, line in enumerate(text_lines): m = _MATH_HDR_RE.match(line) if not m: continue body = m.group(2) if len(body) <= 100: continue has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3 has_ex = bool(_EXERCISE_TRIGGER_RE.search(body)) if has_math or has_ex: hits.append({"riga": i + 1, "testo": line.strip()[:120]}) if len(hits) >= max_n: break return hits residui = { "backtick": _scan(r"`"), "dotleader": _scan(r"(?:\. ){5,}"), "url": _scan(r"^(https?://|www\.)\S+"), "immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"), "br_inline": _scan(r"
"), "simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'), "formule_inline": _scan(r"\[\d+\.\d+\]"), "footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'), "pua_markers": _scan(r'[-]'), "formula_headers": _scan_formula_headers(), } report = { "stem": stem, "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"), "transforms": { **t_stats, "riduzione_pct": round(reduction), }, "structure": profile, "distribution": distribution, "anomalie": { "bare_headers": len(bare_hdrs), "short_sections": len(short_secs), "long_sections": len(long_secs), "bare_headers_list": bare_hdrs, "short_sections_list": short_secs, "long_sections_list": long_secs, }, "residui": { "backtick": len(residui["backtick"]), "dotleader": len(residui["dotleader"]), "url": len(residui["url"]), "immagini": len(residui["immagini"]), "br_inline": len(residui["br_inline"]), "simboli_encoding": len(residui["simboli_encoding"]), "formule_inline": len(residui["formule_inline"]), "footnote_markers": len(residui["footnote_markers"]), "pua_markers": len(residui["pua_markers"]), "backtick_esempi": residui["backtick"], "dotleader_esempi": residui["dotleader"], "url_esempi": residui["url"], "immagini_esempi": residui["immagini"], "br_inline_esempi": residui["br_inline"], "simboli_encoding_esempi": residui["simboli_encoding"], "formule_inline_esempi": residui["formule_inline"], "footnote_markers_esempi": residui["footnote_markers"], "pua_markers_esempi": residui["pua_markers"], "formula_headers": len(residui["formula_headers"]), "formula_headers_esempi": residui["formula_headers"], }, } report_path = out_dir / "report.json" report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") return report_path