136 lines
5.1 KiB
Python
136 lines
5.1 KiB
Python
|
|
import json
|
||
|
|
import re
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from .structure import _parse_sections_with_body
|
||
|
|
from ._constants import _MATH_SYMBOLS_RE, _EXERCISE_TRIGGER_RE, _MATH_HDR_RE
|
||
|
|
|
||
|
|
|
||
|
|
def build_report(
|
||
|
|
stem: str,
|
||
|
|
out_dir: Path,
|
||
|
|
clean_text: str,
|
||
|
|
t_stats: dict,
|
||
|
|
profile: dict,
|
||
|
|
reduction: float,
|
||
|
|
) -> Path:
|
||
|
|
text_lines = clean_text.split("\n")
|
||
|
|
|
||
|
|
sections = _parse_sections_with_body(clean_text, 3)
|
||
|
|
lengths = [len(body) for _, body in sections]
|
||
|
|
|
||
|
|
def _pct(data: list[int], p: float) -> int:
|
||
|
|
if not data:
|
||
|
|
return 0
|
||
|
|
s = sorted(data)
|
||
|
|
return s[max(0, min(len(s) - 1, int(len(s) * p)))]
|
||
|
|
|
||
|
|
distribution = {
|
||
|
|
"min": min(lengths) if lengths else 0,
|
||
|
|
"p25": _pct(lengths, 0.25),
|
||
|
|
"mediana": _pct(lengths, 0.50),
|
||
|
|
"p75": _pct(lengths, 0.75),
|
||
|
|
"max": max(lengths) if lengths else 0,
|
||
|
|
}
|
||
|
|
|
||
|
|
bare_hdrs = [
|
||
|
|
{"header": hdr, "corpo_inizio": body[:120].replace("\n", " ")}
|
||
|
|
for hdr, body in sections
|
||
|
|
if re.match(r"^### \d+\.\s*$", hdr) and len(body.strip()) < 30
|
||
|
|
]
|
||
|
|
short_secs = [
|
||
|
|
{"header": hdr, "chars": length, "testo": body[:80].replace("\n", " ")}
|
||
|
|
for (hdr, body), length in zip(sections, lengths)
|
||
|
|
if 0 < length < 150
|
||
|
|
]
|
||
|
|
long_secs = [
|
||
|
|
{"header": hdr, "chars": length}
|
||
|
|
for (hdr, _), length in zip(sections, lengths)
|
||
|
|
if length > 1500
|
||
|
|
]
|
||
|
|
|
||
|
|
def _scan(pattern: str, max_n: int = 10) -> list[dict]:
|
||
|
|
hits = []
|
||
|
|
for i, line in enumerate(text_lines):
|
||
|
|
if re.search(pattern, line) and not re.match(r"^#+ ", line):
|
||
|
|
hits.append({"riga": i + 1, "testo": line.strip()[:120]})
|
||
|
|
if len(hits) >= max_n:
|
||
|
|
break
|
||
|
|
return hits
|
||
|
|
|
||
|
|
def _scan_formula_headers(max_n: int = 10) -> list[dict]:
|
||
|
|
hits = []
|
||
|
|
for i, line in enumerate(text_lines):
|
||
|
|
m = _MATH_HDR_RE.match(line)
|
||
|
|
if not m:
|
||
|
|
continue
|
||
|
|
body = m.group(2)
|
||
|
|
if len(body) <= 100:
|
||
|
|
continue
|
||
|
|
has_math = len(_MATH_SYMBOLS_RE.findall(body)) >= 3
|
||
|
|
has_ex = bool(_EXERCISE_TRIGGER_RE.search(body))
|
||
|
|
if has_math or has_ex:
|
||
|
|
hits.append({"riga": i + 1, "testo": line.strip()[:120]})
|
||
|
|
if len(hits) >= max_n:
|
||
|
|
break
|
||
|
|
return hits
|
||
|
|
|
||
|
|
residui = {
|
||
|
|
"backtick": _scan(r"`"),
|
||
|
|
"dotleader": _scan(r"(?:\. ){5,}"),
|
||
|
|
"url": _scan(r"^(https?://|www\.)\S+"),
|
||
|
|
"immagini": _scan(r"!\[[^\]]*\]\([^)]*\)"),
|
||
|
|
"br_inline": _scan(r"<br>"),
|
||
|
|
"simboli_encoding": _scan(r'(?<=[0-9A-Za-z])[!"](?=[0-9A-Za-z])'),
|
||
|
|
"formule_inline": _scan(r"\[\d+\.\d+\]"),
|
||
|
|
"footnote_markers": _scan(r'[¹²³⁰⁴-⁹]'),
|
||
|
|
"pua_markers": _scan(r'[-]'),
|
||
|
|
"formula_headers": _scan_formula_headers(),
|
||
|
|
}
|
||
|
|
|
||
|
|
report = {
|
||
|
|
"stem": stem,
|
||
|
|
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
||
|
|
"transforms": {
|
||
|
|
**t_stats,
|
||
|
|
"riduzione_pct": round(reduction),
|
||
|
|
},
|
||
|
|
"structure": profile,
|
||
|
|
"distribution": distribution,
|
||
|
|
"anomalie": {
|
||
|
|
"bare_headers": len(bare_hdrs),
|
||
|
|
"short_sections": len(short_secs),
|
||
|
|
"long_sections": len(long_secs),
|
||
|
|
"bare_headers_list": bare_hdrs,
|
||
|
|
"short_sections_list": short_secs,
|
||
|
|
"long_sections_list": long_secs,
|
||
|
|
},
|
||
|
|
"residui": {
|
||
|
|
"backtick": len(residui["backtick"]),
|
||
|
|
"dotleader": len(residui["dotleader"]),
|
||
|
|
"url": len(residui["url"]),
|
||
|
|
"immagini": len(residui["immagini"]),
|
||
|
|
"br_inline": len(residui["br_inline"]),
|
||
|
|
"simboli_encoding": len(residui["simboli_encoding"]),
|
||
|
|
"formule_inline": len(residui["formule_inline"]),
|
||
|
|
"footnote_markers": len(residui["footnote_markers"]),
|
||
|
|
"pua_markers": len(residui["pua_markers"]),
|
||
|
|
"backtick_esempi": residui["backtick"],
|
||
|
|
"dotleader_esempi": residui["dotleader"],
|
||
|
|
"url_esempi": residui["url"],
|
||
|
|
"immagini_esempi": residui["immagini"],
|
||
|
|
"br_inline_esempi": residui["br_inline"],
|
||
|
|
"simboli_encoding_esempi": residui["simboli_encoding"],
|
||
|
|
"formule_inline_esempi": residui["formule_inline"],
|
||
|
|
"footnote_markers_esempi": residui["footnote_markers"],
|
||
|
|
"pua_markers_esempi": residui["pua_markers"],
|
||
|
|
"formula_headers": len(residui["formula_headers"]),
|
||
|
|
"formula_headers_esempi": residui["formula_headers"],
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
report_path = out_dir / "report.json"
|
||
|
|
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
|
|
return report_path
|