diagnostic_snapshot.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. """Aggregate connection, virtual-printer, and log-health diagnostics into a
  2. single snapshot for the support bundle and bug-report submission paths.
  3. Each user-triggered support artifact (the System-page support ZIP and the
  4. bug-report bubble) already exposed these three checks inline in the UI but
  5. omitted them from what landed in the maintainer's hands. This module is the
  6. single entry point both flows call to capture all three at once.
  7. Designed around three constraints:
  8. - **Fail-soft per probe.** A crash inside one printer's check must not nuke the
  9. whole snapshot — that's the whole point of including diagnostics in the
  10. bundle: a partial result is more useful than a 500.
  11. - **Bounded total runtime.** Each probe runs concurrently and is guarded by an
  12. outer wall-clock cap; timeouts emit a marker entry rather than blocking.
  13. - **No mutation.** Connection / VP diagnostics only probe TCP ports and read
  14. state; log-health is a passive scanner. Safe to run on every bundle.
  15. """
  16. from __future__ import annotations
  17. import asyncio
  18. import logging
  19. import re
  20. from typing import Any
  21. from sqlalchemy import select
  22. from sqlalchemy.ext.asyncio import AsyncSession
  23. logger = logging.getLogger(__name__)
  24. # Mirrors the IPv4 pattern in services.log_reader.sanitize_log_content. Kept as
  25. # a literal here (not imported) so a refactor of that module's internals can't
  26. # silently change snapshot sanitization. Skips firmware-version-shaped strings
  27. # (leading-zero octets like "01.09.01.00") via the [1-9]\d|\d alternations.
  28. _IPV4_RE = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)\b")
  29. # Per-diagnostic wall-clock cap. Each underlying probe carries its own (smaller)
  30. # TCP / HTTP timeouts; this is the outer guard so a hung interface or a wedged
  31. # subprocess can't stall bundle generation past about this many seconds per
  32. # printer/VP. Snapshot total runtime is bounded by max(per-cap) thanks to the
  33. # concurrent gather, not the sum.
  34. _PER_DIAGNOSTIC_TIMEOUT_SECONDS = 15.0
  35. def _serialize(result: Any) -> Any:
  36. """Convert a Pydantic model to a dict; pass through plain dicts/lists."""
  37. if hasattr(result, "model_dump"):
  38. return result.model_dump()
  39. return result
  40. async def _run_connection_for(printer) -> dict:
  41. from backend.app.services.printer_diagnostic import run_connection_diagnostic
  42. base = {"printer_id": printer.id, "printer_name": printer.name}
  43. try:
  44. result = await asyncio.wait_for(
  45. run_connection_diagnostic(
  46. printer.ip_address,
  47. printer=printer,
  48. serial_number=printer.serial_number,
  49. access_code=printer.access_code,
  50. ),
  51. timeout=_PER_DIAGNOSTIC_TIMEOUT_SECONDS,
  52. )
  53. return {**base, "result": _serialize(result)}
  54. except asyncio.TimeoutError:
  55. return {**base, "error": "timed_out"}
  56. except Exception as e:
  57. # Log with traceback so the bundle generation isn't silent about
  58. # a broken probe, but never propagate.
  59. logger.warning("Connection diagnostic failed for printer %s: %s", printer.id, e, exc_info=True)
  60. return {**base, "error": str(e)}
  61. async def _run_vp_for(vp) -> dict:
  62. from backend.app.services.virtual_printer import virtual_printer_manager
  63. from backend.app.services.virtual_printer.diagnostic import run_vp_diagnostic
  64. base = {"vp_id": vp.id, "name": vp.name}
  65. try:
  66. instance = virtual_printer_manager.get_instance(vp.id)
  67. result = await asyncio.wait_for(
  68. run_vp_diagnostic(vp, instance),
  69. timeout=_PER_DIAGNOSTIC_TIMEOUT_SECONDS,
  70. )
  71. return {**base, "result": _serialize(result)}
  72. except asyncio.TimeoutError:
  73. return {**base, "error": "timed_out"}
  74. except Exception as e:
  75. logger.warning("VP diagnostic failed for VP %s: %s", vp.id, e, exc_info=True)
  76. return {**base, "error": str(e)}
  77. async def _run_log_health() -> Any:
  78. from backend.app.services.log_health import scan_logs
  79. try:
  80. # scan_logs is sync I/O-bound (file read + regex); push off the loop.
  81. result = await asyncio.wait_for(
  82. asyncio.to_thread(scan_logs),
  83. timeout=_PER_DIAGNOSTIC_TIMEOUT_SECONDS,
  84. )
  85. return _serialize(result)
  86. except asyncio.TimeoutError:
  87. return {"error": "timed_out"}
  88. except Exception as e:
  89. logger.warning("Log-health scan failed: %s", e, exc_info=True)
  90. return {"error": str(e)}
  91. async def collect_diagnostic_snapshot(db: AsyncSession) -> dict[str, Any]:
  92. """Return the three-section diagnostic snapshot.
  93. Always returns a dict with keys ``connection_diagnostics`` (list, one entry
  94. per active printer), ``vp_diagnostics`` (list, one entry per enabled VP —
  95. empty if none), and ``log_health`` (the ``scan_logs`` result or an error
  96. marker). Each list entry carries either ``result`` (success) or ``error``
  97. (timeout / exception) so the maintainer can tell at a glance whether a
  98. given probe ran.
  99. """
  100. from backend.app.models.printer import Printer
  101. from backend.app.models.virtual_printer import VirtualPrinter
  102. printers_result = await db.execute(select(Printer).where(Printer.is_active.is_(True)))
  103. printers = list(printers_result.scalars().all())
  104. vps_result = await db.execute(select(VirtualPrinter).where(VirtualPrinter.enabled.is_(True)))
  105. vps = list(vps_result.scalars().all())
  106. # Concurrent: total wall-clock ≈ max(per-cap), not sum.
  107. results = await asyncio.gather(
  108. asyncio.gather(*(_run_connection_for(p) for p in printers)) if printers else _noop_list(),
  109. asyncio.gather(*(_run_vp_for(vp) for vp in vps)) if vps else _noop_list(),
  110. _run_log_health(),
  111. return_exceptions=True,
  112. )
  113. connection_results, vp_results, log_health = results
  114. def _coerce_list(r) -> list:
  115. if isinstance(r, BaseException):
  116. logger.warning("Diagnostic snapshot batch failed: %s", r)
  117. return []
  118. return list(r) if r is not None else []
  119. snapshot = {
  120. "connection_diagnostics": _coerce_list(connection_results),
  121. "vp_diagnostics": _coerce_list(vp_results),
  122. "log_health": log_health if not isinstance(log_health, BaseException) else {"error": str(log_health)},
  123. }
  124. # Sanitize before returning. The diagnostic schemas embed printer/host IPs
  125. # (`PrinterDiagnosticResult.ip_address`, network-mode check params, VP
  126. # `bind_ip`) and the snapshot adds printer names — none of which should
  127. # leak into a submitted GitHub issue or a shared support ZIP. Use the
  128. # same `collect_sensitive_strings` table the log sanitizer already
  129. # consults so the replacement labels stay consistent ([PRINTER], [SERIAL],
  130. # [IP], [ACCESS_CODE]); the IPv4 regex fallback in `_mask_string` then
  131. # catches host / bind IPs that aren't in the DB.
  132. try:
  133. from backend.app.services.log_reader import collect_sensitive_strings
  134. sensitive_strings = await collect_sensitive_strings(db)
  135. except Exception:
  136. logger.warning("Could not collect sensitive strings for snapshot sanitization", exc_info=True)
  137. sensitive_strings = {}
  138. return _sanitize_recursive(snapshot, sensitive_strings)
  139. async def _noop_list() -> list:
  140. return []
  141. def _mask_string(value: str, sensitive_strings: dict[str, str]) -> str:
  142. """Apply known-value replacement + IPv4 regex masking to a single string.
  143. Known values are matched first (longest first so "My Printer 1" beats
  144. "My Printer"); the regex pass then catches any IPs the sensitive_strings
  145. table didn't already cover — most importantly the Bambuddy host's own
  146. IP (returned by ``_get_host_ip`` inside the diagnostic, not in the DB)
  147. and any virtual-printer ``bind_ip`` the user picked at setup.
  148. """
  149. if not value:
  150. return value
  151. for raw, label in sorted(sensitive_strings.items(), key=lambda x: len(x[0]), reverse=True):
  152. if len(raw) < 3:
  153. continue
  154. if raw in value:
  155. value = value.replace(raw, label)
  156. value = _IPV4_RE.sub("[IP]", value)
  157. return value
  158. def _sanitize_recursive(node: Any, sensitive_strings: dict[str, str]) -> Any:
  159. """Walk the snapshot and redact strings in place — dicts, lists, scalars.
  160. Non-string scalars (ints, bools, None) pass through; we only need to
  161. mask user-visible values. Keys are NOT renamed (those are structural).
  162. """
  163. if isinstance(node, str):
  164. return _mask_string(node, sensitive_strings)
  165. if isinstance(node, dict):
  166. return {k: _sanitize_recursive(v, sensitive_strings) for k, v in node.items()}
  167. if isinstance(node, list):
  168. return [_sanitize_recursive(item, sensitive_strings) for item in node]
  169. return node