log_health.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. """Log-health scanner.
  2. Matches the recent Bambuddy app log against a curated catalog of known failure
  3. signatures, so users can self-diagnose setup ("layer 8") issues before filing a
  4. bug report.
  5. The catalog is a deliberate *allowlist*: only known-bad, actionable signatures
  6. are matched — a healthy install produces an empty finding list. Human-readable
  7. cause and fix text is intentionally NOT stored here; the frontend renders it
  8. from i18n keys ``systemHealth.signature.<id>.{name,cause,fix}`` so it stays
  9. translatable across all locales. This module only carries the machine-facing
  10. fields (pattern, severity, category, wiki anchor).
  11. """
  12. import logging
  13. import re
  14. from dataclasses import dataclass
  15. from pydantic import BaseModel
  16. from backend.app.core.config import settings
  17. from backend.app.services.log_reader import LogEntry, read_log_entries, sanitize_log_content
  18. logger = logging.getLogger(__name__)
  19. # How many recent log entries to scan by default.
  20. DEFAULT_SCAN_LIMIT = 4000
  21. # Log levels ranked so a signature can require "at least WARNING" etc.
  22. _LEVEL_RANK = {"DEBUG": 10, "INFO": 20, "WARNING": 30, "ERROR": 40, "CRITICAL": 50}
  23. # Findings are ordered layer8 first (the user can act on these), then
  24. # environment, then bug (please report). Within a group: errors before warnings.
  25. _CATEGORY_ORDER = {"layer8": 0, "environment": 1, "bug": 2}
  26. _SEVERITY_ORDER = {"error": 0, "warning": 1}
  27. # Cap the sample line length so a finding can never carry a huge folded traceback.
  28. _SAMPLE_MAX_LEN = 400
  29. @dataclass(frozen=True)
  30. class LogSignature:
  31. """One curated known-issue signature.
  32. ``patterns`` are matched (``re.search``, case-insensitive) against the log
  33. entry message. A signature only becomes a reported finding once it has
  34. matched ``min_count`` times within the scan window — this gates noisy,
  35. individually-benign symptoms (e.g. an occasional MQTT reconnect after a
  36. Wi-Fi blip) from being surfaced as a problem.
  37. """
  38. id: str
  39. patterns: tuple[re.Pattern[str], ...]
  40. severity: str # "error" | "warning"
  41. category: str # "layer8" | "environment" | "bug"
  42. wiki_anchor: str # slug appended to the troubleshooting wiki page URL
  43. min_level: str = "WARNING"
  44. logger_prefix: str | None = None # only match entries from this logger tree
  45. min_count: int = 1
  46. def _compile(*patterns: str) -> tuple[re.Pattern[str], ...]:
  47. return tuple(re.compile(p, re.IGNORECASE) for p in patterns)
  48. # --- The catalog -----------------------------------------------------------
  49. # Seeded from the ranked "layer 8" root causes found in the closed-issue triage
  50. # review. Each id MUST have matching i18n keys: systemHealth.signature.<id>.*
  51. SIGNATURES: tuple[LogSignature, ...] = (
  52. LogSignature(
  53. # Wrong/mistyped access code — FTPS login is rejected (530).
  54. id="ftp-auth-rejected",
  55. patterns=_compile(r"FTP connection permission error"),
  56. severity="error",
  57. category="layer8",
  58. wiki_anchor="wrong-access-code",
  59. logger_prefix="backend.app.services.bambu_ftp",
  60. ),
  61. LogSignature(
  62. # FTPS :990 unreachable — port blocked by a firewall, or the printer is
  63. # off / on a different subnet.
  64. id="ftp-connection-timeout",
  65. patterns=_compile(r"FTP connection timed out"),
  66. severity="warning",
  67. category="layer8",
  68. wiki_anchor="ftps-port-990-blocked",
  69. logger_prefix="backend.app.services.bambu_ftp",
  70. min_count=3,
  71. ),
  72. LogSignature(
  73. # TLS negotiation to the printer's FTPS server failed.
  74. id="ftp-ssl-error",
  75. patterns=_compile(r"FTP SSL error connecting"),
  76. severity="warning",
  77. category="layer8",
  78. wiki_anchor="ftps-tls-failure",
  79. logger_prefix="backend.app.services.bambu_ftp",
  80. min_count=3,
  81. ),
  82. LogSignature(
  83. # MQTT connection keeps dropping — typically MQTT :8883 partially
  84. # blocked, LAN mode unstable, or a flaky network path to the printer.
  85. id="mqtt-connection-flapping",
  86. patterns=_compile(r"Forcing MQTT reconnect", r"Hard reset reconnect failed"),
  87. severity="warning",
  88. category="layer8",
  89. wiki_anchor="mqtt-connection-unstable",
  90. logger_prefix="backend.app.services.bambu_mqtt",
  91. min_count=5,
  92. ),
  93. LogSignature(
  94. # Camera stream unreachable — RTSPS :322 blocked, or the printer
  95. # camera / LAN liveview is disabled.
  96. id="camera-connection-refused",
  97. patterns=_compile(
  98. r"Chamber image: connection refused",
  99. r"Chamber image: connection timeout",
  100. r"Camera connection test failed",
  101. ),
  102. severity="warning",
  103. category="layer8",
  104. wiki_anchor="camera-rtsps-port-322",
  105. logger_prefix="backend.app.services.camera",
  106. min_count=3,
  107. ),
  108. LogSignature(
  109. # SQLite write contention. Surfaces inside exception tracebacks; folded
  110. # continuation lines are part of the entry message, so this still
  111. # matches. The fix is switching to PostgreSQL under multi-printer load.
  112. id="database-locked",
  113. patterns=_compile(r"database is locked"),
  114. severity="error",
  115. category="environment",
  116. wiki_anchor="database-is-locked",
  117. ),
  118. )
  119. class LogFinding(BaseModel):
  120. """An aggregated, sanitized match of one signature against the log."""
  121. signature_id: str
  122. severity: str
  123. category: str
  124. wiki_anchor: str
  125. count: int
  126. first_seen: str
  127. last_seen: str
  128. sample: str
  129. class ScanResult(BaseModel):
  130. """Result of a log-health scan."""
  131. findings: list[LogFinding]
  132. scanned_entries: int
  133. log_available: bool
  134. summary: dict[str, int]
  135. def _level_ok(entry: LogEntry, min_level: str) -> bool:
  136. return _LEVEL_RANK.get(entry.level.upper(), 0) >= _LEVEL_RANK.get(min_level, 30)
  137. def _matches(sig: LogSignature, entry: LogEntry) -> bool:
  138. if not _level_ok(entry, sig.min_level):
  139. return False
  140. if sig.logger_prefix and not entry.logger_name.startswith(sig.logger_prefix):
  141. return False
  142. return any(p.search(entry.message) for p in sig.patterns)
  143. def _sample_line(message: str) -> str:
  144. """Take the first line of a (possibly multi-line) entry, length-capped."""
  145. first_line = message.splitlines()[0] if message else ""
  146. if len(first_line) > _SAMPLE_MAX_LEN:
  147. return first_line[:_SAMPLE_MAX_LEN] + "…"
  148. return first_line
  149. def scan_logs(
  150. limit: int = DEFAULT_SCAN_LIMIT,
  151. sensitive_strings: dict[str, str] | None = None,
  152. ) -> ScanResult:
  153. """Scan the recent app log against the signature catalog.
  154. ``sensitive_strings`` (from :func:`log_reader.collect_sensitive_strings`) is
  155. applied to every sample line so printer names, serials, IPs, and access
  156. codes never leave the process. Even when it is ``None`` the regex-based
  157. redaction passes still run.
  158. """
  159. log_file = settings.log_dir / "bambuddy.log"
  160. log_available = log_file.exists()
  161. entries, _total = read_log_entries(limit=limit)
  162. # entry_id -> accumulator. entries arrive newest-first.
  163. agg: dict[str, dict] = {}
  164. for entry in entries:
  165. for sig in SIGNATURES:
  166. if not _matches(sig, entry):
  167. continue
  168. acc = agg.get(sig.id)
  169. if acc is None:
  170. # First (== newest) occurrence encountered.
  171. agg[sig.id] = {
  172. "count": 1,
  173. "sample": entry.message,
  174. "last_seen": entry.timestamp,
  175. "first_seen": entry.timestamp,
  176. }
  177. else:
  178. acc["count"] += 1
  179. # Iterating newest-first, so each later hit is older.
  180. acc["first_seen"] = entry.timestamp
  181. findings: list[LogFinding] = []
  182. for sig in SIGNATURES:
  183. acc = agg.get(sig.id)
  184. if acc is None or acc["count"] < sig.min_count:
  185. continue
  186. sample = sanitize_log_content(_sample_line(acc["sample"]), sensitive_strings)
  187. findings.append(
  188. LogFinding(
  189. signature_id=sig.id,
  190. severity=sig.severity,
  191. category=sig.category,
  192. wiki_anchor=sig.wiki_anchor,
  193. count=acc["count"],
  194. first_seen=acc["first_seen"],
  195. last_seen=acc["last_seen"],
  196. sample=sample,
  197. )
  198. )
  199. findings.sort(
  200. key=lambda f: (
  201. _CATEGORY_ORDER.get(f.category, 9),
  202. _SEVERITY_ORDER.get(f.severity, 9),
  203. -f.count,
  204. )
  205. )
  206. summary = {
  207. "total": len(findings),
  208. "layer8": sum(1 for f in findings if f.category == "layer8"),
  209. "environment": sum(1 for f in findings if f.category == "environment"),
  210. "bug": sum(1 for f in findings if f.category == "bug"),
  211. }
  212. return ScanResult(
  213. findings=findings,
  214. scanned_entries=len(entries),
  215. log_available=log_available,
  216. summary=summary,
  217. )