camera_diagnose.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. """End-to-end camera diagnostic, surfaced via ``POST /printers/{id}/camera/diagnose``.
  2. Cuts off the "camera broken" support-ticket loop at the user's screen by
  3. running the printer-side camera path through staged checks (TCP, end-
  4. to-end frame capture) and reporting WHICH stage failed plus a
  5. remediation key the frontend can render translated.
  6. The goal isn't to be a perfect protocol analyser — it's to be the diff
  7. between "user opens a ticket with 'connection lost'" and "user sees
  8. 'Printer not reachable; check IP and LAN-only mode'" before they ever
  9. write a message.
  10. Stages
  11. ------
  12. 1. **tcp_reachable** — open a TCP socket to the camera port (322 for
  13. RTSPS models, 6000 for the chamber-image-protocol A1 / P1 family).
  14. Distinguishes "printer down" / "firewall" / "LAN-only off" from
  15. stream-content problems.
  16. 2. **first_frame** — call the existing ``capture_camera_frame_bytes``
  17. pipeline (same code that powers /camera/snapshot) and verify at
  18. least one JPEG comes back within the model's profile-derived
  19. timeout. Combines auth + protocol handshake + first keyframe into
  20. one stage because splitting RTSP's ``ffmpeg`` invocation is heavy
  21. and the user-facing answer is the same either way: "the camera
  22. itself isn't producing frames".
  23. Shortcut
  24. --------
  25. Most Bambu firmwares allow exactly one concurrent camera connection.
  26. Opening a fresh socket while a viewer is attached would kick them off
  27. (and trigger the same #1348 reconnect-storm pattern we built the fan-
  28. out broadcaster to prevent). When ``is_stream_active`` reports True
  29. AND a buffered frame is fresh (last 10 s), we short-circuit the test
  30. with ``live_stream_active`` and report success — the user is
  31. literally watching the camera right now, no test needed.
  32. """
  33. from __future__ import annotations
  34. import asyncio
  35. import logging
  36. import time
  37. from dataclasses import dataclass, field
  38. from backend.app.services.camera import (
  39. capture_camera_frame_bytes,
  40. get_camera_port,
  41. is_chamber_image_model,
  42. )
  43. from backend.app.services.camera_profiles import DEFAULT_PROFILE, get_camera_profile
  44. logger = logging.getLogger(__name__)
  45. # How long a live-stream buffered frame stays "fresh enough" to count as
  46. # proof that the camera works. Tuned conservatively — if the active
  47. # stream hasn't produced a frame in this window, run the real test
  48. # instead of trusting a possibly-stale buffer.
  49. _LIVE_FRAME_FRESHNESS_SECONDS = 10.0
  50. @dataclass
  51. class CameraDiagnoseStage:
  52. """One step of the diagnostic. Status drives the green/red icon
  53. the frontend renders next to the stage name."""
  54. name: str # "tcp_reachable" | "first_frame" | "live_stream_active"
  55. status: str # "ok" | "failed" | "skipped"
  56. duration_ms: int = 0
  57. # Optional machine-readable code for failures so the frontend can
  58. # render a stage-specific hint without parsing free-text errors.
  59. code: str | None = None
  60. @dataclass
  61. class CameraDiagnoseResult:
  62. printer_id: int
  63. protocol: str # "rtsp" | "chamber_image"
  64. port: int
  65. # Whether this model's camera path uses the default profile or has
  66. # an override entry in ``camera_profiles._PROFILES``. Useful for
  67. # triage: tells us instantly whether the user is on a tuned model.
  68. profile: str
  69. overall_status: str # "ok" | "failed"
  70. stages: list[CameraDiagnoseStage] = field(default_factory=list)
  71. # i18n key. Frontend maps to a translated remediation hint.
  72. summary_code: str = ""
  73. def to_dict(self) -> dict:
  74. return {
  75. "printer_id": self.printer_id,
  76. "protocol": self.protocol,
  77. "port": self.port,
  78. "profile": self.profile,
  79. "overall_status": self.overall_status,
  80. "stages": [
  81. {"name": s.name, "status": s.status, "duration_ms": s.duration_ms, "code": s.code} for s in self.stages
  82. ],
  83. "summary_code": self.summary_code,
  84. }
  85. def _profile_label(model: str | None) -> str:
  86. """Return ``"default"`` or the resolved model name when this model
  87. has an override entry in :data:`camera_profiles._PROFILES`."""
  88. profile = get_camera_profile(model)
  89. if profile is DEFAULT_PROFILE:
  90. return "default"
  91. # Normalise via the same alias map the lookup uses. If the model
  92. # resolves to a profile but the lookup is by alias (e.g. N7 → P2S),
  93. # report the canonical display name.
  94. from backend.app.services.camera_profiles import _MODEL_ALIASES, _PROFILES
  95. key = (model or "").upper().strip()
  96. key = _MODEL_ALIASES.get(key, key)
  97. return key if key in _PROFILES else "default"
  98. async def _check_tcp_reachable(ip_address: str, port: int, timeout: float) -> CameraDiagnoseStage:
  99. """Stage 1 — open a TCP socket to the camera port."""
  100. started = time.monotonic()
  101. try:
  102. _, writer = await asyncio.wait_for(
  103. asyncio.open_connection(ip_address, port),
  104. timeout=timeout,
  105. )
  106. try:
  107. writer.close()
  108. await writer.wait_closed()
  109. except OSError:
  110. pass
  111. return CameraDiagnoseStage(
  112. name="tcp_reachable",
  113. status="ok",
  114. duration_ms=int((time.monotonic() - started) * 1000),
  115. )
  116. except asyncio.TimeoutError:
  117. return CameraDiagnoseStage(
  118. name="tcp_reachable",
  119. status="failed",
  120. duration_ms=int((time.monotonic() - started) * 1000),
  121. code="tcp_timeout",
  122. )
  123. except (ConnectionRefusedError, OSError) as exc:
  124. # ConnectionRefusedError = printer up, camera port closed (likely
  125. # LAN-only off or developer mode off). Other OSError = host
  126. # unreachable. We keep these separate codes so the frontend can
  127. # surface a precise remediation hint.
  128. is_refused = isinstance(exc, ConnectionRefusedError)
  129. return CameraDiagnoseStage(
  130. name="tcp_reachable",
  131. status="failed",
  132. duration_ms=int((time.monotonic() - started) * 1000),
  133. code="tcp_refused" if is_refused else "tcp_unreachable",
  134. )
  135. async def _check_first_frame(
  136. ip_address: str,
  137. access_code: str,
  138. model: str | None,
  139. timeout: int,
  140. ) -> CameraDiagnoseStage:
  141. """Stage 2 — capture one frame end-to-end. Combines auth + protocol
  142. handshake + first keyframe; either it works or it doesn't."""
  143. started = time.monotonic()
  144. try:
  145. jpeg = await capture_camera_frame_bytes(
  146. ip_address=ip_address,
  147. access_code=access_code,
  148. model=model,
  149. timeout=timeout,
  150. )
  151. except Exception as exc: # noqa: BLE001 — see camera_profiles.py rationale
  152. # capture_camera_frame_bytes can raise from many layers (ffmpeg
  153. # spawn, TLS proxy startup, asyncio.open_connection). For the
  154. # user-facing answer, any exception during the capture path is
  155. # "first frame failed" — drilling down is for the support log.
  156. logger.warning("Camera diagnose first-frame capture raised: %s", exc)
  157. return CameraDiagnoseStage(
  158. name="first_frame",
  159. status="failed",
  160. duration_ms=int((time.monotonic() - started) * 1000),
  161. code="capture_exception",
  162. )
  163. if jpeg:
  164. return CameraDiagnoseStage(
  165. name="first_frame",
  166. status="ok",
  167. duration_ms=int((time.monotonic() - started) * 1000),
  168. )
  169. return CameraDiagnoseStage(
  170. name="first_frame",
  171. status="failed",
  172. duration_ms=int((time.monotonic() - started) * 1000),
  173. code="no_frame",
  174. )
  175. def _summary_for_stages(stages: list[CameraDiagnoseStage]) -> str:
  176. """Pick the remediation key from the first failing stage's ``code``,
  177. or ``all_ok`` when every stage passed."""
  178. for stage in stages:
  179. if stage.status != "failed":
  180. continue
  181. if stage.code == "tcp_timeout":
  182. return "printer_unreachable"
  183. if stage.code == "tcp_refused":
  184. return "camera_port_closed"
  185. if stage.code == "tcp_unreachable":
  186. return "printer_unreachable"
  187. if stage.code in ("no_frame", "capture_exception"):
  188. return "no_frame"
  189. return "unknown_failure"
  190. return "all_ok"
  191. async def diagnose_camera(
  192. ip_address: str,
  193. access_code: str,
  194. model: str | None,
  195. printer_id: int,
  196. *,
  197. has_live_stream: bool = False,
  198. live_frame_age_seconds: float | None = None,
  199. tcp_timeout: float = 3.0,
  200. capture_timeout: int = 15,
  201. ) -> CameraDiagnoseResult:
  202. """Run the camera diagnostic and return a structured result.
  203. ``has_live_stream`` and ``live_frame_age_seconds`` are looked up
  204. by the route handler from the active-stream registry (see the
  205. docstring at the top of this file for why). When they indicate a
  206. fresh frame is already buffered, the diagnostic short-circuits with
  207. a ``live_stream_active`` stage and ``all_ok`` summary — real-world
  208. proof of a working camera beats any synthetic test.
  209. """
  210. is_chamber = is_chamber_image_model(model)
  211. protocol = "chamber_image" if is_chamber else "rtsp"
  212. port = get_camera_port(model)
  213. result = CameraDiagnoseResult(
  214. printer_id=printer_id,
  215. protocol=protocol,
  216. port=port,
  217. profile=_profile_label(model),
  218. overall_status="ok",
  219. stages=[],
  220. )
  221. # Shortcut: the camera is currently streaming with a fresh frame.
  222. # Running the real diagnostic here would either kick the live
  223. # viewer off (single-camera-connection printers) or block on the
  224. # second-socket-refused timeout (#1348). Trust the live evidence.
  225. if (
  226. has_live_stream
  227. and live_frame_age_seconds is not None
  228. and 0 <= live_frame_age_seconds < _LIVE_FRAME_FRESHNESS_SECONDS
  229. ):
  230. result.stages.append(
  231. CameraDiagnoseStage(
  232. name="live_stream_active",
  233. status="ok",
  234. duration_ms=0,
  235. )
  236. )
  237. result.summary_code = "live_stream_active_healthy"
  238. return result
  239. # Stage 1
  240. tcp_stage = await _check_tcp_reachable(ip_address, port, tcp_timeout)
  241. result.stages.append(tcp_stage)
  242. if tcp_stage.status != "ok":
  243. result.overall_status = "failed"
  244. # Skip first_frame — without TCP there's no point spawning ffmpeg.
  245. result.stages.append(CameraDiagnoseStage(name="first_frame", status="skipped", duration_ms=0))
  246. result.summary_code = _summary_for_stages(result.stages)
  247. return result
  248. # Stage 2
  249. frame_stage = await _check_first_frame(ip_address, access_code, model, capture_timeout)
  250. result.stages.append(frame_stage)
  251. if frame_stage.status != "ok":
  252. result.overall_status = "failed"
  253. result.summary_code = _summary_for_stages(result.stages)
  254. return result