test_scheduler_watchdog.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. """Regression tests for ``_watchdog_print_start``.
  2. The watchdog reverts queue items to ``pending`` when a dispatched print never
  3. lands on the printer (half-broken MQTT session — #887/#936/#967). H2D firmware
  4. can sit at ``FINISH`` for 50+ seconds after accepting a ``project_file``
  5. command before flipping ``gcode_state`` to ``PREPARE``, which used to trip the
  6. state-only watchdog and cause the scheduler to revert the item; the subsequent
  7. successful dispatch then looked like a reprint of the just-finished job (#1078).
  8. The fix: treat ``subtask_id`` advancing past the pre-dispatch value as an
  9. equivalent "command landed" signal, and raise the timeout from 45 s to 90 s as
  10. belt-and-braces for slow transitions that also don't emit an early subtask_id
  11. tick.
  12. """
  13. from types import SimpleNamespace
  14. from unittest.mock import MagicMock, patch
  15. import pytest
  16. from backend.app.models.print_queue import PrintQueueItem
  17. from backend.app.services.print_scheduler import PrintScheduler
  18. @pytest.fixture
  19. async def db_session():
  20. """In-memory SQLite with one ``printing`` queue item at id=1."""
  21. from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
  22. import backend.app.models # noqa: F401 — populate Base.metadata
  23. from backend.app.core.database import Base
  24. engine = create_async_engine("sqlite+aiosqlite:///:memory:", echo=False)
  25. async with engine.begin() as conn:
  26. await conn.run_sync(Base.metadata.create_all)
  27. session_maker = async_sessionmaker(engine, expire_on_commit=False)
  28. async with session_maker() as db:
  29. db.add(PrintQueueItem(id=1, printer_id=42, archive_id=99, status="printing"))
  30. await db.commit()
  31. try:
  32. yield session_maker
  33. finally:
  34. await engine.dispose()
  35. def _status(state: str, subtask_id: str | None = None, gcode_file: str | None = None):
  36. """Minimal stand-in for PrinterState — only the fields the watchdog reads."""
  37. return SimpleNamespace(state=state, subtask_id=subtask_id, gcode_file=gcode_file)
  38. class TestWatchdogExitsEarlyOnPickup:
  39. """The watchdog must NOT revert when the printer has clearly picked up the job."""
  40. @pytest.mark.asyncio
  41. async def test_exits_on_state_change(self, db_session):
  42. """State transitioning away from pre_state is the primary "accepted" signal."""
  43. get_status = MagicMock(return_value=_status("RUNNING", "OLD_SUBTASK"))
  44. with (
  45. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  46. patch("backend.app.services.print_scheduler.async_session", db_session),
  47. ):
  48. await PrintScheduler._watchdog_print_start(
  49. queue_item_id=1,
  50. printer_id=42,
  51. pre_state="FINISH",
  52. pre_subtask_id="OLD_SUBTASK",
  53. timeout=0.3,
  54. poll_interval=0.05,
  55. )
  56. # Item should remain "printing" — watchdog recognised the pickup.
  57. async with db_session() as db:
  58. item = await db.get(PrintQueueItem, 1)
  59. assert item.status == "printing"
  60. @pytest.mark.asyncio
  61. async def test_exits_on_subtask_id_change_even_if_state_still_finish(self, db_session):
  62. """Regression for #1078: H2D keeps state=FINISH for ~50 s after accepting
  63. project_file, but subtask_id flips to our new submission_id almost
  64. immediately. That must short-circuit the revert."""
  65. get_status = MagicMock(return_value=_status("FINISH", "NEW_SUBTASK_12345"))
  66. with (
  67. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  68. patch("backend.app.services.print_scheduler.async_session", db_session),
  69. ):
  70. await PrintScheduler._watchdog_print_start(
  71. queue_item_id=1,
  72. printer_id=42,
  73. pre_state="FINISH",
  74. pre_subtask_id="OLD_SUBTASK_99999",
  75. timeout=0.3,
  76. poll_interval=0.05,
  77. )
  78. async with db_session() as db:
  79. item = await db.get(PrintQueueItem, 1)
  80. assert item.status == "printing", (
  81. "subtask_id advanced past pre_subtask_id — the printer accepted our "
  82. "project_file and the watchdog must not revert the queue item even "
  83. "though state is still FINISH (#1078)"
  84. )
  85. class TestWatchdogRevertsWhenStuck:
  86. """Genuine half-broken sessions still need the revert + reconnect recovery."""
  87. @pytest.mark.asyncio
  88. async def test_reverts_when_neither_state_nor_subtask_id_changes(self, db_session):
  89. """Both signals unchanged across the full timeout → revert to pending
  90. and force MQTT reconnect (the #967 recovery path)."""
  91. get_status = MagicMock(return_value=_status("FINISH", "OLD_SUBTASK"))
  92. client = MagicMock()
  93. get_client = MagicMock(return_value=client)
  94. with (
  95. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  96. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  97. patch("backend.app.services.print_scheduler.async_session", db_session),
  98. patch("backend.app.core.database.async_session", db_session),
  99. ):
  100. await PrintScheduler._watchdog_print_start(
  101. queue_item_id=1,
  102. printer_id=42,
  103. pre_state="FINISH",
  104. pre_subtask_id="OLD_SUBTASK",
  105. timeout=0.2,
  106. poll_interval=0.05,
  107. )
  108. async with db_session() as db:
  109. item = await db.get(PrintQueueItem, 1)
  110. assert item.status == "pending"
  111. assert item.started_at is None
  112. client.force_reconnect_stale_session.assert_called_once()
  113. @pytest.mark.asyncio
  114. async def test_reverts_on_finish_to_idle_user_dismissed_prompt(self, db_session):
  115. """Regression for #1370: when pre_state is FINISH and the printer
  116. transitions to IDLE during the watchdog window, that's the user
  117. dismissing a post-print prompt — NOT acceptance of our project_file.
  118. The bundle in #1370 showed exactly this: queue item dispatched while
  119. printer was in FINISH (residual from a previous print), command sent
  120. but silently rejected by firmware, then the user manually cleared
  121. the screen prompt so the printer moved to IDLE. The original
  122. ``state != pre_state`` check returned early on this transition and
  123. the queue row was left stuck in 'printing' indefinitely, blocking
  124. all future dispatches to that printer.
  125. The watchdog now only treats transitions into the active-print
  126. state set (PREPARE / SLICING / RUNNING / PAUSE) as a valid "command
  127. landed" signal.
  128. """
  129. get_status = MagicMock(return_value=_status("IDLE", "OLD_SUBTASK"))
  130. client = MagicMock()
  131. get_client = MagicMock(return_value=client)
  132. with (
  133. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  134. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  135. patch("backend.app.services.print_scheduler.async_session", db_session),
  136. patch("backend.app.core.database.async_session", db_session),
  137. ):
  138. await PrintScheduler._watchdog_print_start(
  139. queue_item_id=1,
  140. printer_id=42,
  141. pre_state="FINISH",
  142. pre_subtask_id="OLD_SUBTASK",
  143. timeout=0.2,
  144. poll_interval=0.05,
  145. )
  146. async with db_session() as db:
  147. item = await db.get(PrintQueueItem, 1)
  148. assert item.status == "pending", (
  149. "FINISH -> IDLE is the user dismissing a screen prompt, not "
  150. "the printer accepting project_file — item must be reverted "
  151. "to 'pending' so the scheduler can retry (#1370)"
  152. )
  153. assert item.started_at is None
  154. @pytest.mark.asyncio
  155. async def test_does_not_revert_on_pickup_via_active_state(self, db_session):
  156. """Counterpart to the #1370 fix: transitions into the active-print
  157. state set ARE a valid "command landed" signal. PREPARE / SLICING /
  158. RUNNING / PAUSE all keep the item in 'printing'.
  159. """
  160. for active_state in ("PREPARE", "SLICING", "RUNNING", "PAUSE"):
  161. async with db_session() as db:
  162. item = await db.get(PrintQueueItem, 1)
  163. item.status = "printing"
  164. item.started_at = None
  165. await db.commit()
  166. get_status = MagicMock(return_value=_status(active_state, "OLD_SUBTASK"))
  167. with (
  168. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  169. patch("backend.app.services.print_scheduler.async_session", db_session),
  170. patch("backend.app.core.database.async_session", db_session),
  171. ):
  172. await PrintScheduler._watchdog_print_start(
  173. queue_item_id=1,
  174. printer_id=42,
  175. pre_state="IDLE",
  176. pre_subtask_id="OLD_SUBTASK",
  177. timeout=0.2,
  178. poll_interval=0.05,
  179. )
  180. async with db_session() as db:
  181. item = await db.get(PrintQueueItem, 1)
  182. assert item.status == "printing", (
  183. f"transition IDLE -> {active_state} must be treated as a "
  184. f"valid 'command landed' signal — watchdog must not revert"
  185. )
  186. @pytest.mark.asyncio
  187. async def test_default_timeout_is_90_seconds(self):
  188. """The default timeout must cover slow H2D FINISH→PREPARE transitions
  189. (~50 s observed). A 45 s default would trip on the exact scenario the
  190. subtask_id check is guarding against, leaving no fallback for printers
  191. that don't echo subtask_id."""
  192. import inspect
  193. sig = inspect.signature(PrintScheduler._watchdog_print_start)
  194. assert sig.parameters["timeout"].default == 90.0
  195. class TestWatchdogFallbackBehaviour:
  196. """Backwards-compat and defensive behaviour around missing data."""
  197. @pytest.mark.asyncio
  198. async def test_pre_subtask_id_none_falls_back_to_state_only(self, db_session):
  199. """When we never captured a pre-dispatch subtask_id (e.g. printer just
  200. connected), the watchdog must still work on the state signal alone —
  201. and still revert when state stays unchanged, so half-broken sessions
  202. are still recovered."""
  203. get_status = MagicMock(return_value=_status("FINISH", "SOMETHING"))
  204. get_client = MagicMock(return_value=None)
  205. with (
  206. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  207. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  208. patch("backend.app.services.print_scheduler.async_session", db_session),
  209. patch("backend.app.core.database.async_session", db_session),
  210. ):
  211. await PrintScheduler._watchdog_print_start(
  212. queue_item_id=1,
  213. printer_id=42,
  214. pre_state="FINISH",
  215. pre_subtask_id=None,
  216. timeout=0.2,
  217. poll_interval=0.05,
  218. )
  219. async with db_session() as db:
  220. item = await db.get(PrintQueueItem, 1)
  221. assert item.status == "pending"
  222. @pytest.mark.asyncio
  223. async def test_current_subtask_id_none_does_not_trigger_early_exit(self, db_session):
  224. """If the printer transiently reports subtask_id=None (e.g. during
  225. reconnect), that must not be treated as "changed" — otherwise the
  226. watchdog would exit early without a real pickup signal and leave the
  227. item stuck in "printing" after a genuinely broken session."""
  228. get_status = MagicMock(return_value=_status("FINISH", None))
  229. get_client = MagicMock(return_value=None)
  230. with (
  231. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  232. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  233. patch("backend.app.services.print_scheduler.async_session", db_session),
  234. patch("backend.app.core.database.async_session", db_session),
  235. ):
  236. await PrintScheduler._watchdog_print_start(
  237. queue_item_id=1,
  238. printer_id=42,
  239. pre_state="FINISH",
  240. pre_subtask_id="OLD_SUBTASK",
  241. timeout=0.2,
  242. poll_interval=0.05,
  243. )
  244. async with db_session() as db:
  245. item = await db.get(PrintQueueItem, 1)
  246. assert item.status == "pending"
  247. @pytest.mark.asyncio
  248. async def test_printer_disconnected_returns_without_reverting(self, db_session):
  249. """If the printer drops during the watchdog window, don't touch the DB —
  250. the reconnect path will sort the queue state out."""
  251. get_status = MagicMock(return_value=None)
  252. with (
  253. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  254. patch("backend.app.services.print_scheduler.async_session", db_session),
  255. ):
  256. await PrintScheduler._watchdog_print_start(
  257. queue_item_id=1,
  258. printer_id=42,
  259. pre_state="FINISH",
  260. pre_subtask_id="OLD_SUBTASK",
  261. timeout=0.2,
  262. poll_interval=0.05,
  263. )
  264. async with db_session() as db:
  265. item = await db.get(PrintQueueItem, 1)
  266. assert item.status == "printing"
  267. @pytest.mark.asyncio
  268. async def test_no_revert_if_item_already_completed(self, db_session):
  269. """If the print completed between watchdog arm-time and timeout (item is
  270. no longer "printing"), the watchdog must not clobber whatever status it
  271. ended up in — #967 race guard. Additionally it must NOT run the MQTT
  272. session-recovery path (forced reconnect): when on_print_complete has
  273. already moved the row, the print clearly landed on the printer and a
  274. forced reconnect on a healthy session would break ongoing prints on
  275. the same printer.
  276. """
  277. # Move item on to "completed" before the watchdog fires.
  278. async with db_session() as db:
  279. item = await db.get(PrintQueueItem, 1)
  280. item.status = "completed"
  281. await db.commit()
  282. get_status = MagicMock(return_value=_status("FINISH", "OLD_SUBTASK"))
  283. client = MagicMock() # NOT None — must verify reconnect isn't called
  284. get_client = MagicMock(return_value=client)
  285. with (
  286. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  287. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  288. patch("backend.app.services.print_scheduler.async_session", db_session),
  289. patch("backend.app.core.database.async_session", db_session),
  290. ):
  291. await PrintScheduler._watchdog_print_start(
  292. queue_item_id=1,
  293. printer_id=42,
  294. pre_state="FINISH",
  295. pre_subtask_id="OLD_SUBTASK",
  296. timeout=0.2,
  297. poll_interval=0.05,
  298. )
  299. async with db_session() as db:
  300. item = await db.get(PrintQueueItem, 1)
  301. assert item.status == "completed" # untouched
  302. client.force_reconnect_stale_session.assert_not_called()
  303. class TestGcodeFileDiscriminator:
  304. """#1150 vs #887/#936: skip the forced reconnect when gcode_file changed
  305. (project_file landed, slow parse — reconnecting causes 0500_4003).
  306. Reconnect when gcode_file is unchanged (publish dropped — half-broken
  307. session needs the original recovery)."""
  308. @pytest.mark.asyncio
  309. async def test_skips_reconnect_when_gcode_file_changed(self, db_session):
  310. get_status = MagicMock(
  311. return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/new.3mf"),
  312. )
  313. client = MagicMock()
  314. get_client = MagicMock(return_value=client)
  315. with (
  316. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  317. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  318. patch("backend.app.services.print_scheduler.async_session", db_session),
  319. patch("backend.app.core.database.async_session", db_session),
  320. ):
  321. await PrintScheduler._watchdog_print_start(
  322. queue_item_id=1,
  323. printer_id=42,
  324. pre_state="FINISH",
  325. pre_subtask_id="OLD_SUBTASK",
  326. pre_gcode_file="/old.3mf",
  327. timeout=0.2,
  328. poll_interval=0.05,
  329. )
  330. # Item still reverts (the user-facing failure stays correct), but the
  331. # MQTT session is left intact so the slow printer can finish parsing.
  332. async with db_session() as db:
  333. item = await db.get(PrintQueueItem, 1)
  334. assert item.status == "pending"
  335. client.force_reconnect_stale_session.assert_not_called()
  336. @pytest.mark.asyncio
  337. async def test_reconnects_when_gcode_file_unchanged(self, db_session):
  338. get_status = MagicMock(
  339. return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/old.3mf"),
  340. )
  341. client = MagicMock()
  342. get_client = MagicMock(return_value=client)
  343. with (
  344. patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
  345. patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
  346. patch("backend.app.services.print_scheduler.async_session", db_session),
  347. patch("backend.app.core.database.async_session", db_session),
  348. ):
  349. await PrintScheduler._watchdog_print_start(
  350. queue_item_id=1,
  351. printer_id=42,
  352. pre_state="FINISH",
  353. pre_subtask_id="OLD_SUBTASK",
  354. pre_gcode_file="/old.3mf",
  355. timeout=0.2,
  356. poll_interval=0.05,
  357. )
  358. client.force_reconnect_stale_session.assert_called_once()