Browse Source

fix(scheduler): watchdogs no longer falsely treat FINISH->IDLE as "print landed" (#1370)

  Both the queue-side _watchdog_print_start and the direct-dispatch
  _verify_print_response used `status.state != pre_state` to decide
  whether a project_file command had been accepted. When a printer was
  in FINISH at dispatch time (un-dismissed post-print prompt from a
  prior job), the firmware silently rejected the new command; if the
  user then dismissed the screen prompt, the printer moved FINISH ->
  IDLE and the watchdog returned early as "command landed" — leaving
  the queue row stuck at status='printing' indefinitely and the
  scheduler permanently marking the printer as busy.

  Narrow the "command landed" check in both verifiers to an allow-list
  of active-print states (PREPARE / SLICING / RUNNING / PAUSE).
  Inactive transitions (FINISH -> IDLE, etc.) no longer short-circuit
  the revert. The subtask_id-advance signal stays in place for H2D's
  slow FINISH -> PREPARE transition (#1078).

  Also wrap _watchdog_print_start's revert commit and
  printer_manager._persist_awaiting_plate_clear in run_with_retry so
  SQLite single-writer contention can't silently drop these writes.
  The revert path returns a tristate sentinel so the post-revert MQTT
  session-recovery logic only runs when we actually reverted (or the
  commit failed) — not when on_print_complete had already cleared the
  row, where a forced reconnect could break a healthy concurrent print.
maziggy 1 week ago
parent
commit
5680f5d34b

File diff suppressed because it is too large
+ 0 - 0
CHANGELOG.md


+ 21 - 1
backend/app/services/background_dispatch.py

@@ -35,6 +35,16 @@ from backend.app.services.printer_manager import printer_manager
 
 logger = logging.getLogger(__name__)
 
+# Bambu firmware states that mean the project_file has actually been accepted
+# and the printer is now processing / running / paused mid-print. Used by the
+# direct-dispatch verifier (#1370): a transition into one of these states means
+# the print landed, anything else (e.g. FINISH -> IDLE after the user dismisses
+# a post-print prompt) is NOT a valid "command landed" signal even though the
+# state value did change. Mirrors the same constant in print_scheduler.py —
+# kept duplicated rather than imported to avoid coupling the two services and
+# to keep the value at the point of use.
+_ACTIVE_PRINT_STATES: frozenset[str] = frozenset({"PREPARE", "SLICING", "RUNNING", "PAUSE"})
+
 
 class DispatchJobCancelled(Exception):
     """Raised when a dispatch job is cancelled by the user."""
@@ -990,9 +1000,19 @@ class BackgroundDispatchService:
                 # within the remaining timeout and still surface a transition.
                 continue
             last_status = state
-            if state.state != pre_state:
+            if state.state in _ACTIVE_PRINT_STATES:
+                # Printer is actively processing the job. We do NOT accept
+                # arbitrary state transitions: a printer going FINISH -> IDLE
+                # (user dismissed the post-print prompt without accepting our
+                # project_file) would otherwise look like "command landed"
+                # and the dispatch job would be marked successful even though
+                # no print is running (#1370).
                 return True
             if pre_subtask_id is not None and state.subtask_id is not None and state.subtask_id != pre_subtask_id:
+                # Printer picked up the job (subtask_id advanced). H2D can
+                # sit at FINISH for ~50 s after accepting project_file before
+                # transitioning to PREPARE, but the subtask_id flips to our
+                # submission_id almost immediately (#1078).
                 return True
         logger.warning(
             "Printer %s (%d) did not respond to print command within %.0fs "

+ 55 - 7
backend/app/services/print_scheduler.py

@@ -11,7 +11,7 @@ from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from backend.app.core.config import settings
-from backend.app.core.database import async_session
+from backend.app.core.database import async_session, run_with_retry
 from backend.app.models.archive import PrintArchive
 from backend.app.models.library import LibraryFile
 from backend.app.models.print_queue import PrintQueueItem
@@ -32,6 +32,15 @@ from backend.app.utils.printer_models import normalize_printer_model
 
 logger = logging.getLogger(__name__)
 
+# Bambu firmware states that mean the project_file has actually been accepted
+# and the printer is now processing / running / paused mid-print. Used by the
+# dispatch watchdog (#1370): a transition into one of these states means the
+# print landed, anything else (e.g. FINISH -> IDLE after the user dismisses
+# a post-print prompt) is NOT a valid "command landed" signal even though the
+# state value did change. SLICING is included because some firmwares park
+# briefly in SLICING between PREPARE and RUNNING while parsing the g-code.
+_ACTIVE_PRINT_STATES: frozenset[str] = frozenset({"PREPARE", "SLICING", "RUNNING", "PAUSE"})
+
 # Filament type equivalence groups — types within the same group are
 # interchangeable on the printer side (Bambu Lab firmware treats them as compatible).
 _FILAMENT_TYPE_GROUPS: list[list[str]] = [
@@ -2029,27 +2038,66 @@ class PrintScheduler:
                 scheduler._release_dispatch_hold(printer_id)
                 return
             last_status = status
-            if status.state != pre_state:
-                # Printer picked up the job (state transition) — release the
+            if status.state in _ACTIVE_PRINT_STATES:
+                # Printer is actively processing the job — release the
                 # post-dispatch hold so the next pending item for this printer
-                # can be evaluated normally.
+                # can be evaluated normally. We do NOT accept arbitrary state
+                # transitions: a printer going FINISH -> IDLE (user dismissed
+                # the post-print prompt without accepting our project_file)
+                # would otherwise look like "command landed" and leave the
+                # queue item stuck in 'printing' forever (#1370).
                 scheduler._release_dispatch_hold(printer_id)
                 return
             if pre_subtask_id is not None and status.subtask_id is not None and status.subtask_id != pre_subtask_id:
-                # Printer picked up the job (subtask_id advanced)
+                # Printer picked up the job (subtask_id advanced). H2D can
+                # sit at FINISH for ~50 s after accepting project_file
+                # before transitioning to PREPARE, but the subtask_id flips
+                # to our submission_id almost immediately (#1078).
                 scheduler._release_dispatch_hold(printer_id)
                 return
 
         # No transition. Revert the item so the scheduler can retry.
         # Drop the in-memory hold so the retry isn't blocked by it.
         scheduler._release_dispatch_hold(printer_id)
-        async with async_session() as db:
+
+        # Three outcomes from the revert attempt, each routed differently:
+        #   "reverted":          row flipped from printing -> pending, run recovery
+        #   "already_moved_on":  item.status != 'printing' (completed/cancelled by
+        #                        on_print_complete or user). Skip recovery entirely
+        #                        — the print clearly landed somewhere even if the
+        #                        watchdog didn't see the active-state transition.
+        #   "revert_failed":     SQLite contention exhausted retries. Still run
+        #                        recovery so the MQTT session gets a fresh client_id
+        #                        on the half-broken-session path.
+        async def _do_revert(db):
             item = await db.get(PrintQueueItem, queue_item_id)
             if not item or item.status != "printing":
-                return  # Already moved on (completed/cancelled/etc.)
+                return "already_moved_on"
             item.status = "pending"
             item.started_at = None
             await db.commit()
+            return "reverted"
+
+        try:
+            revert_outcome = await run_with_retry(_do_revert, label=f"watchdog revert item={queue_item_id}")
+        except Exception as e:
+            logger.warning(
+                "Queue item %s: failed to revert to 'pending' (printer %d): %s — "
+                "scheduler may keep treating this item as in-flight",
+                queue_item_id,
+                printer_id,
+                e,
+            )
+            revert_outcome = "revert_failed"
+
+        if revert_outcome == "already_moved_on":
+            # Preserves the pre-#1370 early-return: if on_print_complete (or any
+            # other path) already moved the item past 'printing', don't run the
+            # MQTT session-recovery logic below — a forced reconnect on a healthy
+            # session breaks ongoing prints on the same printer.
+            return
+
+        if revert_outcome == "reverted":
             logger.warning(
                 "Queue item %s: printer %d did not respond to print command within "
                 "%.0fs (state still %s, subtask_id still %s) — reverted to 'pending' "

+ 8 - 6
backend/app/services/printer_manager.py

@@ -269,14 +269,16 @@ class PrinterManager:
             )
 
     async def _persist_awaiting_plate_clear(self, printer_id: int, awaiting: bool):
-        from backend.app.core.database import async_session
+        from backend.app.core.database import run_with_retry
+
+        async def _do(db):
+            printer = await db.get(Printer, printer_id)
+            if printer is not None:
+                printer.awaiting_plate_clear = awaiting
+                await db.commit()
 
         try:
-            async with async_session() as db:
-                printer = await db.get(Printer, printer_id)
-                if printer is not None:
-                    printer.awaiting_plate_clear = awaiting
-                    await db.commit()
+            await run_with_retry(_do, label=f"persist awaiting_plate_clear printer={printer_id}")
         except Exception as e:
             logger.warning("Failed to persist awaiting_plate_clear for printer %d: %s", printer_id, e)
 

+ 62 - 0
backend/tests/unit/services/test_background_dispatch_watchdog.py

@@ -99,6 +99,68 @@ class TestReturnsFalseOnTimeout:
         assert result is False
         client.force_reconnect_stale_session.assert_called_once()
 
+    @pytest.mark.asyncio
+    async def test_returns_false_on_finish_to_idle_user_dismissed_prompt(self):
+        """Regression for #1370 in the direct-dispatch path: when pre_state is
+        FINISH and the printer transitions to IDLE during the verifier window,
+        that's the user dismissing a post-print prompt — NOT acceptance of our
+        project_file. The original ``state != pre_state`` check incorrectly
+        returned True on this transition, so the dispatch job was marked
+        successful even though no print was running. Must now report failure
+        so the caller raises RuntimeError and the user sees the actual error.
+        """
+        get_status = MagicMock(return_value=_status("IDLE", "OLD_SUBTASK"))
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_status",
+                get_status,
+            ),
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_client",
+                get_client,
+            ),
+        ):
+            result = await BackgroundDispatchService._verify_print_response(
+                printer_id=42,
+                printer_name="P1S",
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        assert result is False, (
+            "FINISH -> IDLE is the user dismissing a screen prompt, not the "
+            "printer accepting project_file — verifier must report failure (#1370)"
+        )
+
+    @pytest.mark.asyncio
+    async def test_returns_true_on_each_active_print_state(self):
+        """Counterpart to the #1370 fix: transitions into the active-print
+        state set ARE valid "command landed" signals. PREPARE / SLICING /
+        RUNNING / PAUSE all return True.
+        """
+        for active_state in ("PREPARE", "SLICING", "RUNNING", "PAUSE"):
+            get_status = MagicMock(return_value=_status(active_state, "OLD_SUBTASK"))
+            with patch(
+                "backend.app.services.background_dispatch.printer_manager.get_status",
+                get_status,
+            ):
+                result = await BackgroundDispatchService._verify_print_response(
+                    printer_id=42,
+                    printer_name="P1S",
+                    pre_state="IDLE",
+                    pre_subtask_id="OLD_SUBTASK",
+                    timeout=0.2,
+                    poll_interval=0.05,
+                )
+            assert result is True, (
+                f"transition IDLE -> {active_state} must be treated as a valid 'command landed' signal"
+            )
+
     @pytest.mark.asyncio
     async def test_returns_false_when_pre_subtask_id_none_and_state_unchanged(self):
         """Backward-compat: callers without a captured pre_subtask_id (e.g. the

+ 97 - 2
backend/tests/unit/test_scheduler_watchdog.py

@@ -118,6 +118,7 @@ class TestWatchdogRevertsWhenStuck:
             patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
             patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
             patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
         ):
             await PrintScheduler._watchdog_print_start(
                 queue_item_id=1,
@@ -135,6 +136,87 @@ class TestWatchdogRevertsWhenStuck:
 
         client.force_reconnect_stale_session.assert_called_once()
 
+    @pytest.mark.asyncio
+    async def test_reverts_on_finish_to_idle_user_dismissed_prompt(self, db_session):
+        """Regression for #1370: when pre_state is FINISH and the printer
+        transitions to IDLE during the watchdog window, that's the user
+        dismissing a post-print prompt — NOT acceptance of our project_file.
+
+        The bundle in #1370 showed exactly this: queue item dispatched while
+        printer was in FINISH (residual from a previous print), command sent
+        but silently rejected by firmware, then the user manually cleared
+        the screen prompt so the printer moved to IDLE. The original
+        ``state != pre_state`` check returned early on this transition and
+        the queue row was left stuck in 'printing' indefinitely, blocking
+        all future dispatches to that printer.
+
+        The watchdog now only treats transitions into the active-print
+        state set (PREPARE / SLICING / RUNNING / PAUSE) as a valid "command
+        landed" signal.
+        """
+        get_status = MagicMock(return_value=_status("IDLE", "OLD_SUBTASK"))
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
+            patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
+            patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
+        ):
+            await PrintScheduler._watchdog_print_start(
+                queue_item_id=1,
+                printer_id=42,
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        async with db_session() as db:
+            item = await db.get(PrintQueueItem, 1)
+            assert item.status == "pending", (
+                "FINISH -> IDLE is the user dismissing a screen prompt, not "
+                "the printer accepting project_file — item must be reverted "
+                "to 'pending' so the scheduler can retry (#1370)"
+            )
+            assert item.started_at is None
+
+    @pytest.mark.asyncio
+    async def test_does_not_revert_on_pickup_via_active_state(self, db_session):
+        """Counterpart to the #1370 fix: transitions into the active-print
+        state set ARE a valid "command landed" signal. PREPARE / SLICING /
+        RUNNING / PAUSE all keep the item in 'printing'.
+        """
+        for active_state in ("PREPARE", "SLICING", "RUNNING", "PAUSE"):
+            async with db_session() as db:
+                item = await db.get(PrintQueueItem, 1)
+                item.status = "printing"
+                item.started_at = None
+                await db.commit()
+
+            get_status = MagicMock(return_value=_status(active_state, "OLD_SUBTASK"))
+            with (
+                patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
+                patch("backend.app.services.print_scheduler.async_session", db_session),
+                patch("backend.app.core.database.async_session", db_session),
+            ):
+                await PrintScheduler._watchdog_print_start(
+                    queue_item_id=1,
+                    printer_id=42,
+                    pre_state="IDLE",
+                    pre_subtask_id="OLD_SUBTASK",
+                    timeout=0.2,
+                    poll_interval=0.05,
+                )
+
+            async with db_session() as db:
+                item = await db.get(PrintQueueItem, 1)
+                assert item.status == "printing", (
+                    f"transition IDLE -> {active_state} must be treated as a "
+                    f"valid 'command landed' signal — watchdog must not revert"
+                )
+
     @pytest.mark.asyncio
     async def test_default_timeout_is_90_seconds(self):
         """The default timeout must cover slow H2D FINISH→PREPARE transitions
@@ -163,6 +245,7 @@ class TestWatchdogFallbackBehaviour:
             patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
             patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
             patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
         ):
             await PrintScheduler._watchdog_print_start(
                 queue_item_id=1,
@@ -190,6 +273,7 @@ class TestWatchdogFallbackBehaviour:
             patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
             patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
             patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
         ):
             await PrintScheduler._watchdog_print_start(
                 queue_item_id=1,
@@ -231,7 +315,12 @@ class TestWatchdogFallbackBehaviour:
     async def test_no_revert_if_item_already_completed(self, db_session):
         """If the print completed between watchdog arm-time and timeout (item is
         no longer "printing"), the watchdog must not clobber whatever status it
-        ended up in — #967 race guard."""
+        ended up in — #967 race guard. Additionally it must NOT run the MQTT
+        session-recovery path (forced reconnect): when on_print_complete has
+        already moved the row, the print clearly landed on the printer and a
+        forced reconnect on a healthy session would break ongoing prints on
+        the same printer.
+        """
         # Move item on to "completed" before the watchdog fires.
         async with db_session() as db:
             item = await db.get(PrintQueueItem, 1)
@@ -239,12 +328,14 @@ class TestWatchdogFallbackBehaviour:
             await db.commit()
 
         get_status = MagicMock(return_value=_status("FINISH", "OLD_SUBTASK"))
-        get_client = MagicMock(return_value=None)
+        client = MagicMock()  # NOT None — must verify reconnect isn't called
+        get_client = MagicMock(return_value=client)
 
         with (
             patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
             patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
             patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
         ):
             await PrintScheduler._watchdog_print_start(
                 queue_item_id=1,
@@ -259,6 +350,8 @@ class TestWatchdogFallbackBehaviour:
             item = await db.get(PrintQueueItem, 1)
             assert item.status == "completed"  # untouched
 
+        client.force_reconnect_stale_session.assert_not_called()
+
 
 class TestGcodeFileDiscriminator:
     """#1150 vs #887/#936: skip the forced reconnect when gcode_file changed
@@ -278,6 +371,7 @@ class TestGcodeFileDiscriminator:
             patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
             patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
             patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
         ):
             await PrintScheduler._watchdog_print_start(
                 queue_item_id=1,
@@ -308,6 +402,7 @@ class TestGcodeFileDiscriminator:
             patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
             patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
             patch("backend.app.services.print_scheduler.async_session", db_session),
+            patch("backend.app.core.database.async_session", db_session),
         ):
             await PrintScheduler._watchdog_print_start(
                 queue_item_id=1,

Some files were not shown because too many files changed in this diff