Browse Source

fix(#1150): skip MQTT reconnect on watchdog timeout when project_file landed

  Background: P1P firmware can take ~135 s after a project_file MQTT publish
  to actually start parsing the uploaded .3mf — gcode_state stays IDLE and
  subtask_id doesn't advance until parse completes. The dispatch watchdogs
  treated the missed transition as a #887/#936 half-broken session and called
  force_reconnect_stale_session, which interrupts the printer's in-progress
  parse and triggers 0500_4003 ("can't parse print file") on the printer side.

  Both #1150 (slow parse) and #887/#936 (zombie session) look identical from
  state and subtask_id alone — both have stale state and stale subtask_id with
  fresh telemetry. The distinguishing signal is the printer's gcode_file
  field: it updates in push_status when the project_file command actually
  lands on the printer, but stays unchanged when the publish was silently
  swallowed.

  Both watchdogs (_verify_print_response in background_dispatch and
  _watchdog_print_start in print_scheduler) now capture pre_gcode_file from
  printer_manager.get_status() before sending the publish, then on timeout
  compare it against the last good status seen during the poll loop. If the
  file changed, the command landed → log a #1150 warning, skip the forced
  reconnect to avoid 0500_4003 mid-parse. If unchanged, fall through to the
  original force_reconnect_stale_session call so the half-broken-session
  recovery is preserved exactly.

  Caveat documented in code: in a retry-same-file slow-parse scenario the
  gcode_file looks identical pre/post-publish, so the watchdog falls through
  to the reconnect path and the user still hits 0500_4003 on that retry.
  Accepted to avoid breaking the half-broken-session recovery, which is the
  more impactful regression of the two.

  The new pre_gcode_file kwarg has a default of None on both watchdog
  functions, so any caller that doesn't pass it keeps the original
  reconnect-on-timeout behavior verbatim.

  4 new unit tests cover both watchdogs: skip on gcode_file change (#1150
  fix), reconnect when unchanged (#936 protection preserved), skip when
  pre=None and current is non-None (printer just connected), reconnect when
  pre_gcode_file arg is omitted (backward-compat). All 439 existing
  dispatch / scheduler / mqtt tests pass unchanged.
maziggy 1 month ago
parent
commit
69b6b5a334

File diff suppressed because it is too large
+ 0 - 0
CHANGELOG.md


+ 33 - 5
backend/app/services/background_dispatch.py

@@ -698,6 +698,7 @@ class BackgroundDispatchService:
                 pre_status = printer_manager.get_status(job.printer_id)
                 pre_state = getattr(pre_status, "state", None) if pre_status else None
                 pre_subtask_id = getattr(pre_status, "subtask_id", None) if pre_status else None
+                pre_gcode_file = getattr(pre_status, "gcode_file", None) if pre_status else None
                 if pre_state:
                     await self._set_active_message(job, f"Waiting for {printer_name} to acknowledge print...")
                     transitioned = await self._verify_print_response(
@@ -705,6 +706,7 @@ class BackgroundDispatchService:
                         printer_name,
                         pre_state,
                         pre_subtask_id=pre_subtask_id,
+                        pre_gcode_file=pre_gcode_file,
                     )
                     if not transitioned:
                         raise RuntimeError(
@@ -889,6 +891,7 @@ class BackgroundDispatchService:
                 pre_status = printer_manager.get_status(job.printer_id)
                 pre_state = getattr(pre_status, "state", None) if pre_status else None
                 pre_subtask_id = getattr(pre_status, "subtask_id", None) if pre_status else None
+                pre_gcode_file = getattr(pre_status, "gcode_file", None) if pre_status else None
                 if pre_state:
                     await self._set_active_message(job, f"Waiting for {printer_name} to acknowledge print...")
                     transitioned = await self._verify_print_response(
@@ -896,6 +899,7 @@ class BackgroundDispatchService:
                         printer_name,
                         pre_state,
                         pre_subtask_id=pre_subtask_id,
+                        pre_gcode_file=pre_gcode_file,
                     )
                     if not transitioned:
                         await db.rollback()
@@ -944,6 +948,7 @@ class BackgroundDispatchService:
         printer_name: str,
         pre_state: str,
         pre_subtask_id: str | None = None,
+        pre_gcode_file: str | None = None,
         timeout: float = 90.0,
         poll_interval: float = 3.0,
     ) -> bool:
@@ -963,6 +968,7 @@ class BackgroundDispatchService:
         landed" signal even while state is still FINISH (#1078).
         """
         deadline = time.monotonic() + timeout
+        last_status = None
         while time.monotonic() < deadline:
             await asyncio.sleep(poll_interval)
             state = printer_manager.get_status(printer_id)
@@ -972,6 +978,7 @@ class BackgroundDispatchService:
                 # failure on the first missed tick; the printer may reconnect
                 # within the remaining timeout and still surface a transition.
                 continue
+            last_status = state
             if state.state != pre_state:
                 return True
             if pre_subtask_id is not None and state.subtask_id is not None and state.subtask_id != pre_subtask_id:
@@ -985,13 +992,34 @@ class BackgroundDispatchService:
             pre_state,
             pre_subtask_id,
         )
-        # Strong signal the MQTT session is half-broken (#887, #936): telemetry
-        # still arrives but our publishes don't reach the printer. Force a fresh
-        # session so the next dispatch can land without a power cycle.
+        # Distinguish #1150 (slow parse) from #887/#936 (half-broken session)
+        # via gcode_file: if the printer is now showing a different file than
+        # before dispatch, the project_file command landed and the printer is
+        # parsing — a forced reconnect mid-parse causes 0500_4003. If
+        # gcode_file is unchanged, the publish was silently swallowed and the
+        # original #936 recovery (force_reconnect → fresh client_id) is what
+        # we want. Caveat: in the rare retry-same-file-after-timeout case the
+        # printer's gcode_file looks identical before and after the publish
+        # lands, so a slow parse on retry-same-file still falls through to the
+        # reconnect (and the original 0500_4003) — accepted to avoid breaking
+        # the half-broken-session recovery path.
         client = printer_manager.get_client(printer_id)
-        if client and hasattr(client, "force_reconnect_stale_session"):
+        current_gcode_file = getattr(last_status, "gcode_file", None) if last_status else None
+        publish_landed = current_gcode_file is not None and current_gcode_file != pre_gcode_file
+        if publish_landed:
+            logger.warning(
+                "Printer %s (%d) gcode_file changed to %r (was %r) — printer "
+                "received the command and is parsing slowly. Skipping forced "
+                "MQTT reconnect to avoid 0500_4003 mid-parse (#1150).",
+                printer_name,
+                printer_id,
+                current_gcode_file,
+                pre_gcode_file,
+            )
+        elif client and hasattr(client, "force_reconnect_stale_session"):
             client.force_reconnect_stale_session(
-                f"print command unacknowledged after {timeout:.0f}s (state still {pre_state})"
+                f"print command unacknowledged after {timeout:.0f}s "
+                f"(state still {pre_state}, gcode_file {current_gcode_file!r})"
             )
         return False
 

+ 25 - 4
backend/app/services/print_scheduler.py

@@ -1851,6 +1851,7 @@ class PrintScheduler:
         pre_status = printer_manager.get_status(item.printer_id)
         pre_state = getattr(pre_status, "state", None) if pre_status else None
         pre_subtask_id = getattr(pre_status, "subtask_id", None) if pre_status else None
+        pre_gcode_file = getattr(pre_status, "gcode_file", None) if pre_status else None
 
         # Start the print with AMS mapping, plate_id and print options
         started = printer_manager.start_print(
@@ -1884,6 +1885,7 @@ class PrintScheduler:
                         item.printer_id,
                         pre_state,
                         pre_subtask_id,
+                        pre_gcode_file,
                     )
                 )
 
@@ -1957,6 +1959,7 @@ class PrintScheduler:
         printer_id: int,
         pre_state: str,
         pre_subtask_id: str | None = None,
+        pre_gcode_file: str | None = None,
         timeout: float = 90.0,
         poll_interval: float = 3.0,
     ) -> None:
@@ -1982,11 +1985,13 @@ class PrintScheduler:
         that also don't emit an early subtask_id tick.
         """
         deadline = time.monotonic() + timeout
+        last_status = None
         while time.monotonic() < deadline:
             await asyncio.sleep(poll_interval)
             status = printer_manager.get_status(printer_id)
             if not status:
                 return  # Printer disconnected — don't mess with the DB
+            last_status = status
             if status.state != pre_state:
                 return  # Printer picked up the job (state transition)
             if pre_subtask_id is not None and status.subtask_id is not None and status.subtask_id != pre_subtask_id:
@@ -2011,12 +2016,28 @@ class PrintScheduler:
                 pre_subtask_id,
             )
 
-        # Same half-broken-session recovery as background_dispatch: force the
-        # MQTT client to reconnect so the next dispatch lands without a power cycle.
+        # Same #1150 / #887/#936 discriminator as background_dispatch: if the
+        # printer's gcode_file changed since pre-dispatch, the project_file
+        # command landed and the printer is parsing — a forced reconnect
+        # mid-parse triggers 0500_4003. If gcode_file is unchanged, the
+        # publish was silently swallowed (#887/#936) and the original
+        # force_reconnect recovery is what we want.
         client = printer_manager.get_client(printer_id)
-        if client and hasattr(client, "force_reconnect_stale_session"):
+        current_gcode_file = getattr(last_status, "gcode_file", None) if last_status else None
+        publish_landed = current_gcode_file is not None and current_gcode_file != pre_gcode_file
+        if publish_landed:
+            logger.warning(
+                "Queue item %s: gcode_file changed to %r (was %r) — printer "
+                "received the command and is parsing slowly. Skipping forced "
+                "MQTT reconnect to avoid 0500_4003 mid-parse (#1150).",
+                queue_item_id,
+                current_gcode_file,
+                pre_gcode_file,
+            )
+        elif client and hasattr(client, "force_reconnect_stale_session"):
             client.force_reconnect_stale_session(
-                f"queue print command unacknowledged after {timeout:.0f}s (state still {pre_state})"
+                f"queue print command unacknowledged after {timeout:.0f}s "
+                f"(state still {pre_state}, gcode_file {current_gcode_file!r})"
             )
 
 

+ 141 - 3
backend/tests/unit/services/test_background_dispatch_watchdog.py

@@ -22,9 +22,9 @@ import pytest
 from backend.app.services.background_dispatch import BackgroundDispatchService
 
 
-def _status(state: str, subtask_id: str | None = None):
-    """Minimal stand-in for PrinterState — only the two fields the watchdog reads."""
-    return SimpleNamespace(state=state, subtask_id=subtask_id)
+def _status(state: str, subtask_id: str | None = None, gcode_file: str | None = None):
+    """Minimal stand-in for PrinterState — only the fields the watchdog reads."""
+    return SimpleNamespace(state=state, subtask_id=subtask_id, gcode_file=gcode_file)
 
 
 class TestReturnsTrueOnPickup:
@@ -224,6 +224,144 @@ class TestDefaults:
         assert sig.parameters["timeout"].default == 90.0
 
 
+class TestGcodeFileDiscriminator:
+    """#1150 vs #887/#936 discriminator: skip the forced reconnect when the
+    printer's gcode_file changed since pre-dispatch (project_file landed,
+    printer is parsing slowly — reconnecting mid-parse causes 0500_4003).
+    Reconnect when gcode_file is unchanged (publish was silently swallowed —
+    half-broken session needs the original recovery)."""
+
+    @pytest.mark.asyncio
+    async def test_skips_reconnect_when_gcode_file_changed(self):
+        get_status = MagicMock(
+            return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/new.3mf"),
+        )
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_status",
+                get_status,
+            ),
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_client",
+                get_client,
+            ),
+        ):
+            result = await BackgroundDispatchService._verify_print_response(
+                printer_id=42,
+                printer_name="P1P",
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                pre_gcode_file="/old.3mf",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        assert result is False
+        client.force_reconnect_stale_session.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_reconnects_when_gcode_file_unchanged(self):
+        # The half-broken-session case (#887/#936): publish was dropped, so
+        # the printer is still showing the previous file. Reconnect to clear
+        # the broken paho QoS-1 queue.
+        get_status = MagicMock(
+            return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/old.3mf"),
+        )
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_status",
+                get_status,
+            ),
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_client",
+                get_client,
+            ),
+        ):
+            await BackgroundDispatchService._verify_print_response(
+                printer_id=42,
+                printer_name="P1P",
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                pre_gcode_file="/old.3mf",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        client.force_reconnect_stale_session.assert_called_once()
+
+    @pytest.mark.asyncio
+    async def test_skips_reconnect_when_pre_gcode_file_was_none(self):
+        # Printer just connected (pre_gcode_file=None) and now reports a
+        # file — that's a clear "command landed" signal too.
+        get_status = MagicMock(
+            return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/new.3mf"),
+        )
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_status",
+                get_status,
+            ),
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_client",
+                get_client,
+            ),
+        ):
+            await BackgroundDispatchService._verify_print_response(
+                printer_id=42,
+                printer_name="P1P",
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                pre_gcode_file=None,
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        client.force_reconnect_stale_session.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_reconnects_when_no_pre_gcode_file_arg_supplied(self):
+        # Backward-compat: callers that don't pass pre_gcode_file at all
+        # (everything but our updated dispatch sites) must still get the
+        # original reconnect-on-timeout behaviour. Here pre_gcode_file
+        # defaults to None and the printer's current gcode_file is also
+        # None → publish_landed=False → reconnect.
+        get_status = MagicMock(
+            return_value=_status("FINISH", "OLD_SUBTASK", gcode_file=None),
+        )
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_status",
+                get_status,
+            ),
+            patch(
+                "backend.app.services.background_dispatch.printer_manager.get_client",
+                get_client,
+            ),
+        ):
+            await BackgroundDispatchService._verify_print_response(
+                printer_id=42,
+                printer_name="P1P",
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        client.force_reconnect_stale_session.assert_called_once()
+
+
 # ---------------------------------------------------------------------------
 # Integration tests: the call sites in _run_reprint_archive and
 # _run_print_library_file must (a) await the watchdog instead of fire-and-

+ 65 - 3
backend/tests/unit/test_scheduler_watchdog.py

@@ -45,9 +45,9 @@ async def db_session():
         await engine.dispose()
 
 
-def _status(state: str, subtask_id: str | None = None):
-    """Minimal stand-in for PrinterState — only the two fields the watchdog reads."""
-    return SimpleNamespace(state=state, subtask_id=subtask_id)
+def _status(state: str, subtask_id: str | None = None, gcode_file: str | None = None):
+    """Minimal stand-in for PrinterState — only the fields the watchdog reads."""
+    return SimpleNamespace(state=state, subtask_id=subtask_id, gcode_file=gcode_file)
 
 
 class TestWatchdogExitsEarlyOnPickup:
@@ -258,3 +258,65 @@ class TestWatchdogFallbackBehaviour:
         async with db_session() as db:
             item = await db.get(PrintQueueItem, 1)
             assert item.status == "completed"  # untouched
+
+
+class TestGcodeFileDiscriminator:
+    """#1150 vs #887/#936: skip the forced reconnect when gcode_file changed
+    (project_file landed, slow parse — reconnecting causes 0500_4003).
+    Reconnect when gcode_file is unchanged (publish dropped — half-broken
+    session needs the original recovery)."""
+
+    @pytest.mark.asyncio
+    async def test_skips_reconnect_when_gcode_file_changed(self, db_session):
+        get_status = MagicMock(
+            return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/new.3mf"),
+        )
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
+            patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
+            patch("backend.app.services.print_scheduler.async_session", db_session),
+        ):
+            await PrintScheduler._watchdog_print_start(
+                queue_item_id=1,
+                printer_id=42,
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                pre_gcode_file="/old.3mf",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        # Item still reverts (the user-facing failure stays correct), but the
+        # MQTT session is left intact so the slow printer can finish parsing.
+        async with db_session() as db:
+            item = await db.get(PrintQueueItem, 1)
+            assert item.status == "pending"
+        client.force_reconnect_stale_session.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_reconnects_when_gcode_file_unchanged(self, db_session):
+        get_status = MagicMock(
+            return_value=_status("FINISH", "OLD_SUBTASK", gcode_file="/old.3mf"),
+        )
+        client = MagicMock()
+        get_client = MagicMock(return_value=client)
+
+        with (
+            patch("backend.app.services.print_scheduler.printer_manager.get_status", get_status),
+            patch("backend.app.services.print_scheduler.printer_manager.get_client", get_client),
+            patch("backend.app.services.print_scheduler.async_session", db_session),
+        ):
+            await PrintScheduler._watchdog_print_start(
+                queue_item_id=1,
+                printer_id=42,
+                pre_state="FINISH",
+                pre_subtask_id="OLD_SUBTASK",
+                pre_gcode_file="/old.3mf",
+                timeout=0.2,
+                poll_interval=0.05,
+            )
+
+        client.force_reconnect_stale_session.assert_called_once()

Some files were not shown because too many files changed in this diff