Przeglądaj źródła

fix(mqtt): reset unanswered counter on any ams_filament_setting response (#1164)

  Configuring AMS slots ~6 times in a row would silently stop reaching
  the printer, with filament colours jumping around briefly ~1 min
  later. Root cause was the zombie-session watchdog from #887.

  When an ams_filament_setting response took >10 s (normal under load)
  the watchdog set `_ams_cmd_unanswered=1` and zeroed
  `_last_ams_cmd_time` so it wouldn't re-fire on every status push.
  The response handler that resets the counter required
  `_last_ams_cmd_time > 0` — so when the late response arrived, the
  reset path skipped it, leaving the counter armed at 1. The next
  slow response on a fresh command (possibly minutes or hours later)
  would take the counter to 2 and force-reconnect mid-publish — the
  in-flight command got dropped, surfacing as "Cannot set AMS
  filament setting: not connected" if the user retried during the
  ~1 min reconnect window.

  Fix: drop the `_last_ams_cmd_time > 0` guard. Any
  ams_filament_setting response proves the channel is alive, so the
  counter must reset unconditionally. Real zombie sessions (no
  responses at all for two consecutive >10 s windows) still trip the
  watchdog correctly.

  Regression test in test_bambu_mqtt.py drives the exact reporter
  sequence: watchdog fires (clears timer, increments counter) →
  late response arrives (must reset counter) → next slow response
  (must only count as 1, not 2). Other 10 zombie-detection tests
  still pass.
maziggy 4 tygodni temu
rodzic
commit
dac6cbfe4e

Plik diff jest za duży
+ 1 - 0
CHANGELOG.md


+ 14 - 2
backend/app/services/bambu_mqtt.py

@@ -857,8 +857,20 @@ class BambuMQTTClient:
                     and print_data.get("sequence_id") == self._dev_mode_probe_seq
                     and print_data.get("sequence_id") == self._dev_mode_probe_seq
                 ):
                 ):
                     self._handle_dev_mode_probe_response(print_data)
                     self._handle_dev_mode_probe_response(print_data)
-                # Track user-initiated ams_filament_setting responses (#887 zombie detection)
-                elif cmd == "ams_filament_setting" and self._last_ams_cmd_time > 0:
+                # Track user-initiated ams_filament_setting responses (#887
+                # zombie detection). Reset both the timer AND the unanswered
+                # counter on ANY response — the response proves the channel is
+                # alive, so the counter must not stay armed even when the
+                # watchdog already zeroed `_last_ams_cmd_time` on a previous
+                # tick. The original `and self._last_ams_cmd_time > 0` guard
+                # caused #1164: one sluggish response (>10s) would set the
+                # counter to 1 and zero the timer; the late response arrived
+                # but was ignored by this branch (timer is 0); the counter
+                # stayed at 1 indefinitely; the very next slow response —
+                # possibly hours later, on a totally unrelated command — would
+                # take it to 2 and force-reconnect, surfacing as "filament
+                # config doesn't reach the printer ~6 changes in".
+                elif cmd == "ams_filament_setting":
                     self._last_ams_cmd_time = 0.0
                     self._last_ams_cmd_time = 0.0
                     self._ams_cmd_unanswered = 0
                     self._ams_cmd_unanswered = 0
             if "command" in print_data and print_data.get("command") == "extrusion_cali_get":
             if "command" in print_data and print_data.get("command") == "extrusion_cali_get":

+ 48 - 0
backend/tests/unit/services/test_bambu_mqtt.py

@@ -4140,6 +4140,54 @@ class TestZombieSessionDetection:
         assert mqtt_client._ams_cmd_unanswered == 1
         assert mqtt_client._ams_cmd_unanswered == 1
         assert mqtt_client.state.connected is True  # no reconnect
         assert mqtt_client.state.connected is True  # no reconnect
 
 
+    def test_late_response_after_watchdog_clears_counter_issue_1164(self, mqtt_client):
+        """Regression for #1164: a late ams_filament_setting response — one
+        that arrives AFTER the watchdog has already zeroed
+        `_last_ams_cmd_time` and incremented the unanswered counter — must
+        still reset the counter. Without this, a single sluggish response
+        leaves the counter armed at 1 indefinitely; the next slow response
+        on a totally unrelated command (possibly minutes or hours later)
+        takes it to 2 and force-reconnects, surfacing as 'AMS slot config
+        doesn't reach the printer ~6 changes in'."""
+        import time
+
+        # First command publishes, then doesn't get a response for >10s.
+        # Watchdog fires: counter=1, _last_ams_cmd_time zeroed.
+        mqtt_client._last_ams_cmd_time = time.monotonic() - 11.0
+        mqtt_client._update_state({"gcode_state": "IDLE"})
+        assert mqtt_client._ams_cmd_unanswered == 1
+        assert mqtt_client._last_ams_cmd_time == 0.0  # watchdog cleared it
+
+        # Late response arrives — the `_process_message` path used to require
+        # `_last_ams_cmd_time > 0` before resetting the counter, so this would
+        # have silently been ignored.
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "command": "ams_filament_setting",
+                    "sequence_id": "0",
+                    "result": "success",
+                    "reason": "success",
+                }
+            }
+        )
+
+        # Counter MUST be reset — the response proves the channel is alive.
+        assert mqtt_client._ams_cmd_unanswered == 0, (
+            "Late ams_filament_setting response must reset the unanswered "
+            "counter even when the watchdog already zeroed _last_ams_cmd_time. "
+            "If this assertion fails the #1164 regression is back: a single "
+            "sluggish response will leave the counter armed and cause a "
+            "spurious force_reconnect on the next slow response."
+        )
+
+        # Now even if a future command times out, the counter starts fresh
+        # and a single timeout doesn't trip the 2x reconnect threshold.
+        mqtt_client._last_ams_cmd_time = time.monotonic() - 11.0
+        mqtt_client._update_state({"gcode_state": "IDLE"})
+        assert mqtt_client._ams_cmd_unanswered == 1
+        assert mqtt_client.state.connected is True  # no force reconnect
+
     def test_on_connect_resets_tracking(self, mqtt_client):
     def test_on_connect_resets_tracking(self, mqtt_client):
         """_on_connect resets zombie tracking fields."""
         """_on_connect resets zombie tracking fields."""
         import time
         import time

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików