Prechádzať zdrojové kódy

fix(archive): stop duplicating the job on a backend restart mid-print (#1485)

  A restart during an active print duplicated the running job in the
  archive, and every further restart spawned another. on_print_start
  re-attaches by subtask_id, falling back to a name match plus a 4-hour
  staleness cutoff that cancelled + recreated any name-matched 'printing'
  archive older than 4h - destroying the live archive of every long print.

  Two fixes:
  - start_print records the minted subtask_id (last_dispatch_subtask_id);
    on_print_start falls back to it when the printer hasn't echoed one
    yet, so queue/scheduled archives persist a restart-stable id.
  - Replace the 4h cutoff with a progress-aware check: a name-matched
    'printing' archive resumes whenever the printer reports real (or
    unknown) progress; it is stale only when the printer shows a
    freshly-started print (<1%) on an archive over 2h old.
maziggy 6 dní pred
rodič
commit
745ed847e6

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 0 - 0
CHANGELOG.md


+ 35 - 7
backend/app/main.py

@@ -2039,8 +2039,23 @@ async def on_print_start(printer_id: int, data: dict):
                 # Update archive status to printing
                 archive.status = "printing"
                 archive.started_at = datetime.now(timezone.utc)
-                if subtask_id and not archive.subtask_id:
-                    archive.subtask_id = subtask_id
+                # Persist a restart-stable id so a later restart resumes this
+                # archive by subtask_id instead of name-matching + duplicating
+                # it (#1485). The printer often hasn't echoed subtask_id back
+                # this soon after dispatch, so fall back to the id Bambuddy
+                # minted when it sent the print command. Scoped to this
+                # expected-print branch on purpose: an expected match means
+                # Bambuddy dispatched this exact print in this process, so the
+                # client's last-dispatch id genuinely belongs to it — using it
+                # for an externally-started print could mis-tag the archive.
+                effective_subtask_id = subtask_id
+                if not effective_subtask_id:
+                    _client = printer_manager.get_client(printer_id)
+                    _dispatched = getattr(_client, "last_dispatch_subtask_id", None) if _client else None
+                    if _dispatched:
+                        effective_subtask_id = str(_dispatched).strip() or None
+                if effective_subtask_id and not archive.subtask_id:
+                    archive.subtask_id = effective_subtask_id
                 # #1403 follow-up: VP-queue archives are created with
                 # printer_id=None at queue-add time (we don't know which
                 # printer will run the job yet). When the print actually
@@ -2208,18 +2223,31 @@ async def on_print_start(printer_id: int, data: dict):
                 _load_objects_from_archive(existing_archive, printer_id, logger)
                 return
 
-            # Name-match only: fall back to the legacy 4h staleness heuristic.
+            # Name-match only (no subtask_id to anchor on): decide resume vs.
+            # stale from the printer's *current* progress, not wall-clock age.
+            # A genuinely long print used to trip a blind 4h cutoff and have its
+            # live archive cancelled + duplicated on every backend restart
+            # (#1485). If the printer reports real progress, this name-matched
+            # 'printing' archive IS that ongoing print — resume it whatever its
+            # age. Only treat it as a stale leftover when the printer clearly
+            # shows a different, freshly-started print: near-0% progress on an
+            # archive far too old to still be at 0%. Unknown progress (printer
+            # not connected) never cancels — resuming is the safe default.
             archive_age = datetime.now(timezone.utc) - existing_archive.created_at.replace(tzinfo=timezone.utc)
-            if archive_age.total_seconds() > 4 * 60 * 60:  # 4 hours
+            live_status = printer_manager.get_status(printer_id)
+            live_progress = getattr(live_status, "progress", None) if live_status else None
+            looks_stale = (
+                live_progress is not None and live_progress < 1.0 and archive_age.total_seconds() > 2 * 60 * 60
+            )
+            if looks_stale:
                 logger.warning(
-                    f"Found stale 'printing' archive {existing_archive.id} (age: {archive_age}), "
-                    f"marking as cancelled and creating new archive"
+                    f"Found stale 'printing' archive {existing_archive.id} (age: {archive_age}, "
+                    f"printer progress {live_progress:.0f}%) — marking cancelled and creating new archive"
                 )
                 existing_archive.status = "cancelled"
                 existing_archive.failure_reason = "Stale - print likely cancelled or failed without status update"
                 await db.commit()
                 # Fall through to create new archive (don't return)
-                _existing_archive = None  # Clear so we don't use stale archive
             else:
                 logger.info(
                     f"Skipping duplicate - already have printing archive {existing_archive.id} for {check_name}"

+ 9 - 0
backend/app/services/bambu_mqtt.py

@@ -362,6 +362,12 @@ class BambuMQTTClient:
         self._timelapse_during_print: bool = False  # Track if timelapse was active during this print
         self._last_valid_progress: float = 0.0  # Last non-zero progress (firmware resets on cancel)
         self._last_valid_layer_num: int = 0  # Last non-zero layer (firmware resets on cancel)
+        # The subtask_id minted for the most recent start_print() command. The
+        # printer echoes it back in status, but often not within the first few
+        # seconds — so on_print_start uses this as the id source when the
+        # printer hasn't reported it yet, letting queue/scheduled archives
+        # persist a restart-stable id from the moment they dispatch (#1485).
+        self.last_dispatch_subtask_id: str | None = None
         self._is_dual_nozzle: bool = False  # Set when device.extruder.info has >= 2 entries
         self._message_log: deque[MQTTLogEntry] = deque(maxlen=100)
         self._logging_enabled: bool = False
@@ -3358,6 +3364,9 @@ class BambuMQTTClient:
             # Modulo keeps uniqueness within a ~24-day wrap window; `or 1` guards
             # the (astronomically unlikely) zero case since task_id=0 is rejected.
             submission_id = str(int(time.time() * 1000) % 2_147_483_647 or 1)
+            # Remember it so on_print_start can persist a restart-stable id on
+            # the archive even before the printer echoes subtask_id back (#1485).
+            self.last_dispatch_subtask_id = submission_id
 
             command = {
                 "print": {

+ 18 - 0
backend/tests/unit/services/test_bambu_mqtt.py

@@ -3906,6 +3906,24 @@ class TestStartPrintUniqueIdentityFields:
         assert int(cmd["task_id"]) > 0
         assert len(cmd["task_id"]) <= 64
 
+    def test_last_dispatch_subtask_id_records_the_minted_id(self, mqtt_client):
+        """#1485: start_print records the minted id on the client so
+        on_print_start can persist it on the archive before the printer
+        echoes subtask_id back — letting a later restart resume by id."""
+        assert mqtt_client.last_dispatch_subtask_id is None
+        mqtt_client.start_print("test.3mf")
+        cmd = self._get_published_command(mqtt_client)
+        assert mqtt_client.last_dispatch_subtask_id == cmd["subtask_id"]
+
+    def test_last_dispatch_subtask_id_updates_per_submission(self, mqtt_client):
+        """Each dispatch overwrites the recorded id with the new submission's."""
+        mqtt_client.start_print("test.3mf")
+        first = mqtt_client.last_dispatch_subtask_id
+        time.sleep(0.002)
+        mqtt_client.start_print("test.3mf")
+        assert mqtt_client.last_dispatch_subtask_id != first
+        assert mqtt_client.last_dispatch_subtask_id == self._get_published_command(mqtt_client)["subtask_id"]
+
     def test_submission_id_fits_signed_int32(self, mqtt_client):
         """Regression for #1042: P1S firmware clamps oversized task identity
         fields to signed int32 max (2**31-1 = 2147483647). If we send raw

+ 51 - 1
backend/tests/unit/test_subtask_archive_resume.py

@@ -10,7 +10,12 @@ for a print that actually ran 13h08m.
 The fix stores `subtask_id` (MQTT-provided job identifier) on the archive row.
 On print-start detection, the handler first tries to match an existing
 archive by subtask_id regardless of age — same id ⇒ same print ⇒ resume.
-Only unmatched prints fall through to the legacy 4h staleness heuristic.
+Only unmatched prints fall through to the name-based fallback.
+
+#1485 follow-up: the name-based fallback no longer cancels on a blind 4h
+age cutoff (which duplicated the archive of any genuinely long print on
+every restart). It now decides resume-vs-stale from the printer's current
+progress — see TestStaleVsResume.
 """
 
 from datetime import datetime, timedelta, timezone
@@ -183,3 +188,48 @@ class TestSubtaskIdResume:
         )
         found = result.scalar_one_or_none()
         assert found is None
+
+
+def _looks_stale(live_progress: float | None, archive_age_seconds: float) -> bool:
+    """Mirrors the name-fallback stale decision in main.on_print_start (#1485).
+
+    A name-matched 'printing' archive is treated as a stale leftover ONLY when
+    the printer clearly shows a different, freshly-started print: near-0%
+    progress on an archive far too old to still be at 0%. Real progress, or
+    unknown progress (printer not connected), always resumes — the old blind
+    4h age cutoff cancelled the live archive of every long print on restart.
+    """
+    return live_progress is not None and live_progress < 1.0 and archive_age_seconds > 2 * 60 * 60
+
+
+class TestStaleVsResume:
+    """The progress-aware replacement for the 4h staleness heuristic (#1485)."""
+
+    def test_long_print_in_progress_resumes_not_stale(self):
+        """The reporter's case: a ~10h print, backend restarts, printer is
+        mid-print at 60%. The old 4h cutoff cancelled + duplicated it; it
+        must now resume regardless of age."""
+        assert _looks_stale(60.0, archive_age_seconds=10 * 3600) is False
+
+    def test_barely_started_long_print_resumes(self):
+        """A genuine print a few percent in is still the same print."""
+        assert _looks_stale(3.0, archive_age_seconds=5 * 3600) is False
+
+    def test_fresh_print_with_old_archive_is_stale(self):
+        """Printer reports a just-started print (~0%) but the matched archive
+        is hours old — that archive is a dead leftover from a previous run."""
+        assert _looks_stale(0.0, archive_age_seconds=9 * 3600) is True
+
+    def test_fresh_print_with_young_archive_resumes(self):
+        """~0% progress on a young archive is just the same print still
+        heating / leveling — not stale."""
+        assert _looks_stale(0.0, archive_age_seconds=20 * 60) is False
+
+    def test_unknown_progress_never_cancels(self):
+        """Printer not connected / progress unknown: resuming is the safe
+        default — never cancel + duplicate when we can't tell."""
+        assert _looks_stale(None, archive_age_seconds=10 * 3600) is False
+
+    def test_sub_one_percent_old_archive_is_stale(self):
+        """The boundary: just under 1% past the 2h mark counts as stale."""
+        assert _looks_stale(0.5, archive_age_seconds=3 * 3600) is True

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov