Procházet zdrojové kódy

fix(timelapse): capture baseline on restart-recovery so post-reboot timelapses attach (follow up issue #1485)

  When Bambuddy is restarted mid-print, the first MQTT push from the
  printer carries `_previous_gcode_state = None`. The #1304 guard
  deliberately suppresses on_print_start on that first push to prevent
  duplicate archive creation — but on_print_start is also where
  _capture_timelapse_baseline_at_start runs, so the in-memory
  _timelapse_baselines dict stays empty for the resumed session.

  At PRINT COMPLETE, _scan_for_timelapse_with_retries finds no baseline
  and falls into its "take baseline now" fallback. By that point the
  printer has already uploaded the in-flight MP4, so the snapshot
  includes the new file. Every "Found N files / no new files since
  baseline" retry then fails to detect a diff, and the archive ends up
  with no timelapse attached — pwostran's report (#1485 follow-up): card
  shows the finish snapshot but no video.

  Add a sibling callback on_print_running_observed that bambu_mqtt fires
  in the "Now tracking RUNNING state" branch when on_print_start was
  suppressed. main.py wires it to a thin handler that looks up the
  printer row and calls the existing _capture_timelapse_baseline_at_start.
  Idempotent — skips if a baseline already exists (handles the rare
  same-session race where on_print_start also fires for some reason).

  The printer doesn't upload the timelapse until after PRINT COMPLETE,
  so a baseline captured any time during the print is still pre-upload —
  no narrow window to hit.

  Verified against the in-the-field logs in #1485 (pwostran's 2026-05-23
  support bundle):
    pre-reboot: baseline = 7 files
    reboot
    post-reboot completion: fallback baseline = 8 files (includes new MP4)
    -> all 4 retry attempts report "no new files since baseline"

  10 new tests cover both the MQTT-side fire decision (fires when
  suppressed, doesn't fire when on_print_start handles it, once per
  session, payload shape mirrors on_print_start) and the main.py
  handler (snapshot capture, double-capture guard, missing-printer-row
  guard).
maziggy před 4 dny
rodič
revize
ed08ed3787

Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 0
CHANGELOG.md


+ 47 - 0
backend/app/main.py

@@ -3024,6 +3024,52 @@ async def _scan_for_timelapse_with_retries(archive_id: int, baseline_names: set[
     logger.warning("[TIMELAPSE] All attempts exhausted for archive %s, giving up", archive_id)
 
 
+async def on_print_running_observed(printer_id: int, data: dict):
+    """Restart-recovery: capture a fresh timelapse baseline for a print that
+    started before Bambuddy came up.
+
+    bambu_mqtt.py suppresses ``on_print_start`` on the first RUNNING push
+    after Bambuddy startup (#1304 guard, prevents duplicate archive
+    creation). Without that path, ``_capture_timelapse_baseline_at_start``
+    never runs and ``_scan_for_timelapse_with_retries`` falls into its
+    "take baseline now" fallback at completion time — but by then the
+    printer has already uploaded the in-flight MP4, so the baseline
+    includes it and no diff ever matches (#1485 follow-up).
+
+    Fires once per session, in lieu of on_print_start when restart-recovery
+    kicks in. The printer doesn't upload the timelapse until after PRINT
+    COMPLETE, so a baseline captured any time during the print is still
+    pre-upload.
+    """
+    logger = logging.getLogger(__name__)
+
+    # Avoid double-capture: on_print_start may have run earlier in this
+    # Bambuddy process if the print started AFTER startup and we crashed
+    # later in the same session. (Realistically this can't happen — the
+    # MQTT client object would have been recreated — but the cheap guard
+    # is correct regardless.)
+    if printer_id in _timelapse_baselines:
+        logger.debug(
+            "[TIMELAPSE] on_print_running_observed: baseline already present for printer %s, skipping",
+            printer_id,
+        )
+        return
+
+    async with async_session() as db:
+        from backend.app.models.printer import Printer
+
+        result = await db.execute(select(Printer).where(Printer.id == printer_id))
+        printer = result.scalar_one_or_none()
+        if not printer:
+            logger.warning(
+                "[TIMELAPSE] on_print_running_observed: printer %s not found in DB, skipping baseline",
+                printer_id,
+            )
+            return
+
+    await _capture_timelapse_baseline_at_start(printer, printer_id, logger)
+
+
 async def on_print_complete(printer_id: int, data: dict):
     """Handle print completion - update the archive status."""
     import time
@@ -4721,6 +4767,7 @@ async def lifespan(app: FastAPI):
     printer_manager.set_status_change_callback(on_printer_status_change)
     printer_manager.set_print_start_callback(on_print_start)
     printer_manager.set_print_complete_callback(on_print_complete)
+    printer_manager.set_print_running_observed_callback(on_print_running_observed)
     printer_manager.set_ams_change_callback(on_ams_change)
 
     # Rehydrate persisted awaiting-plate-clear gate (#961) so prompts survive restarts

+ 37 - 0
backend/app/services/bambu_mqtt.py

@@ -333,6 +333,7 @@ class BambuMQTTClient:
         on_layer_change: Callable[[int], None] | None = None,
         on_bed_temp_update: Callable[[float], None] | None = None,
         on_drying_complete: Callable[[int], None] | None = None,
+        on_print_running_observed: Callable[[dict], None] | None = None,
     ):
         self.ip_address = ip_address
         self.serial_number = serial_number
@@ -348,6 +349,14 @@ class BambuMQTTClient:
         # the drying cycle just finished (auto- or manually-triggered).
         # Receives the AMS id of the unit that finished drying.
         self.on_drying_complete = on_drying_complete
+        # #1485 follow-up: fired the first time we see RUNNING state in a
+        # session WHEN on_print_start was suppressed (Bambuddy started mid-
+        # print, the #1304 first-push guard skipped the start event). Lets
+        # main.py capture a fresh timelapse baseline at restart-recovery
+        # time so the completion-time snapshot-diff still works. Receives
+        # the same shape as on_print_start (filename / subtask_name /
+        # remaining_time / raw_data / ams_mapping).
+        self.on_print_running_observed = on_print_running_observed
         # Per-AMS previous dry_time, used to detect the falling edge above.
         # Seeded lazily as we observe each AMS unit.
         self._previous_dry_times: dict[int, int] = {}
@@ -2909,6 +2918,7 @@ class BambuMQTTClient:
         )
 
         # Track RUNNING state for more robust completion detection
+        running_first_observed = False
         if self.state.state == "RUNNING" and current_file:
             if not self._was_running:
                 logger.debug("[%s] Now tracking RUNNING state for %s", self.serial_number, current_file)
@@ -2916,6 +2926,14 @@ class BambuMQTTClient:
                 if self.state.timelapse:
                     self._timelapse_during_print = True
                     logger.debug("[%s] Timelapse detected when entering RUNNING state", self.serial_number)
+                # Mark this as the first RUNNING observation of the session.
+                # If is_new_print also fires below, on_print_start handles
+                # baseline capture and we suppress on_print_running_observed
+                # to avoid double-capture. If is_new_print does NOT fire
+                # (Bambuddy started mid-print — the #1304 guard suppressed
+                # it), main.py needs this hook to catch the restart-recovery
+                # case (#1485 follow-up).
+                running_first_observed = True
             self._was_running = True
             self._completion_triggered = False
 
@@ -2961,6 +2979,25 @@ class BambuMQTTClient:
                     "ams_mapping": self._captured_ams_mapping,
                 }
             )
+        elif running_first_observed and self.on_print_running_observed:
+            # Restart-recovery hook (#1485 follow-up): Bambuddy started mid-
+            # print, so the #1304 first-push guard suppressed on_print_start,
+            # but we still need main.py to capture a fresh timelapse baseline
+            # before the printer uploads the in-flight MP4. Same payload
+            # shape as on_print_start so the consumer can reuse fields.
+            logger.info(
+                f"[{self.serial_number}] RUNNING observed without PRINT START "
+                f"(restart-recovery) - file: {current_file}, subtask: {self.state.subtask_name}"
+            )
+            self.on_print_running_observed(
+                {
+                    "filename": current_file,
+                    "subtask_name": self.state.subtask_name,
+                    "remaining_time": self.state.remaining_time * 60 if self.state.remaining_time > 0 else None,
+                    "raw_data": data,
+                    "ams_mapping": self._captured_ams_mapping,
+                }
+            )
 
         # Detect print completion (FINISH = success, FAILED = error, IDLE = aborted)
         # Use _was_running flag in addition to _previous_gcode_state for more robust detection

+ 15 - 0
backend/app/services/printer_manager.py

@@ -169,6 +169,7 @@ class PrinterManager:
         self._printer_info: dict[int, PrinterInfo] = {}  # Cache printer name/serial for callbacks
         self._on_print_start: Callable[[int, dict], None] | None = None
         self._on_print_complete: Callable[[int, dict], None] | None = None
+        self._on_print_running_observed: Callable[[int, dict], None] | None = None
         self._on_status_change: Callable[[int, PrinterState], None] | None = None
         self._on_ams_change: Callable[[int, list], None] | None = None
         self._on_layer_change: Callable[[int, int], None] | None = None
@@ -309,6 +310,15 @@ class PrinterManager:
         """Set callback for print completion events."""
         self._on_print_complete = callback
 
+    def set_print_running_observed_callback(self, callback: Callable[[int, dict], None]):
+        """Set callback for restart-recovery RUNNING-state observations (#1485
+        follow-up). Fires the first time we see ``state == RUNNING`` for a
+        printer that started its print before Bambuddy came up — the #1304
+        guard suppresses ``on_print_start`` for these, so anything that
+        normally hangs off it (e.g. timelapse baseline capture) needs this
+        hook to recover."""
+        self._on_print_running_observed = callback
+
     def set_status_change_callback(self, callback: Callable[[int, PrinterState], None]):
         """Set callback for status change events."""
         self._on_status_change = callback
@@ -372,6 +382,10 @@ class PrinterManager:
             if self._on_print_complete:
                 self._schedule_async(self._on_print_complete(printer_id, data))
 
+        def on_print_running_observed(data: dict):
+            if self._on_print_running_observed:
+                self._schedule_async(self._on_print_running_observed(printer_id, data))
+
         def on_ams_change(ams_data: list):
             if self._on_ams_change:
                 self._schedule_async(self._on_ams_change(printer_id, ams_data))
@@ -400,6 +414,7 @@ class PrinterManager:
             on_layer_change=on_layer_change,
             on_bed_temp_update=on_bed_temp_update,
             on_drying_complete=on_drying_complete,
+            on_print_running_observed=on_print_running_observed,
         )
 
         client.connect()

+ 191 - 0
backend/tests/unit/services/test_bambu_mqtt.py

@@ -5217,3 +5217,194 @@ class TestDryingCompleteCallback:
         # Drying genuinely finishes → the real edge still fires exactly once.
         mqtt_client._handle_ams_data({"ams": [{"id": "0", "dry_time": 0, "tray": []}]})
         assert mqtt_client._drying_events == [0]
+
+
+class TestPrintRunningObservedCallback:
+    """#1485 follow-up: on_print_running_observed fires the FIRST time we
+    see ``state == RUNNING`` for a printer whose print started before
+    Bambuddy came up. It lets main.py capture a timelapse baseline at
+    restart-recovery time — when on_print_start was suppressed by the
+    #1304 first-push guard. Must NOT fire when on_print_start handles the
+    transition (avoids double-capture), and must NOT fire again after
+    the first observation in the same session.
+    """
+
+    @pytest.fixture
+    def mqtt_client(self):
+        from backend.app.services.bambu_mqtt import BambuMQTTClient
+
+        return BambuMQTTClient(
+            ip_address="192.168.1.100",
+            serial_number="TEST123",
+            access_code="12345678",
+        )
+
+    def test_fires_on_first_running_push_after_startup(self, mqtt_client):
+        """First push the client sees has _previous_gcode_state=None, so the
+        #1304 guard suppresses on_print_start. on_print_running_observed
+        must fire instead so the consumer can recover."""
+        start_calls: list[dict] = []
+        running_observed_calls: list[dict] = []
+        mqtt_client.on_print_start = lambda data: start_calls.append(data)
+        mqtt_client.on_print_running_observed = lambda data: running_observed_calls.append(data)
+
+        # Pristine state — exactly what we have right after BambuMQTTClient
+        # construction following a Bambuddy restart.
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = None
+
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "gcode_state": "RUNNING",
+                    "gcode_file": "/data/Metadata/test_print.gcode",
+                    "subtask_name": "Test_Print",
+                }
+            }
+        )
+
+        assert start_calls == [], "on_print_start must be suppressed by the #1304 guard"
+        assert len(running_observed_calls) == 1
+        assert running_observed_calls[0]["filename"] == "/data/Metadata/test_print.gcode"
+        assert running_observed_calls[0]["subtask_name"] == "Test_Print"
+
+    def test_does_not_fire_when_print_start_fires(self, mqtt_client):
+        """Normal print start (a real state transition from non-RUNNING to
+        RUNNING) goes through on_print_start; on_print_running_observed
+        must stay quiet so the consumer doesn't capture the baseline twice."""
+        start_calls: list[dict] = []
+        running_observed_calls: list[dict] = []
+        mqtt_client.on_print_start = lambda data: start_calls.append(data)
+        mqtt_client.on_print_running_observed = lambda data: running_observed_calls.append(data)
+
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = "IDLE"  # Not None — past the #1304 guard
+
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "gcode_state": "RUNNING",
+                    "gcode_file": "/data/Metadata/test_print.gcode",
+                    "subtask_name": "Test_Print",
+                }
+            }
+        )
+
+        assert len(start_calls) == 1, "on_print_start should fire on a real start transition"
+        assert running_observed_calls == [], "on_print_running_observed must not double up with on_print_start"
+
+    def test_fires_only_once_per_session(self, mqtt_client):
+        """Subsequent RUNNING pushes in the same session must not re-fire the
+        callback — the baseline only needs to be captured once, the consumer
+        treats repeat calls as a hint to skip via the in-memory dict guard."""
+        running_observed_calls: list[dict] = []
+        mqtt_client.on_print_running_observed = lambda data: running_observed_calls.append(data)
+
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = None
+
+        msg = {
+            "print": {
+                "gcode_state": "RUNNING",
+                "gcode_file": "/data/Metadata/test_print.gcode",
+                "subtask_name": "Test_Print",
+            }
+        }
+        mqtt_client._process_message(msg)
+        mqtt_client._process_message(msg)
+        mqtt_client._process_message(msg)
+
+        assert len(running_observed_calls) == 1
+
+    def test_does_not_fire_when_not_running(self, mqtt_client):
+        """An IDLE / PREPARE / FINISH first-push must not trigger the
+        restart-recovery path — there's no print to baseline."""
+        running_observed_calls: list[dict] = []
+        mqtt_client.on_print_running_observed = lambda data: running_observed_calls.append(data)
+
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = None
+
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "gcode_state": "IDLE",
+                    "gcode_file": "",
+                    "subtask_name": "",
+                }
+            }
+        )
+
+        assert running_observed_calls == []
+
+    def test_does_not_fire_without_current_file(self, mqtt_client):
+        """RUNNING with no file is ill-formed (firmware glitch / transient).
+        We need ``current_file`` to find the right archive, so skip the
+        callback rather than fire it with a meaningless payload."""
+        running_observed_calls: list[dict] = []
+        mqtt_client.on_print_running_observed = lambda data: running_observed_calls.append(data)
+
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = None
+
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "gcode_state": "RUNNING",
+                    "gcode_file": "",
+                    "subtask_name": "",
+                }
+            }
+        )
+
+        assert running_observed_calls == []
+
+    def test_safe_when_callback_not_set(self, mqtt_client):
+        """No callback configured → silently skip; no AttributeError on the
+        firing branch."""
+        mqtt_client.on_print_running_observed = None
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = None
+
+        # Should not raise.
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "gcode_state": "RUNNING",
+                    "gcode_file": "/data/Metadata/test_print.gcode",
+                    "subtask_name": "Test_Print",
+                }
+            }
+        )
+
+        assert mqtt_client._was_running is True
+
+    def test_payload_shape_matches_print_start(self, mqtt_client):
+        """The payload shape must mirror on_print_start so main.py's
+        consumer can reuse the same dict fields (filename / subtask_name /
+        remaining_time / raw_data / ams_mapping). Test pins the keys."""
+        running_observed_calls: list[dict] = []
+        mqtt_client.on_print_running_observed = lambda data: running_observed_calls.append(data)
+        mqtt_client._was_running = False
+        mqtt_client._previous_gcode_state = None
+
+        mqtt_client._process_message(
+            {
+                "print": {
+                    "gcode_state": "RUNNING",
+                    "gcode_file": "/data/Metadata/test_print.gcode",
+                    "subtask_name": "Test_Print",
+                    "mc_remaining_time": 42,
+                }
+            }
+        )
+
+        assert len(running_observed_calls) == 1
+        payload = running_observed_calls[0]
+        assert set(payload.keys()) == {
+            "filename",
+            "subtask_name",
+            "remaining_time",
+            "raw_data",
+            "ams_mapping",
+        }

+ 152 - 0
backend/tests/unit/test_timelapse_baseline_restart_recovery.py

@@ -0,0 +1,152 @@
+"""Regression for #1485 follow-up: timelapse baseline on restart-recovery.
+
+When Bambuddy restarts mid-print, the first MQTT push has
+``_previous_gcode_state = None`` which the #1304 guard treats as "first push
+after Bambuddy startup, don't fire on_print_start" — avoiding duplicate
+archive creation. But that path is also where ``_capture_timelapse_baseline_at_start``
+lives, so without a separate hook the baseline is never captured. The
+completion-time scan then falls into its "take baseline now" fallback
+that snapshots the SD card AFTER the in-flight MP4 has landed, the new
+file ends up in the baseline set, and no diff ever matches.
+
+bambu_mqtt.py:_process_message now fires a sibling ``on_print_running_observed``
+callback in this case. main.py wires it to ``on_print_running_observed``
+which captures the baseline. These tests verify that handler.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from backend.app.main import _timelapse_baselines
+
+
+@pytest.fixture(autouse=True)
+def _clear_baselines():
+    _timelapse_baselines.clear()
+    yield
+    _timelapse_baselines.clear()
+
+
+@pytest.mark.asyncio
+async def test_running_observed_captures_baseline_on_restart_recovery():
+    """The handler must capture the printer's existing-videos snapshot so
+    the completion-time scan has something to set-diff against. This is
+    the case the in-the-field pwostran report (#1485) hits: pre-reboot
+    baseline of 7 files lost on restart, post-reboot fallback baseline
+    sees the 8 files (including the just-uploaded one) → no new file."""
+    mock_printer = MagicMock()
+    mock_printer.id = 1
+    mock_printer.name = "TestP1S"
+    mock_printer.ip_address = "192.168.1.100"
+    mock_printer.access_code = "12345678"
+    mock_printer.model = "P1S"
+
+    existing_videos = [
+        {"name": "earlier_a.mp4", "is_directory": False, "path": "/timelapse/earlier_a.mp4"},
+        {"name": "earlier_b.mp4", "is_directory": False, "path": "/timelapse/earlier_b.mp4"},
+        {"name": "earlier_c.mp4", "is_directory": False, "path": "/timelapse/earlier_c.mp4"},
+    ]
+
+    def execute_router(stmt, *args, **kwargs):
+        return MagicMock(scalar_one_or_none=MagicMock(return_value=mock_printer))
+
+    mock_session = AsyncMock()
+    mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+    mock_session.__aexit__ = AsyncMock()
+    mock_session.execute = AsyncMock(side_effect=execute_router)
+
+    with (
+        patch("backend.app.main.async_session") as mock_session_maker,
+        patch(
+            "backend.app.main._list_timelapse_videos",
+            new=AsyncMock(return_value=(existing_videos, "/timelapse")),
+        ),
+    ):
+        mock_session_maker.return_value = mock_session
+
+        from backend.app.main import on_print_running_observed
+
+        await on_print_running_observed(
+            1,
+            {
+                "filename": "/data/Metadata/test_print.gcode",
+                "subtask_name": "Test_Print",
+                "remaining_time": 3600,
+                "raw_data": {},
+                "ams_mapping": None,
+            },
+        )
+
+    assert _timelapse_baselines.get(1) == {"earlier_a.mp4", "earlier_b.mp4", "earlier_c.mp4"}, (
+        "restart-recovery handler must capture the printer's existing-videos "
+        "baseline so the completion-time scan can set-diff to find the new file"
+    )
+
+
+@pytest.mark.asyncio
+async def test_running_observed_skips_when_baseline_already_present():
+    """If on_print_start already ran in this Bambuddy process for the same
+    printer (the realistic same-session race), a second capture would
+    overwrite the correct pre-print baseline with one taken later — which
+    could include the in-flight MP4. Skip when a baseline exists."""
+    _timelapse_baselines[1] = {"pre_existing_a.mp4", "pre_existing_b.mp4"}
+
+    with (
+        patch("backend.app.main.async_session") as mock_session_maker,
+        patch("backend.app.main._list_timelapse_videos", new=AsyncMock()) as mock_list,
+    ):
+        from backend.app.main import on_print_running_observed
+
+        await on_print_running_observed(
+            1,
+            {
+                "filename": "/data/Metadata/test_print.gcode",
+                "subtask_name": "Test_Print",
+                "remaining_time": 3600,
+                "raw_data": {},
+                "ams_mapping": None,
+            },
+        )
+
+        # Neither the DB lookup nor the FTP scan should have run.
+        mock_session_maker.assert_not_called()
+        mock_list.assert_not_called()
+
+    # Original baseline preserved.
+    assert _timelapse_baselines[1] == {"pre_existing_a.mp4", "pre_existing_b.mp4"}
+
+
+@pytest.mark.asyncio
+async def test_running_observed_skips_when_printer_row_missing():
+    """If the printer was deleted between the MQTT push and this handler
+    running, we can't capture anything — log and return without raising."""
+    mock_session = AsyncMock()
+    mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+    mock_session.__aexit__ = AsyncMock()
+    mock_session.execute = AsyncMock(return_value=MagicMock(scalar_one_or_none=MagicMock(return_value=None)))
+
+    with (
+        patch("backend.app.main.async_session") as mock_session_maker,
+        patch("backend.app.main._list_timelapse_videos", new=AsyncMock()) as mock_list,
+    ):
+        mock_session_maker.return_value = mock_session
+
+        from backend.app.main import on_print_running_observed
+
+        # Should not raise.
+        await on_print_running_observed(
+            999,
+            {
+                "filename": "/data/Metadata/test_print.gcode",
+                "subtask_name": "Test_Print",
+                "remaining_time": 3600,
+                "raw_data": {},
+                "ams_mapping": None,
+            },
+        )
+
+        # FTP scan must not run if the printer row didn't resolve.
+        mock_list.assert_not_called()
+
+    assert 999 not in _timelapse_baselines

Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů