Browse Source

fix(archive): resume on subtask_id, short-circuit 550, cache 3mf (#972)

  Second wave of #972 — reproducer on a 37.5 MB BambuStudio print to an A1
  showed three stacking root causes when Bambuddy restarts mid-print.

  1. Archive start_time lost on container restart. The name-based dedup
     cancelled any "printing" archive older than 4h and recreated it with
     started_at=now(), so a 13h print that saw a restart 10h in ended up
     showing ~1.5h duration. Persist MQTT subtask_id on every archive and
     match on that first, regardless of age — same id means same print,
     resume in place. Also revives Stale-cancelled rows for users
     upgrading mid-print.

  2. 3MF FTP search tried non-existent paths for ~48 min. Order was
     /cache → /model → /data → /data/Metadata → / with 11×30s retries
     each; BambuStudio actually pushes to / on A1, so the real path was
     tested last. Reorder to / first, and raise a new FileNotOnPrinterError
     sentinel from download_to_file on 550 so with_ftp_retry short-circuits
     via non_retry_exceptions. 425 / SSL EOF / connection resets still
     retry as before.

  3. Cover endpoint and archive flow downloaded the same 36 MB twice and
     competed for the printer's single FTP socket, producing 425 errors
     that fed cause-2's retry storm. Add an in-memory _threemf_path_cache
     keyed on (printer_id, normalized filename); whichever flow fetches
     first populates it, the other reuses the file read-only. Eviction
     runs on on_print_complete and deletes the temp file.

  Backend: 14 new tests across test_bambu_ftp.py and a new
  test_subtask_archive_resume.py. Existing suite: 2737 pass. ruff clean,
  frontend build clean.
maziggy 1 month ago
parent
commit
46c246c504

File diff suppressed because it is too large
+ 0 - 0
CHANGELOG.md


+ 58 - 34
backend/app/api/routes/printers.py

@@ -29,9 +29,11 @@ from backend.app.schemas.printer import (
     PrintOptionsResponse,
 )
 from backend.app.services.bambu_ftp import (
+    cache_3mf_download,
     delete_file_async,
     download_file_bytes_async,
     download_file_try_paths_async,
+    get_cached_3mf,
     get_storage_info_async,
     list_files_async,
 )
@@ -792,43 +794,61 @@ async def get_printer_cover(
     temp_path = settings.archive_dir / "temp" / f"cover_{printer_id}_{temp_filename}"
     temp_path.parent.mkdir(parents=True, exist_ok=True)
 
-    logger.info(
-        f"Trying to download cover for '{subtask_name}' from {printer.ip_address} (trying {len(remote_paths)} paths)"
-    )
-
-    # Retry logic for transient FTP failures
-    max_retries = 2
-    last_error = None
+    # Cache check (#972): the archive-metadata flow in main.py may have already
+    # downloaded this 3MF during the print-start handler. Reusing that file
+    # avoids a second 36MB transfer competing with the printer's single FTP
+    # socket (which produces the 425 errors that feed the retry storm).
     downloaded = False
-
-    for attempt in range(max_retries + 1):
-        try:
-            downloaded = await download_file_try_paths_async(
-                printer.ip_address,
-                printer.access_code,
-                remote_paths,
-                temp_path,
-                printer_model=printer.model,
-            )
-            if downloaded:
-                break
-        except Exception as e:
-            last_error = e
-            if attempt < max_retries:
-                logger.warning("FTP download attempt %s failed: %s, retrying...", attempt + 1, e)
-                await asyncio.sleep(0.5 * (attempt + 1))  # Brief backoff
-            else:
-                logger.error("FTP download failed after %s attempts: %s", max_retries + 1, e)
-
-    if last_error and not downloaded:
-        raise HTTPException(503, f"FTP download temporarily unavailable: {last_error}")
+    using_cached = False
+    for candidate_name in possible_filenames:
+        cached = get_cached_3mf(printer_id, candidate_name)
+        if cached:
+            logger.info("Cover using cached 3MF from %s (avoided duplicate FTP)", cached)
+            temp_path = cached
+            downloaded = True
+            using_cached = True
+            break
 
     if not downloaded:
-        raise HTTPException(
-            404,
-            f"Could not download 3MF file for '{subtask_name}' from printer {printer.ip_address}. Tried: {possible_filenames}",
+        logger.info(
+            f"Trying to download cover for '{subtask_name}' from {printer.ip_address} (trying {len(remote_paths)} paths)"
         )
 
+        # Retry logic for transient FTP failures
+        max_retries = 2
+        last_error = None
+
+        for attempt in range(max_retries + 1):
+            try:
+                downloaded = await download_file_try_paths_async(
+                    printer.ip_address,
+                    printer.access_code,
+                    remote_paths,
+                    temp_path,
+                    printer_model=printer.model,
+                )
+                if downloaded:
+                    break
+            except Exception as e:
+                last_error = e
+                if attempt < max_retries:
+                    logger.warning("FTP download attempt %s failed: %s, retrying...", attempt + 1, e)
+                    await asyncio.sleep(0.5 * (attempt + 1))  # Brief backoff
+                else:
+                    logger.error("FTP download failed after %s attempts: %s", max_retries + 1, e)
+
+        if last_error and not downloaded:
+            raise HTTPException(503, f"FTP download temporarily unavailable: {last_error}")
+
+        if not downloaded:
+            raise HTTPException(
+                404,
+                f"Could not download 3MF file for '{subtask_name}' from printer {printer.ip_address}. Tried: {possible_filenames}",
+            )
+
+        # Share the fresh download with the archive flow.
+        cache_3mf_download(printer_id, temp_filename, temp_path)
+
     # Verify file actually exists and has content
     if not temp_path.exists():
         raise HTTPException(500, f"Download reported success but file not found: {temp_path}")
@@ -837,7 +857,8 @@ async def get_printer_cover(
     logger.info("Downloaded file size: %s bytes", file_size)
 
     if file_size == 0:
-        temp_path.unlink()
+        if not using_cached:
+            temp_path.unlink()
         raise HTTPException(500, f"Downloaded file is empty for '{subtask_name}'")
 
     try:
@@ -900,7 +921,10 @@ async def get_printer_cover(
             zf.close()
 
     finally:
-        if temp_path.exists():
+        # Only delete when this invocation owns the file. A cached path is
+        # shared with the archive flow — removing it would force a refetch
+        # the next time either flow needs the 3MF.
+        if not using_cached and temp_path.exists():
             temp_path.unlink()
 
 

+ 6 - 0
backend/app/core/database.py

@@ -1427,6 +1427,12 @@ async def run_migrations(conn):
     # energy tracking survives a backend restart mid-print.
     await _safe_execute(conn, "ALTER TABLE print_archives ADD COLUMN energy_start_kwh REAL")
 
+    # Migration: Add subtask_id to print_archives (#972)
+    # MQTT-provided task identifier used to resume the same archive row across a
+    # backend restart mid-print. Without it, a long print (e.g. 13h) triggers
+    # stale-cancel + new-archive, losing started_at continuity.
+    await _safe_execute(conn, "ALTER TABLE print_archives ADD COLUMN subtask_id VARCHAR(64)")
+
     # Migration: Create smart_plug_energy_snapshots table (#941)
     # Hourly snapshots of each plug's lifetime counter, so date-range queries in
     # "total consumption" energy mode can compute (last - first) deltas.

+ 141 - 24
backend/app/main.py

@@ -63,7 +63,15 @@ from backend.app.core.websocket import ws_manager
 from backend.app.models.smart_plug import SmartPlug
 from backend.app.services.archive import ArchiveService
 from backend.app.services.background_dispatch import background_dispatch
-from backend.app.services.bambu_ftp import download_file_async, get_ftp_retry_settings, with_ftp_retry
+from backend.app.services.bambu_ftp import (
+    FileNotOnPrinterError,
+    cache_3mf_download,
+    clear_3mf_cache,
+    download_file_async,
+    get_cached_3mf,
+    get_ftp_retry_settings,
+    with_ftp_retry,
+)
 from backend.app.services.bambu_mqtt import PrinterState
 from backend.app.services.github_backup import github_backup_service
 from backend.app.services.homeassistant import homeassistant_service
@@ -1493,6 +1501,18 @@ async def on_print_start(printer_id: int, data: dict):
         filename = data.get("filename", "")
         subtask_name = data.get("subtask_name", "")
 
+        # MQTT subtask_id uniquely identifies a print job on the printer. When
+        # present, it lets us match an archive across a backend restart (#972):
+        # same id → same print → resume the existing row instead of cancelling
+        # it and recreating from scratch (which loses started_at). Treat "0"
+        # and "" as absent — Bambu reports "0" for non-cloud / local prints.
+        raw_mqtt = data.get("raw_data") or {}
+        subtask_id = raw_mqtt.get("subtask_id")
+        if subtask_id is not None:
+            subtask_id = str(subtask_id).strip()
+            if subtask_id in ("", "0"):
+                subtask_id = None
+
         logger.info("[CALLBACK] Print start detected - filename: %s, subtask: %s", filename, subtask_name)
 
         # Skip calibration prints — internal printer files should not be archived
@@ -1548,6 +1568,8 @@ async def on_print_start(printer_id: int, data: dict):
                 # Update archive status to printing
                 archive.status = "printing"
                 archive.started_at = datetime.now(timezone.utc)
+                if subtask_id and not archive.subtask_id:
+                    archive.subtask_id = subtask_id
                 await db.commit()
 
                 # Track as active print
@@ -1619,29 +1641,80 @@ async def on_print_start(printer_id: int, data: dict):
         # This prevents duplicates when backend restarts during an active print
         from backend.app.models.archive import PrintArchive
 
-        check_name = subtask_name or filename.split("/")[-1].replace(".gcode", "").replace(".3mf", "")
-        existing = await db.execute(
-            select(PrintArchive)
-            .where(PrintArchive.printer_id == printer_id)
-            .where(PrintArchive.status == "printing")
-            .where(
-                or_(
-                    PrintArchive.print_name == check_name,
-                    PrintArchive.filename.in_(
-                        [
-                            f"{check_name}.3mf",
-                            f"{check_name}.gcode.3mf",
-                        ]
-                    ),
+        existing_archive: PrintArchive | None = None
+
+        # Preferred match: subtask_id equality. MQTT reports the same subtask_id
+        # across a backend restart for the same print, so this is the most
+        # reliable way to reattach. We also accept a previously stale-cancelled
+        # archive here so users upgrading mid-print get revived when the row
+        # their earlier Bambuddy version wrongly cancelled reappears (#972).
+        if subtask_id:
+            by_id = await db.execute(
+                select(PrintArchive)
+                .where(PrintArchive.printer_id == printer_id)
+                .where(PrintArchive.subtask_id == subtask_id)
+                .where(PrintArchive.status.in_(["printing", "cancelled"]))
+                .order_by(PrintArchive.created_at.desc())
+                .limit(1)
+            )
+            candidate = by_id.scalar_one_or_none()
+            if candidate and (candidate.status == "printing" or (candidate.failure_reason or "").startswith("Stale")):
+                existing_archive = candidate
+
+        # Fallback match: name-based lookup. Kept as-is for prints whose
+        # subtask_id is missing ("0" / local / non-cloud prints).
+        if existing_archive is None:
+            check_name = subtask_name or filename.split("/")[-1].replace(".gcode", "").replace(".3mf", "")
+            existing = await db.execute(
+                select(PrintArchive)
+                .where(PrintArchive.printer_id == printer_id)
+                .where(PrintArchive.status == "printing")
+                .where(
+                    or_(
+                        PrintArchive.print_name == check_name,
+                        PrintArchive.filename.in_(
+                            [
+                                f"{check_name}.3mf",
+                                f"{check_name}.gcode.3mf",
+                            ]
+                        ),
+                    )
                 )
+                .order_by(PrintArchive.created_at.desc())
+                .limit(1)
             )
-            .order_by(PrintArchive.created_at.desc())
-            .limit(1)
-        )
-        existing_archive = existing.scalar_one_or_none()
+            existing_archive = existing.scalar_one_or_none()
+
         if existing_archive:
-            # Check if archive is stale (older than 4 hours) - likely a failed/cancelled print
-            # that didn't get properly updated
+            # subtask_id match → always resume, regardless of age. Same print,
+            # just a backend restart. Revive if it was previously stale-cancelled.
+            subtask_match = bool(subtask_id and existing_archive.subtask_id == subtask_id)
+
+            if subtask_match:
+                if existing_archive.status == "cancelled":
+                    logger.warning(
+                        "Reviving stale-cancelled archive %s — matching subtask_id %s confirms same print (#972)",
+                        existing_archive.id,
+                        subtask_id,
+                    )
+                    existing_archive.status = "printing"
+                    existing_archive.failure_reason = None
+                    await db.commit()
+                else:
+                    logger.info("Resuming archive %s on subtask_id match (%s)", existing_archive.id, subtask_id)
+                _active_prints[(printer_id, existing_archive.filename)] = existing_archive.id
+                if existing_archive.energy_start_kwh is None:
+                    await _record_energy_start(existing_archive, printer_id, db, context="subtask-resume")
+                if not notification_sent:
+                    archive_data = {
+                        "print_time_seconds": existing_archive.print_time_seconds,
+                        "created_by_id": existing_archive.created_by_id,
+                    }
+                    await _send_print_start_notification(printer_id, data, archive_data, logger)
+                _load_objects_from_archive(existing_archive, printer_id, logger)
+                return
+
+            # Name-match only: fall back to the legacy 4h staleness heuristic.
             archive_age = datetime.now(timezone.utc) - existing_archive.created_at.replace(tzinfo=timezone.utc)
             if archive_age.total_seconds() > 4 * 60 * 60:  # 4 hours
                 logger.warning(
@@ -1659,6 +1732,10 @@ async def on_print_start(printer_id: int, data: dict):
                 )
                 # Track this as the active print
                 _active_prints[(printer_id, existing_archive.filename)] = existing_archive.id
+                # Attach subtask_id retroactively so future restarts can resume
+                if subtask_id and not existing_archive.subtask_id:
+                    existing_archive.subtask_id = subtask_id
+                    await db.commit()
                 # Also set up energy tracking if not already tracked (#941: persisted column)
                 if existing_archive.energy_start_kwh is None:
                     await _record_energy_start(existing_archive, printer_id, db, context="existing-printing")
@@ -1714,19 +1791,38 @@ async def on_print_start(printer_id: int, data: dict):
         temp_path = None
         downloaded_filename = None
 
+        # Cache check: cover endpoint may have already pulled this 3MF during
+        # the print (frontend opens the card and shows the thumbnail) — reuse
+        # that file instead of re-downloading 36MB over the same FTP link that
+        # just served it (#972). The cache keys on a normalized filename so
+        # variants like "X", "X.3mf", "X.gcode.3mf" all collapse to one entry.
+        for try_filename in possible_names:
+            if not try_filename.endswith(".3mf"):
+                continue
+            cached = get_cached_3mf(printer_id, try_filename)
+            if cached:
+                logger.info("Reusing cached 3MF from %s (avoided duplicate FTP)", cached)
+                temp_path = cached
+                downloaded_filename = try_filename
+                break
+
         # Get FTP retry settings
         ftp_retry_enabled, ftp_retry_count, ftp_retry_delay, ftp_timeout = await get_ftp_retry_settings()
 
-        for try_filename in possible_names:
+        for try_filename in possible_names if not downloaded_filename else []:
             if not try_filename.endswith(".3mf"):
                 continue
 
+            # Root (/) is where BambuStudio/OrcaSlicer uploads land on A1/P1-series
+            # printers, so try it first — deferring it to last cost #972's reporter
+            # ~48 minutes of retries on /cache//model//data//data/Metadata before
+            # landing on the path that actually had the file.
             remote_paths = [
+                f"/{try_filename}",
                 f"/cache/{try_filename}",
                 f"/model/{try_filename}",
                 f"/data/{try_filename}",
                 f"/data/Metadata/{try_filename}",
-                f"/{try_filename}",
             ]
 
             temp_path = app_settings.archive_dir / "temp" / try_filename
@@ -1748,6 +1844,7 @@ async def on_print_start(printer_id: int, data: dict):
                             max_retries=ftp_retry_count,
                             retry_delay=ftp_retry_delay,
                             operation_name=f"Download 3MF from {remote_path}",
+                            non_retry_exceptions=(FileNotOnPrinterError,),
                         )
                     else:
                         downloaded = await download_file_async(
@@ -1762,7 +1859,14 @@ async def on_print_start(printer_id: int, data: dict):
                     if downloaded:
                         downloaded_filename = try_filename
                         logger.info("Downloaded: %s", remote_path)
+                        # Populate shared cache so the cover endpoint (if it
+                        # runs next) doesn't refetch the same 36MB over FTP.
+                        cache_3mf_download(printer_id, try_filename, temp_path)
                         break
+                except FileNotOnPrinterError:
+                    # 550 — file isn't at this path. Advance to next candidate
+                    # without burning the retry budget.
+                    logger.debug("3MF not at %s (550), trying next path", remote_path)
                 except Exception as e:
                     logger.debug("FTP download failed for %s: %s", remote_path, e)
 
@@ -1826,6 +1930,7 @@ async def on_print_start(printer_id: int, data: dict):
                             if downloaded:
                                 downloaded_filename = fname
                                 logger.info("Found and downloaded from %s: %s", search_dir, fname)
+                                cache_3mf_download(printer_id, fname, temp_path)
                                 break
                 except Exception as e:
                     logger.debug("Failed to list %s: %s", search_dir, e)
@@ -1866,6 +1971,7 @@ async def on_print_start(printer_id: int, data: dict):
                     print_time_seconds=fallback_print_time,
                     status="printing",
                     started_at=datetime.now(timezone.utc),
+                    subtask_id=subtask_id,
                     extra_data={"no_3mf_available": True, "original_subtask": subtask_name, "_print_data": data},
                 )
 
@@ -1951,6 +2057,7 @@ async def on_print_start(printer_id: int, data: dict):
                 printer_id=printer_id,
                 source_file=temp_path,
                 print_data={**data, "status": "printing"},
+                subtask_id=subtask_id,
             )
 
             if archive:
@@ -2055,7 +2162,12 @@ async def on_print_start(printer_id: int, data: dict):
                 except Exception as e:
                     logger.warning("[TIMELAPSE] Failed to capture baseline at print start: %s", e)
         finally:
-            if temp_path and temp_path.exists():
+            # Keep temp_path around until print completes so the cover endpoint
+            # can reuse it (#972). Cache eviction in on_print_complete deletes
+            # the file. If the cache entry was evicted early (file vanished),
+            # clean up any stragglers here to avoid leaking disk on retries.
+            cached_now = get_cached_3mf(printer_id, downloaded_filename) if downloaded_filename else None
+            if temp_path and temp_path.exists() and cached_now != temp_path:
                 temp_path.unlink()
 
 
@@ -2297,6 +2409,11 @@ async def on_print_complete(printer_id: int, data: dict):
 
     logger.info("[CALLBACK] on_print_complete started for printer %s", printer_id)
 
+    # Drop the 3MF download cache for this printer (#972). The print is over,
+    # nothing else legitimately needs the bytes; keeping them would only risk
+    # handing a stale file to the next print if it reuses the same name.
+    clear_3mf_cache(printer_id)
+
     try:
         ws_data = {
             "status": data.get("status"),

+ 6 - 0
backend/app/models/archive.py

@@ -43,6 +43,12 @@ class PrintArchive(Base):
     started_at: Mapped[datetime | None] = mapped_column(DateTime)
     completed_at: Mapped[datetime | None] = mapped_column(DateTime)
 
+    # Printer-assigned subtask identifier from MQTT. Used to resume the same
+    # archive row across a backend restart during a long-running print (#972):
+    # if the same subtask_id reappears after restart, we know it's the same
+    # print and keep the original row instead of cancel-then-create.
+    subtask_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
+
     # Extended metadata (JSON blob for flexibility)
     extra_data: Mapped[dict | None] = mapped_column(JSON)
 

+ 5 - 0
backend/app/services/archive.py

@@ -855,6 +855,7 @@ class ArchiveService:
         created_by_id: int | None = None,
         original_filename: str | None = None,
         project_id: int | None = None,
+        subtask_id: str | None = None,
     ) -> PrintArchive | None:
         """Archive a 3MF file with metadata.
 
@@ -867,6 +868,9 @@ class ArchiveService:
                 stored with UUID names)
             project_id: Project to associate this archive with (optional, set when triggered
                 from the project view)
+            subtask_id: MQTT-provided task identifier (optional). Used to match an
+                existing archive across a backend restart mid-print so the
+                original row can be resumed instead of cancelled (#972).
         """
         # Verify printer exists if specified
         if printer_id is not None:
@@ -978,6 +982,7 @@ class ArchiveService:
             extra_data=metadata,
             created_by_id=created_by_id,
             project_id=project_id,
+            subtask_id=subtask_id,
         )
 
         self.db.add(archive)

+ 103 - 3
backend/app/services/bambu_ftp.py

@@ -16,6 +16,17 @@ logger = logging.getLogger(__name__)
 T = TypeVar("T")
 
 
+class FileNotOnPrinterError(Exception):
+    """Raised when a remote FTP path returns 550 (file not found).
+
+    550 means the file does not exist at that path — retrying the same path
+    will never succeed. Callers use this sentinel with with_ftp_retry's
+    non_retry_exceptions to immediately move on to the next candidate path
+    instead of burning the full retry budget (up to 11 × 30s per path) on
+    a lookup that cannot recover.
+    """
+
+
 class ImplicitFTP_TLS(FTP_TLS):
     """FTP_TLS subclass for implicit FTPS (port 990) with model-specific SSL handling.
 
@@ -280,14 +291,21 @@ class BambuFTPClient:
             logger.info("Successfully downloaded %s to %s (%s bytes)", remote_path, local_path, file_size)
             return True
         except (OSError, ftplib.Error) as e:
-            # Log at INFO level so we can see failures in normal logs
-            logger.info("FTP download failed for %s: %s", remote_path, e)
             # Clean up partial file if it exists
             if local_path.exists():
                 try:
                     local_path.unlink()
                 except OSError:
                     pass  # Best-effort partial file cleanup; not critical if removal fails
+            # 550 means the file is not at this path. Surface as a sentinel so
+            # with_ftp_retry can abandon this path immediately and the caller
+            # can advance to the next candidate instead of retrying 11× at
+            # 30s intervals (the pattern that cost #972's reporter ~48min).
+            if isinstance(e, ftplib.error_perm) and str(e).startswith("550"):
+                logger.info("FTP download failed for %s: %s (not on printer)", remote_path, e)
+                raise FileNotOnPrinterError(f"{remote_path}: {e}") from e
+            # Log at INFO level so we can see failures in normal logs
+            logger.info("FTP download failed for %s: %s", remote_path, e)
             return False
 
     def diagnose_storage(self) -> dict:
@@ -597,6 +615,79 @@ class BambuFTPClient:
         return result if result else None
 
 
+# Shared 3MF download cache (#972).
+#
+# Both the cover thumbnail endpoint (api/routes/printers.py) and the archive
+# metadata flow (main.py) fetch the same 3MF file over FTP during a print.
+# On slow / contended links (A1 Wi-Fi, large files) the duplicate transfers
+# compete for the printer's single FTP socket and trigger 425 "can't open
+# data channel" errors, feeding back into cause-2's retry storm.
+#
+# This cache stores the local path of a successfully-downloaded 3MF keyed
+# by (printer_id, normalized_name). Whichever flow downloads first populates
+# the cache; the other flow reuses the file read-only. Evicted on print
+# completion so a later print with the same name re-downloads fresh bytes.
+_threemf_path_cache: dict[tuple[int, str], Path] = {}
+
+
+def normalize_3mf_name(name: str) -> str:
+    """Collapse various 3MF filename variants to a cache key.
+
+    Bambu tooling produces names as bare subtask ("Part"), with .3mf, with
+    .gcode.3mf, or (Studio-normalized) with spaces → underscores. All of
+    these refer to the same print job on the same printer, so they must
+    hash to the same cache key.
+    """
+    # Lowercase first so .3MF / .GCODE.3MF variants strip cleanly — a
+    # real-world case since Windows-side tooling sometimes uppercases
+    # extensions.
+    cleaned = name.strip().lower().replace(".gcode.3mf", "").replace(".gcode", "").replace(".3mf", "")
+    return cleaned.replace(" ", "_")
+
+
+def cache_3mf_download(printer_id: int, name: str, local_path: Path) -> None:
+    """Record a successfully-downloaded 3MF so a sibling flow can reuse it."""
+    _threemf_path_cache[(printer_id, normalize_3mf_name(name))] = local_path
+
+
+def get_cached_3mf(printer_id: int, name: str) -> Path | None:
+    """Return a cached 3MF path for this printer/name if the file still exists."""
+    key = (printer_id, normalize_3mf_name(name))
+    cached = _threemf_path_cache.get(key)
+    if cached and cached.exists() and cached.stat().st_size > 0:
+        return cached
+    # Evict dead entry — the file was cleaned up (temp dir clean, manual
+    # deletion, restart) so the cache value is no longer usable.
+    if cached:
+        _threemf_path_cache.pop(key, None)
+    return None
+
+
+def clear_3mf_cache(printer_id: int | None = None, delete_files: bool = True) -> None:
+    """Drop cache entries for one printer (or all with None).
+
+    When ``delete_files`` is True (default) the on-disk 3MF is removed as well
+    — called from on_print_complete so temp files don't accumulate across
+    prints. Tests that want to inspect the cache contents disable this.
+    """
+
+    def _maybe_unlink(path: Path) -> None:
+        if delete_files and path.exists():
+            try:
+                path.unlink()
+            except OSError as exc:
+                logger.debug("3MF cache cleanup skipped %s: %s", path, exc)
+
+    if printer_id is None:
+        for path in list(_threemf_path_cache.values()):
+            _maybe_unlink(path)
+        _threemf_path_cache.clear()
+        return
+    for key in [k for k in _threemf_path_cache if k[0] == printer_id]:
+        _maybe_unlink(_threemf_path_cache[key])
+        _threemf_path_cache.pop(key, None)
+
+
 async def download_file_async(
     ip_address: str,
     access_code: str,
@@ -711,7 +802,16 @@ async def download_file_try_paths_async(
             return False
 
         try:
-            return any(client.download_to_file(remote_path, local_path) for remote_path in remote_paths)
+            # FileNotOnPrinterError signals "try the next path", not "give up" —
+            # this function's whole purpose is to walk a list of candidates
+            # over one connection. Only a real transport error should bubble.
+            for remote_path in remote_paths:
+                try:
+                    if client.download_to_file(remote_path, local_path):
+                        return True
+                except FileNotOnPrinterError:
+                    continue
+            return False
         finally:
             client.disconnect()
 

+ 169 - 5
backend/tests/unit/services/test_bambu_ftp.py

@@ -20,11 +20,17 @@ import pytest
 
 from backend.app.services.bambu_ftp import (
     BambuFTPClient,
+    FileNotOnPrinterError,
+    cache_3mf_download,
+    clear_3mf_cache,
     delete_file_async,
     download_file_async,
     download_file_try_paths_async,
+    get_cached_3mf,
     list_files_async,
+    normalize_3mf_name,
     upload_file_async,
+    with_ftp_retry,
 )
 
 # Brief delay to allow pyftpdlib to flush uploaded files to disk.
@@ -265,14 +271,19 @@ class TestDownload:
         assert not local.exists()
         client.disconnect()
 
-    def test_download_to_file_missing_returns_false(self, ftp_client_factory, tmp_path):
-        """Missing file returns False."""
+    def test_download_to_file_missing_raises_not_on_printer(self, ftp_client_factory, tmp_path):
+        """Missing file raises FileNotOnPrinterError so callers can short-circuit
+        the retry loop — 550 means the file isn't there and retrying won't help."""
+        from backend.app.services.bambu_ftp import FileNotOnPrinterError
+
         local = tmp_path / "missing.bin"
         client = ftp_client_factory()
         client.connect()
-        result = client.download_to_file("/cache/no_such_file.bin", local)
-        assert result is False
-        client.disconnect()
+        try:
+            with pytest.raises(FileNotOnPrinterError):
+                client.download_to_file("/cache/no_such_file.bin", local)
+        finally:
+            client.disconnect()
 
     def test_download_large_file(self, ftp_client_factory, ftp_server):
         """Large file download (>1MB) works correctly."""
@@ -1001,3 +1012,156 @@ class TestFailureScenarios:
             downloaded = client2.download_file("/cache/voidresp_test.3mf")
             assert downloaded == content, f"Content mismatch for model={model}"
             client2.disconnect()
+
+
+# ---------------------------------------------------------------------------
+# Short-circuit retries on 550 (#972)
+# ---------------------------------------------------------------------------
+class TestFileNotOnPrinterShortCircuit:
+    """FileNotOnPrinterError must bypass the retry budget.
+
+    Before this fix, a 3MF path that wasn't on the printer (550) cost
+    `ftp_retry_count + 1` attempts × `ftp_retry_delay` seconds per candidate
+    path. With ftp_retry_count=10 and four candidate paths, that's ~22 min
+    of dead retries before the real path is tried. #972 in the wild showed
+    48 min of retrying paths that didn't exist.
+    """
+
+    async def test_with_ftp_retry_propagates_file_not_on_printer_without_retrying(self):
+        """with_ftp_retry raises FileNotOnPrinterError on first attempt.
+
+        Verifies non_retry_exceptions short-circuits before the retry loop
+        has a chance to sleep and try again.
+        """
+        attempts = {"n": 0}
+
+        async def always_missing(*_args, **_kwargs):
+            attempts["n"] += 1
+            raise FileNotOnPrinterError("/cache/absent.3mf: 550")
+
+        with pytest.raises(FileNotOnPrinterError):
+            await with_ftp_retry(
+                always_missing,
+                max_retries=10,
+                retry_delay=0.01,
+                operation_name="test 550 short-circuit",
+                non_retry_exceptions=(FileNotOnPrinterError,),
+            )
+
+        assert attempts["n"] == 1, "550 must not trigger any retry"
+
+    async def test_with_ftp_retry_still_retries_transient_errors(self):
+        """Non-550 exceptions continue to retry up to max_retries + 1."""
+        attempts = {"n": 0}
+
+        async def flaky(*_args, **_kwargs):
+            attempts["n"] += 1
+            raise TimeoutError("transient")
+
+        result = await with_ftp_retry(
+            flaky,
+            max_retries=2,
+            retry_delay=0.01,
+            operation_name="test transient retries",
+            non_retry_exceptions=(FileNotOnPrinterError,),
+        )
+
+        assert result is None
+        assert attempts["n"] == 3, "Transient errors should retry to exhaustion"
+
+    def test_download_to_file_raises_on_missing_path(self, ftp_client_factory, tmp_path):
+        """download_to_file surfaces 550 as FileNotOnPrinterError end-to-end
+        against the real mock FTPS server, not just a hand-rolled mock."""
+        local = tmp_path / "never_downloaded.3mf"
+        client = ftp_client_factory()
+        client.connect()
+        try:
+            with pytest.raises(FileNotOnPrinterError):
+                client.download_to_file("/cache/does_not_exist.3mf", local)
+        finally:
+            client.disconnect()
+        assert not local.exists(), "Partial file must be cleaned up on 550"
+
+
+# ---------------------------------------------------------------------------
+# 3MF download cache (#972)
+# ---------------------------------------------------------------------------
+class TestThreeMFCache:
+    """Cover endpoint and archive flow share downloaded 3MF bytes via this
+    cache. Tests isolate themselves with clear_3mf_cache(delete_files=False)
+    so they don't clobber each other."""
+
+    def setup_method(self):
+        clear_3mf_cache(delete_files=False)
+
+    def teardown_method(self):
+        clear_3mf_cache(delete_files=False)
+
+    def test_normalize_collapses_filename_variants(self):
+        """Bambu names vary (.3mf, .gcode.3mf, with spaces) — they all map
+        to the same cache slot so both flows agree on the key."""
+        canonical = normalize_3mf_name("Broly_Legendary.gcode.3mf")
+        assert normalize_3mf_name("Broly_Legendary.3mf") == canonical
+        assert normalize_3mf_name("Broly_Legendary") == canonical
+        # Bambu Studio rewrites spaces to underscores on upload — treat as equal
+        assert normalize_3mf_name("Broly Legendary") == canonical
+        # Case is also collapsed so keys match across capitalizations
+        assert normalize_3mf_name("BROLY_LEGENDARY.3MF") == canonical
+
+    def test_cache_hit_returns_stored_path(self, tmp_path):
+        """get_cached_3mf returns the same Path that was put in."""
+        f = tmp_path / "Broly.gcode.3mf"
+        f.write_bytes(b"fake 3mf content")
+        cache_3mf_download(1, "Broly.gcode.3mf", f)
+        assert get_cached_3mf(1, "Broly.gcode.3mf") == f
+
+    def test_cache_lookup_uses_normalized_name(self, tmp_path):
+        """Caching under .gcode.3mf and querying with bare name still hits."""
+        f = tmp_path / "Broly.gcode.3mf"
+        f.write_bytes(b"x")
+        cache_3mf_download(1, "Broly.gcode.3mf", f)
+        assert get_cached_3mf(1, "Broly.3mf") == f
+        assert get_cached_3mf(1, "Broly") == f
+
+    def test_cache_miss_on_different_printer(self, tmp_path):
+        """Printer id is part of the key — two printers never collide."""
+        f = tmp_path / "A.3mf"
+        f.write_bytes(b"x")
+        cache_3mf_download(1, "A.3mf", f)
+        assert get_cached_3mf(2, "A.3mf") is None
+
+    def test_cache_evicts_when_file_deleted(self, tmp_path):
+        """Stale entry (file gone) returns None and is dropped from the dict."""
+        f = tmp_path / "A.3mf"
+        f.write_bytes(b"x")
+        cache_3mf_download(1, "A.3mf", f)
+        f.unlink()
+        assert get_cached_3mf(1, "A.3mf") is None
+        # Re-populating after eviction works — no ghost entries remain.
+        f.write_bytes(b"y")
+        cache_3mf_download(1, "A.3mf", f)
+        assert get_cached_3mf(1, "A.3mf") == f
+
+    def test_clear_by_printer_scoped(self, tmp_path):
+        """Clearing one printer leaves the other untouched."""
+        f1 = tmp_path / "one.3mf"
+        f1.write_bytes(b"1")
+        f2 = tmp_path / "two.3mf"
+        f2.write_bytes(b"2")
+        cache_3mf_download(1, "one.3mf", f1)
+        cache_3mf_download(2, "two.3mf", f2)
+        clear_3mf_cache(1)
+        assert get_cached_3mf(1, "one.3mf") is None
+        assert get_cached_3mf(2, "two.3mf") == f2
+        # clear_3mf_cache defaulted to delete_files=True, so the file is gone
+        assert not f1.exists()
+        assert f2.exists()
+
+    def test_clear_without_deleting_files(self, tmp_path):
+        """delete_files=False leaves files on disk — used by tests."""
+        f = tmp_path / "keep.3mf"
+        f.write_bytes(b"x")
+        cache_3mf_download(1, "keep.3mf", f)
+        clear_3mf_cache(1, delete_files=False)
+        assert get_cached_3mf(1, "keep.3mf") is None
+        assert f.exists()

+ 185 - 0
backend/tests/unit/test_subtask_archive_resume.py

@@ -0,0 +1,185 @@
+"""Regression tests for subtask_id-based archive resume (#972).
+
+Before this fix, a Bambuddy restart during a long print (e.g. 13h) triggered
+the name-based "stale archive" path at 4h, cancelled the original row, and
+created a new archive with `started_at = now()` — losing ~9h of print time
+continuity. mstko reported this on a 37.5MB Broly print on an A1: after a
+container restart mid-print, the archive ended up showing ~1h37m duration
+for a print that actually ran 13h08m.
+
+The fix stores `subtask_id` (MQTT-provided job identifier) on the archive row.
+On print-start detection, the handler first tries to match an existing
+archive by subtask_id regardless of age — same id ⇒ same print ⇒ resume.
+Only unmatched prints fall through to the legacy 4h staleness heuristic.
+"""
+
+from datetime import datetime, timedelta, timezone
+
+import pytest
+from sqlalchemy import select
+
+from backend.app.models.archive import PrintArchive
+
+
+def _extract_subtask_id(data: dict) -> str | None:
+    """Mirrors the extraction logic in main.on_print_start.
+
+    Hoisted here so the test can pin the contract: Bambu reports "0" and
+    empty string for local / non-cloud prints, both of which must collapse
+    to None so we don't match every non-cloud print to every other one.
+    """
+    raw = data.get("raw_data") or {}
+    val = raw.get("subtask_id")
+    if val is None:
+        return None
+    val = str(val).strip()
+    if val in ("", "0"):
+        return None
+    return val
+
+
+class TestSubtaskIdExtraction:
+    """subtask_id extraction mirrors the in-handler logic."""
+
+    def test_valid_id_returns_string(self):
+        assert _extract_subtask_id({"raw_data": {"subtask_id": "12345"}}) == "12345"
+
+    def test_zero_collapses_to_none(self):
+        """Bambu reports '0' for local (non-cloud) prints; must not match anything."""
+        assert _extract_subtask_id({"raw_data": {"subtask_id": "0"}}) is None
+
+    def test_empty_collapses_to_none(self):
+        assert _extract_subtask_id({"raw_data": {"subtask_id": ""}}) is None
+
+    def test_missing_raw_data(self):
+        assert _extract_subtask_id({}) is None
+
+    def test_missing_subtask_id(self):
+        assert _extract_subtask_id({"raw_data": {"foo": "bar"}}) is None
+
+    def test_integer_value_stringified(self):
+        """MQTT may send the id as an int — coerce consistently."""
+        assert _extract_subtask_id({"raw_data": {"subtask_id": 12345}}) == "12345"
+
+    def test_whitespace_trimmed(self):
+        assert _extract_subtask_id({"raw_data": {"subtask_id": "  42  "}}) == "42"
+
+
+class TestSubtaskIdResume:
+    """End-to-end DB behavior of the resume path: a second on_print_start
+    for the same subtask_id must find and reuse the first archive row."""
+
+    @pytest.fixture
+    async def archive_factory(self, db_session, printer_factory):
+        printer = await printer_factory()
+
+        async def _create(
+            subtask_id: str | None = None,
+            status: str = "printing",
+            age_hours: float = 0,
+            failure_reason: str | None = None,
+        ):
+            started = datetime.now(timezone.utc) - timedelta(hours=age_hours)
+            archive = PrintArchive(
+                printer_id=printer.id,
+                filename="Broly_Legendary.gcode.3mf",
+                file_path="archive/1/x/Broly.gcode.3mf",
+                file_size=100,
+                print_name="Broly_Legendary",
+                status=status,
+                started_at=started,
+                subtask_id=subtask_id,
+                failure_reason=failure_reason,
+            )
+            # Override server_default on created_at so age-based tests work
+            archive.created_at = started
+            db_session.add(archive)
+            await db_session.commit()
+            await db_session.refresh(archive)
+            return printer, archive
+
+        return _create
+
+    async def test_subtask_id_query_finds_matching_printing_row(self, archive_factory, db_session):
+        """The lookup used by main.on_print_start finds a matching row even
+        when the archive is older than the 4h name-based staleness cutoff."""
+        printer, archive = await archive_factory(subtask_id="t-123", age_hours=10)
+
+        result = await db_session.execute(
+            select(PrintArchive)
+            .where(PrintArchive.printer_id == printer.id)
+            .where(PrintArchive.subtask_id == "t-123")
+            .where(PrintArchive.status.in_(["printing", "cancelled"]))
+            .order_by(PrintArchive.created_at.desc())
+            .limit(1)
+        )
+        found = result.scalar_one_or_none()
+        assert found is not None
+        assert found.id == archive.id
+
+    async def test_subtask_id_revives_stale_cancelled_row(self, archive_factory, db_session):
+        """If an older Bambuddy wrongly cancelled the archive (legacy 4h path),
+        the next print-start with the same subtask_id must revive it rather
+        than start a third row."""
+        printer, archive = await archive_factory(
+            subtask_id="t-456",
+            status="cancelled",
+            failure_reason="Stale - print likely cancelled or failed without status update",
+            age_hours=10,
+        )
+
+        result = await db_session.execute(
+            select(PrintArchive)
+            .where(PrintArchive.printer_id == printer.id)
+            .where(PrintArchive.subtask_id == "t-456")
+            .where(PrintArchive.status.in_(["printing", "cancelled"]))
+            .order_by(PrintArchive.created_at.desc())
+            .limit(1)
+        )
+        candidate = result.scalar_one_or_none()
+        assert candidate is not None
+
+        # Revival mirrors the main.py logic: only revive stale-cancelled rows,
+        # not user-cancelled ones. The failure_reason prefix is the signal.
+        is_stale_cancelled = (candidate.failure_reason or "").startswith("Stale")
+        assert is_stale_cancelled
+
+        candidate.status = "printing"
+        candidate.failure_reason = None
+        await db_session.commit()
+        await db_session.refresh(candidate)
+
+        assert candidate.status == "printing"
+        # Crucially, started_at is preserved — this is the whole point of the
+        # fix. A fresh archive would have started_at = now, losing continuity.
+        age_after = datetime.now(timezone.utc) - candidate.started_at.replace(tzinfo=timezone.utc)
+        assert age_after > timedelta(hours=9), "started_at must survive revival"
+
+    async def test_subtask_id_null_does_not_match_other_nulls(self, archive_factory, db_session):
+        """Two different non-cloud prints both have subtask_id=NULL. They
+        must NOT match each other via the subtask_id lookup (which is why
+        the handler filters by `subtask_id IS NOT NULL` in the Python layer
+        before even running this query)."""
+        printer, _archive = await archive_factory(subtask_id=None, age_hours=1)
+
+        # This shape of query (subtask_id == None) would return rows via
+        # SQLAlchemy's NULL handling, but the handler only runs it when
+        # subtask_id is truthy — so the query is never issued for NULL.
+        # Assert the guard by testing the subtask_id != "" branch.
+        result = await db_session.execute(select(PrintArchive).where(PrintArchive.subtask_id == ""))
+        found = result.scalar_one_or_none()
+        assert found is None, "Empty string must not match NULL rows"
+
+    async def test_completed_archive_not_resumed(self, archive_factory, db_session):
+        """A completed archive with the same subtask_id must not be reopened
+        as printing — that subtask's job is done; a new run is a new row."""
+        printer, _ = await archive_factory(subtask_id="t-789", status="completed")
+
+        result = await db_session.execute(
+            select(PrintArchive)
+            .where(PrintArchive.printer_id == printer.id)
+            .where(PrintArchive.subtask_id == "t-789")
+            .where(PrintArchive.status.in_(["printing", "cancelled"]))
+        )
+        found = result.scalar_one_or_none()
+        assert found is None

Some files were not shown because too many files changed in this diff