Explorar el Código

fix(ftp): wait for zombie thread to complete before giving up on download (#1014)

maziggy hace 1 mes
padre
commit
d3425c7f44
Se han modificado 3 ficheros con 103 adiciones y 24 borrados
  1. 0 0
      CHANGELOG.md
  2. 47 24
      backend/app/services/bambu_ftp.py
  3. 56 0
      backend/tests/unit/services/test_bambu_ftp.py

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 0 - 0
CHANGELOG.md


+ 47 - 24
backend/app/services/bambu_ftp.py

@@ -4,6 +4,7 @@ import logging
 import os
 import os
 import socket
 import socket
 import ssl
 import ssl
+import threading
 import time
 import time
 from collections.abc import Awaitable, Callable
 from collections.abc import Awaitable, Callable
 from ftplib import FTP, FTP_TLS  # nosec B402
 from ftplib import FTP, FTP_TLS  # nosec B402
@@ -719,46 +720,68 @@ async def download_file_async(
     # the download after we stop waiting. The thread flips `success` to True
     # the download after we stop waiting. The thread flips `success` to True
     # ONLY after the file is fully written — a post-timeout check lets us
     # ONLY after the file is fully written — a post-timeout check lets us
     # salvage the download without mistaking an in-progress partial write
     # salvage the download without mistaking an in-progress partial write
-    # for a completed one. Each attempt gets its own dict so a zombie from
-    # an earlier attempt can't flip the flag for a later one.
+    # for a completed one. Each attempt gets its own dict and event so a
+    # zombie from an earlier attempt can't flip the flag for a later one.
+    # The event is set in `_download`'s finally block so the post-timeout
+    # path can wait for genuine thread completion instead of a fixed sleep.
 
 
-    def _download(force_prot_c: bool, completion: dict) -> bool:
+    def _download(force_prot_c: bool, completion: dict, done: threading.Event) -> bool:
         mode_str = "prot_c" if force_prot_c else "prot_p"
         mode_str = "prot_c" if force_prot_c else "prot_p"
-        client = BambuFTPClient(
-            ip_address, access_code, timeout=socket_timeout, printer_model=printer_model, force_prot_c=force_prot_c
-        )
-        if client.connect():
-            try:
-                result = client.download_to_file(remote_path, local_path)
-                if result:
-                    BambuFTPClient.cache_mode(ip_address, mode_str)
-                    completion["success"] = True
-                return result
-            finally:
-                client.disconnect()
-        return False
+        try:
+            client = BambuFTPClient(
+                ip_address,
+                access_code,
+                timeout=socket_timeout,
+                printer_model=printer_model,
+                force_prot_c=force_prot_c,
+            )
+            if client.connect():
+                try:
+                    result = client.download_to_file(remote_path, local_path)
+                    if result:
+                        BambuFTPClient.cache_mode(ip_address, mode_str)
+                        completion["success"] = True
+                    return result
+                finally:
+                    client.disconnect()
+            return False
+        finally:
+            done.set()
 
 
     async def _run(force_prot_c: bool) -> bool:
     async def _run(force_prot_c: bool) -> bool:
         completion = {"success": False}
         completion = {"success": False}
+        done = threading.Event()
         try:
         try:
             return await asyncio.wait_for(
             return await asyncio.wait_for(
-                loop.run_in_executor(None, lambda: _download(force_prot_c, completion)), timeout=timeout
+                loop.run_in_executor(None, _download, force_prot_c, completion, done), timeout=timeout
             )
             )
         except TimeoutError:
         except TimeoutError:
-            # Give the zombie executor thread a brief moment to finish if it
-            # was already close to done. Only salvage when the thread has
-            # signalled genuine success — checking file size alone would
-            # mistake an in-progress partial write for a completed download.
-            await asyncio.sleep(0.5)
+            # Slow WiFi links commonly overshoot ftp_timeout by 10–30 s without
+            # actually being stuck, so starting attempt 2 now would just contend
+            # with the still-progressing RETR on attempt 1 and produce the
+            # zombie-write race reported in #1014 (file landed on disk minutes
+            # after the retry loop had already given up). Wait for the worker
+            # thread to genuinely finish — capped at 30 s so a truly stuck
+            # connection can't stall a whole attempt indefinitely, with a 0.5 s
+            # floor so artificially small test timeouts still give zombies a
+            # realistic window to finish.
+            grace = max(min(timeout, 30.0), 0.5)
+            await loop.run_in_executor(None, done.wait, grace)
             if completion["success"] and local_path.exists() and local_path.stat().st_size > 0:
             if completion["success"] and local_path.exists() and local_path.stat().st_size > 0:
                 logger.info(
                 logger.info(
-                    "FTP download wait_for timed out after %ss for %s, but thread completed (%s bytes) — salvaging",
+                    "FTP download wait_for timed out after %ss for %s, but thread completed within %ss grace (%s bytes) — salvaging",
                     timeout,
                     timeout,
                     remote_path,
                     remote_path,
+                    grace,
                     local_path.stat().st_size,
                     local_path.stat().st_size,
                 )
                 )
                 return True
                 return True
-            logger.warning("FTP download timed out after %ss for %s", timeout, remote_path)
+            logger.warning(
+                "FTP download timed out after %ss (plus %ss grace) for %s",
+                timeout,
+                grace,
+                remote_path,
+            )
             return False
             return False
 
 
     # Check if we have a cached mode for this printer
     # Check if we have a cached mode for this printer

+ 56 - 0
backend/tests/unit/services/test_bambu_ftp.py

@@ -819,6 +819,62 @@ class TestAsyncWrappers:
         )
         )
         assert result is False
         assert result is False
 
 
+    @pytest.mark.asyncio
+    async def test_download_file_async_timeout_waits_for_slow_zombie(self, tmp_path, monkeypatch):
+        """A zombie that completes within the 30s grace window is salvaged.
+
+        Regression for #1014: on slow WiFi, download_to_file can overshoot the
+        user's ftp_timeout by 10–30 s without being stuck. The old fixed 0.5 s
+        post-timeout sleep was too short — it gave up and started attempt 2
+        while attempt 1's zombie thread kept running, and by the time the zombie
+        wrote the file to disk with a success flag, attempt 2 had already
+        reported failure (its own completion dict was still False). The async
+        wrapper now waits up to min(timeout, 30 s) for the worker thread to
+        finish before returning, so a slow-but-progressing download salvages.
+        """
+        from backend.app.services import bambu_ftp
+
+        bambu_ftp.BambuFTPClient._mode_cache.pop("127.0.0.1", None)
+
+        local = tmp_path / "slow_zombie.bin"
+        expected_content = b"finished during grace window"
+
+        class FakeClient:
+            """Mimics a slow FTP: wait_for gives up at 1.0 s but RETR takes
+            1.5 s total. Old 0.5 s fixed sleep would have bailed (0.5 < 0.5
+            extra); new grace = max(min(1.0, 30), 0.5) = 1.0 s covers the
+            remaining 0.5 s so salvage succeeds."""
+
+            def __init__(self, *args, **kwargs):
+                pass
+
+            def connect(self):
+                return True
+
+            def download_to_file(self, remote_path, local_path):
+                time.sleep(1.5)  # wait_for times out at 1.0 s; zombie finishes 0.5 s later
+                local_path.write_bytes(expected_content)
+                return True
+
+            def disconnect(self):
+                pass
+
+        monkeypatch.setattr(bambu_ftp, "BambuFTPClient", FakeClient)
+        monkeypatch.setattr(FakeClient, "_mode_cache", {}, raising=False)
+        monkeypatch.setattr(FakeClient, "A1_MODELS", set(), raising=False)
+        monkeypatch.setattr(FakeClient, "cache_mode", staticmethod(lambda ip, mode: None), raising=False)
+
+        result = await download_file_async(
+            "127.0.0.1",
+            "12345678",
+            "/cache/slow_zombie.bin",
+            local,
+            timeout=1.0,
+            printer_model="X1C",
+        )
+        assert result is True
+        assert local.read_bytes() == expected_content
+
     @pytest.mark.asyncio
     @pytest.mark.asyncio
     async def test_download_file_try_paths_first_succeeds(self, patch_ftp_port, tmp_path):
     async def test_download_file_try_paths_first_succeeds(self, patch_ftp_port, tmp_path):
         """download_file_try_paths_async succeeds on first path."""
         """download_file_try_paths_async succeeds on first path."""

Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio