Browse Source

fix(#1420): negative-cache cover 404s, add GitHub rate-limit backoff

  Cover endpoint had no negative cache: when every FTP path returned
  550 for a print whose 3MF wasn't on the printer (typical SD-card
  print), each frontend refresh re-ran the full 8-path fan-out. Add
  _cover_404_cache keyed by (subtask_name, view_key) and short-circuit
  to 404 on hit; clear alongside _cover_cache on print start. Only
  populated on genuine 404 paths, not transient FTP errors, so flaky
  network doesn't lock out future retries.

  GitHub update-check had no backoff on 403 rate-limit. Add module-
  level _github_rate_limit_until plus three helpers; check before
  every api.github.com call in /updates/check and
  _discover_target_release. Read X-RateLimit-Reset from the 403 with a
  1-hour fallback when the header is absent and a 60-second floor to
  guard against container/GitHub clock skew. Route surfaces
  retry_after_seconds so the UI can display real wait time.

  The "ffmpeg didn't terminate gracefully" line the reporter quoted
  is the standard SIGTERM/SIGKILL pattern in camera.py and unrelated
  to the FTP loop; it goes away on its own once the cover endpoint
  stops hammering the printer.
maziggy 1 week ago
parent
commit
03e3f5313e

File diff suppressed because it is too large
+ 0 - 0
CHANGELOG.md


+ 20 - 4
backend/app/api/routes/printers.py

@@ -775,10 +775,17 @@ async def test_printer_connection(
 # different plates always fetch a fresh thumbnail without needing plate in the key.
 _cover_cache: dict[int, dict[tuple[str, str], bytes]] = {}
 
+# Negative cache (#1420): when a cover lookup exhausts every FTP path with 550
+# (file sliced on SD card, not on printer storage), remember the failure so the
+# next request short-circuits to 404 instead of re-hammering FTP 8 paths deep.
+# Cleared on print start alongside _cover_cache.
+_cover_404_cache: dict[int, set[tuple[str, str]]] = {}
+
 
 def clear_cover_cache(printer_id: int) -> None:
     """Clear cached cover images for a printer. Call on print start to avoid stale thumbnails."""
     _cover_cache.pop(printer_id, None)
+    _cover_404_cache.pop(printer_id, None)
 
 
 @router.get("/{printer_id}/cover")
@@ -828,10 +835,15 @@ async def get_printer_cover(
     # runs on every print start, so a re-dispatch with a different plate gets
     # a fresh image regardless. Pre-#1166 the key included plate_num, but with
     # late plate resolution the cache check would always miss.
-    if printer_id in _cover_cache:
-        cache_key = (subtask_name, view_key)
-        if cache_key in _cover_cache[printer_id]:
-            return Response(content=_cover_cache[printer_id][cache_key], media_type="image/png")
+    cache_key = (subtask_name, view_key)
+    if printer_id in _cover_cache and cache_key in _cover_cache[printer_id]:
+        return Response(content=_cover_cache[printer_id][cache_key], media_type="image/png")
+
+    # Negative-cache short-circuit (#1420): if a prior lookup for this same
+    # subtask + view already failed, don't replay 8 FTP retries on every page
+    # refresh. _cover_404_cache is cleared on print start.
+    if printer_id in _cover_404_cache and cache_key in _cover_404_cache[printer_id]:
+        raise HTTPException(404, f"No cover available for '{subtask_name}' (cached)")
 
     # Build possible 3MF filenames from subtask_name
     # Bambu printers may store files as "name.gcode.3mf" (sliced via Bambu Studio)
@@ -917,6 +929,9 @@ async def get_printer_cover(
             raise HTTPException(503, f"FTP download temporarily unavailable: {last_error}")
 
         if not downloaded:
+            # Remember this failure so subsequent requests for the same print
+            # skip the 8-path FTP fan-out (#1420).
+            _cover_404_cache.setdefault(printer_id, set()).add(cache_key)
             raise HTTPException(
                 404,
                 f"Could not download 3MF file for '{subtask_name}' from printer {printer.ip_address}. Tried: {possible_filenames}",
@@ -1006,6 +1021,7 @@ async def get_printer_cover(
                     _cover_cache[printer_id][(subtask_name, view_key)] = image_data
                     return Response(content=image_data, media_type="image/png")
 
+            _cover_404_cache.setdefault(printer_id, set()).add(cache_key)
             raise HTTPException(404, "No thumbnail found in 3MF file")
         finally:
             zf.close()

+ 96 - 0
backend/app/api/routes/updates.py

@@ -6,6 +6,7 @@ import os
 import re
 import shutil
 import sys
+import time
 
 import httpx
 from fastapi import APIRouter, BackgroundTasks, Depends
@@ -31,6 +32,61 @@ _update_status = {
     "error": None,
 }
 
+# GitHub rate-limit backoff (#1420): when api.github.com returns 403 with
+# X-RateLimit-Remaining=0, refuse to retry until X-RateLimit-Reset (epoch
+# seconds). Falls back to a 1-hour pause if the header is absent. Prevents
+# the update checker from hammering GitHub once the unauthenticated quota
+# (60 req/hr per source IP) is exhausted.
+_GITHUB_RATE_LIMIT_FALLBACK_SECONDS = 3600
+_github_rate_limit_until: float = 0.0
+
+
+def _seconds_until_github_unblocked() -> float:
+    """Return seconds remaining until GitHub backoff lifts, or 0 if unblocked."""
+    remaining = _github_rate_limit_until - time.time()
+    return remaining if remaining > 0 else 0.0
+
+
+def _record_github_rate_limit(response: httpx.Response) -> None:
+    """Set the backoff window from a GitHub 403 response's headers."""
+    global _github_rate_limit_until
+    reset_header = response.headers.get("X-RateLimit-Reset")
+    reset_at: float | None = None
+    if reset_header:
+        try:
+            reset_at = float(reset_header)
+        except ValueError:
+            reset_at = None
+    if reset_at is None:
+        reset_at = time.time() + _GITHUB_RATE_LIMIT_FALLBACK_SECONDS
+    # Floor at a 60s minimum: protects against clock skew between the container
+    # and GitHub (parsed reset epoch in the past would otherwise leave us with
+    # no real backoff and we'd hammer GitHub again immediately).
+    reset_at = max(reset_at, time.time() + 60)
+    # Only extend the window — never shorten it via an out-of-order response.
+    if reset_at > _github_rate_limit_until:
+        _github_rate_limit_until = reset_at
+    logger.warning(
+        "GitHub rate limit hit; suppressing update checks for %.0fs (reset header=%s)",
+        _seconds_until_github_unblocked(),
+        reset_header,
+    )
+
+
+def _is_github_rate_limit_response(response: httpx.Response) -> bool:
+    """Detect a rate-limit response from GitHub (403/429 with Remaining=0)."""
+    if response.status_code not in (403, 429):
+        return False
+    remaining = response.headers.get("X-RateLimit-Remaining")
+    if remaining == "0":
+        return True
+    # Some proxies strip the header; fall back to body inspection.
+    try:
+        body = response.text or ""
+    except Exception:
+        body = ""
+    return "rate limit" in body.lower() or "API rate limit exceeded" in body
+
 
 def _is_docker_environment() -> bool:
     """Detect if running inside a Docker container."""
@@ -287,6 +343,23 @@ async def check_for_updates(
     beta_setting = result.scalar_one_or_none()
     include_beta = beta_setting and beta_setting.value.lower() == "true"
 
+    # Short-circuit if we're still inside a GitHub rate-limit backoff window (#1420).
+    backoff_remaining = _seconds_until_github_unblocked()
+    if backoff_remaining > 0:
+        _update_status = {
+            "status": "error",
+            "progress": 0,
+            "message": "GitHub rate limit reached",
+            "error": "GitHub rate limit reached; retry later",
+        }
+        return {
+            "update_available": False,
+            "current_version": APP_VERSION,
+            "latest_version": None,
+            "error": "GitHub rate limit reached; retry later",
+            "retry_after_seconds": int(backoff_remaining),
+        }
+
     _update_status = {
         "status": "checking",
         "progress": 0,
@@ -302,6 +375,22 @@ async def check_for_updates(
                 timeout=10.0,
             )
 
+            if _is_github_rate_limit_response(response):
+                _record_github_rate_limit(response)
+                _update_status = {
+                    "status": "error",
+                    "progress": 0,
+                    "message": "GitHub rate limit reached",
+                    "error": "GitHub rate limit reached; retry later",
+                }
+                return {
+                    "update_available": False,
+                    "current_version": APP_VERSION,
+                    "latest_version": None,
+                    "error": "GitHub rate limit reached; retry later",
+                    "retry_after_seconds": int(_seconds_until_github_unblocked()),
+                }
+
             if response.status_code == 404:
                 # No releases yet
                 _update_status = {
@@ -419,6 +508,10 @@ async def _discover_target_release(db: AsyncSession) -> str | None:
     beta_setting = result.scalar_one_or_none()
     include_beta = beta_setting and beta_setting.value.lower() == "true"
 
+    if _seconds_until_github_unblocked() > 0:
+        logger.warning("Skipping update target discovery: GitHub rate-limit backoff still active")
+        return None
+
     try:
         async with httpx.AsyncClient() as client:
             response = await client.get(
@@ -426,6 +519,9 @@ async def _discover_target_release(db: AsyncSession) -> str | None:
                 headers={"Accept": "application/vnd.github.v3+json"},
                 timeout=10.0,
             )
+            if _is_github_rate_limit_response(response):
+                _record_github_rate_limit(response)
+                return None
             response.raise_for_status()
             releases = response.json()
     except (httpx.HTTPError, ValueError) as exc:

+ 45 - 0
backend/tests/integration/test_printers_api.py

@@ -493,6 +493,51 @@ class TestPrintersAPI:
         assert response.status_code == 200
         assert response.content == b"PLATE_3_PNG"
 
+    @pytest.mark.asyncio
+    @pytest.mark.integration
+    async def test_cover_negative_cache_skips_repeat_ftp_fanout(
+        self, async_client: AsyncClient, printer_factory, db_session
+    ):
+        """#1420: when every FTP path returns 550 for the current subtask, the
+        next request for the same subtask must short-circuit to 404 instead of
+        replaying the 8-path FTP fan-out (which starves the printer's single
+        FTP socket and flooded the user's logs)."""
+        from unittest.mock import AsyncMock, MagicMock, patch
+
+        from backend.app.api.routes.printers import _cover_404_cache, _cover_cache
+        from backend.app.services.bambu_mqtt import PrinterState
+
+        printer = await printer_factory()
+
+        _cover_cache.pop(printer.id, None)
+        _cover_404_cache.pop(printer.id, None)
+
+        state = PrinterState()
+        state.connected = True
+        state.state = "RUNNING"
+        state.subtask_name = "OrphanPrint"
+        state.gcode_file = "OrphanPrint.3mf"
+
+        ftp_mock = AsyncMock(return_value=False)
+
+        with (
+            patch("backend.app.api.routes.printers.printer_manager") as mock_pm,
+            patch("backend.app.api.routes.printers.download_file_try_paths_async", ftp_mock),
+        ):
+            mock_pm.get_status = MagicMock(return_value=state)
+            mock_pm.is_awaiting_plate_clear = MagicMock(return_value=False)
+
+            r1 = await async_client.get(f"/api/v1/printers/{printer.id}/cover")
+            r2 = await async_client.get(f"/api/v1/printers/{printer.id}/cover")
+
+        assert r1.status_code == 404
+        assert r2.status_code == 404
+        # First call retries internally; second call must short-circuit before FTP.
+        first_call_count = ftp_mock.await_count
+        assert first_call_count >= 1
+        # Second request didn't add to the count: the negative cache held.
+        assert ftp_mock.await_count == first_call_count
+
     @pytest.mark.asyncio
     @pytest.mark.integration
     async def test_get_printer_status_omits_fila_switch_when_not_installed(

+ 62 - 0
backend/tests/integration/test_updates_api.py

@@ -181,6 +181,68 @@ class TestUpdatesAPI:
         assert body["is_docker"] is True
         assert body["update_method"] == "docker"
 
+    @pytest.mark.asyncio
+    async def test_check_backs_off_after_github_rate_limit(self, async_client: AsyncClient):
+        """#1420: once GitHub returns 403 with X-RateLimit-Remaining=0, the
+        next call must short-circuit on the backoff window instead of hitting
+        api.github.com again. Otherwise the user's logs flood with rate-limit
+        errors and Bambuddy keeps adding to whatever throttle GitHub applies."""
+        import time
+
+        import httpx as _httpx
+
+        import backend.app.api.routes.updates as updates_module
+
+        # Reset module-level backoff state between tests.
+        updates_module._github_rate_limit_until = 0.0
+
+        # Future reset time, ~10 minutes ahead — the backoff window we expect.
+        future_reset = time.time() + 600
+
+        class _RateLimitedResp:
+            status_code = 403
+            headers = {
+                "X-RateLimit-Remaining": "0",
+                "X-RateLimit-Reset": str(int(future_reset)),
+            }
+            text = "API rate limit exceeded"
+
+            def raise_for_status(self):
+                raise _httpx.HTTPStatusError("403", request=None, response=self)
+
+            def json(self):
+                return {"message": "API rate limit exceeded"}
+
+        call_counter = {"n": 0}
+
+        class _FakeClient:
+            async def __aenter__(self):
+                return self
+
+            async def __aexit__(self, *_):
+                return None
+
+            async def get(self, *_, **__):
+                call_counter["n"] += 1
+                return _RateLimitedResp()
+
+        try:
+            with patch.object(_httpx, "AsyncClient", _FakeClient):
+                first = await async_client.get("/api/v1/updates/check")
+                second = await async_client.get("/api/v1/updates/check")
+        finally:
+            updates_module._github_rate_limit_until = 0.0
+
+        # First request reached httpx; second short-circuited on the backoff.
+        assert call_counter["n"] == 1
+
+        first_body = first.json()
+        second_body = second.json()
+        assert "rate limit" in (first_body.get("error") or "").lower()
+        assert "rate limit" in (second_body.get("error") or "").lower()
+        # Backoff window roughly matches the X-RateLimit-Reset header.
+        assert second_body.get("retry_after_seconds", 0) > 0
+
     def test_parse_version(self):
         from backend.app.api.routes.updates import parse_version
 

Some files were not shown because too many files changed in this diff