|
|
@@ -0,0 +1,77 @@
|
|
|
+"""Event-loop stall watchdog (#1486).
|
|
|
+
|
|
|
+A frozen asyncio event loop is invisible: it produces no log line and no
|
|
|
+traceback — the HTTP server just goes silent, ``/health`` hangs, and the
|
|
|
+process can stop responding to SIGTERM. Several "container hangs after adding
|
|
|
+a printer" reports had exactly this shape, with nothing in the logs to act on.
|
|
|
+
|
|
|
+This watchdog makes such a freeze diagnosable. An async heartbeat re-arms
|
|
|
+``faulthandler.dump_traceback_later()`` every ``HEARTBEAT_INTERVAL`` seconds,
|
|
|
+always ``STALL_THRESHOLD`` seconds ahead. While the loop keeps ticking the
|
|
|
+timer is cancelled and re-armed before it can fire. If the loop stalls, the
|
|
|
+heartbeat can't re-arm — and faulthandler's timer runs in a dedicated C-level
|
|
|
+thread that fires regardless of the frozen loop, dumping *every* thread's
|
|
|
+stack to stderr. The blocked frame then shows up in ``docker compose logs``.
|
|
|
+"""
|
|
|
+
|
|
|
+import asyncio
|
|
|
+import faulthandler
|
|
|
+import logging
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+# How often the heartbeat cancels + re-arms the faulthandler timer. Must be
|
|
|
+# comfortably below STALL_THRESHOLD so a healthy loop always re-arms in time.
|
|
|
+HEARTBEAT_INTERVAL = 10.0
|
|
|
+
|
|
|
+# The loop must be unresponsive for at least this long before thread stacks
|
|
|
+# are dumped. Generous on purpose: no legitimate on-loop operation should
|
|
|
+# block for 30s, so anything that does is itself a bug worth a stack dump.
|
|
|
+STALL_THRESHOLD = 30.0
|
|
|
+
|
|
|
+_watchdog_task: asyncio.Task | None = None
|
|
|
+
|
|
|
+
|
|
|
+async def _heartbeat_loop() -> None:
|
|
|
+ """Re-arm the faulthandler stall timer on every tick."""
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ faulthandler.cancel_dump_traceback_later()
|
|
|
+ # repeat=False: one dump pinpoints a hard freeze. If the loop
|
|
|
+ # recovers and stalls again, the next heartbeat re-arms anyway.
|
|
|
+ faulthandler.dump_traceback_later(STALL_THRESHOLD, repeat=False)
|
|
|
+ except Exception as e: # never let the watchdog itself crash the app
|
|
|
+ logger.warning("Loop watchdog re-arm failed: %s", e)
|
|
|
+ try:
|
|
|
+ await asyncio.sleep(HEARTBEAT_INTERVAL)
|
|
|
+ except asyncio.CancelledError:
|
|
|
+ break
|
|
|
+
|
|
|
+
|
|
|
+def start_loop_watchdog() -> None:
|
|
|
+ """Start the event-loop stall watchdog. Idempotent."""
|
|
|
+ global _watchdog_task
|
|
|
+ if _watchdog_task is not None:
|
|
|
+ return
|
|
|
+ if not faulthandler.is_enabled():
|
|
|
+ # Also installs handlers for fatal signals (SIGSEGV etc.) — harmless
|
|
|
+ # and useful; the dump_traceback_later timer works either way.
|
|
|
+ faulthandler.enable()
|
|
|
+ _watchdog_task = asyncio.create_task(_heartbeat_loop())
|
|
|
+ logger.info(
|
|
|
+ "Event-loop stall watchdog started — dumps all thread stacks to stderr if the loop stalls for more than %.0fs",
|
|
|
+ STALL_THRESHOLD,
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def stop_loop_watchdog() -> None:
|
|
|
+ """Stop the watchdog and disarm the pending stall timer."""
|
|
|
+ global _watchdog_task
|
|
|
+ if _watchdog_task is not None:
|
|
|
+ _watchdog_task.cancel()
|
|
|
+ _watchdog_task = None
|
|
|
+ try:
|
|
|
+ faulthandler.cancel_dump_traceback_later()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ logger.info("Event-loop stall watchdog stopped")
|