loop_watchdog.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. """Event-loop stall watchdog (#1486).
  2. A frozen asyncio event loop is invisible: it produces no log line and no
  3. traceback — the HTTP server just goes silent, ``/health`` hangs, and the
  4. process can stop responding to SIGTERM. Several "container hangs after adding
  5. a printer" reports had exactly this shape, with nothing in the logs to act on.
  6. This watchdog makes such a freeze diagnosable. An async heartbeat re-arms
  7. ``faulthandler.dump_traceback_later()`` every ``HEARTBEAT_INTERVAL`` seconds,
  8. always ``STALL_THRESHOLD`` seconds ahead. While the loop keeps ticking the
  9. timer is cancelled and re-armed before it can fire. If the loop stalls, the
  10. heartbeat can't re-arm — and faulthandler's timer runs in a dedicated C-level
  11. thread that fires regardless of the frozen loop, dumping *every* thread's
  12. stack to stderr. The blocked frame then shows up in ``docker compose logs``.
  13. """
  14. import asyncio
  15. import faulthandler
  16. import logging
  17. logger = logging.getLogger(__name__)
  18. # How often the heartbeat cancels + re-arms the faulthandler timer. Must be
  19. # comfortably below STALL_THRESHOLD so a healthy loop always re-arms in time.
  20. HEARTBEAT_INTERVAL = 10.0
  21. # The loop must be unresponsive for at least this long before thread stacks
  22. # are dumped. Generous on purpose: no legitimate on-loop operation should
  23. # block for 30s, so anything that does is itself a bug worth a stack dump.
  24. STALL_THRESHOLD = 30.0
  25. _watchdog_task: asyncio.Task | None = None
  26. async def _heartbeat_loop() -> None:
  27. """Re-arm the faulthandler stall timer on every tick."""
  28. while True:
  29. try:
  30. faulthandler.cancel_dump_traceback_later()
  31. # repeat=False: one dump pinpoints a hard freeze. If the loop
  32. # recovers and stalls again, the next heartbeat re-arms anyway.
  33. faulthandler.dump_traceback_later(STALL_THRESHOLD, repeat=False)
  34. except Exception as e: # never let the watchdog itself crash the app
  35. logger.warning("Loop watchdog re-arm failed: %s", e)
  36. try:
  37. await asyncio.sleep(HEARTBEAT_INTERVAL)
  38. except asyncio.CancelledError:
  39. break
  40. def start_loop_watchdog() -> None:
  41. """Start the event-loop stall watchdog. Idempotent."""
  42. global _watchdog_task
  43. if _watchdog_task is not None:
  44. return
  45. if not faulthandler.is_enabled():
  46. # Also installs handlers for fatal signals (SIGSEGV etc.) — harmless
  47. # and useful; the dump_traceback_later timer works either way.
  48. faulthandler.enable()
  49. _watchdog_task = asyncio.create_task(_heartbeat_loop())
  50. logger.info(
  51. "Event-loop stall watchdog started — dumps all thread stacks to stderr if the loop stalls for more than %.0fs",
  52. STALL_THRESHOLD,
  53. )
  54. def stop_loop_watchdog() -> None:
  55. """Stop the watchdog and disarm the pending stall timer."""
  56. global _watchdog_task
  57. if _watchdog_task is not None:
  58. _watchdog_task.cancel()
  59. _watchdog_task = None
  60. try:
  61. faulthandler.cancel_dump_traceback_later()
  62. except Exception:
  63. pass
  64. logger.info("Event-loop stall watchdog stopped")