yunoadmin
/
bambuddy
同期ミラー https://github.com/maziggy/bambuddy


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
							"""Event-loop stall watchdog (#1486).

A frozen asyncio event loop is invisible: it produces no log line and no
traceback — the HTTP server just goes silent, ``/health`` hangs, and the
process can stop responding to SIGTERM. Several "container hangs after adding
a printer" reports had exactly this shape, with nothing in the logs to act on.

This watchdog makes such a freeze diagnosable. An async heartbeat re-arms
``faulthandler.dump_traceback_later()`` every ``HEARTBEAT_INTERVAL`` seconds,
always ``STALL_THRESHOLD`` seconds ahead. While the loop keeps ticking the
timer is cancelled and re-armed before it can fire. If the loop stalls, the
heartbeat can't re-arm — and faulthandler's timer runs in a dedicated C-level
thread that fires regardless of the frozen loop, dumping *every* thread's
stack to stderr. The blocked frame then shows up in ``docker compose logs``.
"""

import asyncio
import faulthandler
import logging

logger = logging.getLogger(__name__)

# How often the heartbeat cancels + re-arms the faulthandler timer. Must be
# comfortably below STALL_THRESHOLD so a healthy loop always re-arms in time.
HEARTBEAT_INTERVAL = 10.0

# The loop must be unresponsive for at least this long before thread stacks
# are dumped. Generous on purpose: no legitimate on-loop operation should
# block for 30s, so anything that does is itself a bug worth a stack dump.
STALL_THRESHOLD = 30.0

_watchdog_task: asyncio.Task | None = None


async def _heartbeat_loop() -> None:
    """Re-arm the faulthandler stall timer on every tick."""
    while True:
        try:
            faulthandler.cancel_dump_traceback_later()
            # repeat=False: one dump pinpoints a hard freeze. If the loop
            # recovers and stalls again, the next heartbeat re-arms anyway.
            faulthandler.dump_traceback_later(STALL_THRESHOLD, repeat=False)
        except Exception as e:  # never let the watchdog itself crash the app
            logger.warning("Loop watchdog re-arm failed: %s", e)
        try:
            await asyncio.sleep(HEARTBEAT_INTERVAL)
        except asyncio.CancelledError:
            break


def start_loop_watchdog() -> None:
    """Start the event-loop stall watchdog. Idempotent."""
    global _watchdog_task
    if _watchdog_task is not None:
        return
    if not faulthandler.is_enabled():
        # Also installs handlers for fatal signals (SIGSEGV etc.) — harmless
        # and useful; the dump_traceback_later timer works either way.
        faulthandler.enable()
    _watchdog_task = asyncio.create_task(_heartbeat_loop())
    logger.info(
        "Event-loop stall watchdog started — dumps all thread stacks to stderr if the loop stalls for more than %.0fs",
        STALL_THRESHOLD,
    )


def stop_loop_watchdog() -> None:
    """Stop the watchdog and disarm the pending stall timer."""
    global _watchdog_task
    if _watchdog_task is not None:
        _watchdog_task.cancel()
        _watchdog_task = None
    try:
        faulthandler.cancel_dump_traceback_later()
    except Exception:
        pass
    logger.info("Event-loop stall watchdog stopped")