test_loop_watchdog.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. """Tests for the event-loop stall watchdog (#1486)."""
  2. import asyncio
  3. import faulthandler
  4. from unittest.mock import patch
  5. import pytest
  6. from backend.app.services import loop_watchdog
  7. @pytest.fixture(autouse=True)
  8. def _mock_faulthandler():
  9. """Patch faulthandler so tests never arm a real 30s stall timer that
  10. could fire mid-suite. Yields (arm_mock, cancel_mock)."""
  11. with (
  12. patch.object(faulthandler, "dump_traceback_later") as arm,
  13. patch.object(faulthandler, "cancel_dump_traceback_later") as cancel,
  14. ):
  15. yield arm, cancel
  16. # Safety net: make sure no test leaves the watchdog task running.
  17. loop_watchdog.stop_loop_watchdog()
  18. async def test_start_arms_the_stall_timer(_mock_faulthandler):
  19. arm, cancel = _mock_faulthandler
  20. loop_watchdog.start_loop_watchdog()
  21. await asyncio.sleep(0) # let the heartbeat run its first iteration
  22. assert cancel.called, "previous timer must be cancelled before re-arming"
  23. assert arm.called
  24. # Armed STALL_THRESHOLD seconds ahead, single-shot.
  25. args, kwargs = arm.call_args
  26. assert args[0] == loop_watchdog.STALL_THRESHOLD
  27. assert kwargs.get("repeat") is False
  28. async def test_start_is_idempotent(_mock_faulthandler):
  29. loop_watchdog.start_loop_watchdog()
  30. first = loop_watchdog._watchdog_task
  31. loop_watchdog.start_loop_watchdog()
  32. assert loop_watchdog._watchdog_task is first, "second start must not spawn a task"
  33. async def test_stop_cancels_the_task_and_disarms(_mock_faulthandler):
  34. _arm, cancel = _mock_faulthandler
  35. loop_watchdog.start_loop_watchdog()
  36. task = loop_watchdog._watchdog_task
  37. assert task is not None
  38. cancel.reset_mock()
  39. loop_watchdog.stop_loop_watchdog()
  40. assert loop_watchdog._watchdog_task is None
  41. assert cancel.called, "stop must disarm the pending faulthandler timer"
  42. await asyncio.sleep(0)
  43. assert task.cancelled() or task.done()
  44. async def test_heartbeat_interval_is_below_stall_threshold():
  45. """A healthy loop must always re-arm before the timer can fire."""
  46. assert loop_watchdog.HEARTBEAT_INTERVAL < loop_watchdog.STALL_THRESHOLD
  47. async def test_rearm_failure_does_not_crash_the_watchdog(_mock_faulthandler):
  48. """A faulthandler hiccup must not take down the heartbeat task."""
  49. arm, _cancel = _mock_faulthandler
  50. arm.side_effect = RuntimeError("boom")
  51. loop_watchdog.start_loop_watchdog()
  52. await asyncio.sleep(0)
  53. task = loop_watchdog._watchdog_task
  54. assert task is not None and not task.done(), "watchdog must survive a re-arm error"