yunoadmin
/
bambuddy
дзеркало https://github.com/maziggy/bambuddy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
							"""Log-health scanner.

Matches the recent Bambuddy app log against a curated catalog of known failure
signatures, so users can self-diagnose setup ("layer 8") issues before filing a
bug report.

The catalog is a deliberate *allowlist*: only known-bad, actionable signatures
are matched — a healthy install produces an empty finding list. Human-readable
cause and fix text is intentionally NOT stored here; the frontend renders it
from i18n keys ``systemHealth.signature.<id>.{name,cause,fix}`` so it stays
translatable across all locales. This module only carries the machine-facing
fields (pattern, severity, category, wiki anchor).
"""

import logging
import re
from dataclasses import dataclass

from pydantic import BaseModel

from backend.app.core.config import settings
from backend.app.services.log_reader import LogEntry, read_log_entries, sanitize_log_content

logger = logging.getLogger(__name__)

# How many recent log entries to scan by default.
DEFAULT_SCAN_LIMIT = 4000

# Log levels ranked so a signature can require "at least WARNING" etc.
_LEVEL_RANK = {"DEBUG": 10, "INFO": 20, "WARNING": 30, "ERROR": 40, "CRITICAL": 50}

# Findings are ordered layer8 first (the user can act on these), then
# environment, then bug (please report). Within a group: errors before warnings.
_CATEGORY_ORDER = {"layer8": 0, "environment": 1, "bug": 2}
_SEVERITY_ORDER = {"error": 0, "warning": 1}

# Cap the sample line length so a finding can never carry a huge folded traceback.
_SAMPLE_MAX_LEN = 400


@dataclass(frozen=True)
class LogSignature:
    """One curated known-issue signature.

    ``patterns`` are matched (``re.search``, case-insensitive) against the log
    entry message. A signature only becomes a reported finding once it has
    matched ``min_count`` times within the scan window — this gates noisy,
    individually-benign symptoms (e.g. an occasional MQTT reconnect after a
    Wi-Fi blip) from being surfaced as a problem.
    """

    id: str
    patterns: tuple[re.Pattern[str], ...]
    severity: str  # "error" | "warning"
    category: str  # "layer8" | "environment" | "bug"
    wiki_anchor: str  # slug appended to the troubleshooting wiki page URL
    min_level: str = "WARNING"
    logger_prefix: str | None = None  # only match entries from this logger tree
    min_count: int = 1


def _compile(*patterns: str) -> tuple[re.Pattern[str], ...]:
    return tuple(re.compile(p, re.IGNORECASE) for p in patterns)


# --- The catalog -----------------------------------------------------------
# Seeded from the ranked "layer 8" root causes found in the closed-issue triage
# review. Each id MUST have matching i18n keys: systemHealth.signature.<id>.*
SIGNATURES: tuple[LogSignature, ...] = (
    LogSignature(
        # Wrong/mistyped access code — FTPS login is rejected (530).
        id="ftp-auth-rejected",
        patterns=_compile(r"FTP connection permission error"),
        severity="error",
        category="layer8",
        wiki_anchor="wrong-access-code",
        logger_prefix="backend.app.services.bambu_ftp",
    ),
    LogSignature(
        # FTPS :990 unreachable — port blocked by a firewall, or the printer is
        # off / on a different subnet.
        id="ftp-connection-timeout",
        patterns=_compile(r"FTP connection timed out"),
        severity="warning",
        category="layer8",
        wiki_anchor="ftps-port-990-blocked",
        logger_prefix="backend.app.services.bambu_ftp",
        min_count=3,
    ),
    LogSignature(
        # TLS negotiation to the printer's FTPS server failed.
        id="ftp-ssl-error",
        patterns=_compile(r"FTP SSL error connecting"),
        severity="warning",
        category="layer8",
        wiki_anchor="ftps-tls-failure",
        logger_prefix="backend.app.services.bambu_ftp",
        min_count=3,
    ),
    LogSignature(
        # MQTT connection keeps dropping — typically MQTT :8883 partially
        # blocked, LAN mode unstable, or a flaky network path to the printer.
        id="mqtt-connection-flapping",
        patterns=_compile(r"Forcing MQTT reconnect", r"Hard reset reconnect failed"),
        severity="warning",
        category="layer8",
        wiki_anchor="mqtt-connection-unstable",
        logger_prefix="backend.app.services.bambu_mqtt",
        min_count=5,
    ),
    LogSignature(
        # Camera stream unreachable — RTSPS :322 blocked, or the printer
        # camera / LAN liveview is disabled.
        id="camera-connection-refused",
        patterns=_compile(
            r"Chamber image: connection refused",
            r"Chamber image: connection timeout",
            r"Camera connection test failed",
        ),
        severity="warning",
        category="layer8",
        wiki_anchor="camera-rtsps-port-322",
        logger_prefix="backend.app.services.camera",
        min_count=3,
    ),
    LogSignature(
        # SQLite write contention. Surfaces inside exception tracebacks; folded
        # continuation lines are part of the entry message, so this still
        # matches. The fix is switching to PostgreSQL under multi-printer load.
        id="database-locked",
        patterns=_compile(r"database is locked"),
        severity="error",
        category="environment",
        wiki_anchor="database-is-locked",
    ),
)


class LogFinding(BaseModel):
    """An aggregated, sanitized match of one signature against the log."""

    signature_id: str
    severity: str
    category: str
    wiki_anchor: str
    count: int
    first_seen: str
    last_seen: str
    sample: str


class ScanResult(BaseModel):
    """Result of a log-health scan."""

    findings: list[LogFinding]
    scanned_entries: int
    log_available: bool
    summary: dict[str, int]


def _level_ok(entry: LogEntry, min_level: str) -> bool:
    return _LEVEL_RANK.get(entry.level.upper(), 0) >= _LEVEL_RANK.get(min_level, 30)


def _matches(sig: LogSignature, entry: LogEntry) -> bool:
    if not _level_ok(entry, sig.min_level):
        return False
    if sig.logger_prefix and not entry.logger_name.startswith(sig.logger_prefix):
        return False
    return any(p.search(entry.message) for p in sig.patterns)


def _sample_line(message: str) -> str:
    """Take the first line of a (possibly multi-line) entry, length-capped."""
    first_line = message.splitlines()[0] if message else ""
    if len(first_line) > _SAMPLE_MAX_LEN:
        return first_line[:_SAMPLE_MAX_LEN] + "…"
    return first_line


def scan_logs(
    limit: int = DEFAULT_SCAN_LIMIT,
    sensitive_strings: dict[str, str] | None = None,
) -> ScanResult:
    """Scan the recent app log against the signature catalog.

    ``sensitive_strings`` (from :func:`log_reader.collect_sensitive_strings`) is
    applied to every sample line so printer names, serials, IPs, and access
    codes never leave the process. Even when it is ``None`` the regex-based
    redaction passes still run.
    """
    log_file = settings.log_dir / "bambuddy.log"
    log_available = log_file.exists()

    entries, _total = read_log_entries(limit=limit)

    # entry_id -> accumulator. entries arrive newest-first.
    agg: dict[str, dict] = {}
    for entry in entries:
        for sig in SIGNATURES:
            if not _matches(sig, entry):
                continue
            acc = agg.get(sig.id)
            if acc is None:
                # First (== newest) occurrence encountered.
                agg[sig.id] = {
                    "count": 1,
                    "sample": entry.message,
                    "last_seen": entry.timestamp,
                    "first_seen": entry.timestamp,
                }
            else:
                acc["count"] += 1
                # Iterating newest-first, so each later hit is older.
                acc["first_seen"] = entry.timestamp

    findings: list[LogFinding] = []
    for sig in SIGNATURES:
        acc = agg.get(sig.id)
        if acc is None or acc["count"] < sig.min_count:
            continue
        sample = sanitize_log_content(_sample_line(acc["sample"]), sensitive_strings)
        findings.append(
            LogFinding(
                signature_id=sig.id,
                severity=sig.severity,
                category=sig.category,
                wiki_anchor=sig.wiki_anchor,
                count=acc["count"],
                first_seen=acc["first_seen"],
                last_seen=acc["last_seen"],
                sample=sample,
            )
        )

    findings.sort(
        key=lambda f: (
            _CATEGORY_ORDER.get(f.category, 9),
            _SEVERITY_ORDER.get(f.severity, 9),
            -f.count,
        )
    )

    summary = {
        "total": len(findings),
        "layer8": sum(1 for f in findings if f.category == "layer8"),
        "environment": sum(1 for f in findings if f.category == "environment"),
        "bug": sum(1 for f in findings if f.category == "bug"),
    }

    return ScanResult(
        findings=findings,
        scanned_entries=len(entries),
        log_available=log_available,
        summary=summary,
    )