test_no_unsafe_path_joins.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. """Backstop: every Path-arithmetic site in the API routes that joins a
  2. variable to a directory-like parent must either use ``safe_join_under`` or
  3. carry a ``# SEC-PATH-OK: <reason>`` marker.
  4. A critical advisory traced to plain ``Path / user_string`` arithmetic in
  5. ``import_project_file`` — the join had no resolve + containment check, and an
  6. attacker-supplied absolute path collapsed the left side. This test catches the
  7. same shape in any new route added later: it AST-walks every Python file under
  8. ``backend/app/api/routes/`` and flags every ``a / b`` where ``a`` looks like a
  9. directory variable and ``b`` is a non-constant (i.e. variable / call result).
  10. False positives are intentionally cheap to silence (add a one-line
  11. ``# SEC-PATH-OK: <reason>`` justifying the existing guard) so that *future*
  12. unsafe joins are noisy by default.
  13. """
  14. from __future__ import annotations
  15. import ast
  16. from pathlib import Path
  17. import pytest
  18. # The route surface receives external input directly; the services layer is
  19. # called by the routes and routinely receives values that originated from a
  20. # request (filenames, query params) or from an untrusted external source
  21. # (printer FTP listings — the printer is part of the threat surface in the
  22. # compromised-printer model). Both layers need the strictest gate.
  23. _BACKEND_APP = Path(__file__).resolve().parents[2] / "app"
  24. SCAN_DIRS = [
  25. _BACKEND_APP / "api" / "routes",
  26. _BACKEND_APP / "services",
  27. ]
  28. # Identifier substrings that suggest the LHS is a filesystem directory. Heuristic
  29. # but tuned to Bambuddy's conventions — every actual directory variable in the
  30. # routes hits one of these.
  31. _DIR_NAME_HINTS = (
  32. "_dir",
  33. "_path",
  34. "dir_",
  35. "path_",
  36. "temp_path",
  37. "library_dir",
  38. "archive_dir",
  39. "photos_dir",
  40. "base_dir",
  41. "ext_dir",
  42. "attachments_dir",
  43. "static_dir",
  44. "log_dir",
  45. "data_dir",
  46. "folder_path",
  47. "file_disk_path",
  48. "photo_path",
  49. "dest",
  50. "output_path",
  51. )
  52. # Function calls whose return value is a Path under our control. Hits to these
  53. # don't need scrutiny — they're constructed by Bambuddy code, not by the request.
  54. _KNOWN_PATH_FACTORIES = (
  55. "Path",
  56. "get_library_dir",
  57. "get_library_files_dir",
  58. "get_archive_dir",
  59. "get_project_attachments_dir",
  60. "get_project_cover_dir",
  61. "resolve",
  62. )
  63. _MARKER = "# SEC-PATH-OK:"
  64. def _looks_path_like(node: ast.AST) -> bool:
  65. """Heuristic for whether *node* evaluates to a ``pathlib.Path``."""
  66. if isinstance(node, ast.Call):
  67. func = node.func
  68. if isinstance(func, ast.Name) and func.id in _KNOWN_PATH_FACTORIES:
  69. return True
  70. return bool(isinstance(func, ast.Attribute) and func.attr in _KNOWN_PATH_FACTORIES)
  71. if isinstance(node, ast.Name):
  72. return any(hint in node.id for hint in _DIR_NAME_HINTS)
  73. if isinstance(node, ast.Attribute):
  74. # `settings.base_dir`, `cls.archive_dir`, etc.
  75. return any(hint in node.attr for hint in _DIR_NAME_HINTS)
  76. if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Div):
  77. # Chains like ``base_dir / "x" / variable`` — keep looking left.
  78. return _looks_path_like(node.left)
  79. return False
  80. def _is_constant_string(node: ast.AST) -> bool:
  81. return isinstance(node, ast.Constant) and isinstance(node.value, str)
  82. def _rhs_is_attacker_shape(node: ast.AST) -> bool:
  83. """The high-risk shape is ``path / Name`` — RHS is a bare variable that
  84. came from somewhere outside this scope (a function parameter, request
  85. body field, ZIP namelist entry).
  86. Attribute (``lib_file.file_path``), Subscript (``photos[i]``), Call
  87. (``str(vp_id)``), and JoinedStr (f-strings) all have *some* structure
  88. that the audit can reason about — those are caught by the broader audit
  89. sweep, not the regression backstop. This narrows the noise to the exact
  90. shape that produced the path-traversal class so the backstop only fires
  91. when something that *looks* like that bug appears.
  92. """
  93. return isinstance(node, ast.Name)
  94. def _line_has_marker(source_lines: list[str], lineno: int, end_lineno: int | None) -> bool:
  95. # Walk every line spanned by the BinOp — the marker can sit anywhere in the
  96. # expression (start, end, or any continuation line of a multi-line join).
  97. start = max(1, lineno)
  98. end = max(start, end_lineno or lineno)
  99. for i in range(start, end + 1):
  100. if i - 1 >= len(source_lines):
  101. continue
  102. if _MARKER in source_lines[i - 1]:
  103. return True
  104. return False
  105. def _enclosing_call_is_safe_join(stack: list[ast.AST]) -> bool:
  106. """True if the BinOp is being passed directly into ``safe_join_under(...)``.
  107. Tracking parent links keeps the test conservative — a ``base_dir / x``
  108. expression that's already inside ``safe_join_under(base_dir / x, ...)``
  109. is fine because the helper does its own containment check. This rarely
  110. happens in practice but keeps the test from yelling about an idiomatic
  111. arrangement.
  112. """
  113. for ancestor in reversed(stack):
  114. if isinstance(ancestor, ast.Call):
  115. func = ancestor.func
  116. if isinstance(func, ast.Name) and func.id == "safe_join_under":
  117. return True
  118. if isinstance(func, ast.Attribute) and func.attr == "safe_join_under":
  119. return True
  120. return False
  121. def _scan_file(py_file: Path) -> list[str]:
  122. source = py_file.read_text()
  123. source_lines = source.splitlines()
  124. tree = ast.parse(source, filename=str(py_file))
  125. findings: list[str] = []
  126. # Walk with parent stack so we can detect "inside safe_join_under" and
  127. # skip such nodes.
  128. stack: list[ast.AST] = []
  129. def visit(node: ast.AST) -> None:
  130. stack.append(node)
  131. try:
  132. if (
  133. isinstance(node, ast.BinOp)
  134. and isinstance(node.op, ast.Div)
  135. and _looks_path_like(node.left)
  136. and _rhs_is_attacker_shape(node.right)
  137. and not _is_constant_string(node.right)
  138. and not _enclosing_call_is_safe_join(stack)
  139. and not _line_has_marker(source_lines, node.lineno, node.end_lineno)
  140. ):
  141. line = source_lines[node.lineno - 1].strip() if node.lineno - 1 < len(source_lines) else "<?>"
  142. findings.append(f"{py_file.name}:{node.lineno} {line}")
  143. for child in ast.iter_child_nodes(node):
  144. visit(child)
  145. finally:
  146. stack.pop()
  147. visit(tree)
  148. return findings
  149. def test_route_path_arithmetic_is_safe_joined_or_marked():
  150. """Every ``<dir-like> / <non-constant>`` join in a route handler must
  151. either route through ``safe_join_under(...)`` or carry a
  152. ``# SEC-PATH-OK: <reason>`` marker on one of its source lines.
  153. Adding ``# SEC-PATH-OK: <reason>`` is the escape hatch for sites where
  154. the input has already been validated (e.g. a denylist + membership
  155. check, a pre-sanitised alphanumeric filter, or an explicit resolve +
  156. ``relative_to`` containment check inline). The marker MUST explain the
  157. existing guard — silent suppression defeats the backstop's purpose.
  158. """
  159. findings: list[str] = []
  160. for scan_dir in SCAN_DIRS:
  161. for py_file in sorted(scan_dir.rglob("*.py")):
  162. if py_file.name == "__init__.py":
  163. continue
  164. findings.extend(_scan_file(py_file))
  165. if findings:
  166. pytest.fail(
  167. "Found Path-arithmetic sites in api/routes/ or services/ that "
  168. "join a non-constant value to a directory-like parent without "
  169. "using safe_join_under() or carrying a # SEC-PATH-OK: marker. "
  170. "Each site must either be refactored to "
  171. "safe_join_under(parent, *parts) or tagged with the marker "
  172. "explaining why the existing guard is sufficient.\n\nFindings:\n" + "\n".join(findings)
  173. )