test_no_unsafe_path_joins.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. """Backstop: every Path-arithmetic site in the API routes that joins a
  2. variable to a directory-like parent must either use ``safe_join_under`` or
  3. carry a ``# SEC-PATH-OK: <reason>`` marker.
  4. A critical advisory traced to plain ``Path / user_string`` arithmetic in
  5. ``import_project_file`` — the join had no resolve + containment check, and an
  6. attacker-supplied absolute path collapsed the left side. This test catches the
  7. same shape in any new route added later: it AST-walks every Python file under
  8. ``backend/app/api/routes/`` and flags every ``a / b`` where ``a`` looks like a
  9. directory variable and ``b`` is a non-constant (i.e. variable / call result).
  10. False positives are intentionally cheap to silence (add a one-line
  11. ``# SEC-PATH-OK: <reason>`` justifying the existing guard) so that *future*
  12. unsafe joins are noisy by default.
  13. """
  14. from __future__ import annotations
  15. import ast
  16. from pathlib import Path
  17. import pytest
  18. # The route surface receives external input directly; the services layer is
  19. # called by the routes and routinely receives values that originated from a
  20. # request (filenames, query params) or from an untrusted external source
  21. # (printer FTP listings — the printer is part of the threat surface in the
  22. # compromised-printer model). Both layers need the strictest gate.
  23. _BACKEND_APP = Path(__file__).resolve().parents[2] / "app"
  24. SCAN_DIRS = [
  25. _BACKEND_APP / "api" / "routes",
  26. _BACKEND_APP / "services",
  27. ]
  28. # Identifier substrings that suggest the LHS is a filesystem directory. Heuristic
  29. # but tuned to Bambuddy's conventions — every actual directory variable in the
  30. # routes hits one of these.
  31. _DIR_NAME_HINTS = (
  32. "_dir",
  33. "_path",
  34. "dir_",
  35. "path_",
  36. "temp_path",
  37. "library_dir",
  38. "archive_dir",
  39. "photos_dir",
  40. "base_dir",
  41. "ext_dir",
  42. "attachments_dir",
  43. "static_dir",
  44. "log_dir",
  45. "data_dir",
  46. "folder_path",
  47. "file_disk_path",
  48. "photo_path",
  49. "dest",
  50. "output_path",
  51. )
  52. # Function calls whose return value is a Path under our control. Hits to these
  53. # don't need scrutiny — they're constructed by Bambuddy code, not by the request.
  54. _KNOWN_PATH_FACTORIES = (
  55. "Path",
  56. "get_library_dir",
  57. "get_library_files_dir",
  58. "get_archive_dir",
  59. "get_project_attachments_dir",
  60. "get_project_cover_dir",
  61. "resolve",
  62. )
  63. _MARKER = "# SEC-PATH-OK:"
  64. def _looks_path_like(node: ast.AST) -> bool:
  65. """Heuristic for whether *node* evaluates to a ``pathlib.Path``."""
  66. if isinstance(node, ast.Call):
  67. func = node.func
  68. if isinstance(func, ast.Name) and func.id in _KNOWN_PATH_FACTORIES:
  69. return True
  70. return bool(isinstance(func, ast.Attribute) and func.attr in _KNOWN_PATH_FACTORIES)
  71. if isinstance(node, ast.Name):
  72. return any(hint in node.id for hint in _DIR_NAME_HINTS)
  73. if isinstance(node, ast.Attribute):
  74. # `settings.base_dir`, `cls.archive_dir`, etc.
  75. return any(hint in node.attr for hint in _DIR_NAME_HINTS)
  76. if isinstance(node, ast.BinOp) and isinstance(node.op, ast.Div):
  77. # Chains like ``base_dir / "x" / variable`` — keep looking left.
  78. return _looks_path_like(node.left)
  79. return False
  80. def _is_constant_string(node: ast.AST) -> bool:
  81. return isinstance(node, ast.Constant) and isinstance(node.value, str)
  82. def _rhs_is_attacker_shape(node: ast.AST) -> bool:
  83. """The high-risk shape is ``path / Name`` — RHS is a bare variable that
  84. came from somewhere outside this scope (a function parameter, request
  85. body field, ZIP namelist entry).
  86. Attribute (``lib_file.file_path``), Subscript (``photos[i]``), Call
  87. (``str(vp_id)``), and JoinedStr (f-strings) all have *some* structure
  88. that the audit can reason about — those are caught by the broader audit
  89. sweep, not the regression backstop. This narrows the noise to the exact
  90. shape that produced the path-traversal class so the backstop only fires
  91. when something that *looks* like that bug appears.
  92. """
  93. return isinstance(node, ast.Name)
  94. _CONTINUATION_TOKENS = (")", "]", "}", ",")
  95. def _line_has_marker(source_lines: list[str], lineno: int, end_lineno: int | None) -> bool:
  96. """Check whether a ``# SEC-PATH-OK:`` marker covers this join.
  97. Looks at every line spanned by the BinOp itself, plus one line past
  98. ``end_lineno`` IF that line begins with a continuation token (``)``,
  99. ``]``, ``}``, ``,``). The peek captures the project's convention of
  100. wrapping a BinOp in parens and placing the marker on the closing line:
  101. file_path = (
  102. library_dir / filename
  103. ) # SEC-PATH-OK: filename validated above
  104. The BinOp itself ends on the inner line, but the marker reads more
  105. naturally on the closing line. Restricting the peek to continuation
  106. lines prevents giving a free pass to a marker that happens to sit on
  107. a wholly unrelated follow-on statement.
  108. """
  109. start = max(1, lineno)
  110. end = max(start, end_lineno or lineno)
  111. for i in range(start, end + 1):
  112. if i - 1 >= len(source_lines):
  113. continue
  114. if _MARKER in source_lines[i - 1]:
  115. return True
  116. # Project convention: marker often sits on the closing-paren line, one
  117. # line past the BinOp's `end_lineno`. Only accept it when the line is a
  118. # continuation of the wrapping expression.
  119. if end < len(source_lines):
  120. trailing = source_lines[end].lstrip()
  121. if trailing.startswith(_CONTINUATION_TOKENS) and _MARKER in source_lines[end]:
  122. return True
  123. return False
  124. def _enclosing_call_is_safe_join(stack: list[ast.AST]) -> bool:
  125. """True if the BinOp is being passed directly into ``safe_join_under(...)``.
  126. Tracking parent links keeps the test conservative — a ``base_dir / x``
  127. expression that's already inside ``safe_join_under(base_dir / x, ...)``
  128. is fine because the helper does its own containment check. This rarely
  129. happens in practice but keeps the test from yelling about an idiomatic
  130. arrangement.
  131. """
  132. for ancestor in reversed(stack):
  133. if isinstance(ancestor, ast.Call):
  134. func = ancestor.func
  135. if isinstance(func, ast.Name) and func.id == "safe_join_under":
  136. return True
  137. if isinstance(func, ast.Attribute) and func.attr == "safe_join_under":
  138. return True
  139. return False
  140. def _scan_file(py_file: Path) -> list[str]:
  141. source = py_file.read_text()
  142. source_lines = source.splitlines()
  143. tree = ast.parse(source, filename=str(py_file))
  144. findings: list[str] = []
  145. # Walk with parent stack so we can detect "inside safe_join_under" and
  146. # skip such nodes.
  147. stack: list[ast.AST] = []
  148. def visit(node: ast.AST) -> None:
  149. stack.append(node)
  150. try:
  151. if (
  152. isinstance(node, ast.BinOp)
  153. and isinstance(node.op, ast.Div)
  154. and _looks_path_like(node.left)
  155. and _rhs_is_attacker_shape(node.right)
  156. and not _is_constant_string(node.right)
  157. and not _enclosing_call_is_safe_join(stack)
  158. and not _line_has_marker(source_lines, node.lineno, node.end_lineno)
  159. ):
  160. line = source_lines[node.lineno - 1].strip() if node.lineno - 1 < len(source_lines) else "<?>"
  161. findings.append(f"{py_file.name}:{node.lineno} {line}")
  162. for child in ast.iter_child_nodes(node):
  163. visit(child)
  164. finally:
  165. stack.pop()
  166. visit(tree)
  167. return findings
  168. def _scan_source(source: str, tmp_path: Path) -> list[str]:
  169. """Drop ``source`` into a temp .py file and run ``_scan_file`` against it.
  170. Returns the findings list."""
  171. f = tmp_path / "candidate.py"
  172. f.write_text(source)
  173. return _scan_file(f)
  174. class TestMarkerDetection:
  175. """Pins the contract for where a ``# SEC-PATH-OK:`` marker is accepted.
  176. Markers must sit either (a) somewhere within the BinOp's own source
  177. lines, or (b) on the immediately-following line when that line is a
  178. continuation token (closing paren / bracket / brace / comma). The
  179. second case is the project's convention of wrapping the BinOp in
  180. parens and placing the marker on the closing-paren line.
  181. """
  182. def test_marker_on_binop_line_recognised(self, tmp_path):
  183. source = (
  184. "from pathlib import Path\nbase_dir = Path('.')\ndef f(x): return base_dir / x # SEC-PATH-OK: trusted x\n"
  185. )
  186. assert _scan_source(source, tmp_path) == []
  187. def test_marker_on_closing_paren_line_recognised(self, tmp_path):
  188. # The exact convention used across api/routes/ and services/: the
  189. # BinOp lives inside a parenthesised expression and the marker sits
  190. # on the closing-paren line, one past the BinOp's `end_lineno`.
  191. source = (
  192. "from pathlib import Path\n"
  193. "base_dir = Path('.')\n"
  194. "def f(x):\n"
  195. " return (\n"
  196. " base_dir / x\n"
  197. " ) # SEC-PATH-OK: trusted x\n"
  198. )
  199. assert _scan_source(source, tmp_path) == []
  200. def test_unrelated_marker_below_does_not_silence(self, tmp_path):
  201. # A second join below carries a marker; the first does not. Only
  202. # the first must be flagged — markers don't cross statement
  203. # boundaries.
  204. source = (
  205. "from pathlib import Path\n"
  206. "base_dir = Path('.')\n"
  207. "def f(x, y):\n"
  208. " p = base_dir / x\n"
  209. " q = base_dir / y # SEC-PATH-OK: trusted y\n"
  210. )
  211. findings = _scan_source(source, tmp_path)
  212. assert len(findings) == 1
  213. assert "base_dir / x" in findings[0]
  214. def test_marker_on_non_continuation_line_does_not_silence(self, tmp_path):
  215. # A SEC-PATH-OK comment on the line right after the BinOp counts
  216. # only when that line is a continuation of the wrapping expression.
  217. # An unrelated next statement's marker must not free-pass the join.
  218. source = (
  219. "from pathlib import Path\n"
  220. "base_dir = Path('.')\n"
  221. "def f(x):\n"
  222. " a = base_dir / x\n"
  223. " b = 1 # SEC-PATH-OK: unrelated, for a different statement\n"
  224. )
  225. findings = _scan_source(source, tmp_path)
  226. assert len(findings) == 1
  227. assert "base_dir / x" in findings[0]
  228. def test_no_marker_anywhere_is_flagged(self, tmp_path):
  229. # Pin the negative path so a future refactor can't accidentally turn
  230. # marker detection into "always returns True".
  231. source = "from pathlib import Path\nbase_dir = Path('.')\ndef f(x): return base_dir / x\n"
  232. findings = _scan_source(source, tmp_path)
  233. assert len(findings) == 1
  234. assert "base_dir / x" in findings[0]
  235. def test_route_path_arithmetic_is_safe_joined_or_marked():
  236. """Every ``<dir-like> / <non-constant>`` join in a route handler must
  237. either route through ``safe_join_under(...)`` or carry a
  238. ``# SEC-PATH-OK: <reason>`` marker on one of its source lines.
  239. Adding ``# SEC-PATH-OK: <reason>`` is the escape hatch for sites where
  240. the input has already been validated (e.g. a denylist + membership
  241. check, a pre-sanitised alphanumeric filter, or an explicit resolve +
  242. ``relative_to`` containment check inline). The marker MUST explain the
  243. existing guard — silent suppression defeats the backstop's purpose.
  244. """
  245. findings: list[str] = []
  246. for scan_dir in SCAN_DIRS:
  247. for py_file in sorted(scan_dir.rglob("*.py")):
  248. if py_file.name == "__init__.py":
  249. continue
  250. findings.extend(_scan_file(py_file))
  251. if findings:
  252. pytest.fail(
  253. "Found Path-arithmetic sites in api/routes/ or services/ that "
  254. "join a non-constant value to a directory-like parent without "
  255. "using safe_join_under() or carrying a # SEC-PATH-OK: marker. "
  256. "Each site must either be refactored to "
  257. "safe_join_under(parent, *parts) or tagged with the marker "
  258. "explaining why the existing guard is sufficient.\n\nFindings:\n" + "\n".join(findings)
  259. )