gitea.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. """Gitea backend — overrides GitHubBackend where Gitea's API diverges."""
  2. import base64
  3. import json
  4. import logging
  5. import re
  6. from datetime import datetime, timezone
  7. import httpx
  8. from backend.app.services.git_providers.github import GitHubBackend
  9. logger = logging.getLogger(__name__)
  10. class GiteaBackend(GitHubBackend):
  11. """Backend for Gitea instances.
  12. Gitea's Git Data API (/api/v1/repos/{owner}/{repo}/git/...) is *mostly*
  13. compatible with GitHub's, but diverges on three points that broke real-world
  14. backups (#1224, #1225, #1239):
  15. 1. ``GET /git/refs/heads/{branch}`` returns a *list* of matching refs even
  16. when only one matches; GitHub returns a single object. The push paths
  17. below extract the SHA via ``_ref_sha()`` instead of the GitHub-style
  18. ``["object"]["sha"]`` chain.
  19. 2. The Git Data API (blobs/trees/commits/refs) refuses writes against an
  20. empty repository — every blob POST returns 404 until the repo has at
  21. least one commit. ``_create_initial_commit()`` is overridden to use the
  22. Contents API, which seeds the branch + initial commit in a single call.
  23. 3. The Git Data API does not support atomic multi-file commits — each file
  24. requires a separate blob POST followed by a tree/commit/ref sequence.
  25. ``push_files()`` is overridden to use the Contents API
  26. (``POST /repos/.../contents`` with a ``files`` array), which commits all
  27. changed files in a single round-trip and avoids partial-commit failures.
  28. """
  29. @staticmethod
  30. def _ref_sha(ref_data) -> str:
  31. """Extract the commit SHA from Gitea's list-shaped ref response."""
  32. if isinstance(ref_data, list):
  33. if not ref_data:
  34. raise ValueError("Empty refs list returned by Gitea API")
  35. return ref_data[0]["object"]["sha"]
  36. return ref_data["object"]["sha"]
  37. @staticmethod
  38. def _commit_tree_sha(commit_data: dict) -> str | None:
  39. """Extract the tree SHA from a commit response.
  40. GitHub's ``GET /git/commits/{sha}`` returns the GitCommit schema with
  41. ``tree`` at the top level. Gitea's same-named endpoint may return the
  42. wrapped Commit schema where ``tree`` lives under ``commit``. Try the
  43. flat shape first (GitHub-compatible deployments and some Gitea/Forgejo
  44. versions) then fall back to the wrapped shape.
  45. """
  46. tree_node = commit_data.get("tree")
  47. if not isinstance(tree_node, dict):
  48. tree_node = (commit_data.get("commit") or {}).get("tree")
  49. if isinstance(tree_node, dict):
  50. return tree_node.get("sha")
  51. return None
  52. def parse_repo_url(self, url: str) -> tuple[str, str]:
  53. """Return (owner, repo) — accepts both https:// and http:// for self-hosted instances."""
  54. if not url or len(url) > 500:
  55. raise ValueError("Invalid Git URL: URL too long or empty")
  56. match = re.match(
  57. r"https?://[\w.\-]+(:\d+)?/([\w.\-]{1,100})/([\w.\-]{1,100})(?:\.git)?/?$",
  58. url,
  59. )
  60. if match:
  61. return match.group(2), match.group(3).removesuffix(".git")
  62. match = re.match(
  63. r"git@[\w.\-]+:([\w.\-]{1,100})/([\w.\-]{1,100})(?:\.git)?$",
  64. url,
  65. )
  66. if match:
  67. return match.group(1), match.group(2).removesuffix(".git")
  68. raise ValueError(f"Cannot parse repository URL: {url}")
  69. def get_api_base(self, repo_url: str) -> str:
  70. """Derive API base from the repository URL's scheme and host."""
  71. match = re.match(r"(https?://[\w.\-]+(:\d+)?)/", repo_url)
  72. if match:
  73. return f"{match.group(1)}/api/v1"
  74. raise ValueError(f"Cannot derive API base from URL: {repo_url}")
  75. def get_headers(self, token: str) -> dict:
  76. headers = super().get_headers(token)
  77. headers["Accept"] = "application/json"
  78. return headers
  79. async def push_files(
  80. self,
  81. repo_url: str,
  82. token: str,
  83. branch: str,
  84. files: dict,
  85. client: httpx.AsyncClient,
  86. _allow_branch_create: bool = True,
  87. ) -> dict:
  88. """Push files via the Git Data API, normalising Gitea's list-shaped ref response."""
  89. try:
  90. owner, repo = self.parse_repo_url(repo_url)
  91. api_base = self.get_api_base(repo_url)
  92. headers = self.get_headers(token)
  93. ref_response = await client.get(f"{api_base}/repos/{owner}/{repo}/git/refs/heads/{branch}", headers=headers)
  94. if ref_response.status_code == 404:
  95. if not _allow_branch_create:
  96. return {
  97. "status": "failed",
  98. "message": (
  99. f"Branch '{branch}' not found after creation — possible replication lag. "
  100. "The next scheduled backup will retry."
  101. ),
  102. }
  103. return await self._create_branch_and_push(
  104. client, headers, api_base, owner, repo, branch, files, repo_url, token
  105. )
  106. if ref_response.status_code != 200:
  107. return {
  108. "status": "failed",
  109. "message": f"Failed to get branch ref: {ref_response.status_code}",
  110. "error": self._truncated_response_text(ref_response),
  111. }
  112. current_commit_sha = self._ref_sha(ref_response.json())
  113. commit_response = await client.get(
  114. f"{api_base}/repos/{owner}/{repo}/git/commits/{current_commit_sha}", headers=headers
  115. )
  116. if commit_response.status_code != 200:
  117. msg = f"Failed to get current commit (HTTP {commit_response.status_code}): {self._truncated_response_text(commit_response)}"
  118. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  119. return {"status": "failed", "message": msg}
  120. current_tree_sha = self._commit_tree_sha(commit_response.json())
  121. if not current_tree_sha:
  122. msg = (
  123. f"Failed to extract tree SHA from commit response: {self._truncated_response_text(commit_response)}"
  124. )
  125. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  126. return {"status": "failed", "message": msg}
  127. tree_response = await client.get(
  128. f"{api_base}/repos/{owner}/{repo}/git/trees/{current_tree_sha}?recursive=1", headers=headers
  129. )
  130. if tree_response.status_code != 200:
  131. msg = f"Failed to list existing tree (HTTP {tree_response.status_code}): {self._truncated_response_text(tree_response)}"
  132. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  133. return {"status": "failed", "message": msg, "error": self._truncated_response_text(tree_response)}
  134. tree_data = tree_response.json()
  135. # Gitea's tree API can report ``truncated: true`` for large
  136. # listings; if we honour the partial map, the dedup check misses
  137. # and every file gets re-uploaded each run.
  138. if tree_data.get("truncated"):
  139. msg = (
  140. "Repository tree exceeds the Gitea API listing limit (truncated=true). "
  141. "Rotate the backup repository to avoid silent file-by-file churn on every backup."
  142. )
  143. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  144. return {"status": "failed", "message": msg}
  145. existing_files: dict[str, str] = {}
  146. for item in tree_data.get("tree", []):
  147. if item.get("type") != "blob":
  148. continue
  149. path, sha = item.get("path"), item.get("sha")
  150. if not path or not sha:
  151. logger.warning("push_files: skipping malformed tree entry: %s", item)
  152. continue
  153. existing_files[path] = sha
  154. api_files = []
  155. files_changed = 0
  156. for path, content in files.items():
  157. content_str = json.dumps(content, indent=2, default=str)
  158. content_bytes = content_str.encode("utf-8")
  159. content_b64 = base64.b64encode(content_bytes).decode()
  160. content_sha = self._blob_sha(content_bytes)
  161. if path in existing_files:
  162. if existing_files[path] == content_sha:
  163. continue
  164. api_files.append(
  165. {"operation": "update", "path": path, "content": content_b64, "sha": existing_files[path]}
  166. )
  167. else:
  168. api_files.append({"operation": "create", "path": path, "content": content_b64})
  169. files_changed += 1
  170. if not api_files:
  171. return {"status": "skipped", "message": "No changes to commit", "commit_sha": None, "files_changed": 0}
  172. commit_message = f"Bambuddy backup - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}"
  173. response = await client.post(
  174. f"{api_base}/repos/{owner}/{repo}/contents",
  175. headers=headers,
  176. json={"branch": branch, "message": commit_message, "files": api_files},
  177. )
  178. if response.status_code == 404:
  179. return {
  180. "status": "failed",
  181. "message": "Contents API endpoint not found — your Gitea instance may be older than v1.18 or the API may be disabled by an administrator (POST /contents returned 404)",
  182. }
  183. if response.status_code == 409:
  184. return {
  185. "status": "failed",
  186. "message": (
  187. "Conflict committing files — the branch likely advanced concurrently "
  188. "(web-UI edit, another backup run, or path-vs-tree collision). "
  189. "The next scheduled backup will re-read the current tree and resolve this."
  190. ),
  191. }
  192. if response.status_code not in (200, 201):
  193. return {
  194. "status": "failed",
  195. "message": f"Backup commit failed: {self._truncated_response_text(response)}",
  196. }
  197. commit_sha = (response.json().get("commit") or {}).get("sha")
  198. message = (
  199. f"Backup successful - {files_changed} files updated"
  200. if commit_sha
  201. else f"Backup successful - {files_changed} files updated (commit SHA not reported by server)"
  202. )
  203. return {
  204. "status": "success",
  205. "message": message,
  206. "commit_sha": commit_sha,
  207. "files_changed": files_changed,
  208. }
  209. except Exception as e:
  210. logger.exception("push_files failed for %s branch=%s", repo_url, branch)
  211. return {"status": "failed", "message": str(e), "error": str(e)}
  212. async def _create_branch_and_push(
  213. self,
  214. client: httpx.AsyncClient,
  215. headers: dict,
  216. api_base: str,
  217. owner: str,
  218. repo: str,
  219. branch: str,
  220. files: dict,
  221. repo_url: str,
  222. token: str,
  223. ) -> dict:
  224. """Create branch (from default branch or as initial commit) then push."""
  225. try:
  226. repo_response = await client.get(f"{api_base}/repos/{owner}/{repo}", headers=headers)
  227. if repo_response.status_code != 200:
  228. msg = f"Failed to get repo info (HTTP {repo_response.status_code}): {self._truncated_response_text(repo_response)}"
  229. logger.warning("_create_branch_and_push %s/%s: %s", owner, repo, msg)
  230. return {"status": "failed", "message": msg}
  231. default_branch = repo_response.json().get("default_branch", "main")
  232. # GET the default branch to confirm the repo is non-empty; SHA is intentionally unused —
  233. # POST /branches takes a branch name, not a SHA.
  234. ref_response = await client.get(
  235. f"{api_base}/repos/{owner}/{repo}/git/refs/heads/{default_branch}", headers=headers
  236. )
  237. if ref_response.status_code != 200:
  238. return await self._create_initial_commit(client, headers, api_base, owner, repo, branch, files)
  239. create_ref = await client.post(
  240. f"{api_base}/repos/{owner}/{repo}/branches",
  241. headers=headers,
  242. json={"new_branch_name": branch, "old_ref_name": default_branch},
  243. )
  244. if create_ref.status_code == 403:
  245. msg = f"Permission denied creating branch '{branch}' — token may lack write access to this repository"
  246. logger.warning("_create_branch_and_push %s/%s: 403 %s", owner, repo, msg)
  247. return {"status": "failed", "message": msg}
  248. if create_ref.status_code == 409:
  249. msg = f"Branch '{branch}' already exists (possible race condition)"
  250. logger.warning("_create_branch_and_push %s/%s: 409 %s", owner, repo, msg)
  251. return {"status": "failed", "message": msg}
  252. if create_ref.status_code != 201:
  253. msg = f"Failed to create branch '{branch}' (HTTP {create_ref.status_code}): {self._truncated_response_text(create_ref)}"
  254. logger.warning("_create_branch_and_push %s/%s: %s", owner, repo, msg)
  255. return {"status": "failed", "message": msg}
  256. logger.info("Re-entering push_files after branch create %s/%s -> %s", owner, repo, branch)
  257. return await self.push_files(repo_url, token, branch, files, client, _allow_branch_create=False)
  258. except Exception as e:
  259. logger.exception("_create_branch_and_push failed for %s/%s branch=%s", owner, repo, branch)
  260. return {"status": "failed", "message": str(e), "error": str(e)}
  261. async def _create_initial_commit(
  262. self,
  263. client: httpx.AsyncClient,
  264. headers: dict,
  265. api_base: str,
  266. owner: str,
  267. repo: str,
  268. branch: str,
  269. files: dict,
  270. ) -> dict:
  271. """Seed an empty Gitea repository via the Contents API.
  272. Gitea's Git Data API requires the repository to have at least one
  273. commit before it accepts blob/tree/commit writes; on an empty repo
  274. every ``POST /git/blobs`` returns 404. The Contents API is the
  275. documented bootstrap path: a single ``POST /repos/{owner}/{repo}/contents``
  276. with a ``files`` array creates the initial commit and the target
  277. branch in one round-trip (Gitea 1.18+, Forgejo all versions).
  278. """
  279. try:
  280. if not files:
  281. return {"status": "skipped", "message": "No files to commit", "commit_sha": None, "files_changed": 0}
  282. api_files = []
  283. for path, content in files.items():
  284. content_str = json.dumps(content, indent=2, default=str)
  285. content_b64 = base64.b64encode(content_str.encode("utf-8")).decode()
  286. api_files.append({"operation": "create", "path": path, "content": content_b64})
  287. commit_message = f"Initial Bambuddy backup - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}"
  288. body = {
  289. "branch": branch,
  290. "new_branch": branch,
  291. "message": commit_message,
  292. "files": api_files,
  293. }
  294. response = await client.post(
  295. f"{api_base}/repos/{owner}/{repo}/contents",
  296. headers=headers,
  297. json=body,
  298. )
  299. if response.status_code not in (200, 201):
  300. return {
  301. "status": "failed",
  302. "message": f"Failed to create initial commit: {self._truncated_response_text(response)}",
  303. }
  304. data = response.json()
  305. commit_sha = (data.get("commit") or {}).get("sha")
  306. message = (
  307. f"Initial backup created - {len(files)} files"
  308. if commit_sha
  309. else f"Initial backup created - {len(files)} files (commit SHA not reported by server)"
  310. )
  311. return {
  312. "status": "success",
  313. "message": message,
  314. "commit_sha": commit_sha,
  315. "files_changed": len(files),
  316. }
  317. except Exception as e:
  318. logger.exception("_create_initial_commit failed for %s/%s branch=%s", owner, repo, branch)
  319. return {"status": "failed", "message": str(e), "error": str(e)}