github.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. """GitHub backend — implements GitProviderBackend using the GitHub Git Data API."""
  2. import base64
  3. import json
  4. import logging
  5. import re
  6. from datetime import datetime, timezone
  7. import httpx
  8. from backend.app.services.git_providers.base import GitProviderBackend
  9. logger = logging.getLogger(__name__)
  10. class GitHubBackend(GitProviderBackend):
  11. """Backend for github.com using the GitHub Git Data API."""
  12. def get_api_base(self, repo_url: str) -> str:
  13. m = re.match(r"https?://([\w.\-]+(:\d+)?)/", repo_url)
  14. if m:
  15. host = m.group(1)
  16. return "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
  17. m = re.match(r"git@([\w.\-]+):", repo_url)
  18. if m:
  19. host = m.group(1)
  20. return "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3"
  21. return "https://api.github.com"
  22. def parse_repo_url(self, url: str) -> tuple[str, str]:
  23. """Return (owner, repo) from a Git HTTPS or SSH URL."""
  24. if not url or len(url) > 500:
  25. raise ValueError("Invalid Git URL: URL too long or empty")
  26. # HTTPS: https://<host>[:<port>]/<owner>/<repo>[.git][/]
  27. match = re.match(
  28. r"https://[\w.\-]+(:\d+)?/([\w.\-]{1,100})/([\w.\-]{1,100})(?:\.git)?/?$",
  29. url,
  30. )
  31. if match:
  32. return match.group(2), match.group(3).removesuffix(".git")
  33. # SSH: git@<host>:<owner>/<repo>[.git]
  34. match = re.match(
  35. r"git@[\w.\-]+:([\w.\-]{1,100})/([\w.\-]{1,100})(?:\.git)?$",
  36. url,
  37. )
  38. if match:
  39. return match.group(1), match.group(2).removesuffix(".git")
  40. raise ValueError(f"Cannot parse repository URL: {url}")
  41. async def test_connection(self, repo_url: str, token: str, client: httpx.AsyncClient) -> dict:
  42. """Test API access and push permission for the repository."""
  43. try:
  44. owner, repo = self.parse_repo_url(repo_url)
  45. api_base = self.get_api_base(repo_url)
  46. headers = self.get_headers(token)
  47. response = await client.get(f"{api_base}/repos/{owner}/{repo}", headers=headers)
  48. if response.status_code == 401:
  49. return {"success": False, "message": "Invalid access token", "repo_name": None, "permissions": None}
  50. if response.status_code == 404:
  51. return {
  52. "success": False,
  53. "message": "Repository not found. Check URL and token permissions.",
  54. "repo_name": None,
  55. "permissions": None,
  56. }
  57. if response.status_code != 200:
  58. return {
  59. "success": False,
  60. "message": f"API error: {response.status_code}",
  61. "repo_name": None,
  62. "permissions": None,
  63. }
  64. data = response.json()
  65. permissions = data.get("permissions", {})
  66. is_private = bool(data.get("private", False))
  67. if not permissions.get("push", False):
  68. return {
  69. "success": False,
  70. "message": "Token does not have push permission to this repository",
  71. "repo_name": data.get("full_name"),
  72. "permissions": permissions,
  73. "is_private": is_private,
  74. }
  75. return {
  76. "success": True,
  77. "message": "Connection successful",
  78. "repo_name": data.get("full_name"),
  79. "permissions": permissions,
  80. "is_private": is_private,
  81. }
  82. except Exception as e:
  83. logger.exception("Git connection test failed")
  84. detail = str(e)[:200]
  85. message = (
  86. f"Connection failed: {type(e).__name__}: {detail}"
  87. if detail
  88. else f"Connection failed: {type(e).__name__}"
  89. )
  90. return {
  91. "success": False,
  92. "message": message,
  93. "repo_name": None,
  94. "permissions": None,
  95. "is_private": None,
  96. }
  97. async def push_files(
  98. self,
  99. repo_url: str,
  100. token: str,
  101. branch: str,
  102. files: dict,
  103. client: httpx.AsyncClient,
  104. _allow_branch_create: bool = True,
  105. ) -> dict:
  106. """Push files to the repository using the Git Data API."""
  107. try:
  108. owner, repo = self.parse_repo_url(repo_url)
  109. api_base = self.get_api_base(repo_url)
  110. headers = self.get_headers(token)
  111. ref_response = await client.get(f"{api_base}/repos/{owner}/{repo}/git/refs/heads/{branch}", headers=headers)
  112. if ref_response.status_code == 404:
  113. if not _allow_branch_create:
  114. return {
  115. "status": "failed",
  116. "message": (
  117. f"Branch '{branch}' not found after creation — possible replication lag. "
  118. "The next scheduled backup will retry."
  119. ),
  120. }
  121. return await self._create_branch_and_push(
  122. client, headers, api_base, owner, repo, branch, files, repo_url, token
  123. )
  124. if ref_response.status_code != 200:
  125. msg = f"Failed to get branch ref (HTTP {ref_response.status_code}): {self._truncated_response_text(ref_response)}"
  126. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  127. return {"status": "failed", "message": msg, "error": self._truncated_response_text(ref_response)}
  128. current_commit_sha, err = self._read_sha(ref_response, "object", "sha")
  129. if err:
  130. msg = f"Malformed ref response ({err}): {self._truncated_response_text(ref_response)}"
  131. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  132. return {"status": "failed", "message": msg}
  133. commit_response = await client.get(
  134. f"{api_base}/repos/{owner}/{repo}/git/commits/{current_commit_sha}", headers=headers
  135. )
  136. if commit_response.status_code != 200:
  137. msg = f"Failed to get current commit (HTTP {commit_response.status_code}): {self._truncated_response_text(commit_response)}"
  138. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  139. return {"status": "failed", "message": msg}
  140. current_tree_sha, err = self._read_sha(commit_response, "tree", "sha")
  141. if err:
  142. msg = f"Malformed commit response ({err}): {self._truncated_response_text(commit_response)}"
  143. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  144. return {"status": "failed", "message": msg}
  145. tree_response = await client.get(
  146. f"{api_base}/repos/{owner}/{repo}/git/trees/{current_tree_sha}?recursive=1", headers=headers
  147. )
  148. if tree_response.status_code != 200:
  149. msg = f"Failed to list existing tree (HTTP {tree_response.status_code}): {self._truncated_response_text(tree_response)}"
  150. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  151. return {"status": "failed", "message": msg, "error": self._truncated_response_text(tree_response)}
  152. tree_data = tree_response.json()
  153. # GitHub's tree API truncates >7MB / >100k entries. A truncated tree
  154. # listing makes the SHA-equality dedup miss and every file gets
  155. # re-uploaded as a new blob each run — silent churn until someone
  156. # notices the bloated history. Fail loudly so the user rotates the
  157. # backup repo.
  158. if tree_data.get("truncated"):
  159. msg = (
  160. "Repository tree exceeds the GitHub API listing limit (truncated=true). "
  161. "Rotate the backup repository to avoid silent file-by-file churn on every backup."
  162. )
  163. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  164. return {"status": "failed", "message": msg}
  165. existing_files: dict[str, str] = {}
  166. for item in tree_data.get("tree", []):
  167. if item.get("type") != "blob":
  168. continue
  169. path, sha = item.get("path"), item.get("sha")
  170. if not path or not sha:
  171. logger.warning("push_files: skipping malformed tree entry: %s", item)
  172. continue
  173. existing_files[path] = sha
  174. tree_items = []
  175. files_changed = 0
  176. for path, content in files.items():
  177. content_str = json.dumps(content, indent=2, default=str)
  178. content_bytes = content_str.encode("utf-8")
  179. content_sha = self._blob_sha(content_bytes)
  180. if path in existing_files and existing_files[path] == content_sha:
  181. continue
  182. blob_response = await client.post(
  183. f"{api_base}/repos/{owner}/{repo}/git/blobs",
  184. headers=headers,
  185. json={"content": base64.b64encode(content_bytes).decode(), "encoding": "base64"},
  186. )
  187. if blob_response.status_code == 404:
  188. msg = "GitHub API returned 404 for POST /git/blobs — check repository visibility and token scope"
  189. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  190. return {"status": "failed", "message": msg}
  191. if blob_response.status_code != 201:
  192. msg = f"Failed to create blob for {path} (HTTP {blob_response.status_code}): {self._truncated_response_text(blob_response)}"
  193. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  194. return {"status": "failed", "message": msg}
  195. blob_sha, err = self._read_sha(blob_response, "sha")
  196. if err:
  197. msg = f"Malformed blob response for {path} ({err}): {self._truncated_response_text(blob_response)}"
  198. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  199. return {"status": "failed", "message": msg}
  200. tree_items.append({"path": path, "mode": "100644", "type": "blob", "sha": blob_sha})
  201. files_changed += 1
  202. if not tree_items:
  203. return {"status": "skipped", "message": "No changes to commit", "commit_sha": None, "files_changed": 0}
  204. tree_response = await client.post(
  205. f"{api_base}/repos/{owner}/{repo}/git/trees",
  206. headers=headers,
  207. json={"base_tree": current_tree_sha, "tree": tree_items},
  208. )
  209. if tree_response.status_code != 201:
  210. msg = f"Failed to create tree (HTTP {tree_response.status_code}): {self._truncated_response_text(tree_response)}"
  211. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  212. return {"status": "failed", "message": msg}
  213. new_tree_sha, err = self._read_sha(tree_response, "sha")
  214. if err:
  215. msg = f"Malformed tree-create response ({err}): {self._truncated_response_text(tree_response)}"
  216. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  217. return {"status": "failed", "message": msg}
  218. commit_message = f"Bambuddy backup - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}"
  219. commit_response = await client.post(
  220. f"{api_base}/repos/{owner}/{repo}/git/commits",
  221. headers=headers,
  222. json={"message": commit_message, "tree": new_tree_sha, "parents": [current_commit_sha]},
  223. )
  224. if commit_response.status_code != 201:
  225. msg = f"Failed to create commit (HTTP {commit_response.status_code}): {self._truncated_response_text(commit_response)}"
  226. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  227. return {"status": "failed", "message": msg}
  228. new_commit_sha, err = self._read_sha(commit_response, "sha")
  229. if err:
  230. msg = f"Malformed commit-create response ({err}): {self._truncated_response_text(commit_response)}"
  231. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  232. return {"status": "failed", "message": msg}
  233. ref_update = await client.patch(
  234. f"{api_base}/repos/{owner}/{repo}/git/refs/heads/{branch}",
  235. headers=headers,
  236. json={"sha": new_commit_sha},
  237. )
  238. if ref_update.status_code != 200:
  239. msg = f"Failed to update branch (HTTP {ref_update.status_code}): {self._truncated_response_text(ref_update)}"
  240. logger.warning("push_files %s/%s: %s", owner, repo, msg)
  241. return {"status": "failed", "message": msg}
  242. return {
  243. "status": "success",
  244. "message": f"Backup successful - {files_changed} files updated",
  245. "commit_sha": new_commit_sha,
  246. "files_changed": files_changed,
  247. }
  248. except Exception as e:
  249. logger.exception("push_files failed for %s branch=%s", repo_url, branch)
  250. return {"status": "failed", "message": str(e), "error": str(e)}
  251. async def _create_branch_and_push(
  252. self,
  253. client: httpx.AsyncClient,
  254. headers: dict,
  255. api_base: str,
  256. owner: str,
  257. repo: str,
  258. branch: str,
  259. files: dict,
  260. repo_url: str,
  261. token: str,
  262. ) -> dict:
  263. """Create branch (from default branch or as initial commit) then push."""
  264. try:
  265. repo_response = await client.get(f"{api_base}/repos/{owner}/{repo}", headers=headers)
  266. if repo_response.status_code != 200:
  267. msg = f"Failed to get repo info (HTTP {repo_response.status_code}): {self._truncated_response_text(repo_response)}"
  268. logger.warning("_create_branch_and_push %s/%s: %s", owner, repo, msg)
  269. return {"status": "failed", "message": msg}
  270. try:
  271. default_branch = repo_response.json().get("default_branch", "main")
  272. except ValueError:
  273. msg = f"Malformed repo-info response (non-JSON body): {self._truncated_response_text(repo_response)}"
  274. logger.warning("_create_branch_and_push %s/%s: %s", owner, repo, msg)
  275. return {"status": "failed", "message": msg}
  276. ref_response = await client.get(
  277. f"{api_base}/repos/{owner}/{repo}/git/refs/heads/{default_branch}", headers=headers
  278. )
  279. if ref_response.status_code != 200:
  280. return await self._create_initial_commit(client, headers, api_base, owner, repo, branch, files)
  281. base_sha, err = self._read_sha(ref_response, "object", "sha")
  282. if err:
  283. msg = f"Malformed default-branch ref response ({err}): {self._truncated_response_text(ref_response)}"
  284. logger.warning("_create_branch_and_push %s/%s: %s", owner, repo, msg)
  285. return {"status": "failed", "message": msg}
  286. create_ref = await client.post(
  287. f"{api_base}/repos/{owner}/{repo}/git/refs",
  288. headers=headers,
  289. json={"ref": f"refs/heads/{branch}", "sha": base_sha},
  290. )
  291. if create_ref.status_code != 201:
  292. msg = f"Failed to create branch '{branch}' (HTTP {create_ref.status_code}): {self._truncated_response_text(create_ref)}"
  293. logger.warning("_create_branch_and_push %s/%s: %s", owner, repo, msg)
  294. return {"status": "failed", "message": msg}
  295. logger.info("Re-entering push_files after branch create %s/%s -> %s", owner, repo, branch)
  296. return await self.push_files(repo_url, token, branch, files, client, _allow_branch_create=False)
  297. except Exception as e:
  298. logger.exception("_create_branch_and_push failed for %s/%s branch=%s", owner, repo, branch)
  299. return {"status": "failed", "message": str(e), "error": str(e)}
  300. async def _create_initial_commit(
  301. self,
  302. client: httpx.AsyncClient,
  303. headers: dict,
  304. api_base: str,
  305. owner: str,
  306. repo: str,
  307. branch: str,
  308. files: dict,
  309. ) -> dict:
  310. """Create the first commit in an empty repository."""
  311. try:
  312. tree_items = []
  313. for path, content in files.items():
  314. content_str = json.dumps(content, indent=2, default=str)
  315. blob_response = await client.post(
  316. f"{api_base}/repos/{owner}/{repo}/git/blobs",
  317. headers=headers,
  318. json={"content": base64.b64encode(content_str.encode()).decode(), "encoding": "base64"},
  319. )
  320. if blob_response.status_code == 404:
  321. msg = "GitHub API returned 404 for POST /git/blobs — check repository visibility and token scope"
  322. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  323. return {"status": "failed", "message": msg}
  324. if blob_response.status_code != 201:
  325. msg = f"Failed to create blob for {path} (HTTP {blob_response.status_code}): {self._truncated_response_text(blob_response)}"
  326. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  327. return {"status": "failed", "message": msg}
  328. blob_sha, err = self._read_sha(blob_response, "sha")
  329. if err:
  330. msg = f"Malformed blob response for {path} ({err}): {self._truncated_response_text(blob_response)}"
  331. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  332. return {"status": "failed", "message": msg}
  333. tree_items.append({"path": path, "mode": "100644", "type": "blob", "sha": blob_sha})
  334. tree_response = await client.post(
  335. f"{api_base}/repos/{owner}/{repo}/git/trees",
  336. headers=headers,
  337. json={"tree": tree_items},
  338. )
  339. if tree_response.status_code != 201:
  340. msg = f"Failed to create tree (HTTP {tree_response.status_code}): {self._truncated_response_text(tree_response)}"
  341. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  342. return {"status": "failed", "message": msg}
  343. tree_sha, err = self._read_sha(tree_response, "sha")
  344. if err:
  345. msg = f"Malformed tree-create response ({err}): {self._truncated_response_text(tree_response)}"
  346. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  347. return {"status": "failed", "message": msg}
  348. commit_response = await client.post(
  349. f"{api_base}/repos/{owner}/{repo}/git/commits",
  350. headers=headers,
  351. json={
  352. "message": f"Initial Bambuddy backup - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}",
  353. "tree": tree_sha,
  354. },
  355. )
  356. if commit_response.status_code != 201:
  357. msg = f"Failed to create commit (HTTP {commit_response.status_code}): {self._truncated_response_text(commit_response)}"
  358. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  359. return {"status": "failed", "message": msg}
  360. commit_sha, err = self._read_sha(commit_response, "sha")
  361. if err:
  362. msg = f"Malformed commit-create response ({err}): {self._truncated_response_text(commit_response)}"
  363. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  364. return {"status": "failed", "message": msg}
  365. ref_response = await client.post(
  366. f"{api_base}/repos/{owner}/{repo}/git/refs",
  367. headers=headers,
  368. json={"ref": f"refs/heads/{branch}", "sha": commit_sha},
  369. )
  370. if ref_response.status_code != 201:
  371. msg = f"Failed to create branch ref (HTTP {ref_response.status_code}): {self._truncated_response_text(ref_response)}"
  372. logger.warning("_create_initial_commit %s/%s: %s", owner, repo, msg)
  373. return {"status": "failed", "message": msg}
  374. return {
  375. "status": "success",
  376. "message": f"Initial backup created - {len(files)} files",
  377. "commit_sha": commit_sha,
  378. "files_changed": len(files),
  379. }
  380. except Exception as e:
  381. logger.exception("_create_initial_commit failed for %s/%s branch=%s", owner, repo, branch)
  382. return {"status": "failed", "message": str(e), "error": str(e)}