ci: add claude code code review

Adds a CI step to the Cirrus CI which will run claude code on the diff
of a Pull Request and fail if it finds critical security vulnerabilities
or serious code issues. Optinally it can be given a GitHub api key to
create a comment in the pull request.
This commit is contained in:
f321x
2026-03-26 19:37:19 +01:00
parent 35b44a1e64
commit 88f9c49a60
3 changed files with 375 additions and 0 deletions
+260
View File
@@ -0,0 +1,260 @@
#!/usr/bin/env python3
"""
Cirrus CI task: Claude Code security review for Electrum pull requests.
Runs Claude Code against the PR diff to detect critical security
vulnerabilities. Optionally posts findings as a GitHub PR comment.
Exit codes:
0 -- PASS (no critical/high issues)
1 -- FAIL (critical/high issues found)
2 -- review could not run (infra error, logged as warning)
Environment variables:
Required:
CLAUDE_CODE_OAUTH_TOKEN -- OAuth token from `claude setup-token` (MAX subscription)
Optional:
GITHUB_TOKEN -- GitHub token for posting PR comments
Set by Cirrus CI:
CIRRUS_PR -- PR number (empty if not a PR build)
CIRRUS_BASE_BRANCH -- target branch of the PR
CIRRUS_REPO_FULL_NAME -- e.g. "spesmilo/electrum"
CIRRUS_TASK_ID -- current Cirrus task ID
"""
import json
import os
import re
import subprocess
import sys
import urllib.error
import urllib.request
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROMPT_FILE = os.path.join(SCRIPT_DIR, "security_review_prompt.md")
MAX_DIFF_CHARS = 800_000
CLAUDE_TIMEOUT_SECONDS = 20 * 60
CLAUDE_MODEL = "claude-opus-4-6"
CLAUDE_EFFORT = "max"
VERDICT_PASS = "PASS"
VERDICT_FAIL = "FAIL"
def git(*args: str) -> str:
result = subprocess.run(
["git"] + list(args),
capture_output=True, text=True, check=True,
)
return result.stdout
def fetch_base_branch(base: str) -> None:
try:
git("fetch", "origin", base, "--depth=1")
except subprocess.CalledProcessError:
git("fetch", "origin", base)
# Shallow CI clones may lack the history needed for three-dot diff
# (merge-base computation). Unshallow if the merge-base is unreachable.
try:
git("merge-base", f"origin/{base}", "HEAD")
except subprocess.CalledProcessError:
try:
git("fetch", "--unshallow")
except subprocess.CalledProcessError:
pass # already a full clone
def get_pr_diff(base: str) -> str:
return git("diff", f"origin/{base}...HEAD")
def changed_files_from_diff(diff: str) -> str:
return "\n".join(
m.group(1) for m in re.finditer(r"^diff --git a/.+ b/(.+)$", diff, re.MULTILINE)
)
def build_prompt(diff: str, changed_files: str) -> str:
with open(PROMPT_FILE) as f:
instructions = f.read()
return (
f"{instructions}\n\n"
f"---\n\n"
f"## Changed files\n\n```\n{changed_files}\n```\n\n"
f"## Diff\n\n```diff\n{diff}\n```"
)
def run_claude(prompt: str) -> str | None:
"""Invoke Claude Code CLI in print mode. Returns review text or None on failure.
Passes the prompt via stdin to avoid OS argument length limits (MAX_ARG_STRLEN).
"""
cmd = [
"claude",
"-p",
"--model", CLAUDE_MODEL,
"--effort", CLAUDE_EFFORT,
"--output-format", "text",
]
try:
result = subprocess.run(
cmd,
input=prompt,
capture_output=True,
text=True,
timeout=CLAUDE_TIMEOUT_SECONDS,
)
except FileNotFoundError:
print("ERROR: 'claude' CLI not found. Is @anthropic-ai/claude-code installed?")
return None
except subprocess.TimeoutExpired:
print(f"ERROR: Claude Code timed out after {CLAUDE_TIMEOUT_SECONDS}s.")
return None
if result.returncode != 0:
print(f"ERROR: Claude Code exited with code {result.returncode}")
if result.stderr:
print(result.stderr)
return None
return result.stdout
def parse_verdict(review: str) -> str | None:
for line in reversed(review.strip().splitlines()):
stripped = line.strip()
if stripped.startswith("VERDICT:"):
verdict = stripped.split(":", 1)[1].strip().upper()
if verdict in (VERDICT_PASS, VERDICT_FAIL):
return verdict
return None
def post_github_comment(body: str, *, repo: str, pr: str) -> None:
"""Post a comment on the PR. Silently skips if credentials are missing."""
token = os.environ.get("GITHUB_TOKEN", "").strip()
if not token:
print("GITHUB_TOKEN not set -- skipping PR comment.")
return
task_id = os.environ.get("CIRRUS_TASK_ID", "")
log_url = f"https://cirrus-ci.com/task/{task_id}" if task_id else ""
comment = (
f"## Security Review -- Issues Found\n\n"
f"{body}\n\n"
f"---\n"
f"*Reviewed by Claude Code ({CLAUDE_MODEL})*"
)
if log_url:
comment += f" | [Full CI log]({log_url})"
url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments"
data = json.dumps({"body": comment}).encode()
req = urllib.request.Request(
url,
data=data,
headers={
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
"Content-Type": "application/json",
},
method="POST",
)
try:
with urllib.request.urlopen(req) as resp:
if resp.status == 201:
print(f"Posted review comment on PR #{pr}.")
else:
print(f"GitHub API responded with status {resp.status}.")
except urllib.error.HTTPError as exc:
print(f"Failed to post PR comment: HTTP {exc.code} {exc.reason}")
except urllib.error.URLError as exc:
print(f"Failed to post PR comment: {exc.reason}")
def main() -> int:
separator = "=" * 60
print(separator)
print("Claude Code Security Review")
print(separator)
pr = os.environ.get("CIRRUS_PR", "").strip()
if not pr:
print("Not a PR build (CIRRUS_PR is empty). Skipping.")
return 0
if not os.environ.get("CLAUDE_CODE_OAUTH_TOKEN", "").strip():
print("ERROR: CLAUDE_CODE_OAUTH_TOKEN is not set.")
return 2
repo = os.environ.get("CIRRUS_REPO_FULL_NAME", "").strip()
base_branch = os.environ.get("CIRRUS_BASE_BRANCH", "master").strip()
print(f"PR #{pr} -> base branch: {base_branch}")
print("\nFetching base branch...")
try:
fetch_base_branch(base_branch)
except subprocess.CalledProcessError as exc:
print(f"ERROR: git fetch failed: {exc}")
return 2
print("Computing diff...")
try:
diff = get_pr_diff(base_branch)
except subprocess.CalledProcessError as exc:
print(f"ERROR: git diff failed: {exc}")
return 2
if not diff or diff.isspace():
print("Empty diff -- nothing to review.")
return 0
changed_files = changed_files_from_diff(diff)
file_count = len(changed_files.splitlines())
print(f"Reviewing changes across {file_count} file(s)...")
if len(diff) > MAX_DIFF_CHARS:
print(f"ERROR: diff is {len(diff)} chars, exceeds maximum of {MAX_DIFF_CHARS}. Skipping review.")
return 2
prompt = build_prompt(diff, changed_files)
print(f"\nRunning Claude Code review (model: {CLAUDE_MODEL})...\n")
review = run_claude(prompt)
if review is None:
print("Review failed to produce output.")
return 2
print(separator)
print("REVIEW OUTPUT")
print(separator)
print(review)
print(separator)
verdict = parse_verdict(review)
if verdict == VERDICT_FAIL:
print("\nVERDICT: FAIL -- Critical or high severity issues found.")
post_github_comment(review, repo=repo, pr=pr)
return 1
if verdict == VERDICT_PASS:
print("\nVERDICT: PASS -- No critical or high severity issues.")
return 0
print("\nWARNING: Could not parse verdict from review output.")
print("Review logged above for manual inspection.")
return 2
if __name__ == "__main__":
sys.exit(main())
+92
View File
@@ -0,0 +1,92 @@
# Electrum Security Review
You are a security auditor reviewing a pull request diff for **Electrum**, a Bitcoin wallet
that handles real funds on mainnet and Lightning Network. Your review must be thorough and
precise -- but equally, it must not cry wolf. Only flag issues you are confident are real
and exploitable in the context shown. A false positive that blocks a legitimate PR wastes
developer time and erodes trust in this review.
## Scope
Focus your findings on the diff provided below -- only flag issues introduced or worsened by
changes in this PR. You have access to the full Electrum codebase; use it freely to read
surrounding code, trace call chains, and understand what the diff actually does. But do not
audit code outside the diff -- the codebase is context, not the review target.
Focus on changes that introduce, worsen, or fail to mitigate security vulnerabilities.
Only flag issues introduced or worsened by the diff. Do not flag
pre-existing issues visible in context lines unless the change makes them newly exploitable.
If the diff is truncated, review only what is provided and note the truncation in your summary.
For each potential issue, consider whether it is actually exploitable given the context
visible in the diff. Do not flag purely theoretical vulnerabilities that require
preconditions impossible within Electrum's architecture. However, do account for
sophisticated real-world attackers -- Electrum is a high-value target where supply-chain
compromise, malicious Electrum servers, and rogue Lightning peers are realistic threat
vectors.
## Severity Definitions
### CRITICAL
Issues that could directly cause loss of funds, exposure of private keys, remote code execution, denial of service, or phishing:
- Private key, seed phrase, or wallet password leaked (to logs, error messages, network, disk in cleartext)
- Cryptographic flaws: weak/predictable randomness, broken key derivation, nonce reuse, custom crypto primitives
- Authentication or authorization bypass in JSON-RPC, wallet password checks, or plugin system
- Transaction integrity: amount/fee manipulation, signature bypass, double-spend vectors
- Lightning channel state corruption that could cause force-close fund loss
- Denial of service: unbounded allocations, algorithmic complexity attacks, resource exhaustion from malicious server responses or peer messages, unbound loops or reads driven by untrusted input
- Phishing vectors: untrusted strings from servers/peers displayed to users in error messages, dialogs, transaction descriptions, or notifications without sanitization -- an attacker-controlled server could craft messages that trick users into sending funds, revealing credentials, or taking dangerous actions
- Obvious regressions: changes that clearly break existing functionality -- e.g. uncaught exceptions propagating to the user, broken control flow that makes a feature non-functional, or incorrect argument handling that would reliably crash at runtime
### HIGH
Issues that could be exploited with moderate effort or lead to significant damage:
- Command injection, path traversal, or injection attacks (SQL, LDAP, XML)
- Unsafe deserialization of data from network peers, Electrum servers, or untrusted files
- Race conditions in wallet state, Lightning channel state machine, HTLC handling, or concurrent RPC
- Integer overflow/underflow in financial calculations (amounts, fees, change outputs)
- Insufficient validation of network protocol messages (Electrum protocol, Lightning BOLT messages, Nostr)
- Hardcoded secrets, credentials, API keys, or debug backdoors
- TOCTOU (time-of-check-time-of-use) vulnerabilities in file or wallet operations
- Privacy leaks: unnecessary exposure of addresses, balances, transaction history, or wallet fingerprints to servers, peers, or third parties -- includes address reuse, unneeded network requests that correlate addresses, and identifiable user fingerprints.
## Output Format
Structure your review as follows:
### If findings exist:
For each finding, use this exact format:
```
### [SEVERITY] Short title
- **File:** `filename.py` L123-L145 (or "multiple files" if applicable)
- **Issue:** Clear description of the vulnerability
- **Impact:** What an attacker could achieve by exploiting this
- **Recommendation:** Specific fix suggestion
```
### Summary
After all findings, provide a one-paragraph summary.
### Verdict
End your review with exactly one of these lines (no extra text on the same line):
```
VERDICT: FAIL
```
or
```
VERDICT: PASS
```
Rules:
- `VERDICT: FAIL` if ANY **Critical** or **High** severity issues were found
- `VERDICT: PASS` if no Critical or High severity issues were found
- If the diff contains no security-relevant changes (documentation, comments, tests, locale files only), output:
```
No security-relevant changes detected in this diff.
VERDICT: PASS
```