ci: add claude code code review
Adds a CI step to the Cirrus CI which will run claude code on the diff of a Pull Request and fail if it finds critical security vulnerabilities or serious code issues. Optinally it can be given a GitHub api key to create a comment in the pull request.
This commit is contained in:
+23
@@ -243,6 +243,29 @@ task:
|
|||||||
main_script:
|
main_script:
|
||||||
- contrib/ban_unicode.py
|
- contrib/ban_unicode.py
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: "security review: Claude Code"
|
||||||
|
# NOTE: claude has access to all API keys available in the Cirrus CI environment.
|
||||||
|
# If we would add some critical api keys in here we should consider this.
|
||||||
|
matrix:
|
||||||
|
- trigger_type: automatic
|
||||||
|
only_if: $CIRRUS_PR != '' && ($CIRRUS_USER_PERMISSION == 'write' || $CIRRUS_USER_PERMISSION == 'admin')
|
||||||
|
- trigger_type: manual
|
||||||
|
only_if: $CIRRUS_PR != ''
|
||||||
|
container:
|
||||||
|
image: node:20
|
||||||
|
cpu: 1
|
||||||
|
memory: 2G
|
||||||
|
# CLAUDE_CODE_OAUTH_TOKEN is set as an encrypted "override" in https://cirrus-ci.com/settings/...
|
||||||
|
# It must be stored encrypted (ENCRYPTED[...]) so Cirrus CI refuses to decrypt it for
|
||||||
|
# fork PRs from users without write permission.
|
||||||
|
# Generate with: claude setup-token
|
||||||
|
# Optional: set GITHUB_TOKEN to enable PR comments on failure
|
||||||
|
install_script:
|
||||||
|
- npm install -g @anthropic-ai/claude-code
|
||||||
|
review_script:
|
||||||
|
- python3 contrib/ci/claude_security_review.py
|
||||||
|
|
||||||
# Cron jobs configured in https://cirrus-ci.com/settings/...
|
# Cron jobs configured in https://cirrus-ci.com/settings/...
|
||||||
# - job "nightly" on branch "master" at "0 30 2 * * ?" (every day at 02:30Z)
|
# - job "nightly" on branch "master" at "0 30 2 * * ?" (every day at 02:30Z)
|
||||||
task:
|
task:
|
||||||
|
|||||||
Executable
+260
@@ -0,0 +1,260 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Cirrus CI task: Claude Code security review for Electrum pull requests.
|
||||||
|
|
||||||
|
Runs Claude Code against the PR diff to detect critical security
|
||||||
|
vulnerabilities. Optionally posts findings as a GitHub PR comment.
|
||||||
|
|
||||||
|
Exit codes:
|
||||||
|
0 -- PASS (no critical/high issues)
|
||||||
|
1 -- FAIL (critical/high issues found)
|
||||||
|
2 -- review could not run (infra error, logged as warning)
|
||||||
|
|
||||||
|
Environment variables:
|
||||||
|
Required:
|
||||||
|
CLAUDE_CODE_OAUTH_TOKEN -- OAuth token from `claude setup-token` (MAX subscription)
|
||||||
|
Optional:
|
||||||
|
GITHUB_TOKEN -- GitHub token for posting PR comments
|
||||||
|
Set by Cirrus CI:
|
||||||
|
CIRRUS_PR -- PR number (empty if not a PR build)
|
||||||
|
CIRRUS_BASE_BRANCH -- target branch of the PR
|
||||||
|
CIRRUS_REPO_FULL_NAME -- e.g. "spesmilo/electrum"
|
||||||
|
CIRRUS_TASK_ID -- current Cirrus task ID
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
PROMPT_FILE = os.path.join(SCRIPT_DIR, "security_review_prompt.md")
|
||||||
|
|
||||||
|
MAX_DIFF_CHARS = 800_000
|
||||||
|
CLAUDE_TIMEOUT_SECONDS = 20 * 60
|
||||||
|
CLAUDE_MODEL = "claude-opus-4-6"
|
||||||
|
CLAUDE_EFFORT = "max"
|
||||||
|
|
||||||
|
VERDICT_PASS = "PASS"
|
||||||
|
VERDICT_FAIL = "FAIL"
|
||||||
|
|
||||||
|
|
||||||
|
def git(*args: str) -> str:
|
||||||
|
result = subprocess.run(
|
||||||
|
["git"] + list(args),
|
||||||
|
capture_output=True, text=True, check=True,
|
||||||
|
)
|
||||||
|
return result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_base_branch(base: str) -> None:
|
||||||
|
try:
|
||||||
|
git("fetch", "origin", base, "--depth=1")
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
git("fetch", "origin", base)
|
||||||
|
# Shallow CI clones may lack the history needed for three-dot diff
|
||||||
|
# (merge-base computation). Unshallow if the merge-base is unreachable.
|
||||||
|
try:
|
||||||
|
git("merge-base", f"origin/{base}", "HEAD")
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
try:
|
||||||
|
git("fetch", "--unshallow")
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass # already a full clone
|
||||||
|
|
||||||
|
|
||||||
|
def get_pr_diff(base: str) -> str:
|
||||||
|
return git("diff", f"origin/{base}...HEAD")
|
||||||
|
|
||||||
|
|
||||||
|
def changed_files_from_diff(diff: str) -> str:
|
||||||
|
return "\n".join(
|
||||||
|
m.group(1) for m in re.finditer(r"^diff --git a/.+ b/(.+)$", diff, re.MULTILINE)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_prompt(diff: str, changed_files: str) -> str:
|
||||||
|
with open(PROMPT_FILE) as f:
|
||||||
|
instructions = f.read()
|
||||||
|
|
||||||
|
return (
|
||||||
|
f"{instructions}\n\n"
|
||||||
|
f"---\n\n"
|
||||||
|
f"## Changed files\n\n```\n{changed_files}\n```\n\n"
|
||||||
|
f"## Diff\n\n```diff\n{diff}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_claude(prompt: str) -> str | None:
|
||||||
|
"""Invoke Claude Code CLI in print mode. Returns review text or None on failure.
|
||||||
|
|
||||||
|
Passes the prompt via stdin to avoid OS argument length limits (MAX_ARG_STRLEN).
|
||||||
|
"""
|
||||||
|
cmd = [
|
||||||
|
"claude",
|
||||||
|
"-p",
|
||||||
|
"--model", CLAUDE_MODEL,
|
||||||
|
"--effort", CLAUDE_EFFORT,
|
||||||
|
"--output-format", "text",
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
input=prompt,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=CLAUDE_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("ERROR: 'claude' CLI not found. Is @anthropic-ai/claude-code installed?")
|
||||||
|
return None
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print(f"ERROR: Claude Code timed out after {CLAUDE_TIMEOUT_SECONDS}s.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"ERROR: Claude Code exited with code {result.returncode}")
|
||||||
|
if result.stderr:
|
||||||
|
print(result.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return result.stdout
|
||||||
|
|
||||||
|
|
||||||
|
def parse_verdict(review: str) -> str | None:
|
||||||
|
for line in reversed(review.strip().splitlines()):
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped.startswith("VERDICT:"):
|
||||||
|
verdict = stripped.split(":", 1)[1].strip().upper()
|
||||||
|
if verdict in (VERDICT_PASS, VERDICT_FAIL):
|
||||||
|
return verdict
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def post_github_comment(body: str, *, repo: str, pr: str) -> None:
|
||||||
|
"""Post a comment on the PR. Silently skips if credentials are missing."""
|
||||||
|
token = os.environ.get("GITHUB_TOKEN", "").strip()
|
||||||
|
if not token:
|
||||||
|
print("GITHUB_TOKEN not set -- skipping PR comment.")
|
||||||
|
return
|
||||||
|
|
||||||
|
task_id = os.environ.get("CIRRUS_TASK_ID", "")
|
||||||
|
log_url = f"https://cirrus-ci.com/task/{task_id}" if task_id else ""
|
||||||
|
|
||||||
|
comment = (
|
||||||
|
f"## Security Review -- Issues Found\n\n"
|
||||||
|
f"{body}\n\n"
|
||||||
|
f"---\n"
|
||||||
|
f"*Reviewed by Claude Code ({CLAUDE_MODEL})*"
|
||||||
|
)
|
||||||
|
if log_url:
|
||||||
|
comment += f" | [Full CI log]({log_url})"
|
||||||
|
|
||||||
|
url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments"
|
||||||
|
data = json.dumps({"body": comment}).encode()
|
||||||
|
req = urllib.request.Request(
|
||||||
|
url,
|
||||||
|
data=data,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"token {token}",
|
||||||
|
"Accept": "application/vnd.github.v3+json",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req) as resp:
|
||||||
|
if resp.status == 201:
|
||||||
|
print(f"Posted review comment on PR #{pr}.")
|
||||||
|
else:
|
||||||
|
print(f"GitHub API responded with status {resp.status}.")
|
||||||
|
except urllib.error.HTTPError as exc:
|
||||||
|
print(f"Failed to post PR comment: HTTP {exc.code} {exc.reason}")
|
||||||
|
except urllib.error.URLError as exc:
|
||||||
|
print(f"Failed to post PR comment: {exc.reason}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
separator = "=" * 60
|
||||||
|
|
||||||
|
print(separator)
|
||||||
|
print("Claude Code Security Review")
|
||||||
|
print(separator)
|
||||||
|
|
||||||
|
pr = os.environ.get("CIRRUS_PR", "").strip()
|
||||||
|
if not pr:
|
||||||
|
print("Not a PR build (CIRRUS_PR is empty). Skipping.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if not os.environ.get("CLAUDE_CODE_OAUTH_TOKEN", "").strip():
|
||||||
|
print("ERROR: CLAUDE_CODE_OAUTH_TOKEN is not set.")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
repo = os.environ.get("CIRRUS_REPO_FULL_NAME", "").strip()
|
||||||
|
base_branch = os.environ.get("CIRRUS_BASE_BRANCH", "master").strip()
|
||||||
|
print(f"PR #{pr} -> base branch: {base_branch}")
|
||||||
|
|
||||||
|
print("\nFetching base branch...")
|
||||||
|
try:
|
||||||
|
fetch_base_branch(base_branch)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
print(f"ERROR: git fetch failed: {exc}")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
print("Computing diff...")
|
||||||
|
try:
|
||||||
|
diff = get_pr_diff(base_branch)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
print(f"ERROR: git diff failed: {exc}")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
if not diff or diff.isspace():
|
||||||
|
print("Empty diff -- nothing to review.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
changed_files = changed_files_from_diff(diff)
|
||||||
|
file_count = len(changed_files.splitlines())
|
||||||
|
print(f"Reviewing changes across {file_count} file(s)...")
|
||||||
|
|
||||||
|
if len(diff) > MAX_DIFF_CHARS:
|
||||||
|
print(f"ERROR: diff is {len(diff)} chars, exceeds maximum of {MAX_DIFF_CHARS}. Skipping review.")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
prompt = build_prompt(diff, changed_files)
|
||||||
|
|
||||||
|
print(f"\nRunning Claude Code review (model: {CLAUDE_MODEL})...\n")
|
||||||
|
review = run_claude(prompt)
|
||||||
|
|
||||||
|
if review is None:
|
||||||
|
print("Review failed to produce output.")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
print(separator)
|
||||||
|
print("REVIEW OUTPUT")
|
||||||
|
print(separator)
|
||||||
|
print(review)
|
||||||
|
print(separator)
|
||||||
|
|
||||||
|
verdict = parse_verdict(review)
|
||||||
|
|
||||||
|
if verdict == VERDICT_FAIL:
|
||||||
|
print("\nVERDICT: FAIL -- Critical or high severity issues found.")
|
||||||
|
post_github_comment(review, repo=repo, pr=pr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if verdict == VERDICT_PASS:
|
||||||
|
print("\nVERDICT: PASS -- No critical or high severity issues.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
print("\nWARNING: Could not parse verdict from review output.")
|
||||||
|
print("Review logged above for manual inspection.")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
# Electrum Security Review
|
||||||
|
|
||||||
|
You are a security auditor reviewing a pull request diff for **Electrum**, a Bitcoin wallet
|
||||||
|
that handles real funds on mainnet and Lightning Network. Your review must be thorough and
|
||||||
|
precise -- but equally, it must not cry wolf. Only flag issues you are confident are real
|
||||||
|
and exploitable in the context shown. A false positive that blocks a legitimate PR wastes
|
||||||
|
developer time and erodes trust in this review.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
Focus your findings on the diff provided below -- only flag issues introduced or worsened by
|
||||||
|
changes in this PR. You have access to the full Electrum codebase; use it freely to read
|
||||||
|
surrounding code, trace call chains, and understand what the diff actually does. But do not
|
||||||
|
audit code outside the diff -- the codebase is context, not the review target.
|
||||||
|
Focus on changes that introduce, worsen, or fail to mitigate security vulnerabilities.
|
||||||
|
Only flag issues introduced or worsened by the diff. Do not flag
|
||||||
|
pre-existing issues visible in context lines unless the change makes them newly exploitable.
|
||||||
|
If the diff is truncated, review only what is provided and note the truncation in your summary.
|
||||||
|
|
||||||
|
For each potential issue, consider whether it is actually exploitable given the context
|
||||||
|
visible in the diff. Do not flag purely theoretical vulnerabilities that require
|
||||||
|
preconditions impossible within Electrum's architecture. However, do account for
|
||||||
|
sophisticated real-world attackers -- Electrum is a high-value target where supply-chain
|
||||||
|
compromise, malicious Electrum servers, and rogue Lightning peers are realistic threat
|
||||||
|
vectors.
|
||||||
|
|
||||||
|
## Severity Definitions
|
||||||
|
|
||||||
|
### CRITICAL
|
||||||
|
Issues that could directly cause loss of funds, exposure of private keys, remote code execution, denial of service, or phishing:
|
||||||
|
- Private key, seed phrase, or wallet password leaked (to logs, error messages, network, disk in cleartext)
|
||||||
|
- Cryptographic flaws: weak/predictable randomness, broken key derivation, nonce reuse, custom crypto primitives
|
||||||
|
- Authentication or authorization bypass in JSON-RPC, wallet password checks, or plugin system
|
||||||
|
- Transaction integrity: amount/fee manipulation, signature bypass, double-spend vectors
|
||||||
|
- Lightning channel state corruption that could cause force-close fund loss
|
||||||
|
- Denial of service: unbounded allocations, algorithmic complexity attacks, resource exhaustion from malicious server responses or peer messages, unbound loops or reads driven by untrusted input
|
||||||
|
- Phishing vectors: untrusted strings from servers/peers displayed to users in error messages, dialogs, transaction descriptions, or notifications without sanitization -- an attacker-controlled server could craft messages that trick users into sending funds, revealing credentials, or taking dangerous actions
|
||||||
|
- Obvious regressions: changes that clearly break existing functionality -- e.g. uncaught exceptions propagating to the user, broken control flow that makes a feature non-functional, or incorrect argument handling that would reliably crash at runtime
|
||||||
|
|
||||||
|
### HIGH
|
||||||
|
Issues that could be exploited with moderate effort or lead to significant damage:
|
||||||
|
- Command injection, path traversal, or injection attacks (SQL, LDAP, XML)
|
||||||
|
- Unsafe deserialization of data from network peers, Electrum servers, or untrusted files
|
||||||
|
- Race conditions in wallet state, Lightning channel state machine, HTLC handling, or concurrent RPC
|
||||||
|
- Integer overflow/underflow in financial calculations (amounts, fees, change outputs)
|
||||||
|
- Insufficient validation of network protocol messages (Electrum protocol, Lightning BOLT messages, Nostr)
|
||||||
|
- Hardcoded secrets, credentials, API keys, or debug backdoors
|
||||||
|
- TOCTOU (time-of-check-time-of-use) vulnerabilities in file or wallet operations
|
||||||
|
- Privacy leaks: unnecessary exposure of addresses, balances, transaction history, or wallet fingerprints to servers, peers, or third parties -- includes address reuse, unneeded network requests that correlate addresses, and identifiable user fingerprints.
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
Structure your review as follows:
|
||||||
|
|
||||||
|
### If findings exist:
|
||||||
|
|
||||||
|
For each finding, use this exact format:
|
||||||
|
|
||||||
|
```
|
||||||
|
### [SEVERITY] Short title
|
||||||
|
- **File:** `filename.py` L123-L145 (or "multiple files" if applicable)
|
||||||
|
- **Issue:** Clear description of the vulnerability
|
||||||
|
- **Impact:** What an attacker could achieve by exploiting this
|
||||||
|
- **Recommendation:** Specific fix suggestion
|
||||||
|
```
|
||||||
|
|
||||||
|
### Summary
|
||||||
|
|
||||||
|
After all findings, provide a one-paragraph summary.
|
||||||
|
|
||||||
|
### Verdict
|
||||||
|
|
||||||
|
End your review with exactly one of these lines (no extra text on the same line):
|
||||||
|
|
||||||
|
```
|
||||||
|
VERDICT: FAIL
|
||||||
|
```
|
||||||
|
or
|
||||||
|
```
|
||||||
|
VERDICT: PASS
|
||||||
|
```
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- `VERDICT: FAIL` if ANY **Critical** or **High** severity issues were found
|
||||||
|
- `VERDICT: PASS` if no Critical or High severity issues were found
|
||||||
|
- If the diff contains no security-relevant changes (documentation, comments, tests, locale files only), output:
|
||||||
|
|
||||||
|
```
|
||||||
|
No security-relevant changes detected in this diff.
|
||||||
|
|
||||||
|
VERDICT: PASS
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user