From 88f9c49a603d9a167ad9341264a2ecfdaf91db2c Mon Sep 17 00:00:00 2001 From: f321x Date: Thu, 26 Mar 2026 19:37:19 +0100 Subject: [PATCH] ci: add claude code code review Adds a CI step to the Cirrus CI which will run claude code on the diff of a Pull Request and fail if it finds critical security vulnerabilities or serious code issues. Optinally it can be given a GitHub api key to create a comment in the pull request. --- .cirrus.yml | 23 +++ contrib/ci/claude_security_review.py | 260 +++++++++++++++++++++++++++ contrib/ci/security_review_prompt.md | 92 ++++++++++ 3 files changed, 375 insertions(+) create mode 100755 contrib/ci/claude_security_review.py create mode 100644 contrib/ci/security_review_prompt.md diff --git a/.cirrus.yml b/.cirrus.yml index af541292b..9b7ddcc64 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -243,6 +243,29 @@ task: main_script: - contrib/ban_unicode.py +task: + name: "security review: Claude Code" + # NOTE: claude has access to all API keys available in the Cirrus CI environment. + # If we would add some critical api keys in here we should consider this. + matrix: + - trigger_type: automatic + only_if: $CIRRUS_PR != '' && ($CIRRUS_USER_PERMISSION == 'write' || $CIRRUS_USER_PERMISSION == 'admin') + - trigger_type: manual + only_if: $CIRRUS_PR != '' + container: + image: node:20 + cpu: 1 + memory: 2G + # CLAUDE_CODE_OAUTH_TOKEN is set as an encrypted "override" in https://cirrus-ci.com/settings/... + # It must be stored encrypted (ENCRYPTED[...]) so Cirrus CI refuses to decrypt it for + # fork PRs from users without write permission. + # Generate with: claude setup-token + # Optional: set GITHUB_TOKEN to enable PR comments on failure + install_script: + - npm install -g @anthropic-ai/claude-code + review_script: + - python3 contrib/ci/claude_security_review.py + # Cron jobs configured in https://cirrus-ci.com/settings/... # - job "nightly" on branch "master" at "0 30 2 * * ?" (every day at 02:30Z) task: diff --git a/contrib/ci/claude_security_review.py b/contrib/ci/claude_security_review.py new file mode 100755 index 000000000..0db8877ca --- /dev/null +++ b/contrib/ci/claude_security_review.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +""" +Cirrus CI task: Claude Code security review for Electrum pull requests. + +Runs Claude Code against the PR diff to detect critical security +vulnerabilities. Optionally posts findings as a GitHub PR comment. + +Exit codes: + 0 -- PASS (no critical/high issues) + 1 -- FAIL (critical/high issues found) + 2 -- review could not run (infra error, logged as warning) + +Environment variables: + Required: + CLAUDE_CODE_OAUTH_TOKEN -- OAuth token from `claude setup-token` (MAX subscription) + Optional: + GITHUB_TOKEN -- GitHub token for posting PR comments + Set by Cirrus CI: + CIRRUS_PR -- PR number (empty if not a PR build) + CIRRUS_BASE_BRANCH -- target branch of the PR + CIRRUS_REPO_FULL_NAME -- e.g. "spesmilo/electrum" + CIRRUS_TASK_ID -- current Cirrus task ID +""" + +import json +import os +import re +import subprocess +import sys +import urllib.error +import urllib.request + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +PROMPT_FILE = os.path.join(SCRIPT_DIR, "security_review_prompt.md") + +MAX_DIFF_CHARS = 800_000 +CLAUDE_TIMEOUT_SECONDS = 20 * 60 +CLAUDE_MODEL = "claude-opus-4-6" +CLAUDE_EFFORT = "max" + +VERDICT_PASS = "PASS" +VERDICT_FAIL = "FAIL" + + +def git(*args: str) -> str: + result = subprocess.run( + ["git"] + list(args), + capture_output=True, text=True, check=True, + ) + return result.stdout + + +def fetch_base_branch(base: str) -> None: + try: + git("fetch", "origin", base, "--depth=1") + except subprocess.CalledProcessError: + git("fetch", "origin", base) + # Shallow CI clones may lack the history needed for three-dot diff + # (merge-base computation). Unshallow if the merge-base is unreachable. + try: + git("merge-base", f"origin/{base}", "HEAD") + except subprocess.CalledProcessError: + try: + git("fetch", "--unshallow") + except subprocess.CalledProcessError: + pass # already a full clone + + +def get_pr_diff(base: str) -> str: + return git("diff", f"origin/{base}...HEAD") + + +def changed_files_from_diff(diff: str) -> str: + return "\n".join( + m.group(1) for m in re.finditer(r"^diff --git a/.+ b/(.+)$", diff, re.MULTILINE) + ) + + +def build_prompt(diff: str, changed_files: str) -> str: + with open(PROMPT_FILE) as f: + instructions = f.read() + + return ( + f"{instructions}\n\n" + f"---\n\n" + f"## Changed files\n\n```\n{changed_files}\n```\n\n" + f"## Diff\n\n```diff\n{diff}\n```" + ) + + +def run_claude(prompt: str) -> str | None: + """Invoke Claude Code CLI in print mode. Returns review text or None on failure. + + Passes the prompt via stdin to avoid OS argument length limits (MAX_ARG_STRLEN). + """ + cmd = [ + "claude", + "-p", + "--model", CLAUDE_MODEL, + "--effort", CLAUDE_EFFORT, + "--output-format", "text", + ] + + try: + result = subprocess.run( + cmd, + input=prompt, + capture_output=True, + text=True, + timeout=CLAUDE_TIMEOUT_SECONDS, + ) + except FileNotFoundError: + print("ERROR: 'claude' CLI not found. Is @anthropic-ai/claude-code installed?") + return None + except subprocess.TimeoutExpired: + print(f"ERROR: Claude Code timed out after {CLAUDE_TIMEOUT_SECONDS}s.") + return None + + if result.returncode != 0: + print(f"ERROR: Claude Code exited with code {result.returncode}") + if result.stderr: + print(result.stderr) + return None + + return result.stdout + + +def parse_verdict(review: str) -> str | None: + for line in reversed(review.strip().splitlines()): + stripped = line.strip() + if stripped.startswith("VERDICT:"): + verdict = stripped.split(":", 1)[1].strip().upper() + if verdict in (VERDICT_PASS, VERDICT_FAIL): + return verdict + return None + + +def post_github_comment(body: str, *, repo: str, pr: str) -> None: + """Post a comment on the PR. Silently skips if credentials are missing.""" + token = os.environ.get("GITHUB_TOKEN", "").strip() + if not token: + print("GITHUB_TOKEN not set -- skipping PR comment.") + return + + task_id = os.environ.get("CIRRUS_TASK_ID", "") + log_url = f"https://cirrus-ci.com/task/{task_id}" if task_id else "" + + comment = ( + f"## Security Review -- Issues Found\n\n" + f"{body}\n\n" + f"---\n" + f"*Reviewed by Claude Code ({CLAUDE_MODEL})*" + ) + if log_url: + comment += f" | [Full CI log]({log_url})" + + url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments" + data = json.dumps({"body": comment}).encode() + req = urllib.request.Request( + url, + data=data, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "Content-Type": "application/json", + }, + method="POST", + ) + + try: + with urllib.request.urlopen(req) as resp: + if resp.status == 201: + print(f"Posted review comment on PR #{pr}.") + else: + print(f"GitHub API responded with status {resp.status}.") + except urllib.error.HTTPError as exc: + print(f"Failed to post PR comment: HTTP {exc.code} {exc.reason}") + except urllib.error.URLError as exc: + print(f"Failed to post PR comment: {exc.reason}") + + +def main() -> int: + separator = "=" * 60 + + print(separator) + print("Claude Code Security Review") + print(separator) + + pr = os.environ.get("CIRRUS_PR", "").strip() + if not pr: + print("Not a PR build (CIRRUS_PR is empty). Skipping.") + return 0 + + if not os.environ.get("CLAUDE_CODE_OAUTH_TOKEN", "").strip(): + print("ERROR: CLAUDE_CODE_OAUTH_TOKEN is not set.") + return 2 + + repo = os.environ.get("CIRRUS_REPO_FULL_NAME", "").strip() + base_branch = os.environ.get("CIRRUS_BASE_BRANCH", "master").strip() + print(f"PR #{pr} -> base branch: {base_branch}") + + print("\nFetching base branch...") + try: + fetch_base_branch(base_branch) + except subprocess.CalledProcessError as exc: + print(f"ERROR: git fetch failed: {exc}") + return 2 + + print("Computing diff...") + try: + diff = get_pr_diff(base_branch) + except subprocess.CalledProcessError as exc: + print(f"ERROR: git diff failed: {exc}") + return 2 + + if not diff or diff.isspace(): + print("Empty diff -- nothing to review.") + return 0 + + changed_files = changed_files_from_diff(diff) + file_count = len(changed_files.splitlines()) + print(f"Reviewing changes across {file_count} file(s)...") + + if len(diff) > MAX_DIFF_CHARS: + print(f"ERROR: diff is {len(diff)} chars, exceeds maximum of {MAX_DIFF_CHARS}. Skipping review.") + return 2 + + prompt = build_prompt(diff, changed_files) + + print(f"\nRunning Claude Code review (model: {CLAUDE_MODEL})...\n") + review = run_claude(prompt) + + if review is None: + print("Review failed to produce output.") + return 2 + + print(separator) + print("REVIEW OUTPUT") + print(separator) + print(review) + print(separator) + + verdict = parse_verdict(review) + + if verdict == VERDICT_FAIL: + print("\nVERDICT: FAIL -- Critical or high severity issues found.") + post_github_comment(review, repo=repo, pr=pr) + return 1 + + if verdict == VERDICT_PASS: + print("\nVERDICT: PASS -- No critical or high severity issues.") + return 0 + + print("\nWARNING: Could not parse verdict from review output.") + print("Review logged above for manual inspection.") + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/contrib/ci/security_review_prompt.md b/contrib/ci/security_review_prompt.md new file mode 100644 index 000000000..4741ecf1c --- /dev/null +++ b/contrib/ci/security_review_prompt.md @@ -0,0 +1,92 @@ +# Electrum Security Review + +You are a security auditor reviewing a pull request diff for **Electrum**, a Bitcoin wallet +that handles real funds on mainnet and Lightning Network. Your review must be thorough and +precise -- but equally, it must not cry wolf. Only flag issues you are confident are real +and exploitable in the context shown. A false positive that blocks a legitimate PR wastes +developer time and erodes trust in this review. + +## Scope + +Focus your findings on the diff provided below -- only flag issues introduced or worsened by +changes in this PR. You have access to the full Electrum codebase; use it freely to read +surrounding code, trace call chains, and understand what the diff actually does. But do not +audit code outside the diff -- the codebase is context, not the review target. +Focus on changes that introduce, worsen, or fail to mitigate security vulnerabilities. +Only flag issues introduced or worsened by the diff. Do not flag +pre-existing issues visible in context lines unless the change makes them newly exploitable. +If the diff is truncated, review only what is provided and note the truncation in your summary. + +For each potential issue, consider whether it is actually exploitable given the context +visible in the diff. Do not flag purely theoretical vulnerabilities that require +preconditions impossible within Electrum's architecture. However, do account for +sophisticated real-world attackers -- Electrum is a high-value target where supply-chain +compromise, malicious Electrum servers, and rogue Lightning peers are realistic threat +vectors. + +## Severity Definitions + +### CRITICAL +Issues that could directly cause loss of funds, exposure of private keys, remote code execution, denial of service, or phishing: +- Private key, seed phrase, or wallet password leaked (to logs, error messages, network, disk in cleartext) +- Cryptographic flaws: weak/predictable randomness, broken key derivation, nonce reuse, custom crypto primitives +- Authentication or authorization bypass in JSON-RPC, wallet password checks, or plugin system +- Transaction integrity: amount/fee manipulation, signature bypass, double-spend vectors +- Lightning channel state corruption that could cause force-close fund loss +- Denial of service: unbounded allocations, algorithmic complexity attacks, resource exhaustion from malicious server responses or peer messages, unbound loops or reads driven by untrusted input +- Phishing vectors: untrusted strings from servers/peers displayed to users in error messages, dialogs, transaction descriptions, or notifications without sanitization -- an attacker-controlled server could craft messages that trick users into sending funds, revealing credentials, or taking dangerous actions +- Obvious regressions: changes that clearly break existing functionality -- e.g. uncaught exceptions propagating to the user, broken control flow that makes a feature non-functional, or incorrect argument handling that would reliably crash at runtime + +### HIGH +Issues that could be exploited with moderate effort or lead to significant damage: +- Command injection, path traversal, or injection attacks (SQL, LDAP, XML) +- Unsafe deserialization of data from network peers, Electrum servers, or untrusted files +- Race conditions in wallet state, Lightning channel state machine, HTLC handling, or concurrent RPC +- Integer overflow/underflow in financial calculations (amounts, fees, change outputs) +- Insufficient validation of network protocol messages (Electrum protocol, Lightning BOLT messages, Nostr) +- Hardcoded secrets, credentials, API keys, or debug backdoors +- TOCTOU (time-of-check-time-of-use) vulnerabilities in file or wallet operations +- Privacy leaks: unnecessary exposure of addresses, balances, transaction history, or wallet fingerprints to servers, peers, or third parties -- includes address reuse, unneeded network requests that correlate addresses, and identifiable user fingerprints. + +## Output Format + +Structure your review as follows: + +### If findings exist: + +For each finding, use this exact format: + +``` +### [SEVERITY] Short title +- **File:** `filename.py` L123-L145 (or "multiple files" if applicable) +- **Issue:** Clear description of the vulnerability +- **Impact:** What an attacker could achieve by exploiting this +- **Recommendation:** Specific fix suggestion +``` + +### Summary + +After all findings, provide a one-paragraph summary. + +### Verdict + +End your review with exactly one of these lines (no extra text on the same line): + +``` +VERDICT: FAIL +``` +or +``` +VERDICT: PASS +``` + +Rules: +- `VERDICT: FAIL` if ANY **Critical** or **High** severity issues were found +- `VERDICT: PASS` if no Critical or High severity issues were found +- If the diff contains no security-relevant changes (documentation, comments, tests, locale files only), output: + +``` +No security-relevant changes detected in this diff. + +VERDICT: PASS +```