ci: add claude code code review

Adds a CI step to the Cirrus CI which will run claude code on the diff of a Pull Request and fail if it finds critical security vulnerabilities or serious code issues. Optinally it can be given a GitHub api key to create a comment in the pull request.
2026-03-26 19:37:19 +01:00
parent 35b44a1e64
commit 88f9c49a60
3 changed files with 375 additions and 0 deletions
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+"""
+Cirrus CI task: Claude Code security review for Electrum pull requests.
+
+Runs Claude Code against the PR diff to detect critical security
+vulnerabilities. Optionally posts findings as a GitHub PR comment.
+
+Exit codes:
+    0 -- PASS (no critical/high issues)
+    1 -- FAIL (critical/high issues found)
+    2 -- review could not run (infra error, logged as warning)
+
+Environment variables:
+    Required:
+        CLAUDE_CODE_OAUTH_TOKEN  -- OAuth token from `claude setup-token` (MAX subscription)
+    Optional:
+        GITHUB_TOKEN             -- GitHub token for posting PR comments
+    Set by Cirrus CI:
+        CIRRUS_PR                -- PR number (empty if not a PR build)
+        CIRRUS_BASE_BRANCH       -- target branch of the PR
+        CIRRUS_REPO_FULL_NAME    -- e.g. "spesmilo/electrum"
+        CIRRUS_TASK_ID           -- current Cirrus task ID
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+import urllib.error
+import urllib.request
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROMPT_FILE = os.path.join(SCRIPT_DIR, "security_review_prompt.md")
+
+MAX_DIFF_CHARS = 800_000
+CLAUDE_TIMEOUT_SECONDS = 20 * 60
+CLAUDE_MODEL = "claude-opus-4-6"
+CLAUDE_EFFORT = "max"
+
+VERDICT_PASS = "PASS"
+VERDICT_FAIL = "FAIL"
+
+
+def git(*args: str) -> str:
+    result = subprocess.run(
+        ["git"] + list(args),
+        capture_output=True, text=True, check=True,
+    )
+    return result.stdout
+
+
+def fetch_base_branch(base: str) -> None:
+    try:
+        git("fetch", "origin", base, "--depth=1")
+    except subprocess.CalledProcessError:
+        git("fetch", "origin", base)
+    # Shallow CI clones may lack the history needed for three-dot diff
+    # (merge-base computation). Unshallow if the merge-base is unreachable.
+    try:
+        git("merge-base", f"origin/{base}", "HEAD")
+    except subprocess.CalledProcessError:
+        try:
+            git("fetch", "--unshallow")
+        except subprocess.CalledProcessError:
+            pass  # already a full clone
+
+
+def get_pr_diff(base: str) -> str:
+    return git("diff", f"origin/{base}...HEAD")
+
+
+def changed_files_from_diff(diff: str) -> str:
+    return "\n".join(
+        m.group(1) for m in re.finditer(r"^diff --git a/.+ b/(.+)$", diff, re.MULTILINE)
+    )
+
+
+def build_prompt(diff: str, changed_files: str) -> str:
+    with open(PROMPT_FILE) as f:
+        instructions = f.read()
+
+    return (
+        f"{instructions}\n\n"
+        f"---\n\n"
+        f"## Changed files\n\n```\n{changed_files}\n```\n\n"
+        f"## Diff\n\n```diff\n{diff}\n```"
+    )
+
+
+def run_claude(prompt: str) -> str | None:
+    """Invoke Claude Code CLI in print mode. Returns review text or None on failure.
+
+    Passes the prompt via stdin to avoid OS argument length limits (MAX_ARG_STRLEN).
+    """
+    cmd = [
+        "claude",
+        "-p",
+        "--model", CLAUDE_MODEL,
+        "--effort", CLAUDE_EFFORT,
+        "--output-format", "text",
+    ]
+
+    try:
+        result = subprocess.run(
+            cmd,
+            input=prompt,
+            capture_output=True,
+            text=True,
+            timeout=CLAUDE_TIMEOUT_SECONDS,
+        )
+    except FileNotFoundError:
+        print("ERROR: 'claude' CLI not found. Is @anthropic-ai/claude-code installed?")
+        return None
+    except subprocess.TimeoutExpired:
+        print(f"ERROR: Claude Code timed out after {CLAUDE_TIMEOUT_SECONDS}s.")
+        return None
+
+    if result.returncode != 0:
+        print(f"ERROR: Claude Code exited with code {result.returncode}")
+        if result.stderr:
+            print(result.stderr)
+        return None
+
+    return result.stdout
+
+
+def parse_verdict(review: str) -> str | None:
+    for line in reversed(review.strip().splitlines()):
+        stripped = line.strip()
+        if stripped.startswith("VERDICT:"):
+            verdict = stripped.split(":", 1)[1].strip().upper()
+            if verdict in (VERDICT_PASS, VERDICT_FAIL):
+                return verdict
+    return None
+
+
+def post_github_comment(body: str, *, repo: str, pr: str) -> None:
+    """Post a comment on the PR. Silently skips if credentials are missing."""
+    token = os.environ.get("GITHUB_TOKEN", "").strip()
+    if not token:
+        print("GITHUB_TOKEN not set -- skipping PR comment.")
+        return
+
+    task_id = os.environ.get("CIRRUS_TASK_ID", "")
+    log_url = f"https://cirrus-ci.com/task/{task_id}" if task_id else ""
+
+    comment = (
+        f"## Security Review -- Issues Found\n\n"
+        f"{body}\n\n"
+        f"---\n"
+        f"*Reviewed by Claude Code ({CLAUDE_MODEL})*"
+    )
+    if log_url:
+        comment += f" | [Full CI log]({log_url})"
+
+    url = f"https://api.github.com/repos/{repo}/issues/{pr}/comments"
+    data = json.dumps({"body": comment}).encode()
+    req = urllib.request.Request(
+        url,
+        data=data,
+        headers={
+            "Authorization": f"token {token}",
+            "Accept": "application/vnd.github.v3+json",
+            "Content-Type": "application/json",
+        },
+        method="POST",
+    )
+
+    try:
+        with urllib.request.urlopen(req) as resp:
+            if resp.status == 201:
+                print(f"Posted review comment on PR #{pr}.")
+            else:
+                print(f"GitHub API responded with status {resp.status}.")
+    except urllib.error.HTTPError as exc:
+        print(f"Failed to post PR comment: HTTP {exc.code} {exc.reason}")
+    except urllib.error.URLError as exc:
+        print(f"Failed to post PR comment: {exc.reason}")
+
+
+def main() -> int:
+    separator = "=" * 60
+
+    print(separator)
+    print("Claude Code Security Review")
+    print(separator)
+
+    pr = os.environ.get("CIRRUS_PR", "").strip()
+    if not pr:
+        print("Not a PR build (CIRRUS_PR is empty). Skipping.")
+        return 0
+
+    if not os.environ.get("CLAUDE_CODE_OAUTH_TOKEN", "").strip():
+        print("ERROR: CLAUDE_CODE_OAUTH_TOKEN is not set.")
+        return 2
+
+    repo = os.environ.get("CIRRUS_REPO_FULL_NAME", "").strip()
+    base_branch = os.environ.get("CIRRUS_BASE_BRANCH", "master").strip()
+    print(f"PR #{pr} -> base branch: {base_branch}")
+
+    print("\nFetching base branch...")
+    try:
+        fetch_base_branch(base_branch)
+    except subprocess.CalledProcessError as exc:
+        print(f"ERROR: git fetch failed: {exc}")
+        return 2
+
+    print("Computing diff...")
+    try:
+        diff = get_pr_diff(base_branch)
+    except subprocess.CalledProcessError as exc:
+        print(f"ERROR: git diff failed: {exc}")
+        return 2
+
+    if not diff or diff.isspace():
+        print("Empty diff -- nothing to review.")
+        return 0
+
+    changed_files = changed_files_from_diff(diff)
+    file_count = len(changed_files.splitlines())
+    print(f"Reviewing changes across {file_count} file(s)...")
+
+    if len(diff) > MAX_DIFF_CHARS:
+        print(f"ERROR: diff is {len(diff)} chars, exceeds maximum of {MAX_DIFF_CHARS}. Skipping review.")
+        return 2
+
+    prompt = build_prompt(diff, changed_files)
+
+    print(f"\nRunning Claude Code review (model: {CLAUDE_MODEL})...\n")
+    review = run_claude(prompt)
+
+    if review is None:
+        print("Review failed to produce output.")
+        return 2
+
+    print(separator)
+    print("REVIEW OUTPUT")
+    print(separator)
+    print(review)
+    print(separator)
+
+    verdict = parse_verdict(review)
+
+    if verdict == VERDICT_FAIL:
+        print("\nVERDICT: FAIL -- Critical or high severity issues found.")
+        post_github_comment(review, repo=repo, pr=pr)
+        return 1
+
+    if verdict == VERDICT_PASS:
+        print("\nVERDICT: PASS -- No critical or high severity issues.")
+        return 0
+
+    print("\nWARNING: Could not parse verdict from review output.")
+    print("Review logged above for manual inspection.")
+    return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,92 @@
+# Electrum Security Review
+
+You are a security auditor reviewing a pull request diff for **Electrum**, a Bitcoin wallet
+that handles real funds on mainnet and Lightning Network. Your review must be thorough and
+precise -- but equally, it must not cry wolf. Only flag issues you are confident are real
+and exploitable in the context shown. A false positive that blocks a legitimate PR wastes
+developer time and erodes trust in this review.
+
+## Scope
+
+Focus your findings on the diff provided below -- only flag issues introduced or worsened by
+changes in this PR. You have access to the full Electrum codebase; use it freely to read
+surrounding code, trace call chains, and understand what the diff actually does. But do not
+audit code outside the diff -- the codebase is context, not the review target.
+Focus on changes that introduce, worsen, or fail to mitigate security vulnerabilities.
+Only flag issues introduced or worsened by the diff. Do not flag
+pre-existing issues visible in context lines unless the change makes them newly exploitable.
+If the diff is truncated, review only what is provided and note the truncation in your summary.
+
+For each potential issue, consider whether it is actually exploitable given the context
+visible in the diff. Do not flag purely theoretical vulnerabilities that require
+preconditions impossible within Electrum's architecture. However, do account for
+sophisticated real-world attackers -- Electrum is a high-value target where supply-chain
+compromise, malicious Electrum servers, and rogue Lightning peers are realistic threat
+vectors.
+
+## Severity Definitions
+
+### CRITICAL
+Issues that could directly cause loss of funds, exposure of private keys, remote code execution, denial of service, or phishing:
+- Private key, seed phrase, or wallet password leaked (to logs, error messages, network, disk in cleartext)
+- Cryptographic flaws: weak/predictable randomness, broken key derivation, nonce reuse, custom crypto primitives
+- Authentication or authorization bypass in JSON-RPC, wallet password checks, or plugin system
+- Transaction integrity: amount/fee manipulation, signature bypass, double-spend vectors
+- Lightning channel state corruption that could cause force-close fund loss
+- Denial of service: unbounded allocations, algorithmic complexity attacks, resource exhaustion from malicious server responses or peer messages, unbound loops or reads driven by untrusted input
+- Phishing vectors: untrusted strings from servers/peers displayed to users in error messages, dialogs, transaction descriptions, or notifications without sanitization -- an attacker-controlled server could craft messages that trick users into sending funds, revealing credentials, or taking dangerous actions
+- Obvious regressions: changes that clearly break existing functionality -- e.g. uncaught exceptions propagating to the user, broken control flow that makes a feature non-functional, or incorrect argument handling that would reliably crash at runtime
+
+### HIGH
+Issues that could be exploited with moderate effort or lead to significant damage:
+- Command injection, path traversal, or injection attacks (SQL, LDAP, XML)
+- Unsafe deserialization of data from network peers, Electrum servers, or untrusted files
+- Race conditions in wallet state, Lightning channel state machine, HTLC handling, or concurrent RPC
+- Integer overflow/underflow in financial calculations (amounts, fees, change outputs)
+- Insufficient validation of network protocol messages (Electrum protocol, Lightning BOLT messages, Nostr)
+- Hardcoded secrets, credentials, API keys, or debug backdoors
+- TOCTOU (time-of-check-time-of-use) vulnerabilities in file or wallet operations
+- Privacy leaks: unnecessary exposure of addresses, balances, transaction history, or wallet fingerprints to servers, peers, or third parties -- includes address reuse, unneeded network requests that correlate addresses, and identifiable user fingerprints.
+
+## Output Format
+
+Structure your review as follows:
+
+### If findings exist:
+
+For each finding, use this exact format:
+
+```
+### [SEVERITY] Short title
+- **File:** `filename.py` L123-L145 (or "multiple files" if applicable)
+- **Issue:** Clear description of the vulnerability
+- **Impact:** What an attacker could achieve by exploiting this
+- **Recommendation:** Specific fix suggestion
+```
+
+### Summary
+
+After all findings, provide a one-paragraph summary.
+
+### Verdict
+
+End your review with exactly one of these lines (no extra text on the same line):
+
+```
+VERDICT: FAIL
+```
+or
+```
+VERDICT: PASS
+```
+
+Rules:
+- `VERDICT: FAIL` if ANY **Critical** or **High** severity issues were found
+- `VERDICT: PASS` if no Critical or High severity issues were found
+- If the diff contains no security-relevant changes (documentation, comments, tests, locale files only), output:
+
+```
+No security-relevant changes detected in this diff.
+
+VERDICT: PASS
+```