cpu-miner/sha256/sha256_backend.h

#ifndef SHA256_BACKEND_H
#define SHA256_BACKEND_H

#include <stdint.h>

/*
 * Compact SHA256 state: only the 8 chaining words (32 bytes).
 * Avoids copying the bloated OpenSSL SHA256_CTX (~112 bytes) in the hot loop.
 */
typedef struct {
    uint32_t h[8];
} sha256_state_t;

typedef struct {
    sha256_state_t init_state;
    sha256_state_t first_chunk_state;
    uint8_t block1_template[64];
    uint8_t block2_template[64];
} sha256d80_midstate_t;

/* Set state to SHA256 initial values (IV). */
void sha256_state_init(sha256_state_t *state);

/* Serialize the 8 state words into a 32-byte big-endian digest. */
void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]);

/* Single SHA256 block compression (64-byte block). */
void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]);

/*
 * 2-way interleaved SHA256 block compression.
 * Processes two independent (state, block) pairs so the CPU can overlap both
 * instruction chains. On non-ARM builds falls back to two sequential calls.
 */
void sha256_transform_fast_2way(
    sha256_state_t *stA, const uint8_t blkA[64],
    sha256_state_t *stB, const uint8_t blkB[64]
);

/* Prepare SHA256d(80-byte header) midstate and constant blocks from header[0..75]. */
void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]);

/*
 * Hash 4 consecutive nonces with SHA256d(header80).
 * start_nonce lane order: [n, n+1, n+2, n+3].
 */
void sha256d80_hash_4way(
    const sha256d80_midstate_t *mid,
    uint32_t start_nonce,
    sha256_state_t out_states[4]
);

/*
 * Hash 4 consecutive nonces and return hit mask against target words.
 * target_words are big-endian words target[0..7].
 * bit i set => lane i meets target.
 */
uint32_t sha256d80_scan_4way(
    const sha256d80_midstate_t *mid,
    uint32_t start_nonce,
    const uint32_t target_words[8],
    sha256_state_t out_states[4]
);

/*
 * Ensure the SHA256 backend is initialized. Call once before using
 * sha256d80_scan_4way_direct() to avoid per-call pthread_once overhead.
 */
void sha256_backend_ensure_init(void);

/*
 * Like sha256d80_scan_4way() but skips the pthread_once check.
 * Caller MUST have called sha256_backend_ensure_init() (or any other
 * backend function) before calling this.
 */
uint32_t sha256d80_scan_4way_direct(
    const sha256d80_midstate_t *mid,
    uint32_t start_nonce,
    const uint32_t target_words[8],
    sha256_state_t out_states[4]
);

#endif