Files
cpu-miner/sha256/sha256_backend.h
Davide Grilli 7d4096749a perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct
Process two independent SHA256 chains simultaneously to hide the 2-cycle
latency of vsha256hq_u32 on Cortex-A76, approaching full throughput.
Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing
block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR
barrier) on every inner-loop call.
2026-03-30 10:42:17 +02:00

84 lines
2.4 KiB
C

#ifndef SHA256_BACKEND_H
#define SHA256_BACKEND_H
#include <stdint.h>
/*
* Compact SHA256 state: only the 8 chaining words (32 bytes).
* Avoids copying the bloated OpenSSL SHA256_CTX (~112 bytes) in the hot loop.
*/
typedef struct {
uint32_t h[8];
} sha256_state_t;
typedef struct {
sha256_state_t init_state;
sha256_state_t first_chunk_state;
uint8_t block1_template[64];
uint8_t block2_template[64];
} sha256d80_midstate_t;
/* Set state to SHA256 initial values (IV). */
void sha256_state_init(sha256_state_t *state);
/* Serialize the 8 state words into a 32-byte big-endian digest. */
void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]);
/* Single SHA256 block compression (64-byte block). */
void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]);
/*
* 2-way interleaved SHA256 block compression.
* Processes two independent (state, block) pairs so the CPU can overlap both
* instruction chains. On non-ARM builds falls back to two sequential calls.
*/
void sha256_transform_fast_2way(
sha256_state_t *stA, const uint8_t blkA[64],
sha256_state_t *stB, const uint8_t blkB[64]
);
/* Prepare SHA256d(80-byte header) midstate and constant blocks from header[0..75]. */
void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]);
/*
* Hash 4 consecutive nonces with SHA256d(header80).
* start_nonce lane order: [n, n+1, n+2, n+3].
*/
void sha256d80_hash_4way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
sha256_state_t out_states[4]
);
/*
* Hash 4 consecutive nonces and return hit mask against target words.
* target_words are big-endian words target[0..7].
* bit i set => lane i meets target.
*/
uint32_t sha256d80_scan_4way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
const uint32_t target_words[8],
sha256_state_t out_states[4]
);
/*
* Ensure the SHA256 backend is initialized. Call once before using
* sha256d80_scan_4way_direct() to avoid per-call pthread_once overhead.
*/
void sha256_backend_ensure_init(void);
/*
* Like sha256d80_scan_4way() but skips the pthread_once check.
* Caller MUST have called sha256_backend_ensure_init() (or any other
* backend function) before calling this.
*/
uint32_t sha256d80_scan_4way_direct(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
const uint32_t target_words[8],
sha256_state_t out_states[4]
);
#endif