perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct
Process two independent SHA256 chains simultaneously to hide the 2-cycle latency of vsha256hq_u32 on Cortex-A76, approaching full throughput. Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR barrier) on every inner-loop call.
This commit is contained in:
@@ -62,4 +62,22 @@ uint32_t sha256d80_scan_4way(
|
||||
sha256_state_t out_states[4]
|
||||
);
|
||||
|
||||
/*
|
||||
* Ensure the SHA256 backend is initialized. Call once before using
|
||||
* sha256d80_scan_4way_direct() to avoid per-call pthread_once overhead.
|
||||
*/
|
||||
void sha256_backend_ensure_init(void);
|
||||
|
||||
/*
|
||||
* Like sha256d80_scan_4way() but skips the pthread_once check.
|
||||
* Caller MUST have called sha256_backend_ensure_init() (or any other
|
||||
* backend function) before calling this.
|
||||
*/
|
||||
uint32_t sha256d80_scan_4way_direct(
|
||||
const sha256d80_midstate_t *mid,
|
||||
uint32_t start_nonce,
|
||||
const uint32_t target_words[8],
|
||||
sha256_state_t out_states[4]
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user