perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct

Process two independent SHA256 chains simultaneously to hide the 2-cycle
latency of vsha256hq_u32 on Cortex-A76, approaching full throughput.
Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing
block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR
barrier) on every inner-loop call.
This commit is contained in:
2026-03-30 10:41:59 +02:00
parent b2f0090236
commit 7d4096749a
3 changed files with 403 additions and 3 deletions

View File

@@ -181,6 +181,41 @@ static int test_sha256d80_4way_100k_nonces(void) {
return 1;
}
static int test_sha256d80_scan_4way_direct(void) {
uint8_t header76[76];
sha256d80_midstate_t mid;
sha256_state_t st_ref[4];
sha256_state_t st_direct[4];
uint32_t all_max[8];
uint32_t mask_ref;
uint32_t mask_direct;
int i;
for (i = 0; i < 76; i++) {
header76[i] = (uint8_t)((i * 13 + 7) & 0xFF);
}
for (i = 0; i < 8; i++) {
all_max[i] = 0xFFFFFFFFU;
}
sha256d80_midstate_init(&mid, header76);
sha256_backend_ensure_init();
mask_ref = sha256d80_scan_4way(&mid, 0x11223344U, all_max, st_ref);
mask_direct = sha256d80_scan_4way_direct(&mid, 0x11223344U, all_max, st_direct);
if (mask_ref != mask_direct) {
fprintf(stderr, "[test_sha256_backend] scan_4way_direct mask mismatch ref=%u direct=%u\n",
mask_ref, mask_direct);
return 0;
}
if (memcmp(st_ref, st_direct, sizeof(st_ref)) != 0) {
fprintf(stderr, "[test_sha256_backend] scan_4way_direct state mismatch\n");
return 0;
}
return 1;
}
static int test_sha256d80_scan_hitmask_basic(void) {
uint8_t header76[76];
sha256d80_midstate_t mid;
@@ -219,6 +254,9 @@ int main(void) {
if (!test_sha256d80_4way_100k_nonces()) {
return 1;
}
if (!test_sha256d80_scan_4way_direct()) {
return 1;
}
if (!test_sha256d80_scan_hitmask_basic()) {
return 1;
}