perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct

Process two independent SHA256 chains simultaneously to hide the 2-cycle latency of vsha256hq_u32 on Cortex-A76, approaching full throughput. Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR barrier) on every inner-loop call.
2026-03-30 10:41:59 +02:00
parent b2f0090236
commit 7d4096749a
3 changed files with 403 additions and 3 deletions
--- a/tests/test_sha256_backend.c
+++ b/tests/test_sha256_backend.c
@@ -181,6 +181,41 @@ static int test_sha256d80_4way_100k_nonces(void) {
    return 1;
 }

+static int test_sha256d80_scan_4way_direct(void) {
+    uint8_t header76[76];
+    sha256d80_midstate_t mid;
+    sha256_state_t st_ref[4];
+    sha256_state_t st_direct[4];
+    uint32_t all_max[8];
+    uint32_t mask_ref;
+    uint32_t mask_direct;
+    int i;
+
+    for (i = 0; i < 76; i++) {
+        header76[i] = (uint8_t)((i * 13 + 7) & 0xFF);
+    }
+    for (i = 0; i < 8; i++) {
+        all_max[i] = 0xFFFFFFFFU;
+    }
+
+    sha256d80_midstate_init(&mid, header76);
+    sha256_backend_ensure_init();
+
+    mask_ref    = sha256d80_scan_4way(&mid, 0x11223344U, all_max, st_ref);
+    mask_direct = sha256d80_scan_4way_direct(&mid, 0x11223344U, all_max, st_direct);
+
+    if (mask_ref != mask_direct) {
+        fprintf(stderr, "[test_sha256_backend] scan_4way_direct mask mismatch ref=%u direct=%u\n",
+                mask_ref, mask_direct);
+        return 0;
+    }
+    if (memcmp(st_ref, st_direct, sizeof(st_ref)) != 0) {
+        fprintf(stderr, "[test_sha256_backend] scan_4way_direct state mismatch\n");
+        return 0;
+    }
+    return 1;
+}
+
 static int test_sha256d80_scan_hitmask_basic(void) {
    uint8_t header76[76];
    sha256d80_midstate_t mid;
@@ -219,6 +254,9 @@ int main(void) {
    if (!test_sha256d80_4way_100k_nonces()) {
        return 1;
    }
+    if (!test_sha256d80_scan_4way_direct()) {
+        return 1;
+    }
    if (!test_sha256d80_scan_hitmask_basic()) {
        return 1;
    }