From 7d4096749a9d9a458f2e50f3c580a70834149e6f Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Mon, 30 Mar 2026 10:41:59 +0200 Subject: [PATCH] perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct Process two independent SHA256 chains simultaneously to hide the 2-cycle latency of vsha256hq_u32 on Cortex-A76, approaching full throughput. Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR barrier) on every inner-loop call. --- sha256/sha256_backend.c | 350 +++++++++++++++++++++++++++++++++++- sha256/sha256_backend.h | 18 ++ tests/test_sha256_backend.c | 38 ++++ 3 files changed, 403 insertions(+), 3 deletions(-) diff --git a/sha256/sha256_backend.c b/sha256/sha256_backend.c index bb5fbb0..9c7eba9 100644 --- a/sha256/sha256_backend.c +++ b/sha256/sha256_backend.c @@ -231,6 +231,314 @@ static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64 void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); + +/* + * 2-way interleaved SHA256 block compression. + * Processes two independent (state, block) pairs simultaneously to hide the + * 2-cycle latency of vsha256hq_u32 on Cortex-A76 (latency=2, throughput=1). + * While chain A is waiting for its result, chain B's instruction issues, + * filling the pipeline bubble and approaching full throughput utilization. + */ +static void sha256_transform_armv8_2way( + sha256_state_t *stA, const uint8_t blkA[64], + sha256_state_t *stB, const uint8_t blkB[64] +) { + /* Chain A state */ + uint32x4_t SA0, SA1, ABEF_A, CDGH_A; + uint32x4_t MA0, MA1, MA2, MA3; + uint32x4_t TA0, TA1, TA2; + /* Chain B state */ + uint32x4_t SB0, SB1, ABEF_B, CDGH_B; + uint32x4_t MB0, MB1, MB2, MB3; + uint32x4_t TB0, TB1, TB2; + + /* Load and byte-swap A */ + SA0 = vld1q_u32(&stA->h[0]); + SA1 = vld1q_u32(&stA->h[4]); + ABEF_A = SA0; CDGH_A = SA1; + MA0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 0))))); + MA1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 16))))); + MA2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 32))))); + MA3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 48))))); + + /* Load and byte-swap B */ + SB0 = vld1q_u32(&stB->h[0]); + SB1 = vld1q_u32(&stB->h[4]); + ABEF_B = SB0; CDGH_B = SB1; + MB0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 0))))); + MB1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 16))))); + MB2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 32))))); + MB3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 48))))); + + /* ---- Rounds 0-3 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00])); + MA0 = vsha256su0q_u32(MA0, MA1); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00])); + MB0 = vsha256su0q_u32(MB0, MB1); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA0 = vsha256su1q_u32(MA0, MA2, MA3); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB0 = vsha256su1q_u32(MB0, MB2, MB3); + + /* ---- Rounds 4-7 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04])); + MA1 = vsha256su0q_u32(MA1, MA2); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04])); + MB1 = vsha256su0q_u32(MB1, MB2); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA1 = vsha256su1q_u32(MA1, MA3, MA0); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB1 = vsha256su1q_u32(MB1, MB3, MB0); + + /* ---- Rounds 8-11 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08])); + MA2 = vsha256su0q_u32(MA2, MA3); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08])); + MB2 = vsha256su0q_u32(MB2, MB3); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA2 = vsha256su1q_u32(MA2, MA0, MA1); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB2 = vsha256su1q_u32(MB2, MB0, MB1); + + /* ---- Rounds 12-15 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c])); + MA3 = vsha256su0q_u32(MA3, MA0); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c])); + MB3 = vsha256su0q_u32(MB3, MB0); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA3 = vsha256su1q_u32(MA3, MA1, MA2); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB3 = vsha256su1q_u32(MB3, MB1, MB2); + + /* ---- Rounds 16-19 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10])); + MA0 = vsha256su0q_u32(MA0, MA1); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10])); + MB0 = vsha256su0q_u32(MB0, MB1); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA0 = vsha256su1q_u32(MA0, MA2, MA3); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB0 = vsha256su1q_u32(MB0, MB2, MB3); + + /* ---- Rounds 20-23 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14])); + MA1 = vsha256su0q_u32(MA1, MA2); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14])); + MB1 = vsha256su0q_u32(MB1, MB2); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA1 = vsha256su1q_u32(MA1, MA3, MA0); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB1 = vsha256su1q_u32(MB1, MB3, MB0); + + /* ---- Rounds 24-27 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18])); + MA2 = vsha256su0q_u32(MA2, MA3); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18])); + MB2 = vsha256su0q_u32(MB2, MB3); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA2 = vsha256su1q_u32(MA2, MA0, MA1); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB2 = vsha256su1q_u32(MB2, MB0, MB1); + + /* ---- Rounds 28-31 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c])); + MA3 = vsha256su0q_u32(MA3, MA0); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c])); + MB3 = vsha256su0q_u32(MB3, MB0); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA3 = vsha256su1q_u32(MA3, MA1, MA2); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB3 = vsha256su1q_u32(MB3, MB1, MB2); + + /* ---- Rounds 32-35 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20])); + MA0 = vsha256su0q_u32(MA0, MA1); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20])); + MB0 = vsha256su0q_u32(MB0, MB1); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA0 = vsha256su1q_u32(MA0, MA2, MA3); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB0 = vsha256su1q_u32(MB0, MB2, MB3); + + /* ---- Rounds 36-39 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24])); + MA1 = vsha256su0q_u32(MA1, MA2); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24])); + MB1 = vsha256su0q_u32(MB1, MB2); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA1 = vsha256su1q_u32(MA1, MA3, MA0); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB1 = vsha256su1q_u32(MB1, MB3, MB0); + + /* ---- Rounds 40-43 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28])); + MA2 = vsha256su0q_u32(MA2, MA3); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28])); + MB2 = vsha256su0q_u32(MB2, MB3); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA2 = vsha256su1q_u32(MA2, MA0, MA1); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB2 = vsha256su1q_u32(MB2, MB0, MB1); + + /* ---- Rounds 44-47 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c])); + MA3 = vsha256su0q_u32(MA3, MA0); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c])); + MB3 = vsha256su0q_u32(MB3, MB0); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA3 = vsha256su1q_u32(MA3, MA1, MA2); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB3 = vsha256su1q_u32(MB3, MB1, MB2); + + /* ---- Rounds 48-51 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30])); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + + /* ---- Rounds 52-55 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34])); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + + /* ---- Rounds 56-59 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38])); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + + /* ---- Rounds 60-63 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c])); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + + SA0 = vaddq_u32(SA0, ABEF_A); + SA1 = vaddq_u32(SA1, CDGH_A); + SB0 = vaddq_u32(SB0, ABEF_B); + SB1 = vaddq_u32(SB1, CDGH_B); + + vst1q_u32(&stA->h[0], SA0); + vst1q_u32(&stA->h[4], SA1); + vst1q_u32(&stB->h[0], SB0); + vst1q_u32(&stB->h[4], SB1); +} + +/* + * 4-way hash using 2-way interleaved transforms. + * Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction + * latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces. + */ +static void sha256d80_hash_4way_armv8_2way( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + sha256_state_t out_states[4] +) { + /* Two block1 buffers reused for both pairs */ + uint8_t block1A[64], block1B[64]; + /* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */ + uint8_t block2A[64], block2B[64]; + + /* Copy template once each (1x instead of 4x) */ + memcpy(block1A, mid->block1_template, 64); + memcpy(block1B, mid->block1_template, 64); + /* Copy only the constant padding half of block2 template (32 bytes each) */ + memcpy(block2A + 32, mid->block2_template + 32, 32); + memcpy(block2B + 32, mid->block2_template + 32, 32); + + /* --- Pass 1, pair (0,1) --- */ + out_states[0] = mid->first_chunk_state; + out_states[1] = mid->first_chunk_state; + set_block1_nonce(block1A, start_nonce + 0U); + set_block1_nonce(block1B, start_nonce + 1U); + sha256_transform_armv8_2way(&out_states[0], block1A, &out_states[1], block1B); + + /* --- Pass 1, pair (2,3) --- */ + out_states[2] = mid->first_chunk_state; + out_states[3] = mid->first_chunk_state; + set_block1_nonce(block1A, start_nonce + 2U); + set_block1_nonce(block1B, start_nonce + 3U); + sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B); + + /* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */ + sha256_state_to_digest(&out_states[0], block2A); + sha256_state_to_digest(&out_states[1], block2B); + out_states[0] = mid->init_state; + out_states[1] = mid->init_state; + sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B); + + /* --- Pass 2, pair (2,3) --- */ + sha256_state_to_digest(&out_states[2], block2A); + sha256_state_to_digest(&out_states[3], block2B); + out_states[2] = mid->init_state; + out_states[3] = mid->init_state; + sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B); +} #endif static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl; @@ -350,7 +658,7 @@ static int arm_hash4_selftest(void) { sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl); sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl); - sha256d80_4way_aarch64_kernel(&mid, 0x10203040U, test); + sha256d80_hash_4way_armv8_2way(&mid, 0x10203040U, test); return memcmp(ref, test, sizeof(ref)) == 0; } @@ -363,8 +671,8 @@ static void sha256_backend_init_once(void) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) { g_transform_fn = sha256_transform_armv8; - g_hash4_fn = sha256d80_4way_aarch64_kernel; - fprintf(stderr, "[miner] backend SHA: ARMv8 ASM+intrinsics enabled\n"); + g_hash4_fn = sha256d80_hash_4way_armv8_2way; + fprintf(stderr, "[miner] backend SHA: ARMv8 2-way interleaved enabled\n"); } else { fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n"); } @@ -395,6 +703,12 @@ void sha256_transform_fast_2way( sha256_state_t *stB, const uint8_t blkB[64] ) { pthread_once(&g_backend_once, sha256_backend_init_once); +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) + if (g_transform_fn == sha256_transform_armv8) { + sha256_transform_armv8_2way(stA, blkA, stB, blkB); + return; + } +#endif g_transform_fn(stA, blkA); g_transform_fn(stB, blkB); } @@ -439,3 +753,33 @@ uint32_t sha256d80_scan_4way( return mask; } + +void sha256_backend_ensure_init(void) { + pthread_once(&g_backend_once, sha256_backend_init_once); +} + +uint32_t sha256d80_scan_4way_direct( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + const uint32_t target_words[8], + sha256_state_t out_states[4] +) { + uint32_t mask = 0; + + g_hash4_fn(mid, start_nonce, out_states); + + if (state_meets_target_words(&out_states[0], target_words)) { + mask |= 1U; + } + if (state_meets_target_words(&out_states[1], target_words)) { + mask |= 2U; + } + if (state_meets_target_words(&out_states[2], target_words)) { + mask |= 4U; + } + if (state_meets_target_words(&out_states[3], target_words)) { + mask |= 8U; + } + + return mask; +} diff --git a/sha256/sha256_backend.h b/sha256/sha256_backend.h index 5d7083a..792da27 100644 --- a/sha256/sha256_backend.h +++ b/sha256/sha256_backend.h @@ -62,4 +62,22 @@ uint32_t sha256d80_scan_4way( sha256_state_t out_states[4] ); +/* + * Ensure the SHA256 backend is initialized. Call once before using + * sha256d80_scan_4way_direct() to avoid per-call pthread_once overhead. + */ +void sha256_backend_ensure_init(void); + +/* + * Like sha256d80_scan_4way() but skips the pthread_once check. + * Caller MUST have called sha256_backend_ensure_init() (or any other + * backend function) before calling this. + */ +uint32_t sha256d80_scan_4way_direct( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + const uint32_t target_words[8], + sha256_state_t out_states[4] +); + #endif diff --git a/tests/test_sha256_backend.c b/tests/test_sha256_backend.c index 1f25d6d..753d739 100644 --- a/tests/test_sha256_backend.c +++ b/tests/test_sha256_backend.c @@ -181,6 +181,41 @@ static int test_sha256d80_4way_100k_nonces(void) { return 1; } +static int test_sha256d80_scan_4way_direct(void) { + uint8_t header76[76]; + sha256d80_midstate_t mid; + sha256_state_t st_ref[4]; + sha256_state_t st_direct[4]; + uint32_t all_max[8]; + uint32_t mask_ref; + uint32_t mask_direct; + int i; + + for (i = 0; i < 76; i++) { + header76[i] = (uint8_t)((i * 13 + 7) & 0xFF); + } + for (i = 0; i < 8; i++) { + all_max[i] = 0xFFFFFFFFU; + } + + sha256d80_midstate_init(&mid, header76); + sha256_backend_ensure_init(); + + mask_ref = sha256d80_scan_4way(&mid, 0x11223344U, all_max, st_ref); + mask_direct = sha256d80_scan_4way_direct(&mid, 0x11223344U, all_max, st_direct); + + if (mask_ref != mask_direct) { + fprintf(stderr, "[test_sha256_backend] scan_4way_direct mask mismatch ref=%u direct=%u\n", + mask_ref, mask_direct); + return 0; + } + if (memcmp(st_ref, st_direct, sizeof(st_ref)) != 0) { + fprintf(stderr, "[test_sha256_backend] scan_4way_direct state mismatch\n"); + return 0; + } + return 1; +} + static int test_sha256d80_scan_hitmask_basic(void) { uint8_t header76[76]; sha256d80_midstate_t mid; @@ -219,6 +254,9 @@ int main(void) { if (!test_sha256d80_4way_100k_nonces()) { return 1; } + if (!test_sha256d80_scan_4way_direct()) { + return 1; + } if (!test_sha256d80_scan_hitmask_basic()) { return 1; }