From b6af554b0cae1702a4afdaefb8c7bce31dc32276 Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Mon, 30 Mar 2026 11:13:49 +0200 Subject: [PATCH] perf(sha256): eliminate double bswap between SHA256d pass1 and pass2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sha256_transform_armv8_2way_pass2 which reads the pass1 output state words directly into MSG0/MSG1 without byte serialization. Previously: sha256_state_to_digest() → native uint32 → BE bytes (8x write_u32_be) sha256_transform load → BE bytes → vrev32q_u8 → native uint32 (4x) These two conversions cancel out. The new path skips both, saving ~52 shift/store/load/vrev ops per 4-nonce group. Also eliminates the two 128-byte block2 stack buffers from sha256d80_hash_4way_armv8_2way. --- sha256/sha256_backend.c | 305 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 283 insertions(+), 22 deletions(-) diff --git a/sha256/sha256_backend.c b/sha256/sha256_backend.c index 9c7eba9..3c065fb 100644 --- a/sha256/sha256_backend.c +++ b/sha256/sha256_backend.c @@ -489,27 +489,298 @@ static void sha256_transform_armv8_2way( vst1q_u32(&stB->h[4], SB1); } +/* + * Constant message words W[8..15] for the second SHA256 pass. + * block2 = [digest(32 bytes) | 0x80 | zeros | length(0x100 bits)] + * After the vrev32q_u8 load that the first-pass transform applies, these + * padding words are always: W[8]=0x80000000, W[9..14]=0, W[15]=0x00000100. + * Preloading them avoids redundant memory reads per call. + */ +static const uint32_t k_block2_msg2[4] = { 0x80000000U, 0U, 0U, 0U }; +static const uint32_t k_block2_msg3[4] = { 0U, 0U, 0U, 0x00000100U }; + +/* + * Specialized 2-way second pass for SHA256d(80-byte header). + * + * Eliminates the double byte-swap that would occur if we used the generic + * transform path: + * sha256_state_to_digest() → state->h[i] (native uint32) → BE bytes + * sha256_transform_armv8_2way() → loads BE bytes → vrev32q_u8 → native uint32 + * These two conversions cancel out. This function loads the pass1 state words + * directly into MSG0/MSG1 without any byte reversal, saving ~52 shift/store/ + * load/vrev operations per 4-nonce group. + * + * stA / stB: on entry, contain the pass1 output state (used as the message). + * on exit, overwritten with the pass2 (final) SHA256 digest state. + */ +static void sha256_transform_armv8_2way_pass2( + sha256_state_t *stA, + sha256_state_t *stB +) { + /* Chain A */ + uint32x4_t SA0, SA1, ABEF_A, CDGH_A; + uint32x4_t MA0, MA1, MA2, MA3; + uint32x4_t TA0, TA1, TA2; + /* Chain B */ + uint32x4_t SB0, SB1, ABEF_B, CDGH_B; + uint32x4_t MB0, MB1, MB2, MB3; + uint32x4_t TB0, TB1, TB2; + + /* Load MSG0/MSG1 from pass1 state — no bswap, state words are already + * in the native form expected by the SHA2 compression instructions. */ + MA0 = vld1q_u32(&stA->h[0]); + MA1 = vld1q_u32(&stA->h[4]); + MB0 = vld1q_u32(&stB->h[0]); + MB1 = vld1q_u32(&stB->h[4]); + + /* Constant padding (shared between both chains) */ + MA2 = vld1q_u32(k_block2_msg2); + MA3 = vld1q_u32(k_block2_msg3); + MB2 = MA2; + MB3 = MA3; + + /* Initialize compression state to SHA256 IV */ + SA0 = vld1q_u32(&k_sha256_iv[0]); + SA1 = vld1q_u32(&k_sha256_iv[4]); + ABEF_A = SA0; CDGH_A = SA1; + SB0 = SA0; SB1 = SA1; + ABEF_B = SB0; CDGH_B = SB1; + + /* ---- Rounds 0-3 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00])); + MA0 = vsha256su0q_u32(MA0, MA1); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00])); + MB0 = vsha256su0q_u32(MB0, MB1); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA0 = vsha256su1q_u32(MA0, MA2, MA3); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB0 = vsha256su1q_u32(MB0, MB2, MB3); + + /* ---- Rounds 4-7 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04])); + MA1 = vsha256su0q_u32(MA1, MA2); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04])); + MB1 = vsha256su0q_u32(MB1, MB2); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA1 = vsha256su1q_u32(MA1, MA3, MA0); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB1 = vsha256su1q_u32(MB1, MB3, MB0); + + /* ---- Rounds 8-11 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08])); + MA2 = vsha256su0q_u32(MA2, MA3); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08])); + MB2 = vsha256su0q_u32(MB2, MB3); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA2 = vsha256su1q_u32(MA2, MA0, MA1); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB2 = vsha256su1q_u32(MB2, MB0, MB1); + + /* ---- Rounds 12-15 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c])); + MA3 = vsha256su0q_u32(MA3, MA0); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c])); + MB3 = vsha256su0q_u32(MB3, MB0); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA3 = vsha256su1q_u32(MA3, MA1, MA2); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB3 = vsha256su1q_u32(MB3, MB1, MB2); + + /* ---- Rounds 16-19 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10])); + MA0 = vsha256su0q_u32(MA0, MA1); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10])); + MB0 = vsha256su0q_u32(MB0, MB1); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA0 = vsha256su1q_u32(MA0, MA2, MA3); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB0 = vsha256su1q_u32(MB0, MB2, MB3); + + /* ---- Rounds 20-23 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14])); + MA1 = vsha256su0q_u32(MA1, MA2); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14])); + MB1 = vsha256su0q_u32(MB1, MB2); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA1 = vsha256su1q_u32(MA1, MA3, MA0); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB1 = vsha256su1q_u32(MB1, MB3, MB0); + + /* ---- Rounds 24-27 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18])); + MA2 = vsha256su0q_u32(MA2, MA3); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18])); + MB2 = vsha256su0q_u32(MB2, MB3); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA2 = vsha256su1q_u32(MA2, MA0, MA1); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB2 = vsha256su1q_u32(MB2, MB0, MB1); + + /* ---- Rounds 28-31 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c])); + MA3 = vsha256su0q_u32(MA3, MA0); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c])); + MB3 = vsha256su0q_u32(MB3, MB0); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA3 = vsha256su1q_u32(MA3, MA1, MA2); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB3 = vsha256su1q_u32(MB3, MB1, MB2); + + /* ---- Rounds 32-35 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20])); + MA0 = vsha256su0q_u32(MA0, MA1); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20])); + MB0 = vsha256su0q_u32(MB0, MB1); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA0 = vsha256su1q_u32(MA0, MA2, MA3); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB0 = vsha256su1q_u32(MB0, MB2, MB3); + + /* ---- Rounds 36-39 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24])); + MA1 = vsha256su0q_u32(MA1, MA2); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24])); + MB1 = vsha256su0q_u32(MB1, MB2); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA1 = vsha256su1q_u32(MA1, MA3, MA0); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB1 = vsha256su1q_u32(MB1, MB3, MB0); + + /* ---- Rounds 40-43 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28])); + MA2 = vsha256su0q_u32(MA2, MA3); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28])); + MB2 = vsha256su0q_u32(MB2, MB3); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + MA2 = vsha256su1q_u32(MA2, MA0, MA1); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + MB2 = vsha256su1q_u32(MB2, MB0, MB1); + + /* ---- Rounds 44-47 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c])); + MA3 = vsha256su0q_u32(MA3, MA0); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c])); + MB3 = vsha256su0q_u32(MB3, MB0); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + MA3 = vsha256su1q_u32(MA3, MA1, MA2); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + MB3 = vsha256su1q_u32(MB3, MB1, MB2); + + /* ---- Rounds 48-51 ---- */ + TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30])); + TA2 = SA0; + TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + + /* ---- Rounds 52-55 ---- */ + TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34])); + TA2 = SA0; + TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + + /* ---- Rounds 56-59 ---- */ + TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38])); + TA2 = SA0; + TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA0); + SA1 = vsha256h2q_u32(SA1, TA2, TA0); + SB0 = vsha256hq_u32(SB0, SB1, TB0); + SB1 = vsha256h2q_u32(SB1, TB2, TB0); + + /* ---- Rounds 60-63 ---- */ + TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c])); + TA2 = SA0; + TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c])); + TB2 = SB0; + SA0 = vsha256hq_u32(SA0, SA1, TA1); + SA1 = vsha256h2q_u32(SA1, TA2, TA1); + SB0 = vsha256hq_u32(SB0, SB1, TB1); + SB1 = vsha256h2q_u32(SB1, TB2, TB1); + + SA0 = vaddq_u32(SA0, ABEF_A); + SA1 = vaddq_u32(SA1, CDGH_A); + SB0 = vaddq_u32(SB0, ABEF_B); + SB1 = vaddq_u32(SB1, CDGH_B); + + vst1q_u32(&stA->h[0], SA0); + vst1q_u32(&stA->h[4], SA1); + vst1q_u32(&stB->h[0], SB0); + vst1q_u32(&stB->h[4], SB1); +} + /* * 4-way hash using 2-way interleaved transforms. - * Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction - * latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces. + * Pass 1: two calls to sha256_transform_armv8_2way (pairs 0,1 and 2,3). + * Pass 2: two calls to sha256_transform_armv8_2way_pass2, which reads the + * pass1 state words directly — no intermediate byte serialization, + * no double bswap overhead. */ static void sha256d80_hash_4way_armv8_2way( const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4] ) { - /* Two block1 buffers reused for both pairs */ uint8_t block1A[64], block1B[64]; - /* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */ - uint8_t block2A[64], block2B[64]; - /* Copy template once each (1x instead of 4x) */ memcpy(block1A, mid->block1_template, 64); memcpy(block1B, mid->block1_template, 64); - /* Copy only the constant padding half of block2 template (32 bytes each) */ - memcpy(block2A + 32, mid->block2_template + 32, 32); - memcpy(block2B + 32, mid->block2_template + 32, 32); /* --- Pass 1, pair (0,1) --- */ out_states[0] = mid->first_chunk_state; @@ -525,19 +796,9 @@ static void sha256d80_hash_4way_armv8_2way( set_block1_nonce(block1B, start_nonce + 3U); sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B); - /* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */ - sha256_state_to_digest(&out_states[0], block2A); - sha256_state_to_digest(&out_states[1], block2B); - out_states[0] = mid->init_state; - out_states[1] = mid->init_state; - sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B); - - /* --- Pass 2, pair (2,3) --- */ - sha256_state_to_digest(&out_states[2], block2A); - sha256_state_to_digest(&out_states[3], block2B); - out_states[2] = mid->init_state; - out_states[3] = mid->init_state; - sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B); + /* --- Pass 2: state words fed directly, no bswap round-trip --- */ + sha256_transform_armv8_2way_pass2(&out_states[0], &out_states[1]); + sha256_transform_armv8_2way_pass2(&out_states[2], &out_states[3]); } #endif