perf(sha256): eliminate double bswap between SHA256d pass1 and pass2

Add sha256_transform_armv8_2way_pass2 which reads the pass1 output state
words directly into MSG0/MSG1 without byte serialization. Previously:
  sha256_state_to_digest() → native uint32 → BE bytes (8x write_u32_be)
  sha256_transform load   → BE bytes → vrev32q_u8 → native uint32 (4x)
These two conversions cancel out. The new path skips both, saving ~52
shift/store/load/vrev ops per 4-nonce group. Also eliminates the two
128-byte block2 stack buffers from sha256d80_hash_4way_armv8_2way.
This commit is contained in:
2026-03-30 11:13:49 +02:00
parent 8709072574
commit b6af554b0c

View File

@@ -489,27 +489,298 @@ static void sha256_transform_armv8_2way(
vst1q_u32(&stB->h[4], SB1);
}
/*
* Constant message words W[8..15] for the second SHA256 pass.
* block2 = [digest(32 bytes) | 0x80 | zeros | length(0x100 bits)]
* After the vrev32q_u8 load that the first-pass transform applies, these
* padding words are always: W[8]=0x80000000, W[9..14]=0, W[15]=0x00000100.
* Preloading them avoids redundant memory reads per call.
*/
static const uint32_t k_block2_msg2[4] = { 0x80000000U, 0U, 0U, 0U };
static const uint32_t k_block2_msg3[4] = { 0U, 0U, 0U, 0x00000100U };
/*
* Specialized 2-way second pass for SHA256d(80-byte header).
*
* Eliminates the double byte-swap that would occur if we used the generic
* transform path:
* sha256_state_to_digest() → state->h[i] (native uint32) → BE bytes
* sha256_transform_armv8_2way() → loads BE bytes → vrev32q_u8 → native uint32
* These two conversions cancel out. This function loads the pass1 state words
* directly into MSG0/MSG1 without any byte reversal, saving ~52 shift/store/
* load/vrev operations per 4-nonce group.
*
* stA / stB: on entry, contain the pass1 output state (used as the message).
* on exit, overwritten with the pass2 (final) SHA256 digest state.
*/
static void sha256_transform_armv8_2way_pass2(
sha256_state_t *stA,
sha256_state_t *stB
) {
/* Chain A */
uint32x4_t SA0, SA1, ABEF_A, CDGH_A;
uint32x4_t MA0, MA1, MA2, MA3;
uint32x4_t TA0, TA1, TA2;
/* Chain B */
uint32x4_t SB0, SB1, ABEF_B, CDGH_B;
uint32x4_t MB0, MB1, MB2, MB3;
uint32x4_t TB0, TB1, TB2;
/* Load MSG0/MSG1 from pass1 state — no bswap, state words are already
* in the native form expected by the SHA2 compression instructions. */
MA0 = vld1q_u32(&stA->h[0]);
MA1 = vld1q_u32(&stA->h[4]);
MB0 = vld1q_u32(&stB->h[0]);
MB1 = vld1q_u32(&stB->h[4]);
/* Constant padding (shared between both chains) */
MA2 = vld1q_u32(k_block2_msg2);
MA3 = vld1q_u32(k_block2_msg3);
MB2 = MA2;
MB3 = MA3;
/* Initialize compression state to SHA256 IV */
SA0 = vld1q_u32(&k_sha256_iv[0]);
SA1 = vld1q_u32(&k_sha256_iv[4]);
ABEF_A = SA0; CDGH_A = SA1;
SB0 = SA0; SB1 = SA1;
ABEF_B = SB0; CDGH_B = SB1;
/* ---- Rounds 0-3 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 4-7 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 8-11 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 12-15 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 16-19 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 20-23 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 24-27 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 28-31 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 32-35 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 36-39 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 40-43 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 44-47 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 48-51 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30]));
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
/* ---- Rounds 52-55 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34]));
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
/* ---- Rounds 56-59 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38]));
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
/* ---- Rounds 60-63 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c]));
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
SA0 = vaddq_u32(SA0, ABEF_A);
SA1 = vaddq_u32(SA1, CDGH_A);
SB0 = vaddq_u32(SB0, ABEF_B);
SB1 = vaddq_u32(SB1, CDGH_B);
vst1q_u32(&stA->h[0], SA0);
vst1q_u32(&stA->h[4], SA1);
vst1q_u32(&stB->h[0], SB0);
vst1q_u32(&stB->h[4], SB1);
}
/*
* 4-way hash using 2-way interleaved transforms.
* Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction
* latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces.
* Pass 1: two calls to sha256_transform_armv8_2way (pairs 0,1 and 2,3).
* Pass 2: two calls to sha256_transform_armv8_2way_pass2, which reads the
* pass1 state words directly — no intermediate byte serialization,
* no double bswap overhead.
*/
static void sha256d80_hash_4way_armv8_2way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
sha256_state_t out_states[4]
) {
/* Two block1 buffers reused for both pairs */
uint8_t block1A[64], block1B[64];
/* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */
uint8_t block2A[64], block2B[64];
/* Copy template once each (1x instead of 4x) */
memcpy(block1A, mid->block1_template, 64);
memcpy(block1B, mid->block1_template, 64);
/* Copy only the constant padding half of block2 template (32 bytes each) */
memcpy(block2A + 32, mid->block2_template + 32, 32);
memcpy(block2B + 32, mid->block2_template + 32, 32);
/* --- Pass 1, pair (0,1) --- */
out_states[0] = mid->first_chunk_state;
@@ -525,19 +796,9 @@ static void sha256d80_hash_4way_armv8_2way(
set_block1_nonce(block1B, start_nonce + 3U);
sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B);
/* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */
sha256_state_to_digest(&out_states[0], block2A);
sha256_state_to_digest(&out_states[1], block2B);
out_states[0] = mid->init_state;
out_states[1] = mid->init_state;
sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B);
/* --- Pass 2, pair (2,3) --- */
sha256_state_to_digest(&out_states[2], block2A);
sha256_state_to_digest(&out_states[3], block2B);
out_states[2] = mid->init_state;
out_states[3] = mid->init_state;
sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B);
/* --- Pass 2: state words fed directly, no bswap round-trip --- */
sha256_transform_armv8_2way_pass2(&out_states[0], &out_states[1]);
sha256_transform_armv8_2way_pass2(&out_states[2], &out_states[3]);
}
#endif