perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct

Process two independent SHA256 chains simultaneously to hide the 2-cycle
latency of vsha256hq_u32 on Cortex-A76, approaching full throughput.
Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing
block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR
barrier) on every inner-loop call.
This commit is contained in:
2026-03-30 10:41:59 +02:00
parent b2f0090236
commit 7d4096749a
3 changed files with 403 additions and 3 deletions

View File

@@ -231,6 +231,314 @@ static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64
void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
/*
* 2-way interleaved SHA256 block compression.
* Processes two independent (state, block) pairs simultaneously to hide the
* 2-cycle latency of vsha256hq_u32 on Cortex-A76 (latency=2, throughput=1).
* While chain A is waiting for its result, chain B's instruction issues,
* filling the pipeline bubble and approaching full throughput utilization.
*/
static void sha256_transform_armv8_2way(
sha256_state_t *stA, const uint8_t blkA[64],
sha256_state_t *stB, const uint8_t blkB[64]
) {
/* Chain A state */
uint32x4_t SA0, SA1, ABEF_A, CDGH_A;
uint32x4_t MA0, MA1, MA2, MA3;
uint32x4_t TA0, TA1, TA2;
/* Chain B state */
uint32x4_t SB0, SB1, ABEF_B, CDGH_B;
uint32x4_t MB0, MB1, MB2, MB3;
uint32x4_t TB0, TB1, TB2;
/* Load and byte-swap A */
SA0 = vld1q_u32(&stA->h[0]);
SA1 = vld1q_u32(&stA->h[4]);
ABEF_A = SA0; CDGH_A = SA1;
MA0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 0)))));
MA1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 16)))));
MA2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 32)))));
MA3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 48)))));
/* Load and byte-swap B */
SB0 = vld1q_u32(&stB->h[0]);
SB1 = vld1q_u32(&stB->h[4]);
ABEF_B = SB0; CDGH_B = SB1;
MB0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 0)))));
MB1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 16)))));
MB2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 32)))));
MB3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 48)))));
/* ---- Rounds 0-3 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 4-7 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 8-11 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 12-15 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 16-19 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 20-23 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 24-27 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 28-31 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 32-35 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 36-39 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 40-43 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 44-47 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 48-51 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30]));
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
/* ---- Rounds 52-55 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34]));
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
/* ---- Rounds 56-59 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38]));
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
/* ---- Rounds 60-63 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c]));
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
SA0 = vaddq_u32(SA0, ABEF_A);
SA1 = vaddq_u32(SA1, CDGH_A);
SB0 = vaddq_u32(SB0, ABEF_B);
SB1 = vaddq_u32(SB1, CDGH_B);
vst1q_u32(&stA->h[0], SA0);
vst1q_u32(&stA->h[4], SA1);
vst1q_u32(&stB->h[0], SB0);
vst1q_u32(&stB->h[4], SB1);
}
/*
* 4-way hash using 2-way interleaved transforms.
* Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction
* latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces.
*/
static void sha256d80_hash_4way_armv8_2way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
sha256_state_t out_states[4]
) {
/* Two block1 buffers reused for both pairs */
uint8_t block1A[64], block1B[64];
/* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */
uint8_t block2A[64], block2B[64];
/* Copy template once each (1x instead of 4x) */
memcpy(block1A, mid->block1_template, 64);
memcpy(block1B, mid->block1_template, 64);
/* Copy only the constant padding half of block2 template (32 bytes each) */
memcpy(block2A + 32, mid->block2_template + 32, 32);
memcpy(block2B + 32, mid->block2_template + 32, 32);
/* --- Pass 1, pair (0,1) --- */
out_states[0] = mid->first_chunk_state;
out_states[1] = mid->first_chunk_state;
set_block1_nonce(block1A, start_nonce + 0U);
set_block1_nonce(block1B, start_nonce + 1U);
sha256_transform_armv8_2way(&out_states[0], block1A, &out_states[1], block1B);
/* --- Pass 1, pair (2,3) --- */
out_states[2] = mid->first_chunk_state;
out_states[3] = mid->first_chunk_state;
set_block1_nonce(block1A, start_nonce + 2U);
set_block1_nonce(block1B, start_nonce + 3U);
sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B);
/* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */
sha256_state_to_digest(&out_states[0], block2A);
sha256_state_to_digest(&out_states[1], block2B);
out_states[0] = mid->init_state;
out_states[1] = mid->init_state;
sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B);
/* --- Pass 2, pair (2,3) --- */
sha256_state_to_digest(&out_states[2], block2A);
sha256_state_to_digest(&out_states[3], block2B);
out_states[2] = mid->init_state;
out_states[3] = mid->init_state;
sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B);
}
#endif
static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl;
@@ -350,7 +658,7 @@ static int arm_hash4_selftest(void) {
sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl);
sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl);
sha256d80_4way_aarch64_kernel(&mid, 0x10203040U, test);
sha256d80_hash_4way_armv8_2way(&mid, 0x10203040U, test);
return memcmp(ref, test, sizeof(ref)) == 0;
}
@@ -363,8 +671,8 @@ static void sha256_backend_init_once(void) {
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) {
g_transform_fn = sha256_transform_armv8;
g_hash4_fn = sha256d80_4way_aarch64_kernel;
fprintf(stderr, "[miner] backend SHA: ARMv8 ASM+intrinsics enabled\n");
g_hash4_fn = sha256d80_hash_4way_armv8_2way;
fprintf(stderr, "[miner] backend SHA: ARMv8 2-way interleaved enabled\n");
} else {
fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n");
}
@@ -395,6 +703,12 @@ void sha256_transform_fast_2way(
sha256_state_t *stB, const uint8_t blkB[64]
) {
pthread_once(&g_backend_once, sha256_backend_init_once);
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
if (g_transform_fn == sha256_transform_armv8) {
sha256_transform_armv8_2way(stA, blkA, stB, blkB);
return;
}
#endif
g_transform_fn(stA, blkA);
g_transform_fn(stB, blkB);
}
@@ -439,3 +753,33 @@ uint32_t sha256d80_scan_4way(
return mask;
}
void sha256_backend_ensure_init(void) {
pthread_once(&g_backend_once, sha256_backend_init_once);
}
uint32_t sha256d80_scan_4way_direct(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
const uint32_t target_words[8],
sha256_state_t out_states[4]
) {
uint32_t mask = 0;
g_hash4_fn(mid, start_nonce, out_states);
if (state_meets_target_words(&out_states[0], target_words)) {
mask |= 1U;
}
if (state_meets_target_words(&out_states[1], target_words)) {
mask |= 2U;
}
if (state_meets_target_words(&out_states[2], target_words)) {
mask |= 4U;
}
if (state_meets_target_words(&out_states[3], target_words)) {
mask |= 8U;
}
return mask;
}

View File

@@ -62,4 +62,22 @@ uint32_t sha256d80_scan_4way(
sha256_state_t out_states[4]
);
/*
* Ensure the SHA256 backend is initialized. Call once before using
* sha256d80_scan_4way_direct() to avoid per-call pthread_once overhead.
*/
void sha256_backend_ensure_init(void);
/*
* Like sha256d80_scan_4way() but skips the pthread_once check.
* Caller MUST have called sha256_backend_ensure_init() (or any other
* backend function) before calling this.
*/
uint32_t sha256d80_scan_4way_direct(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
const uint32_t target_words[8],
sha256_state_t out_states[4]
);
#endif

View File

@@ -181,6 +181,41 @@ static int test_sha256d80_4way_100k_nonces(void) {
return 1;
}
static int test_sha256d80_scan_4way_direct(void) {
uint8_t header76[76];
sha256d80_midstate_t mid;
sha256_state_t st_ref[4];
sha256_state_t st_direct[4];
uint32_t all_max[8];
uint32_t mask_ref;
uint32_t mask_direct;
int i;
for (i = 0; i < 76; i++) {
header76[i] = (uint8_t)((i * 13 + 7) & 0xFF);
}
for (i = 0; i < 8; i++) {
all_max[i] = 0xFFFFFFFFU;
}
sha256d80_midstate_init(&mid, header76);
sha256_backend_ensure_init();
mask_ref = sha256d80_scan_4way(&mid, 0x11223344U, all_max, st_ref);
mask_direct = sha256d80_scan_4way_direct(&mid, 0x11223344U, all_max, st_direct);
if (mask_ref != mask_direct) {
fprintf(stderr, "[test_sha256_backend] scan_4way_direct mask mismatch ref=%u direct=%u\n",
mask_ref, mask_direct);
return 0;
}
if (memcmp(st_ref, st_direct, sizeof(st_ref)) != 0) {
fprintf(stderr, "[test_sha256_backend] scan_4way_direct state mismatch\n");
return 0;
}
return 1;
}
static int test_sha256d80_scan_hitmask_basic(void) {
uint8_t header76[76];
sha256d80_midstate_t mid;
@@ -219,6 +254,9 @@ int main(void) {
if (!test_sha256d80_4way_100k_nonces()) {
return 1;
}
if (!test_sha256d80_scan_4way_direct()) {
return 1;
}
if (!test_sha256d80_scan_hitmask_basic()) {
return 1;
}