perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct
Process two independent SHA256 chains simultaneously to hide the 2-cycle latency of vsha256hq_u32 on Cortex-A76, approaching full throughput. Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR barrier) on every inner-loop call.
This commit is contained in:
@@ -231,6 +231,314 @@ static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64
|
||||
|
||||
void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
|
||||
extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
|
||||
|
||||
/*
|
||||
* 2-way interleaved SHA256 block compression.
|
||||
* Processes two independent (state, block) pairs simultaneously to hide the
|
||||
* 2-cycle latency of vsha256hq_u32 on Cortex-A76 (latency=2, throughput=1).
|
||||
* While chain A is waiting for its result, chain B's instruction issues,
|
||||
* filling the pipeline bubble and approaching full throughput utilization.
|
||||
*/
|
||||
static void sha256_transform_armv8_2way(
|
||||
sha256_state_t *stA, const uint8_t blkA[64],
|
||||
sha256_state_t *stB, const uint8_t blkB[64]
|
||||
) {
|
||||
/* Chain A state */
|
||||
uint32x4_t SA0, SA1, ABEF_A, CDGH_A;
|
||||
uint32x4_t MA0, MA1, MA2, MA3;
|
||||
uint32x4_t TA0, TA1, TA2;
|
||||
/* Chain B state */
|
||||
uint32x4_t SB0, SB1, ABEF_B, CDGH_B;
|
||||
uint32x4_t MB0, MB1, MB2, MB3;
|
||||
uint32x4_t TB0, TB1, TB2;
|
||||
|
||||
/* Load and byte-swap A */
|
||||
SA0 = vld1q_u32(&stA->h[0]);
|
||||
SA1 = vld1q_u32(&stA->h[4]);
|
||||
ABEF_A = SA0; CDGH_A = SA1;
|
||||
MA0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 0)))));
|
||||
MA1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 16)))));
|
||||
MA2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 32)))));
|
||||
MA3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 48)))));
|
||||
|
||||
/* Load and byte-swap B */
|
||||
SB0 = vld1q_u32(&stB->h[0]);
|
||||
SB1 = vld1q_u32(&stB->h[4]);
|
||||
ABEF_B = SB0; CDGH_B = SB1;
|
||||
MB0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 0)))));
|
||||
MB1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 16)))));
|
||||
MB2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 32)))));
|
||||
MB3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 48)))));
|
||||
|
||||
/* ---- Rounds 0-3 ---- */
|
||||
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00]));
|
||||
MA0 = vsha256su0q_u32(MA0, MA1);
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00]));
|
||||
MB0 = vsha256su0q_u32(MB0, MB1);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
|
||||
|
||||
/* ---- Rounds 4-7 ---- */
|
||||
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04]));
|
||||
MA1 = vsha256su0q_u32(MA1, MA2);
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04]));
|
||||
MB1 = vsha256su0q_u32(MB1, MB2);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
|
||||
|
||||
/* ---- Rounds 8-11 ---- */
|
||||
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08]));
|
||||
MA2 = vsha256su0q_u32(MA2, MA3);
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08]));
|
||||
MB2 = vsha256su0q_u32(MB2, MB3);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
|
||||
|
||||
/* ---- Rounds 12-15 ---- */
|
||||
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c]));
|
||||
MA3 = vsha256su0q_u32(MA3, MA0);
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c]));
|
||||
MB3 = vsha256su0q_u32(MB3, MB0);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
|
||||
|
||||
/* ---- Rounds 16-19 ---- */
|
||||
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10]));
|
||||
MA0 = vsha256su0q_u32(MA0, MA1);
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10]));
|
||||
MB0 = vsha256su0q_u32(MB0, MB1);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
|
||||
|
||||
/* ---- Rounds 20-23 ---- */
|
||||
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14]));
|
||||
MA1 = vsha256su0q_u32(MA1, MA2);
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14]));
|
||||
MB1 = vsha256su0q_u32(MB1, MB2);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
|
||||
|
||||
/* ---- Rounds 24-27 ---- */
|
||||
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18]));
|
||||
MA2 = vsha256su0q_u32(MA2, MA3);
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18]));
|
||||
MB2 = vsha256su0q_u32(MB2, MB3);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
|
||||
|
||||
/* ---- Rounds 28-31 ---- */
|
||||
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c]));
|
||||
MA3 = vsha256su0q_u32(MA3, MA0);
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c]));
|
||||
MB3 = vsha256su0q_u32(MB3, MB0);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
|
||||
|
||||
/* ---- Rounds 32-35 ---- */
|
||||
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20]));
|
||||
MA0 = vsha256su0q_u32(MA0, MA1);
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20]));
|
||||
MB0 = vsha256su0q_u32(MB0, MB1);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
|
||||
|
||||
/* ---- Rounds 36-39 ---- */
|
||||
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24]));
|
||||
MA1 = vsha256su0q_u32(MA1, MA2);
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24]));
|
||||
MB1 = vsha256su0q_u32(MB1, MB2);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
|
||||
|
||||
/* ---- Rounds 40-43 ---- */
|
||||
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28]));
|
||||
MA2 = vsha256su0q_u32(MA2, MA3);
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28]));
|
||||
MB2 = vsha256su0q_u32(MB2, MB3);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
|
||||
|
||||
/* ---- Rounds 44-47 ---- */
|
||||
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c]));
|
||||
MA3 = vsha256su0q_u32(MA3, MA0);
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c]));
|
||||
MB3 = vsha256su0q_u32(MB3, MB0);
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
|
||||
|
||||
/* ---- Rounds 48-51 ---- */
|
||||
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30]));
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30]));
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
|
||||
/* ---- Rounds 52-55 ---- */
|
||||
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34]));
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34]));
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
|
||||
/* ---- Rounds 56-59 ---- */
|
||||
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38]));
|
||||
TA2 = SA0;
|
||||
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38]));
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
||||
|
||||
/* ---- Rounds 60-63 ---- */
|
||||
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c]));
|
||||
TA2 = SA0;
|
||||
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c]));
|
||||
TB2 = SB0;
|
||||
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
||||
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
||||
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
||||
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
||||
|
||||
SA0 = vaddq_u32(SA0, ABEF_A);
|
||||
SA1 = vaddq_u32(SA1, CDGH_A);
|
||||
SB0 = vaddq_u32(SB0, ABEF_B);
|
||||
SB1 = vaddq_u32(SB1, CDGH_B);
|
||||
|
||||
vst1q_u32(&stA->h[0], SA0);
|
||||
vst1q_u32(&stA->h[4], SA1);
|
||||
vst1q_u32(&stB->h[0], SB0);
|
||||
vst1q_u32(&stB->h[4], SB1);
|
||||
}
|
||||
|
||||
/*
|
||||
* 4-way hash using 2-way interleaved transforms.
|
||||
* Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction
|
||||
* latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces.
|
||||
*/
|
||||
static void sha256d80_hash_4way_armv8_2way(
|
||||
const sha256d80_midstate_t *mid,
|
||||
uint32_t start_nonce,
|
||||
sha256_state_t out_states[4]
|
||||
) {
|
||||
/* Two block1 buffers reused for both pairs */
|
||||
uint8_t block1A[64], block1B[64];
|
||||
/* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */
|
||||
uint8_t block2A[64], block2B[64];
|
||||
|
||||
/* Copy template once each (1x instead of 4x) */
|
||||
memcpy(block1A, mid->block1_template, 64);
|
||||
memcpy(block1B, mid->block1_template, 64);
|
||||
/* Copy only the constant padding half of block2 template (32 bytes each) */
|
||||
memcpy(block2A + 32, mid->block2_template + 32, 32);
|
||||
memcpy(block2B + 32, mid->block2_template + 32, 32);
|
||||
|
||||
/* --- Pass 1, pair (0,1) --- */
|
||||
out_states[0] = mid->first_chunk_state;
|
||||
out_states[1] = mid->first_chunk_state;
|
||||
set_block1_nonce(block1A, start_nonce + 0U);
|
||||
set_block1_nonce(block1B, start_nonce + 1U);
|
||||
sha256_transform_armv8_2way(&out_states[0], block1A, &out_states[1], block1B);
|
||||
|
||||
/* --- Pass 1, pair (2,3) --- */
|
||||
out_states[2] = mid->first_chunk_state;
|
||||
out_states[3] = mid->first_chunk_state;
|
||||
set_block1_nonce(block1A, start_nonce + 2U);
|
||||
set_block1_nonce(block1B, start_nonce + 3U);
|
||||
sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B);
|
||||
|
||||
/* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */
|
||||
sha256_state_to_digest(&out_states[0], block2A);
|
||||
sha256_state_to_digest(&out_states[1], block2B);
|
||||
out_states[0] = mid->init_state;
|
||||
out_states[1] = mid->init_state;
|
||||
sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B);
|
||||
|
||||
/* --- Pass 2, pair (2,3) --- */
|
||||
sha256_state_to_digest(&out_states[2], block2A);
|
||||
sha256_state_to_digest(&out_states[3], block2B);
|
||||
out_states[2] = mid->init_state;
|
||||
out_states[3] = mid->init_state;
|
||||
sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B);
|
||||
}
|
||||
#endif
|
||||
|
||||
static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl;
|
||||
@@ -350,7 +658,7 @@ static int arm_hash4_selftest(void) {
|
||||
|
||||
sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl);
|
||||
sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl);
|
||||
sha256d80_4way_aarch64_kernel(&mid, 0x10203040U, test);
|
||||
sha256d80_hash_4way_armv8_2way(&mid, 0x10203040U, test);
|
||||
|
||||
return memcmp(ref, test, sizeof(ref)) == 0;
|
||||
}
|
||||
@@ -363,8 +671,8 @@ static void sha256_backend_init_once(void) {
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
||||
if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) {
|
||||
g_transform_fn = sha256_transform_armv8;
|
||||
g_hash4_fn = sha256d80_4way_aarch64_kernel;
|
||||
fprintf(stderr, "[miner] backend SHA: ARMv8 ASM+intrinsics enabled\n");
|
||||
g_hash4_fn = sha256d80_hash_4way_armv8_2way;
|
||||
fprintf(stderr, "[miner] backend SHA: ARMv8 2-way interleaved enabled\n");
|
||||
} else {
|
||||
fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n");
|
||||
}
|
||||
@@ -395,6 +703,12 @@ void sha256_transform_fast_2way(
|
||||
sha256_state_t *stB, const uint8_t blkB[64]
|
||||
) {
|
||||
pthread_once(&g_backend_once, sha256_backend_init_once);
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
||||
if (g_transform_fn == sha256_transform_armv8) {
|
||||
sha256_transform_armv8_2way(stA, blkA, stB, blkB);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
g_transform_fn(stA, blkA);
|
||||
g_transform_fn(stB, blkB);
|
||||
}
|
||||
@@ -439,3 +753,33 @@ uint32_t sha256d80_scan_4way(
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
void sha256_backend_ensure_init(void) {
|
||||
pthread_once(&g_backend_once, sha256_backend_init_once);
|
||||
}
|
||||
|
||||
uint32_t sha256d80_scan_4way_direct(
|
||||
const sha256d80_midstate_t *mid,
|
||||
uint32_t start_nonce,
|
||||
const uint32_t target_words[8],
|
||||
sha256_state_t out_states[4]
|
||||
) {
|
||||
uint32_t mask = 0;
|
||||
|
||||
g_hash4_fn(mid, start_nonce, out_states);
|
||||
|
||||
if (state_meets_target_words(&out_states[0], target_words)) {
|
||||
mask |= 1U;
|
||||
}
|
||||
if (state_meets_target_words(&out_states[1], target_words)) {
|
||||
mask |= 2U;
|
||||
}
|
||||
if (state_meets_target_words(&out_states[2], target_words)) {
|
||||
mask |= 4U;
|
||||
}
|
||||
if (state_meets_target_words(&out_states[3], target_words)) {
|
||||
mask |= 8U;
|
||||
}
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
||||
@@ -62,4 +62,22 @@ uint32_t sha256d80_scan_4way(
|
||||
sha256_state_t out_states[4]
|
||||
);
|
||||
|
||||
/*
|
||||
* Ensure the SHA256 backend is initialized. Call once before using
|
||||
* sha256d80_scan_4way_direct() to avoid per-call pthread_once overhead.
|
||||
*/
|
||||
void sha256_backend_ensure_init(void);
|
||||
|
||||
/*
|
||||
* Like sha256d80_scan_4way() but skips the pthread_once check.
|
||||
* Caller MUST have called sha256_backend_ensure_init() (or any other
|
||||
* backend function) before calling this.
|
||||
*/
|
||||
uint32_t sha256d80_scan_4way_direct(
|
||||
const sha256d80_midstate_t *mid,
|
||||
uint32_t start_nonce,
|
||||
const uint32_t target_words[8],
|
||||
sha256_state_t out_states[4]
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -181,6 +181,41 @@ static int test_sha256d80_4way_100k_nonces(void) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int test_sha256d80_scan_4way_direct(void) {
|
||||
uint8_t header76[76];
|
||||
sha256d80_midstate_t mid;
|
||||
sha256_state_t st_ref[4];
|
||||
sha256_state_t st_direct[4];
|
||||
uint32_t all_max[8];
|
||||
uint32_t mask_ref;
|
||||
uint32_t mask_direct;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 76; i++) {
|
||||
header76[i] = (uint8_t)((i * 13 + 7) & 0xFF);
|
||||
}
|
||||
for (i = 0; i < 8; i++) {
|
||||
all_max[i] = 0xFFFFFFFFU;
|
||||
}
|
||||
|
||||
sha256d80_midstate_init(&mid, header76);
|
||||
sha256_backend_ensure_init();
|
||||
|
||||
mask_ref = sha256d80_scan_4way(&mid, 0x11223344U, all_max, st_ref);
|
||||
mask_direct = sha256d80_scan_4way_direct(&mid, 0x11223344U, all_max, st_direct);
|
||||
|
||||
if (mask_ref != mask_direct) {
|
||||
fprintf(stderr, "[test_sha256_backend] scan_4way_direct mask mismatch ref=%u direct=%u\n",
|
||||
mask_ref, mask_direct);
|
||||
return 0;
|
||||
}
|
||||
if (memcmp(st_ref, st_direct, sizeof(st_ref)) != 0) {
|
||||
fprintf(stderr, "[test_sha256_backend] scan_4way_direct state mismatch\n");
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int test_sha256d80_scan_hitmask_basic(void) {
|
||||
uint8_t header76[76];
|
||||
sha256d80_midstate_t mid;
|
||||
@@ -219,6 +254,9 @@ int main(void) {
|
||||
if (!test_sha256d80_4way_100k_nonces()) {
|
||||
return 1;
|
||||
}
|
||||
if (!test_sha256d80_scan_4way_direct()) {
|
||||
return 1;
|
||||
}
|
||||
if (!test_sha256d80_scan_hitmask_basic()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user