Files
cpu-miner/sha256/sha256_backend.c
Davide Grilli 7d4096749a perf(sha256): add ARMv8 2-way interleaved transform and scan_4way_direct
Process two independent SHA256 chains simultaneously to hide the 2-cycle
latency of vsha256hq_u32 on Cortex-A76, approaching full throughput.
Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing
block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR
barrier) on every inner-loop call.
2026-03-30 10:42:17 +02:00

786 lines
26 KiB
C

#include "sha256_backend.h"
#include <openssl/sha.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#if defined(__aarch64__)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
#include <arm_neon.h>
#endif
static const uint32_t k_sha256_iv[8] = {
0x6a09e667U, 0xbb67ae85U, 0x3c6ef372U, 0xa54ff53aU,
0x510e527fU, 0x9b05688cU, 0x1f83d9abU, 0x5be0cd19U
};
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
static const uint32_t k_sha256[64] = {
0x428A2F98U, 0x71374491U, 0xB5C0FBCFU, 0xE9B5DBA5U,
0x3956C25BU, 0x59F111F1U, 0x923F82A4U, 0xAB1C5ED5U,
0xD807AA98U, 0x12835B01U, 0x243185BEU, 0x550C7DC3U,
0x72BE5D74U, 0x80DEB1FEU, 0x9BDC06A7U, 0xC19BF174U,
0xE49B69C1U, 0xEFBE4786U, 0x0FC19DC6U, 0x240CA1CCU,
0x2DE92C6FU, 0x4A7484AAU, 0x5CB0A9DCU, 0x76F988DAU,
0x983E5152U, 0xA831C66DU, 0xB00327C8U, 0xBF597FC7U,
0xC6E00BF3U, 0xD5A79147U, 0x06CA6351U, 0x14292967U,
0x27B70A85U, 0x2E1B2138U, 0x4D2C6DFCU, 0x53380D13U,
0x650A7354U, 0x766A0ABBU, 0x81C2C92EU, 0x92722C85U,
0xA2BFE8A1U, 0xA81A664BU, 0xC24B8B70U, 0xC76C51A3U,
0xD192E819U, 0xD6990624U, 0xF40E3585U, 0x106AA070U,
0x19A4C116U, 0x1E376C08U, 0x2748774CU, 0x34B0BCB5U,
0x391C0CB3U, 0x4ED8AA4AU, 0x5B9CCA4FU, 0x682E6FF3U,
0x748F82EEU, 0x78A5636FU, 0x84C87814U, 0x8CC70208U,
0x90BEFFFAU, 0xA4506CEBU, 0xBEF9A3F7U, 0xC67178F2U
};
#endif
static inline uint32_t bswap_u32(uint32_t x) {
return ((x & 0x000000FFU) << 24) |
((x & 0x0000FF00U) << 8) |
((x & 0x00FF0000U) >> 8) |
((x & 0xFF000000U) >> 24);
}
static inline void write_u32_be(uint8_t *dst, uint32_t v) {
dst[0] = (uint8_t)((v >> 24) & 0xFFU);
dst[1] = (uint8_t)((v >> 16) & 0xFFU);
dst[2] = (uint8_t)((v >> 8) & 0xFFU);
dst[3] = (uint8_t)(v & 0xFFU);
}
static inline void write_u32_le(uint8_t *dst, uint32_t v) {
dst[0] = (uint8_t)(v & 0xFFU);
dst[1] = (uint8_t)((v >> 8) & 0xFFU);
dst[2] = (uint8_t)((v >> 16) & 0xFFU);
dst[3] = (uint8_t)((v >> 24) & 0xFFU);
}
static inline void set_block1_nonce(uint8_t block1[64], uint32_t nonce) {
write_u32_le(block1 + 12, nonce);
}
static inline int state_meets_target_words(const sha256_state_t *state, const uint32_t target_words[8]) {
int i;
for (i = 0; i < 8; i++) {
uint32_t hw = bswap_u32(state->h[7 - i]);
if (hw < target_words[i]) {
return 1;
}
if (hw > target_words[i]) {
return 0;
}
}
return 1;
}
typedef void (*sha256_transform_fn_t)(sha256_state_t *state, const uint8_t block[64]);
typedef void (*sha256d80_hash4_fn_t)(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
static void sha256_transform_openssl(sha256_state_t *state, const uint8_t block[64]) {
SHA256_CTX ctx;
memcpy(ctx.h, state->h, sizeof(state->h));
ctx.Nl = 0;
ctx.Nh = 0;
ctx.num = 0;
ctx.md_len = SHA256_DIGEST_LENGTH;
SHA256_Transform(&ctx, block);
memcpy(state->h, ctx.h, sizeof(state->h));
}
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64]) {
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
uint32x4_t MSG0, MSG1, MSG2, MSG3;
uint32x4_t TMP0, TMP1, TMP2;
STATE0 = vld1q_u32(&state->h[0]);
STATE1 = vld1q_u32(&state->h[4]);
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
MSG0 = vld1q_u32((const uint32_t *)(block + 0));
MSG1 = vld1q_u32((const uint32_t *)(block + 16));
MSG2 = vld1q_u32((const uint32_t *)(block + 32));
MSG3 = vld1q_u32((const uint32_t *)(block + 48));
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x00]));
MSG0 = vsha256su0q_u32(MSG0, MSG1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x04]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
MSG1 = vsha256su0q_u32(MSG1, MSG2);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x08]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
MSG2 = vsha256su0q_u32(MSG2, MSG3);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x0c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
MSG3 = vsha256su0q_u32(MSG3, MSG0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x10]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
MSG0 = vsha256su0q_u32(MSG0, MSG1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x14]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
MSG1 = vsha256su0q_u32(MSG1, MSG2);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x18]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
MSG2 = vsha256su0q_u32(MSG2, MSG3);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x1c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
MSG3 = vsha256su0q_u32(MSG3, MSG0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x20]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
MSG0 = vsha256su0q_u32(MSG0, MSG1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x24]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
MSG1 = vsha256su0q_u32(MSG1, MSG2);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x28]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
MSG2 = vsha256su0q_u32(MSG2, MSG3);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x2c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
MSG3 = vsha256su0q_u32(MSG3, MSG0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x30]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x34]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
TMP2 = STATE0;
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x38]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
TMP2 = STATE0;
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x3c]));
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
TMP2 = STATE0;
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
vst1q_u32(&state->h[0], STATE0);
vst1q_u32(&state->h[4], STATE1);
}
void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
/*
* 2-way interleaved SHA256 block compression.
* Processes two independent (state, block) pairs simultaneously to hide the
* 2-cycle latency of vsha256hq_u32 on Cortex-A76 (latency=2, throughput=1).
* While chain A is waiting for its result, chain B's instruction issues,
* filling the pipeline bubble and approaching full throughput utilization.
*/
static void sha256_transform_armv8_2way(
sha256_state_t *stA, const uint8_t blkA[64],
sha256_state_t *stB, const uint8_t blkB[64]
) {
/* Chain A state */
uint32x4_t SA0, SA1, ABEF_A, CDGH_A;
uint32x4_t MA0, MA1, MA2, MA3;
uint32x4_t TA0, TA1, TA2;
/* Chain B state */
uint32x4_t SB0, SB1, ABEF_B, CDGH_B;
uint32x4_t MB0, MB1, MB2, MB3;
uint32x4_t TB0, TB1, TB2;
/* Load and byte-swap A */
SA0 = vld1q_u32(&stA->h[0]);
SA1 = vld1q_u32(&stA->h[4]);
ABEF_A = SA0; CDGH_A = SA1;
MA0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 0)))));
MA1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 16)))));
MA2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 32)))));
MA3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 48)))));
/* Load and byte-swap B */
SB0 = vld1q_u32(&stB->h[0]);
SB1 = vld1q_u32(&stB->h[4]);
ABEF_B = SB0; CDGH_B = SB1;
MB0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 0)))));
MB1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 16)))));
MB2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 32)))));
MB3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 48)))));
/* ---- Rounds 0-3 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 4-7 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 8-11 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 12-15 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 16-19 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 20-23 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 24-27 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 28-31 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 32-35 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20]));
MA0 = vsha256su0q_u32(MA0, MA1);
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20]));
MB0 = vsha256su0q_u32(MB0, MB1);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
/* ---- Rounds 36-39 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24]));
MA1 = vsha256su0q_u32(MA1, MA2);
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24]));
MB1 = vsha256su0q_u32(MB1, MB2);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
/* ---- Rounds 40-43 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28]));
MA2 = vsha256su0q_u32(MA2, MA3);
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28]));
MB2 = vsha256su0q_u32(MB2, MB3);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
/* ---- Rounds 44-47 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c]));
MA3 = vsha256su0q_u32(MA3, MA0);
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c]));
MB3 = vsha256su0q_u32(MB3, MB0);
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
/* ---- Rounds 48-51 ---- */
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30]));
TA2 = SA0;
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
/* ---- Rounds 52-55 ---- */
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34]));
TA2 = SA0;
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
/* ---- Rounds 56-59 ---- */
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38]));
TA2 = SA0;
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA0);
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
SB0 = vsha256hq_u32(SB0, SB1, TB0);
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
/* ---- Rounds 60-63 ---- */
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c]));
TA2 = SA0;
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c]));
TB2 = SB0;
SA0 = vsha256hq_u32(SA0, SA1, TA1);
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
SB0 = vsha256hq_u32(SB0, SB1, TB1);
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
SA0 = vaddq_u32(SA0, ABEF_A);
SA1 = vaddq_u32(SA1, CDGH_A);
SB0 = vaddq_u32(SB0, ABEF_B);
SB1 = vaddq_u32(SB1, CDGH_B);
vst1q_u32(&stA->h[0], SA0);
vst1q_u32(&stA->h[4], SA1);
vst1q_u32(&stB->h[0], SB0);
vst1q_u32(&stB->h[4], SB1);
}
/*
* 4-way hash using 2-way interleaved transforms.
* Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction
* latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces.
*/
static void sha256d80_hash_4way_armv8_2way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
sha256_state_t out_states[4]
) {
/* Two block1 buffers reused for both pairs */
uint8_t block1A[64], block1B[64];
/* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */
uint8_t block2A[64], block2B[64];
/* Copy template once each (1x instead of 4x) */
memcpy(block1A, mid->block1_template, 64);
memcpy(block1B, mid->block1_template, 64);
/* Copy only the constant padding half of block2 template (32 bytes each) */
memcpy(block2A + 32, mid->block2_template + 32, 32);
memcpy(block2B + 32, mid->block2_template + 32, 32);
/* --- Pass 1, pair (0,1) --- */
out_states[0] = mid->first_chunk_state;
out_states[1] = mid->first_chunk_state;
set_block1_nonce(block1A, start_nonce + 0U);
set_block1_nonce(block1B, start_nonce + 1U);
sha256_transform_armv8_2way(&out_states[0], block1A, &out_states[1], block1B);
/* --- Pass 1, pair (2,3) --- */
out_states[2] = mid->first_chunk_state;
out_states[3] = mid->first_chunk_state;
set_block1_nonce(block1A, start_nonce + 2U);
set_block1_nonce(block1B, start_nonce + 3U);
sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B);
/* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */
sha256_state_to_digest(&out_states[0], block2A);
sha256_state_to_digest(&out_states[1], block2B);
out_states[0] = mid->init_state;
out_states[1] = mid->init_state;
sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B);
/* --- Pass 2, pair (2,3) --- */
sha256_state_to_digest(&out_states[2], block2A);
sha256_state_to_digest(&out_states[3], block2B);
out_states[2] = mid->init_state;
out_states[3] = mid->init_state;
sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B);
}
#endif
static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl;
static sha256d80_hash4_fn_t g_hash4_fn;
static pthread_once_t g_backend_once = PTHREAD_ONCE_INIT;
static int cpu_has_sha2(void) {
#if defined(__aarch64__) && defined(HWCAP_SHA2)
unsigned long caps = getauxval(AT_HWCAP);
return (caps & HWCAP_SHA2) != 0UL;
#else
return 0;
#endif
}
static void sha256d80_midstate_init_with_fn(
sha256d80_midstate_t *mid,
const uint8_t header_76[76],
sha256_transform_fn_t tf
) {
sha256_state_init(&mid->init_state);
mid->first_chunk_state = mid->init_state;
tf(&mid->first_chunk_state, header_76);
memset(mid->block1_template, 0, sizeof(mid->block1_template));
memcpy(mid->block1_template, header_76 + 64, 12);
mid->block1_template[16] = 0x80;
mid->block1_template[62] = 0x02;
mid->block1_template[63] = 0x80;
memset(mid->block2_template, 0, sizeof(mid->block2_template));
mid->block2_template[32] = 0x80;
mid->block2_template[62] = 0x01;
mid->block2_template[63] = 0x00;
}
static void sha256d80_hash_4way_with_fn(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
sha256_state_t out_states[4],
sha256_transform_fn_t tf
) {
uint8_t block1[4][64];
uint8_t block2[4][64];
int i;
for (i = 0; i < 4; i++) {
out_states[i] = mid->first_chunk_state;
memcpy(block1[i], mid->block1_template, sizeof(mid->block1_template));
memcpy(block2[i], mid->block2_template, sizeof(mid->block2_template));
}
set_block1_nonce(block1[0], start_nonce + 0U);
set_block1_nonce(block1[1], start_nonce + 1U);
set_block1_nonce(block1[2], start_nonce + 2U);
set_block1_nonce(block1[3], start_nonce + 3U);
tf(&out_states[0], block1[0]);
tf(&out_states[1], block1[1]);
tf(&out_states[2], block1[2]);
tf(&out_states[3], block1[3]);
sha256_state_to_digest(&out_states[0], block2[0]);
sha256_state_to_digest(&out_states[1], block2[1]);
sha256_state_to_digest(&out_states[2], block2[2]);
sha256_state_to_digest(&out_states[3], block2[3]);
out_states[0] = mid->init_state;
out_states[1] = mid->init_state;
out_states[2] = mid->init_state;
out_states[3] = mid->init_state;
tf(&out_states[0], block2[0]);
tf(&out_states[1], block2[1]);
tf(&out_states[2], block2[2]);
tf(&out_states[3], block2[3]);
}
static void sha256d80_hash_4way_generic(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) {
sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, g_transform_fn);
}
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) {
sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, sha256_transform_armv8);
}
static int arm_transform_selftest(void) {
uint8_t block[64];
sha256_state_t ref;
sha256_state_t test;
int i;
sha256_state_init(&ref);
sha256_state_init(&test);
for (i = 0; i < 64; i++) {
block[i] = (uint8_t)(i * 3 + 1);
}
sha256_transform_openssl(&ref, block);
sha256_transform_armv8(&test, block);
return memcmp(ref.h, test.h, sizeof(ref.h)) == 0;
}
static int arm_hash4_selftest(void) {
uint8_t header_76[76];
sha256d80_midstate_t mid;
sha256_state_t ref[4];
sha256_state_t test[4];
int i;
for (i = 0; i < 76; i++) {
header_76[i] = (uint8_t)(i * 7 + 11);
}
sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl);
sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl);
sha256d80_hash_4way_armv8_2way(&mid, 0x10203040U, test);
return memcmp(ref, test, sizeof(ref)) == 0;
}
#endif
static void sha256_backend_init_once(void) {
g_hash4_fn = sha256d80_hash_4way_generic;
g_transform_fn = sha256_transform_openssl;
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) {
g_transform_fn = sha256_transform_armv8;
g_hash4_fn = sha256d80_hash_4way_armv8_2way;
fprintf(stderr, "[miner] backend SHA: ARMv8 2-way interleaved enabled\n");
} else {
fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n");
}
#else
fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n");
#endif
}
void sha256_state_init(sha256_state_t *state) {
memcpy(state->h, k_sha256_iv, sizeof(state->h));
}
void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]) {
int i;
for (i = 0; i < 8; i++) {
write_u32_be(out + i * 4, state->h[i]);
}
}
void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]) {
pthread_once(&g_backend_once, sha256_backend_init_once);
g_transform_fn(state, block);
}
void sha256_transform_fast_2way(
sha256_state_t *stA, const uint8_t blkA[64],
sha256_state_t *stB, const uint8_t blkB[64]
) {
pthread_once(&g_backend_once, sha256_backend_init_once);
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
if (g_transform_fn == sha256_transform_armv8) {
sha256_transform_armv8_2way(stA, blkA, stB, blkB);
return;
}
#endif
g_transform_fn(stA, blkA);
g_transform_fn(stB, blkB);
}
void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]) {
pthread_once(&g_backend_once, sha256_backend_init_once);
sha256d80_midstate_init_with_fn(mid, header_76, g_transform_fn);
}
void sha256d80_hash_4way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
sha256_state_t out_states[4]
) {
pthread_once(&g_backend_once, sha256_backend_init_once);
g_hash4_fn(mid, start_nonce, out_states);
}
uint32_t sha256d80_scan_4way(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
const uint32_t target_words[8],
sha256_state_t out_states[4]
) {
uint32_t mask = 0;
pthread_once(&g_backend_once, sha256_backend_init_once);
g_hash4_fn(mid, start_nonce, out_states);
if (state_meets_target_words(&out_states[0], target_words)) {
mask |= 1U;
}
if (state_meets_target_words(&out_states[1], target_words)) {
mask |= 2U;
}
if (state_meets_target_words(&out_states[2], target_words)) {
mask |= 4U;
}
if (state_meets_target_words(&out_states[3], target_words)) {
mask |= 8U;
}
return mask;
}
void sha256_backend_ensure_init(void) {
pthread_once(&g_backend_once, sha256_backend_init_once);
}
uint32_t sha256d80_scan_4way_direct(
const sha256d80_midstate_t *mid,
uint32_t start_nonce,
const uint32_t target_words[8],
sha256_state_t out_states[4]
) {
uint32_t mask = 0;
g_hash4_fn(mid, start_nonce, out_states);
if (state_meets_target_words(&out_states[0], target_words)) {
mask |= 1U;
}
if (state_meets_target_words(&out_states[1], target_words)) {
mask |= 2U;
}
if (state_meets_target_words(&out_states[2], target_words)) {
mask |= 4U;
}
if (state_meets_target_words(&out_states[3], target_words)) {
mask |= 8U;
}
return mask;
}