Process two independent SHA256 chains simultaneously to hide the 2-cycle latency of vsha256hq_u32 on Cortex-A76, approaching full throughput. Also reduces memcpy from 512 to ~192 bytes per 4-nonce group by reusing block buffers, and adds scan_4way_direct to bypass pthread_once (LDAR barrier) on every inner-loop call.
786 lines
26 KiB
C
786 lines
26 KiB
C
#include "sha256_backend.h"
|
|
|
|
#include <openssl/sha.h>
|
|
#include <pthread.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#if defined(__aarch64__)
|
|
#include <sys/auxv.h>
|
|
#include <asm/hwcap.h>
|
|
#endif
|
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
#include <arm_neon.h>
|
|
#endif
|
|
|
|
static const uint32_t k_sha256_iv[8] = {
|
|
0x6a09e667U, 0xbb67ae85U, 0x3c6ef372U, 0xa54ff53aU,
|
|
0x510e527fU, 0x9b05688cU, 0x1f83d9abU, 0x5be0cd19U
|
|
};
|
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
static const uint32_t k_sha256[64] = {
|
|
0x428A2F98U, 0x71374491U, 0xB5C0FBCFU, 0xE9B5DBA5U,
|
|
0x3956C25BU, 0x59F111F1U, 0x923F82A4U, 0xAB1C5ED5U,
|
|
0xD807AA98U, 0x12835B01U, 0x243185BEU, 0x550C7DC3U,
|
|
0x72BE5D74U, 0x80DEB1FEU, 0x9BDC06A7U, 0xC19BF174U,
|
|
0xE49B69C1U, 0xEFBE4786U, 0x0FC19DC6U, 0x240CA1CCU,
|
|
0x2DE92C6FU, 0x4A7484AAU, 0x5CB0A9DCU, 0x76F988DAU,
|
|
0x983E5152U, 0xA831C66DU, 0xB00327C8U, 0xBF597FC7U,
|
|
0xC6E00BF3U, 0xD5A79147U, 0x06CA6351U, 0x14292967U,
|
|
0x27B70A85U, 0x2E1B2138U, 0x4D2C6DFCU, 0x53380D13U,
|
|
0x650A7354U, 0x766A0ABBU, 0x81C2C92EU, 0x92722C85U,
|
|
0xA2BFE8A1U, 0xA81A664BU, 0xC24B8B70U, 0xC76C51A3U,
|
|
0xD192E819U, 0xD6990624U, 0xF40E3585U, 0x106AA070U,
|
|
0x19A4C116U, 0x1E376C08U, 0x2748774CU, 0x34B0BCB5U,
|
|
0x391C0CB3U, 0x4ED8AA4AU, 0x5B9CCA4FU, 0x682E6FF3U,
|
|
0x748F82EEU, 0x78A5636FU, 0x84C87814U, 0x8CC70208U,
|
|
0x90BEFFFAU, 0xA4506CEBU, 0xBEF9A3F7U, 0xC67178F2U
|
|
};
|
|
#endif
|
|
|
|
static inline uint32_t bswap_u32(uint32_t x) {
|
|
return ((x & 0x000000FFU) << 24) |
|
|
((x & 0x0000FF00U) << 8) |
|
|
((x & 0x00FF0000U) >> 8) |
|
|
((x & 0xFF000000U) >> 24);
|
|
}
|
|
|
|
static inline void write_u32_be(uint8_t *dst, uint32_t v) {
|
|
dst[0] = (uint8_t)((v >> 24) & 0xFFU);
|
|
dst[1] = (uint8_t)((v >> 16) & 0xFFU);
|
|
dst[2] = (uint8_t)((v >> 8) & 0xFFU);
|
|
dst[3] = (uint8_t)(v & 0xFFU);
|
|
}
|
|
|
|
static inline void write_u32_le(uint8_t *dst, uint32_t v) {
|
|
dst[0] = (uint8_t)(v & 0xFFU);
|
|
dst[1] = (uint8_t)((v >> 8) & 0xFFU);
|
|
dst[2] = (uint8_t)((v >> 16) & 0xFFU);
|
|
dst[3] = (uint8_t)((v >> 24) & 0xFFU);
|
|
}
|
|
|
|
static inline void set_block1_nonce(uint8_t block1[64], uint32_t nonce) {
|
|
write_u32_le(block1 + 12, nonce);
|
|
}
|
|
|
|
static inline int state_meets_target_words(const sha256_state_t *state, const uint32_t target_words[8]) {
|
|
int i;
|
|
|
|
for (i = 0; i < 8; i++) {
|
|
uint32_t hw = bswap_u32(state->h[7 - i]);
|
|
if (hw < target_words[i]) {
|
|
return 1;
|
|
}
|
|
if (hw > target_words[i]) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
typedef void (*sha256_transform_fn_t)(sha256_state_t *state, const uint8_t block[64]);
|
|
typedef void (*sha256d80_hash4_fn_t)(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
|
|
|
|
static void sha256_transform_openssl(sha256_state_t *state, const uint8_t block[64]) {
|
|
SHA256_CTX ctx;
|
|
|
|
memcpy(ctx.h, state->h, sizeof(state->h));
|
|
ctx.Nl = 0;
|
|
ctx.Nh = 0;
|
|
ctx.num = 0;
|
|
ctx.md_len = SHA256_DIGEST_LENGTH;
|
|
SHA256_Transform(&ctx, block);
|
|
memcpy(state->h, ctx.h, sizeof(state->h));
|
|
}
|
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64]) {
|
|
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
|
|
uint32x4_t MSG0, MSG1, MSG2, MSG3;
|
|
uint32x4_t TMP0, TMP1, TMP2;
|
|
|
|
STATE0 = vld1q_u32(&state->h[0]);
|
|
STATE1 = vld1q_u32(&state->h[4]);
|
|
|
|
ABEF_SAVE = STATE0;
|
|
CDGH_SAVE = STATE1;
|
|
|
|
MSG0 = vld1q_u32((const uint32_t *)(block + 0));
|
|
MSG1 = vld1q_u32((const uint32_t *)(block + 16));
|
|
MSG2 = vld1q_u32((const uint32_t *)(block + 32));
|
|
MSG3 = vld1q_u32((const uint32_t *)(block + 48));
|
|
|
|
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
|
|
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
|
|
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
|
|
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
|
|
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x00]));
|
|
MSG0 = vsha256su0q_u32(MSG0, MSG1);
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x04]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
|
|
|
|
MSG1 = vsha256su0q_u32(MSG1, MSG2);
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x08]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
|
|
|
|
MSG2 = vsha256su0q_u32(MSG2, MSG3);
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x0c]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
|
|
|
|
MSG3 = vsha256su0q_u32(MSG3, MSG0);
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x10]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
|
|
|
|
MSG0 = vsha256su0q_u32(MSG0, MSG1);
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x14]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
|
|
|
|
MSG1 = vsha256su0q_u32(MSG1, MSG2);
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x18]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
|
|
|
|
MSG2 = vsha256su0q_u32(MSG2, MSG3);
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x1c]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
|
|
|
|
MSG3 = vsha256su0q_u32(MSG3, MSG0);
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x20]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
|
|
|
|
MSG0 = vsha256su0q_u32(MSG0, MSG1);
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x24]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
|
|
|
|
MSG1 = vsha256su0q_u32(MSG1, MSG2);
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x28]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
|
|
|
|
MSG2 = vsha256su0q_u32(MSG2, MSG3);
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x2c]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
|
|
|
|
MSG3 = vsha256su0q_u32(MSG3, MSG0);
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x30]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
|
|
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x34]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
TMP2 = STATE0;
|
|
TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x38]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
TMP2 = STATE0;
|
|
TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x3c]));
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
|
|
|
|
TMP2 = STATE0;
|
|
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
|
|
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
|
|
|
|
STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
|
|
STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
|
|
|
|
vst1q_u32(&state->h[0], STATE0);
|
|
vst1q_u32(&state->h[4], STATE1);
|
|
}
|
|
|
|
void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
|
|
extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]);
|
|
|
|
/*
|
|
* 2-way interleaved SHA256 block compression.
|
|
* Processes two independent (state, block) pairs simultaneously to hide the
|
|
* 2-cycle latency of vsha256hq_u32 on Cortex-A76 (latency=2, throughput=1).
|
|
* While chain A is waiting for its result, chain B's instruction issues,
|
|
* filling the pipeline bubble and approaching full throughput utilization.
|
|
*/
|
|
static void sha256_transform_armv8_2way(
|
|
sha256_state_t *stA, const uint8_t blkA[64],
|
|
sha256_state_t *stB, const uint8_t blkB[64]
|
|
) {
|
|
/* Chain A state */
|
|
uint32x4_t SA0, SA1, ABEF_A, CDGH_A;
|
|
uint32x4_t MA0, MA1, MA2, MA3;
|
|
uint32x4_t TA0, TA1, TA2;
|
|
/* Chain B state */
|
|
uint32x4_t SB0, SB1, ABEF_B, CDGH_B;
|
|
uint32x4_t MB0, MB1, MB2, MB3;
|
|
uint32x4_t TB0, TB1, TB2;
|
|
|
|
/* Load and byte-swap A */
|
|
SA0 = vld1q_u32(&stA->h[0]);
|
|
SA1 = vld1q_u32(&stA->h[4]);
|
|
ABEF_A = SA0; CDGH_A = SA1;
|
|
MA0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 0)))));
|
|
MA1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 16)))));
|
|
MA2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 32)))));
|
|
MA3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 48)))));
|
|
|
|
/* Load and byte-swap B */
|
|
SB0 = vld1q_u32(&stB->h[0]);
|
|
SB1 = vld1q_u32(&stB->h[4]);
|
|
ABEF_B = SB0; CDGH_B = SB1;
|
|
MB0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 0)))));
|
|
MB1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 16)))));
|
|
MB2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 32)))));
|
|
MB3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 48)))));
|
|
|
|
/* ---- Rounds 0-3 ---- */
|
|
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00]));
|
|
MA0 = vsha256su0q_u32(MA0, MA1);
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00]));
|
|
MB0 = vsha256su0q_u32(MB0, MB1);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
|
|
|
|
/* ---- Rounds 4-7 ---- */
|
|
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04]));
|
|
MA1 = vsha256su0q_u32(MA1, MA2);
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04]));
|
|
MB1 = vsha256su0q_u32(MB1, MB2);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
|
|
|
|
/* ---- Rounds 8-11 ---- */
|
|
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08]));
|
|
MA2 = vsha256su0q_u32(MA2, MA3);
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08]));
|
|
MB2 = vsha256su0q_u32(MB2, MB3);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
|
|
|
|
/* ---- Rounds 12-15 ---- */
|
|
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c]));
|
|
MA3 = vsha256su0q_u32(MA3, MA0);
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c]));
|
|
MB3 = vsha256su0q_u32(MB3, MB0);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
|
|
|
|
/* ---- Rounds 16-19 ---- */
|
|
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10]));
|
|
MA0 = vsha256su0q_u32(MA0, MA1);
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10]));
|
|
MB0 = vsha256su0q_u32(MB0, MB1);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
|
|
|
|
/* ---- Rounds 20-23 ---- */
|
|
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14]));
|
|
MA1 = vsha256su0q_u32(MA1, MA2);
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14]));
|
|
MB1 = vsha256su0q_u32(MB1, MB2);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
|
|
|
|
/* ---- Rounds 24-27 ---- */
|
|
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18]));
|
|
MA2 = vsha256su0q_u32(MA2, MA3);
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18]));
|
|
MB2 = vsha256su0q_u32(MB2, MB3);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
|
|
|
|
/* ---- Rounds 28-31 ---- */
|
|
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c]));
|
|
MA3 = vsha256su0q_u32(MA3, MA0);
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c]));
|
|
MB3 = vsha256su0q_u32(MB3, MB0);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
|
|
|
|
/* ---- Rounds 32-35 ---- */
|
|
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20]));
|
|
MA0 = vsha256su0q_u32(MA0, MA1);
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20]));
|
|
MB0 = vsha256su0q_u32(MB0, MB1);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
MA0 = vsha256su1q_u32(MA0, MA2, MA3);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
MB0 = vsha256su1q_u32(MB0, MB2, MB3);
|
|
|
|
/* ---- Rounds 36-39 ---- */
|
|
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24]));
|
|
MA1 = vsha256su0q_u32(MA1, MA2);
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24]));
|
|
MB1 = vsha256su0q_u32(MB1, MB2);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
MA1 = vsha256su1q_u32(MA1, MA3, MA0);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
MB1 = vsha256su1q_u32(MB1, MB3, MB0);
|
|
|
|
/* ---- Rounds 40-43 ---- */
|
|
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28]));
|
|
MA2 = vsha256su0q_u32(MA2, MA3);
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28]));
|
|
MB2 = vsha256su0q_u32(MB2, MB3);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
MA2 = vsha256su1q_u32(MA2, MA0, MA1);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
MB2 = vsha256su1q_u32(MB2, MB0, MB1);
|
|
|
|
/* ---- Rounds 44-47 ---- */
|
|
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c]));
|
|
MA3 = vsha256su0q_u32(MA3, MA0);
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c]));
|
|
MB3 = vsha256su0q_u32(MB3, MB0);
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
MA3 = vsha256su1q_u32(MA3, MA1, MA2);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
MB3 = vsha256su1q_u32(MB3, MB1, MB2);
|
|
|
|
/* ---- Rounds 48-51 ---- */
|
|
TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30]));
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30]));
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
|
|
/* ---- Rounds 52-55 ---- */
|
|
TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34]));
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34]));
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
|
|
/* ---- Rounds 56-59 ---- */
|
|
TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38]));
|
|
TA2 = SA0;
|
|
TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38]));
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA0);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA0);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB0);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB0);
|
|
|
|
/* ---- Rounds 60-63 ---- */
|
|
TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c]));
|
|
TA2 = SA0;
|
|
TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c]));
|
|
TB2 = SB0;
|
|
SA0 = vsha256hq_u32(SA0, SA1, TA1);
|
|
SA1 = vsha256h2q_u32(SA1, TA2, TA1);
|
|
SB0 = vsha256hq_u32(SB0, SB1, TB1);
|
|
SB1 = vsha256h2q_u32(SB1, TB2, TB1);
|
|
|
|
SA0 = vaddq_u32(SA0, ABEF_A);
|
|
SA1 = vaddq_u32(SA1, CDGH_A);
|
|
SB0 = vaddq_u32(SB0, ABEF_B);
|
|
SB1 = vaddq_u32(SB1, CDGH_B);
|
|
|
|
vst1q_u32(&stA->h[0], SA0);
|
|
vst1q_u32(&stA->h[4], SA1);
|
|
vst1q_u32(&stB->h[0], SB0);
|
|
vst1q_u32(&stB->h[4], SB1);
|
|
}
|
|
|
|
/*
|
|
* 4-way hash using 2-way interleaved transforms.
|
|
* Pairs (0,1) and (2,3) are processed simultaneously, hiding SHA2 instruction
|
|
* latency. Avoids redundant memcpy by reusing block buffers across the 4 nonces.
|
|
*/
|
|
static void sha256d80_hash_4way_armv8_2way(
|
|
const sha256d80_midstate_t *mid,
|
|
uint32_t start_nonce,
|
|
sha256_state_t out_states[4]
|
|
) {
|
|
/* Two block1 buffers reused for both pairs */
|
|
uint8_t block1A[64], block1B[64];
|
|
/* Two block2 buffers: only bytes 0-31 (digest) vary; bytes 32-63 are constant */
|
|
uint8_t block2A[64], block2B[64];
|
|
|
|
/* Copy template once each (1x instead of 4x) */
|
|
memcpy(block1A, mid->block1_template, 64);
|
|
memcpy(block1B, mid->block1_template, 64);
|
|
/* Copy only the constant padding half of block2 template (32 bytes each) */
|
|
memcpy(block2A + 32, mid->block2_template + 32, 32);
|
|
memcpy(block2B + 32, mid->block2_template + 32, 32);
|
|
|
|
/* --- Pass 1, pair (0,1) --- */
|
|
out_states[0] = mid->first_chunk_state;
|
|
out_states[1] = mid->first_chunk_state;
|
|
set_block1_nonce(block1A, start_nonce + 0U);
|
|
set_block1_nonce(block1B, start_nonce + 1U);
|
|
sha256_transform_armv8_2way(&out_states[0], block1A, &out_states[1], block1B);
|
|
|
|
/* --- Pass 1, pair (2,3) --- */
|
|
out_states[2] = mid->first_chunk_state;
|
|
out_states[3] = mid->first_chunk_state;
|
|
set_block1_nonce(block1A, start_nonce + 2U);
|
|
set_block1_nonce(block1B, start_nonce + 3U);
|
|
sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B);
|
|
|
|
/* --- Pass 2, pair (0,1): write digest directly into block2 bytes 0-31 --- */
|
|
sha256_state_to_digest(&out_states[0], block2A);
|
|
sha256_state_to_digest(&out_states[1], block2B);
|
|
out_states[0] = mid->init_state;
|
|
out_states[1] = mid->init_state;
|
|
sha256_transform_armv8_2way(&out_states[0], block2A, &out_states[1], block2B);
|
|
|
|
/* --- Pass 2, pair (2,3) --- */
|
|
sha256_state_to_digest(&out_states[2], block2A);
|
|
sha256_state_to_digest(&out_states[3], block2B);
|
|
out_states[2] = mid->init_state;
|
|
out_states[3] = mid->init_state;
|
|
sha256_transform_armv8_2way(&out_states[2], block2A, &out_states[3], block2B);
|
|
}
|
|
#endif
|
|
|
|
static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl;
|
|
static sha256d80_hash4_fn_t g_hash4_fn;
|
|
static pthread_once_t g_backend_once = PTHREAD_ONCE_INIT;
|
|
|
|
static int cpu_has_sha2(void) {
|
|
#if defined(__aarch64__) && defined(HWCAP_SHA2)
|
|
unsigned long caps = getauxval(AT_HWCAP);
|
|
return (caps & HWCAP_SHA2) != 0UL;
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
static void sha256d80_midstate_init_with_fn(
|
|
sha256d80_midstate_t *mid,
|
|
const uint8_t header_76[76],
|
|
sha256_transform_fn_t tf
|
|
) {
|
|
sha256_state_init(&mid->init_state);
|
|
mid->first_chunk_state = mid->init_state;
|
|
tf(&mid->first_chunk_state, header_76);
|
|
|
|
memset(mid->block1_template, 0, sizeof(mid->block1_template));
|
|
memcpy(mid->block1_template, header_76 + 64, 12);
|
|
mid->block1_template[16] = 0x80;
|
|
mid->block1_template[62] = 0x02;
|
|
mid->block1_template[63] = 0x80;
|
|
|
|
memset(mid->block2_template, 0, sizeof(mid->block2_template));
|
|
mid->block2_template[32] = 0x80;
|
|
mid->block2_template[62] = 0x01;
|
|
mid->block2_template[63] = 0x00;
|
|
}
|
|
|
|
static void sha256d80_hash_4way_with_fn(
|
|
const sha256d80_midstate_t *mid,
|
|
uint32_t start_nonce,
|
|
sha256_state_t out_states[4],
|
|
sha256_transform_fn_t tf
|
|
) {
|
|
uint8_t block1[4][64];
|
|
uint8_t block2[4][64];
|
|
int i;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
out_states[i] = mid->first_chunk_state;
|
|
memcpy(block1[i], mid->block1_template, sizeof(mid->block1_template));
|
|
memcpy(block2[i], mid->block2_template, sizeof(mid->block2_template));
|
|
}
|
|
|
|
set_block1_nonce(block1[0], start_nonce + 0U);
|
|
set_block1_nonce(block1[1], start_nonce + 1U);
|
|
set_block1_nonce(block1[2], start_nonce + 2U);
|
|
set_block1_nonce(block1[3], start_nonce + 3U);
|
|
|
|
tf(&out_states[0], block1[0]);
|
|
tf(&out_states[1], block1[1]);
|
|
tf(&out_states[2], block1[2]);
|
|
tf(&out_states[3], block1[3]);
|
|
|
|
sha256_state_to_digest(&out_states[0], block2[0]);
|
|
sha256_state_to_digest(&out_states[1], block2[1]);
|
|
sha256_state_to_digest(&out_states[2], block2[2]);
|
|
sha256_state_to_digest(&out_states[3], block2[3]);
|
|
|
|
out_states[0] = mid->init_state;
|
|
out_states[1] = mid->init_state;
|
|
out_states[2] = mid->init_state;
|
|
out_states[3] = mid->init_state;
|
|
|
|
tf(&out_states[0], block2[0]);
|
|
tf(&out_states[1], block2[1]);
|
|
tf(&out_states[2], block2[2]);
|
|
tf(&out_states[3], block2[3]);
|
|
}
|
|
|
|
static void sha256d80_hash_4way_generic(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) {
|
|
sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, g_transform_fn);
|
|
}
|
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) {
|
|
sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, sha256_transform_armv8);
|
|
}
|
|
|
|
static int arm_transform_selftest(void) {
|
|
uint8_t block[64];
|
|
sha256_state_t ref;
|
|
sha256_state_t test;
|
|
int i;
|
|
|
|
sha256_state_init(&ref);
|
|
sha256_state_init(&test);
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
block[i] = (uint8_t)(i * 3 + 1);
|
|
}
|
|
|
|
sha256_transform_openssl(&ref, block);
|
|
sha256_transform_armv8(&test, block);
|
|
|
|
return memcmp(ref.h, test.h, sizeof(ref.h)) == 0;
|
|
}
|
|
|
|
static int arm_hash4_selftest(void) {
|
|
uint8_t header_76[76];
|
|
sha256d80_midstate_t mid;
|
|
sha256_state_t ref[4];
|
|
sha256_state_t test[4];
|
|
int i;
|
|
|
|
for (i = 0; i < 76; i++) {
|
|
header_76[i] = (uint8_t)(i * 7 + 11);
|
|
}
|
|
|
|
sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl);
|
|
sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl);
|
|
sha256d80_hash_4way_armv8_2way(&mid, 0x10203040U, test);
|
|
|
|
return memcmp(ref, test, sizeof(ref)) == 0;
|
|
}
|
|
#endif
|
|
|
|
static void sha256_backend_init_once(void) {
|
|
g_hash4_fn = sha256d80_hash_4way_generic;
|
|
g_transform_fn = sha256_transform_openssl;
|
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) {
|
|
g_transform_fn = sha256_transform_armv8;
|
|
g_hash4_fn = sha256d80_hash_4way_armv8_2way;
|
|
fprintf(stderr, "[miner] backend SHA: ARMv8 2-way interleaved enabled\n");
|
|
} else {
|
|
fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n");
|
|
}
|
|
#else
|
|
fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n");
|
|
#endif
|
|
}
|
|
|
|
void sha256_state_init(sha256_state_t *state) {
|
|
memcpy(state->h, k_sha256_iv, sizeof(state->h));
|
|
}
|
|
|
|
void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]) {
|
|
int i;
|
|
|
|
for (i = 0; i < 8; i++) {
|
|
write_u32_be(out + i * 4, state->h[i]);
|
|
}
|
|
}
|
|
|
|
void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]) {
|
|
pthread_once(&g_backend_once, sha256_backend_init_once);
|
|
g_transform_fn(state, block);
|
|
}
|
|
|
|
void sha256_transform_fast_2way(
|
|
sha256_state_t *stA, const uint8_t blkA[64],
|
|
sha256_state_t *stB, const uint8_t blkB[64]
|
|
) {
|
|
pthread_once(&g_backend_once, sha256_backend_init_once);
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
if (g_transform_fn == sha256_transform_armv8) {
|
|
sha256_transform_armv8_2way(stA, blkA, stB, blkB);
|
|
return;
|
|
}
|
|
#endif
|
|
g_transform_fn(stA, blkA);
|
|
g_transform_fn(stB, blkB);
|
|
}
|
|
|
|
void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]) {
|
|
pthread_once(&g_backend_once, sha256_backend_init_once);
|
|
sha256d80_midstate_init_with_fn(mid, header_76, g_transform_fn);
|
|
}
|
|
|
|
void sha256d80_hash_4way(
|
|
const sha256d80_midstate_t *mid,
|
|
uint32_t start_nonce,
|
|
sha256_state_t out_states[4]
|
|
) {
|
|
pthread_once(&g_backend_once, sha256_backend_init_once);
|
|
g_hash4_fn(mid, start_nonce, out_states);
|
|
}
|
|
|
|
uint32_t sha256d80_scan_4way(
|
|
const sha256d80_midstate_t *mid,
|
|
uint32_t start_nonce,
|
|
const uint32_t target_words[8],
|
|
sha256_state_t out_states[4]
|
|
) {
|
|
uint32_t mask = 0;
|
|
|
|
pthread_once(&g_backend_once, sha256_backend_init_once);
|
|
g_hash4_fn(mid, start_nonce, out_states);
|
|
|
|
if (state_meets_target_words(&out_states[0], target_words)) {
|
|
mask |= 1U;
|
|
}
|
|
if (state_meets_target_words(&out_states[1], target_words)) {
|
|
mask |= 2U;
|
|
}
|
|
if (state_meets_target_words(&out_states[2], target_words)) {
|
|
mask |= 4U;
|
|
}
|
|
if (state_meets_target_words(&out_states[3], target_words)) {
|
|
mask |= 8U;
|
|
}
|
|
|
|
return mask;
|
|
}
|
|
|
|
void sha256_backend_ensure_init(void) {
|
|
pthread_once(&g_backend_once, sha256_backend_init_once);
|
|
}
|
|
|
|
uint32_t sha256d80_scan_4way_direct(
|
|
const sha256d80_midstate_t *mid,
|
|
uint32_t start_nonce,
|
|
const uint32_t target_words[8],
|
|
sha256_state_t out_states[4]
|
|
) {
|
|
uint32_t mask = 0;
|
|
|
|
g_hash4_fn(mid, start_nonce, out_states);
|
|
|
|
if (state_meets_target_words(&out_states[0], target_words)) {
|
|
mask |= 1U;
|
|
}
|
|
if (state_meets_target_words(&out_states[1], target_words)) {
|
|
mask |= 2U;
|
|
}
|
|
if (state_meets_target_words(&out_states[2], target_words)) {
|
|
mask |= 4U;
|
|
}
|
|
if (state_meets_target_words(&out_states[3], target_words)) {
|
|
mask |= 8U;
|
|
}
|
|
|
|
return mask;
|
|
}
|