From 5b4c11f6f082e0e1c47ece95d45970b2a3ce84c7 Mon Sep 17 00:00:00 2001 From: Davide Grilli Date: Mon, 30 Mar 2026 09:04:57 +0200 Subject: [PATCH] feat(sha256): add sha256d80 backend API and ARM64 kernel entry --- sha256/sha256_backend.c | 441 ++++++++++++++++++++++++++++++++ sha256/sha256_backend.h | 65 +++++ sha256/sha256d80_4way_aarch64.S | 9 + 3 files changed, 515 insertions(+) create mode 100644 sha256/sha256_backend.c create mode 100644 sha256/sha256_backend.h create mode 100644 sha256/sha256d80_4way_aarch64.S diff --git a/sha256/sha256_backend.c b/sha256/sha256_backend.c new file mode 100644 index 0000000..bb5fbb0 --- /dev/null +++ b/sha256/sha256_backend.c @@ -0,0 +1,441 @@ +#include "sha256_backend.h" + +#include +#include +#include +#include + +#if defined(__aarch64__) +#include +#include +#endif + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) +#include +#endif + +static const uint32_t k_sha256_iv[8] = { + 0x6a09e667U, 0xbb67ae85U, 0x3c6ef372U, 0xa54ff53aU, + 0x510e527fU, 0x9b05688cU, 0x1f83d9abU, 0x5be0cd19U +}; + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) +static const uint32_t k_sha256[64] = { + 0x428A2F98U, 0x71374491U, 0xB5C0FBCFU, 0xE9B5DBA5U, + 0x3956C25BU, 0x59F111F1U, 0x923F82A4U, 0xAB1C5ED5U, + 0xD807AA98U, 0x12835B01U, 0x243185BEU, 0x550C7DC3U, + 0x72BE5D74U, 0x80DEB1FEU, 0x9BDC06A7U, 0xC19BF174U, + 0xE49B69C1U, 0xEFBE4786U, 0x0FC19DC6U, 0x240CA1CCU, + 0x2DE92C6FU, 0x4A7484AAU, 0x5CB0A9DCU, 0x76F988DAU, + 0x983E5152U, 0xA831C66DU, 0xB00327C8U, 0xBF597FC7U, + 0xC6E00BF3U, 0xD5A79147U, 0x06CA6351U, 0x14292967U, + 0x27B70A85U, 0x2E1B2138U, 0x4D2C6DFCU, 0x53380D13U, + 0x650A7354U, 0x766A0ABBU, 0x81C2C92EU, 0x92722C85U, + 0xA2BFE8A1U, 0xA81A664BU, 0xC24B8B70U, 0xC76C51A3U, + 0xD192E819U, 0xD6990624U, 0xF40E3585U, 0x106AA070U, + 0x19A4C116U, 0x1E376C08U, 0x2748774CU, 0x34B0BCB5U, + 0x391C0CB3U, 0x4ED8AA4AU, 0x5B9CCA4FU, 0x682E6FF3U, + 0x748F82EEU, 0x78A5636FU, 0x84C87814U, 0x8CC70208U, + 0x90BEFFFAU, 0xA4506CEBU, 0xBEF9A3F7U, 0xC67178F2U +}; +#endif + +static inline uint32_t bswap_u32(uint32_t x) { + return ((x & 0x000000FFU) << 24) | + ((x & 0x0000FF00U) << 8) | + ((x & 0x00FF0000U) >> 8) | + ((x & 0xFF000000U) >> 24); +} + +static inline void write_u32_be(uint8_t *dst, uint32_t v) { + dst[0] = (uint8_t)((v >> 24) & 0xFFU); + dst[1] = (uint8_t)((v >> 16) & 0xFFU); + dst[2] = (uint8_t)((v >> 8) & 0xFFU); + dst[3] = (uint8_t)(v & 0xFFU); +} + +static inline void write_u32_le(uint8_t *dst, uint32_t v) { + dst[0] = (uint8_t)(v & 0xFFU); + dst[1] = (uint8_t)((v >> 8) & 0xFFU); + dst[2] = (uint8_t)((v >> 16) & 0xFFU); + dst[3] = (uint8_t)((v >> 24) & 0xFFU); +} + +static inline void set_block1_nonce(uint8_t block1[64], uint32_t nonce) { + write_u32_le(block1 + 12, nonce); +} + +static inline int state_meets_target_words(const sha256_state_t *state, const uint32_t target_words[8]) { + int i; + + for (i = 0; i < 8; i++) { + uint32_t hw = bswap_u32(state->h[7 - i]); + if (hw < target_words[i]) { + return 1; + } + if (hw > target_words[i]) { + return 0; + } + } + + return 1; +} + +typedef void (*sha256_transform_fn_t)(sha256_state_t *state, const uint8_t block[64]); +typedef void (*sha256d80_hash4_fn_t)(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); + +static void sha256_transform_openssl(sha256_state_t *state, const uint8_t block[64]) { + SHA256_CTX ctx; + + memcpy(ctx.h, state->h, sizeof(state->h)); + ctx.Nl = 0; + ctx.Nh = 0; + ctx.num = 0; + ctx.md_len = SHA256_DIGEST_LENGTH; + SHA256_Transform(&ctx, block); + memcpy(state->h, ctx.h, sizeof(state->h)); +} + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) +static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64]) { + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1, TMP2; + + STATE0 = vld1q_u32(&state->h[0]); + STATE1 = vld1q_u32(&state->h[4]); + + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + MSG0 = vld1q_u32((const uint32_t *)(block + 0)); + MSG1 = vld1q_u32((const uint32_t *)(block + 16)); + MSG2 = vld1q_u32((const uint32_t *)(block + 32)); + MSG3 = vld1q_u32((const uint32_t *)(block + 48)); + + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x00])); + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x04])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x08])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x0c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x10])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x14])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x18])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x1c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x20])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x24])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x28])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x2c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x30])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x34])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x38])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x3c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + TMP2 = STATE0; + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + STATE0 = vaddq_u32(STATE0, ABEF_SAVE); + STATE1 = vaddq_u32(STATE1, CDGH_SAVE); + + vst1q_u32(&state->h[0], STATE0); + vst1q_u32(&state->h[4], STATE1); +} + +void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); +extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); +#endif + +static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl; +static sha256d80_hash4_fn_t g_hash4_fn; +static pthread_once_t g_backend_once = PTHREAD_ONCE_INIT; + +static int cpu_has_sha2(void) { +#if defined(__aarch64__) && defined(HWCAP_SHA2) + unsigned long caps = getauxval(AT_HWCAP); + return (caps & HWCAP_SHA2) != 0UL; +#else + return 0; +#endif +} + +static void sha256d80_midstate_init_with_fn( + sha256d80_midstate_t *mid, + const uint8_t header_76[76], + sha256_transform_fn_t tf +) { + sha256_state_init(&mid->init_state); + mid->first_chunk_state = mid->init_state; + tf(&mid->first_chunk_state, header_76); + + memset(mid->block1_template, 0, sizeof(mid->block1_template)); + memcpy(mid->block1_template, header_76 + 64, 12); + mid->block1_template[16] = 0x80; + mid->block1_template[62] = 0x02; + mid->block1_template[63] = 0x80; + + memset(mid->block2_template, 0, sizeof(mid->block2_template)); + mid->block2_template[32] = 0x80; + mid->block2_template[62] = 0x01; + mid->block2_template[63] = 0x00; +} + +static void sha256d80_hash_4way_with_fn( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + sha256_state_t out_states[4], + sha256_transform_fn_t tf +) { + uint8_t block1[4][64]; + uint8_t block2[4][64]; + int i; + + for (i = 0; i < 4; i++) { + out_states[i] = mid->first_chunk_state; + memcpy(block1[i], mid->block1_template, sizeof(mid->block1_template)); + memcpy(block2[i], mid->block2_template, sizeof(mid->block2_template)); + } + + set_block1_nonce(block1[0], start_nonce + 0U); + set_block1_nonce(block1[1], start_nonce + 1U); + set_block1_nonce(block1[2], start_nonce + 2U); + set_block1_nonce(block1[3], start_nonce + 3U); + + tf(&out_states[0], block1[0]); + tf(&out_states[1], block1[1]); + tf(&out_states[2], block1[2]); + tf(&out_states[3], block1[3]); + + sha256_state_to_digest(&out_states[0], block2[0]); + sha256_state_to_digest(&out_states[1], block2[1]); + sha256_state_to_digest(&out_states[2], block2[2]); + sha256_state_to_digest(&out_states[3], block2[3]); + + out_states[0] = mid->init_state; + out_states[1] = mid->init_state; + out_states[2] = mid->init_state; + out_states[3] = mid->init_state; + + tf(&out_states[0], block2[0]); + tf(&out_states[1], block2[1]); + tf(&out_states[2], block2[2]); + tf(&out_states[3], block2[3]); +} + +static void sha256d80_hash_4way_generic(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) { + sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, g_transform_fn); +} + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) +void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) { + sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, sha256_transform_armv8); +} + +static int arm_transform_selftest(void) { + uint8_t block[64]; + sha256_state_t ref; + sha256_state_t test; + int i; + + sha256_state_init(&ref); + sha256_state_init(&test); + + for (i = 0; i < 64; i++) { + block[i] = (uint8_t)(i * 3 + 1); + } + + sha256_transform_openssl(&ref, block); + sha256_transform_armv8(&test, block); + + return memcmp(ref.h, test.h, sizeof(ref.h)) == 0; +} + +static int arm_hash4_selftest(void) { + uint8_t header_76[76]; + sha256d80_midstate_t mid; + sha256_state_t ref[4]; + sha256_state_t test[4]; + int i; + + for (i = 0; i < 76; i++) { + header_76[i] = (uint8_t)(i * 7 + 11); + } + + sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl); + sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl); + sha256d80_4way_aarch64_kernel(&mid, 0x10203040U, test); + + return memcmp(ref, test, sizeof(ref)) == 0; +} +#endif + +static void sha256_backend_init_once(void) { + g_hash4_fn = sha256d80_hash_4way_generic; + g_transform_fn = sha256_transform_openssl; + +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) + if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) { + g_transform_fn = sha256_transform_armv8; + g_hash4_fn = sha256d80_4way_aarch64_kernel; + fprintf(stderr, "[miner] backend SHA: ARMv8 ASM+intrinsics enabled\n"); + } else { + fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n"); + } +#else + fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n"); +#endif +} + +void sha256_state_init(sha256_state_t *state) { + memcpy(state->h, k_sha256_iv, sizeof(state->h)); +} + +void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]) { + int i; + + for (i = 0; i < 8; i++) { + write_u32_be(out + i * 4, state->h[i]); + } +} + +void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]) { + pthread_once(&g_backend_once, sha256_backend_init_once); + g_transform_fn(state, block); +} + +void sha256_transform_fast_2way( + sha256_state_t *stA, const uint8_t blkA[64], + sha256_state_t *stB, const uint8_t blkB[64] +) { + pthread_once(&g_backend_once, sha256_backend_init_once); + g_transform_fn(stA, blkA); + g_transform_fn(stB, blkB); +} + +void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]) { + pthread_once(&g_backend_once, sha256_backend_init_once); + sha256d80_midstate_init_with_fn(mid, header_76, g_transform_fn); +} + +void sha256d80_hash_4way( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + sha256_state_t out_states[4] +) { + pthread_once(&g_backend_once, sha256_backend_init_once); + g_hash4_fn(mid, start_nonce, out_states); +} + +uint32_t sha256d80_scan_4way( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + const uint32_t target_words[8], + sha256_state_t out_states[4] +) { + uint32_t mask = 0; + + pthread_once(&g_backend_once, sha256_backend_init_once); + g_hash4_fn(mid, start_nonce, out_states); + + if (state_meets_target_words(&out_states[0], target_words)) { + mask |= 1U; + } + if (state_meets_target_words(&out_states[1], target_words)) { + mask |= 2U; + } + if (state_meets_target_words(&out_states[2], target_words)) { + mask |= 4U; + } + if (state_meets_target_words(&out_states[3], target_words)) { + mask |= 8U; + } + + return mask; +} diff --git a/sha256/sha256_backend.h b/sha256/sha256_backend.h new file mode 100644 index 0000000..5d7083a --- /dev/null +++ b/sha256/sha256_backend.h @@ -0,0 +1,65 @@ +#ifndef SHA256_BACKEND_H +#define SHA256_BACKEND_H + +#include + +/* + * Compact SHA256 state: only the 8 chaining words (32 bytes). + * Avoids copying the bloated OpenSSL SHA256_CTX (~112 bytes) in the hot loop. + */ +typedef struct { + uint32_t h[8]; +} sha256_state_t; + +typedef struct { + sha256_state_t init_state; + sha256_state_t first_chunk_state; + uint8_t block1_template[64]; + uint8_t block2_template[64]; +} sha256d80_midstate_t; + +/* Set state to SHA256 initial values (IV). */ +void sha256_state_init(sha256_state_t *state); + +/* Serialize the 8 state words into a 32-byte big-endian digest. */ +void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]); + +/* Single SHA256 block compression (64-byte block). */ +void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]); + +/* + * 2-way interleaved SHA256 block compression. + * Processes two independent (state, block) pairs so the CPU can overlap both + * instruction chains. On non-ARM builds falls back to two sequential calls. + */ +void sha256_transform_fast_2way( + sha256_state_t *stA, const uint8_t blkA[64], + sha256_state_t *stB, const uint8_t blkB[64] +); + +/* Prepare SHA256d(80-byte header) midstate and constant blocks from header[0..75]. */ +void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]); + +/* + * Hash 4 consecutive nonces with SHA256d(header80). + * start_nonce lane order: [n, n+1, n+2, n+3]. + */ +void sha256d80_hash_4way( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + sha256_state_t out_states[4] +); + +/* + * Hash 4 consecutive nonces and return hit mask against target words. + * target_words are big-endian words target[0..7]. + * bit i set => lane i meets target. + */ +uint32_t sha256d80_scan_4way( + const sha256d80_midstate_t *mid, + uint32_t start_nonce, + const uint32_t target_words[8], + sha256_state_t out_states[4] +); + +#endif diff --git a/sha256/sha256d80_4way_aarch64.S b/sha256/sha256d80_4way_aarch64.S new file mode 100644 index 0000000..51d4d5e --- /dev/null +++ b/sha256/sha256d80_4way_aarch64.S @@ -0,0 +1,9 @@ +.text +.align 2 + +.global sha256d80_4way_aarch64_kernel +.type sha256d80_4way_aarch64_kernel, %function +sha256d80_4way_aarch64_kernel: + b sha256d80_4way_aarch64_impl +.size sha256d80_4way_aarch64_kernel, .-sha256d80_4way_aarch64_kernel +