#include "sha256_backend.h" #include #include #include #include #if defined(__aarch64__) #include #include #endif #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) #include #endif static const uint32_t k_sha256_iv[8] = { 0x6a09e667U, 0xbb67ae85U, 0x3c6ef372U, 0xa54ff53aU, 0x510e527fU, 0x9b05688cU, 0x1f83d9abU, 0x5be0cd19U }; #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) static const uint32_t k_sha256[64] = { 0x428A2F98U, 0x71374491U, 0xB5C0FBCFU, 0xE9B5DBA5U, 0x3956C25BU, 0x59F111F1U, 0x923F82A4U, 0xAB1C5ED5U, 0xD807AA98U, 0x12835B01U, 0x243185BEU, 0x550C7DC3U, 0x72BE5D74U, 0x80DEB1FEU, 0x9BDC06A7U, 0xC19BF174U, 0xE49B69C1U, 0xEFBE4786U, 0x0FC19DC6U, 0x240CA1CCU, 0x2DE92C6FU, 0x4A7484AAU, 0x5CB0A9DCU, 0x76F988DAU, 0x983E5152U, 0xA831C66DU, 0xB00327C8U, 0xBF597FC7U, 0xC6E00BF3U, 0xD5A79147U, 0x06CA6351U, 0x14292967U, 0x27B70A85U, 0x2E1B2138U, 0x4D2C6DFCU, 0x53380D13U, 0x650A7354U, 0x766A0ABBU, 0x81C2C92EU, 0x92722C85U, 0xA2BFE8A1U, 0xA81A664BU, 0xC24B8B70U, 0xC76C51A3U, 0xD192E819U, 0xD6990624U, 0xF40E3585U, 0x106AA070U, 0x19A4C116U, 0x1E376C08U, 0x2748774CU, 0x34B0BCB5U, 0x391C0CB3U, 0x4ED8AA4AU, 0x5B9CCA4FU, 0x682E6FF3U, 0x748F82EEU, 0x78A5636FU, 0x84C87814U, 0x8CC70208U, 0x90BEFFFAU, 0xA4506CEBU, 0xBEF9A3F7U, 0xC67178F2U }; #endif static inline uint32_t bswap_u32(uint32_t x) { return ((x & 0x000000FFU) << 24) | ((x & 0x0000FF00U) << 8) | ((x & 0x00FF0000U) >> 8) | ((x & 0xFF000000U) >> 24); } static inline void write_u32_be(uint8_t *dst, uint32_t v) { dst[0] = (uint8_t)((v >> 24) & 0xFFU); dst[1] = (uint8_t)((v >> 16) & 0xFFU); dst[2] = (uint8_t)((v >> 8) & 0xFFU); dst[3] = (uint8_t)(v & 0xFFU); } static inline void write_u32_le(uint8_t *dst, uint32_t v) { dst[0] = (uint8_t)(v & 0xFFU); dst[1] = (uint8_t)((v >> 8) & 0xFFU); dst[2] = (uint8_t)((v >> 16) & 0xFFU); dst[3] = (uint8_t)((v >> 24) & 0xFFU); } static inline void set_block1_nonce(uint8_t block1[64], uint32_t nonce) { write_u32_le(block1 + 12, nonce); } static inline int state_meets_target_words(const sha256_state_t *state, const uint32_t target_words[8]) { int i; for (i = 0; i < 8; i++) { uint32_t hw = bswap_u32(state->h[7 - i]); if (hw < target_words[i]) { return 1; } if (hw > target_words[i]) { return 0; } } return 1; } typedef void (*sha256_transform_fn_t)(sha256_state_t *state, const uint8_t block[64]); typedef void (*sha256d80_hash4_fn_t)(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); static void sha256_transform_openssl(sha256_state_t *state, const uint8_t block[64]) { SHA256_CTX ctx; memcpy(ctx.h, state->h, sizeof(state->h)); ctx.Nl = 0; ctx.Nh = 0; ctx.num = 0; ctx.md_len = SHA256_DIGEST_LENGTH; SHA256_Transform(&ctx, block); memcpy(state->h, ctx.h, sizeof(state->h)); } #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) static void sha256_transform_armv8(sha256_state_t *state, const uint8_t block[64]) { uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; uint32x4_t MSG0, MSG1, MSG2, MSG3; uint32x4_t TMP0, TMP1, TMP2; STATE0 = vld1q_u32(&state->h[0]); STATE1 = vld1q_u32(&state->h[4]); ABEF_SAVE = STATE0; CDGH_SAVE = STATE1; MSG0 = vld1q_u32((const uint32_t *)(block + 0)); MSG1 = vld1q_u32((const uint32_t *)(block + 16)); MSG2 = vld1q_u32((const uint32_t *)(block + 32)); MSG3 = vld1q_u32((const uint32_t *)(block + 48)); MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x00])); MSG0 = vsha256su0q_u32(MSG0, MSG1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x04])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); MSG1 = vsha256su0q_u32(MSG1, MSG2); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x08])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); MSG2 = vsha256su0q_u32(MSG2, MSG3); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x0c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); MSG3 = vsha256su0q_u32(MSG3, MSG0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x10])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); MSG0 = vsha256su0q_u32(MSG0, MSG1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x14])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); MSG1 = vsha256su0q_u32(MSG1, MSG2); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x18])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); MSG2 = vsha256su0q_u32(MSG2, MSG3); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x1c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); MSG3 = vsha256su0q_u32(MSG3, MSG0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x20])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); MSG0 = vsha256su0q_u32(MSG0, MSG1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x24])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); MSG1 = vsha256su0q_u32(MSG1, MSG2); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x28])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); MSG2 = vsha256su0q_u32(MSG2, MSG3); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x2c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); MSG3 = vsha256su0q_u32(MSG3, MSG0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG0, vld1q_u32(&k_sha256[0x30])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); TMP2 = STATE0; TMP1 = vaddq_u32(MSG1, vld1q_u32(&k_sha256[0x34])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); TMP2 = STATE0; TMP0 = vaddq_u32(MSG2, vld1q_u32(&k_sha256[0x38])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); TMP2 = STATE0; TMP1 = vaddq_u32(MSG3, vld1q_u32(&k_sha256[0x3c])); STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); TMP2 = STATE0; STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); STATE0 = vaddq_u32(STATE0, ABEF_SAVE); STATE1 = vaddq_u32(STATE1, CDGH_SAVE); vst1q_u32(&state->h[0], STATE0); vst1q_u32(&state->h[4], STATE1); } void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); extern void sha256d80_4way_aarch64_kernel(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]); /* * 2-way interleaved SHA256 block compression. * Processes two independent (state, block) pairs simultaneously to hide the * 2-cycle latency of vsha256hq_u32 on Cortex-A76 (latency=2, throughput=1). * While chain A is waiting for its result, chain B's instruction issues, * filling the pipeline bubble and approaching full throughput utilization. */ static void sha256_transform_armv8_2way( sha256_state_t *stA, const uint8_t blkA[64], sha256_state_t *stB, const uint8_t blkB[64] ) { /* Chain A state */ uint32x4_t SA0, SA1, ABEF_A, CDGH_A; uint32x4_t MA0, MA1, MA2, MA3; uint32x4_t TA0, TA1, TA2; /* Chain B state */ uint32x4_t SB0, SB1, ABEF_B, CDGH_B; uint32x4_t MB0, MB1, MB2, MB3; uint32x4_t TB0, TB1, TB2; /* Load and byte-swap A */ SA0 = vld1q_u32(&stA->h[0]); SA1 = vld1q_u32(&stA->h[4]); ABEF_A = SA0; CDGH_A = SA1; MA0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 0))))); MA1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 16))))); MA2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 32))))); MA3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkA + 48))))); /* Load and byte-swap B */ SB0 = vld1q_u32(&stB->h[0]); SB1 = vld1q_u32(&stB->h[4]); ABEF_B = SB0; CDGH_B = SB1; MB0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 0))))); MB1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 16))))); MB2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 32))))); MB3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t *)(blkB + 48))))); /* ---- Rounds 0-3 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00])); MA0 = vsha256su0q_u32(MA0, MA1); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00])); MB0 = vsha256su0q_u32(MB0, MB1); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA0 = vsha256su1q_u32(MA0, MA2, MA3); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB0 = vsha256su1q_u32(MB0, MB2, MB3); /* ---- Rounds 4-7 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04])); MA1 = vsha256su0q_u32(MA1, MA2); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04])); MB1 = vsha256su0q_u32(MB1, MB2); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA1 = vsha256su1q_u32(MA1, MA3, MA0); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB1 = vsha256su1q_u32(MB1, MB3, MB0); /* ---- Rounds 8-11 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08])); MA2 = vsha256su0q_u32(MA2, MA3); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08])); MB2 = vsha256su0q_u32(MB2, MB3); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA2 = vsha256su1q_u32(MA2, MA0, MA1); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB2 = vsha256su1q_u32(MB2, MB0, MB1); /* ---- Rounds 12-15 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c])); MA3 = vsha256su0q_u32(MA3, MA0); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c])); MB3 = vsha256su0q_u32(MB3, MB0); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA3 = vsha256su1q_u32(MA3, MA1, MA2); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB3 = vsha256su1q_u32(MB3, MB1, MB2); /* ---- Rounds 16-19 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10])); MA0 = vsha256su0q_u32(MA0, MA1); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10])); MB0 = vsha256su0q_u32(MB0, MB1); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA0 = vsha256su1q_u32(MA0, MA2, MA3); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB0 = vsha256su1q_u32(MB0, MB2, MB3); /* ---- Rounds 20-23 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14])); MA1 = vsha256su0q_u32(MA1, MA2); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14])); MB1 = vsha256su0q_u32(MB1, MB2); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA1 = vsha256su1q_u32(MA1, MA3, MA0); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB1 = vsha256su1q_u32(MB1, MB3, MB0); /* ---- Rounds 24-27 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18])); MA2 = vsha256su0q_u32(MA2, MA3); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18])); MB2 = vsha256su0q_u32(MB2, MB3); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA2 = vsha256su1q_u32(MA2, MA0, MA1); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB2 = vsha256su1q_u32(MB2, MB0, MB1); /* ---- Rounds 28-31 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c])); MA3 = vsha256su0q_u32(MA3, MA0); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c])); MB3 = vsha256su0q_u32(MB3, MB0); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA3 = vsha256su1q_u32(MA3, MA1, MA2); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB3 = vsha256su1q_u32(MB3, MB1, MB2); /* ---- Rounds 32-35 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20])); MA0 = vsha256su0q_u32(MA0, MA1); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20])); MB0 = vsha256su0q_u32(MB0, MB1); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA0 = vsha256su1q_u32(MA0, MA2, MA3); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB0 = vsha256su1q_u32(MB0, MB2, MB3); /* ---- Rounds 36-39 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24])); MA1 = vsha256su0q_u32(MA1, MA2); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24])); MB1 = vsha256su0q_u32(MB1, MB2); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA1 = vsha256su1q_u32(MA1, MA3, MA0); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB1 = vsha256su1q_u32(MB1, MB3, MB0); /* ---- Rounds 40-43 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28])); MA2 = vsha256su0q_u32(MA2, MA3); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28])); MB2 = vsha256su0q_u32(MB2, MB3); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA2 = vsha256su1q_u32(MA2, MA0, MA1); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB2 = vsha256su1q_u32(MB2, MB0, MB1); /* ---- Rounds 44-47 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c])); MA3 = vsha256su0q_u32(MA3, MA0); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c])); MB3 = vsha256su0q_u32(MB3, MB0); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA3 = vsha256su1q_u32(MA3, MA1, MA2); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB3 = vsha256su1q_u32(MB3, MB1, MB2); /* ---- Rounds 48-51 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30])); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); /* ---- Rounds 52-55 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34])); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); /* ---- Rounds 56-59 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38])); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); /* ---- Rounds 60-63 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c])); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); SA0 = vaddq_u32(SA0, ABEF_A); SA1 = vaddq_u32(SA1, CDGH_A); SB0 = vaddq_u32(SB0, ABEF_B); SB1 = vaddq_u32(SB1, CDGH_B); vst1q_u32(&stA->h[0], SA0); vst1q_u32(&stA->h[4], SA1); vst1q_u32(&stB->h[0], SB0); vst1q_u32(&stB->h[4], SB1); } /* * Constant message words W[8..15] for the second SHA256 pass. * block2 = [digest(32 bytes) | 0x80 | zeros | length(0x100 bits)] * After the vrev32q_u8 load that the first-pass transform applies, these * padding words are always: W[8]=0x80000000, W[9..14]=0, W[15]=0x00000100. * Preloading them avoids redundant memory reads per call. */ static const uint32_t k_block2_msg2[4] = { 0x80000000U, 0U, 0U, 0U }; static const uint32_t k_block2_msg3[4] = { 0U, 0U, 0U, 0x00000100U }; /* * Specialized 2-way second pass for SHA256d(80-byte header). * * Eliminates the double byte-swap that would occur if we used the generic * transform path: * sha256_state_to_digest() → state->h[i] (native uint32) → BE bytes * sha256_transform_armv8_2way() → loads BE bytes → vrev32q_u8 → native uint32 * These two conversions cancel out. This function loads the pass1 state words * directly into MSG0/MSG1 without any byte reversal, saving ~52 shift/store/ * load/vrev operations per 4-nonce group. * * stA / stB: on entry, contain the pass1 output state (used as the message). * on exit, overwritten with the pass2 (final) SHA256 digest state. */ static void sha256_transform_armv8_2way_pass2( sha256_state_t *stA, sha256_state_t *stB ) { /* Chain A */ uint32x4_t SA0, SA1, ABEF_A, CDGH_A; uint32x4_t MA0, MA1, MA2, MA3; uint32x4_t TA0, TA1, TA2; /* Chain B */ uint32x4_t SB0, SB1, ABEF_B, CDGH_B; uint32x4_t MB0, MB1, MB2, MB3; uint32x4_t TB0, TB1, TB2; /* Load MSG0/MSG1 from pass1 state — no bswap, state words are already * in the native form expected by the SHA2 compression instructions. */ MA0 = vld1q_u32(&stA->h[0]); MA1 = vld1q_u32(&stA->h[4]); MB0 = vld1q_u32(&stB->h[0]); MB1 = vld1q_u32(&stB->h[4]); /* Constant padding (shared between both chains) */ MA2 = vld1q_u32(k_block2_msg2); MA3 = vld1q_u32(k_block2_msg3); MB2 = MA2; MB3 = MA3; /* Initialize compression state to SHA256 IV */ SA0 = vld1q_u32(&k_sha256_iv[0]); SA1 = vld1q_u32(&k_sha256_iv[4]); ABEF_A = SA0; CDGH_A = SA1; SB0 = SA0; SB1 = SA1; ABEF_B = SB0; CDGH_B = SB1; /* ---- Rounds 0-3 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x00])); MA0 = vsha256su0q_u32(MA0, MA1); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x00])); MB0 = vsha256su0q_u32(MB0, MB1); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA0 = vsha256su1q_u32(MA0, MA2, MA3); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB0 = vsha256su1q_u32(MB0, MB2, MB3); /* ---- Rounds 4-7 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x04])); MA1 = vsha256su0q_u32(MA1, MA2); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x04])); MB1 = vsha256su0q_u32(MB1, MB2); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA1 = vsha256su1q_u32(MA1, MA3, MA0); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB1 = vsha256su1q_u32(MB1, MB3, MB0); /* ---- Rounds 8-11 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x08])); MA2 = vsha256su0q_u32(MA2, MA3); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x08])); MB2 = vsha256su0q_u32(MB2, MB3); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA2 = vsha256su1q_u32(MA2, MA0, MA1); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB2 = vsha256su1q_u32(MB2, MB0, MB1); /* ---- Rounds 12-15 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x0c])); MA3 = vsha256su0q_u32(MA3, MA0); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x0c])); MB3 = vsha256su0q_u32(MB3, MB0); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA3 = vsha256su1q_u32(MA3, MA1, MA2); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB3 = vsha256su1q_u32(MB3, MB1, MB2); /* ---- Rounds 16-19 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x10])); MA0 = vsha256su0q_u32(MA0, MA1); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x10])); MB0 = vsha256su0q_u32(MB0, MB1); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA0 = vsha256su1q_u32(MA0, MA2, MA3); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB0 = vsha256su1q_u32(MB0, MB2, MB3); /* ---- Rounds 20-23 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x14])); MA1 = vsha256su0q_u32(MA1, MA2); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x14])); MB1 = vsha256su0q_u32(MB1, MB2); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA1 = vsha256su1q_u32(MA1, MA3, MA0); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB1 = vsha256su1q_u32(MB1, MB3, MB0); /* ---- Rounds 24-27 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x18])); MA2 = vsha256su0q_u32(MA2, MA3); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x18])); MB2 = vsha256su0q_u32(MB2, MB3); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA2 = vsha256su1q_u32(MA2, MA0, MA1); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB2 = vsha256su1q_u32(MB2, MB0, MB1); /* ---- Rounds 28-31 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x1c])); MA3 = vsha256su0q_u32(MA3, MA0); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x1c])); MB3 = vsha256su0q_u32(MB3, MB0); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA3 = vsha256su1q_u32(MA3, MA1, MA2); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB3 = vsha256su1q_u32(MB3, MB1, MB2); /* ---- Rounds 32-35 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x20])); MA0 = vsha256su0q_u32(MA0, MA1); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x20])); MB0 = vsha256su0q_u32(MB0, MB1); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA0 = vsha256su1q_u32(MA0, MA2, MA3); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB0 = vsha256su1q_u32(MB0, MB2, MB3); /* ---- Rounds 36-39 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x24])); MA1 = vsha256su0q_u32(MA1, MA2); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x24])); MB1 = vsha256su0q_u32(MB1, MB2); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA1 = vsha256su1q_u32(MA1, MA3, MA0); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB1 = vsha256su1q_u32(MB1, MB3, MB0); /* ---- Rounds 40-43 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x28])); MA2 = vsha256su0q_u32(MA2, MA3); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x28])); MB2 = vsha256su0q_u32(MB2, MB3); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); MA2 = vsha256su1q_u32(MA2, MA0, MA1); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); MB2 = vsha256su1q_u32(MB2, MB0, MB1); /* ---- Rounds 44-47 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x2c])); MA3 = vsha256su0q_u32(MA3, MA0); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x2c])); MB3 = vsha256su0q_u32(MB3, MB0); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); MA3 = vsha256su1q_u32(MA3, MA1, MA2); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); MB3 = vsha256su1q_u32(MB3, MB1, MB2); /* ---- Rounds 48-51 ---- */ TA0 = vaddq_u32(MA0, vld1q_u32(&k_sha256[0x30])); TA2 = SA0; TB0 = vaddq_u32(MB0, vld1q_u32(&k_sha256[0x30])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); /* ---- Rounds 52-55 ---- */ TA1 = vaddq_u32(MA1, vld1q_u32(&k_sha256[0x34])); TA2 = SA0; TB1 = vaddq_u32(MB1, vld1q_u32(&k_sha256[0x34])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); /* ---- Rounds 56-59 ---- */ TA0 = vaddq_u32(MA2, vld1q_u32(&k_sha256[0x38])); TA2 = SA0; TB0 = vaddq_u32(MB2, vld1q_u32(&k_sha256[0x38])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA0); SA1 = vsha256h2q_u32(SA1, TA2, TA0); SB0 = vsha256hq_u32(SB0, SB1, TB0); SB1 = vsha256h2q_u32(SB1, TB2, TB0); /* ---- Rounds 60-63 ---- */ TA1 = vaddq_u32(MA3, vld1q_u32(&k_sha256[0x3c])); TA2 = SA0; TB1 = vaddq_u32(MB3, vld1q_u32(&k_sha256[0x3c])); TB2 = SB0; SA0 = vsha256hq_u32(SA0, SA1, TA1); SA1 = vsha256h2q_u32(SA1, TA2, TA1); SB0 = vsha256hq_u32(SB0, SB1, TB1); SB1 = vsha256h2q_u32(SB1, TB2, TB1); SA0 = vaddq_u32(SA0, ABEF_A); SA1 = vaddq_u32(SA1, CDGH_A); SB0 = vaddq_u32(SB0, ABEF_B); SB1 = vaddq_u32(SB1, CDGH_B); vst1q_u32(&stA->h[0], SA0); vst1q_u32(&stA->h[4], SA1); vst1q_u32(&stB->h[0], SB0); vst1q_u32(&stB->h[4], SB1); } /* * 4-way hash using 2-way interleaved transforms. * Pass 1: two calls to sha256_transform_armv8_2way (pairs 0,1 and 2,3). * Pass 2: two calls to sha256_transform_armv8_2way_pass2, which reads the * pass1 state words directly — no intermediate byte serialization, * no double bswap overhead. */ static void sha256d80_hash_4way_armv8_2way( const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4] ) { uint8_t block1A[64], block1B[64]; memcpy(block1A, mid->block1_template, 64); memcpy(block1B, mid->block1_template, 64); /* --- Pass 1, pair (0,1) --- */ out_states[0] = mid->first_chunk_state; out_states[1] = mid->first_chunk_state; set_block1_nonce(block1A, start_nonce + 0U); set_block1_nonce(block1B, start_nonce + 1U); sha256_transform_armv8_2way(&out_states[0], block1A, &out_states[1], block1B); /* --- Pass 1, pair (2,3) --- */ out_states[2] = mid->first_chunk_state; out_states[3] = mid->first_chunk_state; set_block1_nonce(block1A, start_nonce + 2U); set_block1_nonce(block1B, start_nonce + 3U); sha256_transform_armv8_2way(&out_states[2], block1A, &out_states[3], block1B); /* --- Pass 2: state words fed directly, no bswap round-trip --- */ sha256_transform_armv8_2way_pass2(&out_states[0], &out_states[1]); sha256_transform_armv8_2way_pass2(&out_states[2], &out_states[3]); } #endif static sha256_transform_fn_t g_transform_fn = sha256_transform_openssl; static sha256d80_hash4_fn_t g_hash4_fn; static pthread_once_t g_backend_once = PTHREAD_ONCE_INIT; static int cpu_has_sha2(void) { #if defined(__aarch64__) && defined(HWCAP_SHA2) unsigned long caps = getauxval(AT_HWCAP); return (caps & HWCAP_SHA2) != 0UL; #else return 0; #endif } static void sha256d80_midstate_init_with_fn( sha256d80_midstate_t *mid, const uint8_t header_76[76], sha256_transform_fn_t tf ) { sha256_state_init(&mid->init_state); mid->first_chunk_state = mid->init_state; tf(&mid->first_chunk_state, header_76); memset(mid->block1_template, 0, sizeof(mid->block1_template)); memcpy(mid->block1_template, header_76 + 64, 12); mid->block1_template[16] = 0x80; mid->block1_template[62] = 0x02; mid->block1_template[63] = 0x80; memset(mid->block2_template, 0, sizeof(mid->block2_template)); mid->block2_template[32] = 0x80; mid->block2_template[62] = 0x01; mid->block2_template[63] = 0x00; } static void sha256d80_hash_4way_with_fn( const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4], sha256_transform_fn_t tf ) { uint8_t block1[4][64]; uint8_t block2[4][64]; int i; for (i = 0; i < 4; i++) { out_states[i] = mid->first_chunk_state; memcpy(block1[i], mid->block1_template, sizeof(mid->block1_template)); memcpy(block2[i], mid->block2_template, sizeof(mid->block2_template)); } set_block1_nonce(block1[0], start_nonce + 0U); set_block1_nonce(block1[1], start_nonce + 1U); set_block1_nonce(block1[2], start_nonce + 2U); set_block1_nonce(block1[3], start_nonce + 3U); tf(&out_states[0], block1[0]); tf(&out_states[1], block1[1]); tf(&out_states[2], block1[2]); tf(&out_states[3], block1[3]); sha256_state_to_digest(&out_states[0], block2[0]); sha256_state_to_digest(&out_states[1], block2[1]); sha256_state_to_digest(&out_states[2], block2[2]); sha256_state_to_digest(&out_states[3], block2[3]); out_states[0] = mid->init_state; out_states[1] = mid->init_state; out_states[2] = mid->init_state; out_states[3] = mid->init_state; tf(&out_states[0], block2[0]); tf(&out_states[1], block2[1]); tf(&out_states[2], block2[2]); tf(&out_states[3], block2[3]); } static void sha256d80_hash_4way_generic(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) { sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, g_transform_fn); } #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) void sha256d80_4way_aarch64_impl(const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4]) { sha256d80_hash_4way_with_fn(mid, start_nonce, out_states, sha256_transform_armv8); } static int arm_transform_selftest(void) { uint8_t block[64]; sha256_state_t ref; sha256_state_t test; int i; sha256_state_init(&ref); sha256_state_init(&test); for (i = 0; i < 64; i++) { block[i] = (uint8_t)(i * 3 + 1); } sha256_transform_openssl(&ref, block); sha256_transform_armv8(&test, block); return memcmp(ref.h, test.h, sizeof(ref.h)) == 0; } static int arm_hash4_selftest(void) { uint8_t header_76[76]; sha256d80_midstate_t mid; sha256_state_t ref[4]; sha256_state_t test[4]; int i; for (i = 0; i < 76; i++) { header_76[i] = (uint8_t)(i * 7 + 11); } sha256d80_midstate_init_with_fn(&mid, header_76, sha256_transform_openssl); sha256d80_hash_4way_with_fn(&mid, 0x10203040U, ref, sha256_transform_openssl); sha256d80_hash_4way_armv8_2way(&mid, 0x10203040U, test); return memcmp(ref, test, sizeof(ref)) == 0; } #endif static void sha256_backend_init_once(void) { g_hash4_fn = sha256d80_hash_4way_generic; g_transform_fn = sha256_transform_openssl; #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) if (cpu_has_sha2() && arm_transform_selftest() && arm_hash4_selftest()) { g_transform_fn = sha256_transform_armv8; g_hash4_fn = sha256d80_hash_4way_armv8_2way; fprintf(stderr, "[miner] backend SHA: ARMv8 2-way interleaved enabled\n"); } else { fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n"); } #else fprintf(stderr, "[miner] backend SHA: OpenSSL fallback\n"); #endif } void sha256_state_init(sha256_state_t *state) { memcpy(state->h, k_sha256_iv, sizeof(state->h)); } void sha256_state_to_digest(const sha256_state_t *state, uint8_t out[32]) { int i; for (i = 0; i < 8; i++) { write_u32_be(out + i * 4, state->h[i]); } } void sha256_transform_fast(sha256_state_t *state, const uint8_t block[64]) { pthread_once(&g_backend_once, sha256_backend_init_once); g_transform_fn(state, block); } void sha256_transform_fast_2way( sha256_state_t *stA, const uint8_t blkA[64], sha256_state_t *stB, const uint8_t blkB[64] ) { pthread_once(&g_backend_once, sha256_backend_init_once); #if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO) if (g_transform_fn == sha256_transform_armv8) { sha256_transform_armv8_2way(stA, blkA, stB, blkB); return; } #endif g_transform_fn(stA, blkA); g_transform_fn(stB, blkB); } void sha256d80_midstate_init(sha256d80_midstate_t *mid, const uint8_t header_76[76]) { pthread_once(&g_backend_once, sha256_backend_init_once); sha256d80_midstate_init_with_fn(mid, header_76, g_transform_fn); } void sha256d80_hash_4way( const sha256d80_midstate_t *mid, uint32_t start_nonce, sha256_state_t out_states[4] ) { pthread_once(&g_backend_once, sha256_backend_init_once); g_hash4_fn(mid, start_nonce, out_states); } uint32_t sha256d80_scan_4way( const sha256d80_midstate_t *mid, uint32_t start_nonce, const uint32_t target_words[8], sha256_state_t out_states[4] ) { uint32_t mask = 0; pthread_once(&g_backend_once, sha256_backend_init_once); g_hash4_fn(mid, start_nonce, out_states); if (state_meets_target_words(&out_states[0], target_words)) { mask |= 1U; } if (state_meets_target_words(&out_states[1], target_words)) { mask |= 2U; } if (state_meets_target_words(&out_states[2], target_words)) { mask |= 4U; } if (state_meets_target_words(&out_states[3], target_words)) { mask |= 8U; } return mask; } void sha256_backend_ensure_init(void) { pthread_once(&g_backend_once, sha256_backend_init_once); } uint32_t sha256d80_scan_4way_direct( const sha256d80_midstate_t *mid, uint32_t start_nonce, const uint32_t target_words[8], sha256_state_t out_states[4] ) { uint32_t mask = 0; g_hash4_fn(mid, start_nonce, out_states); if (state_meets_target_words(&out_states[0], target_words)) { mask |= 1U; } if (state_meets_target_words(&out_states[1], target_words)) { mask |= 2U; } if (state_meets_target_words(&out_states[2], target_words)) { mask |= 4U; } if (state_meets_target_words(&out_states[3], target_words)) { mask |= 8U; } return mask; }