From 28b8908347c0b49938d90c27970088fbe81c5ec3 Mon Sep 17 00:00:00 2001 From: Nic Barker Date: Wed, 19 Mar 2025 11:30:00 +1300 Subject: [PATCH] Interim commit --- clay.h | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 122 insertions(+), 12 deletions(-) diff --git a/clay.h b/clay.h index 2f69878..85b3770 100644 --- a/clay.h +++ b/clay.h @@ -1348,22 +1348,132 @@ Clay_ElementId Clay__HashString(Clay_String key, const uint32_t offset, const ui return CLAY__INIT(Clay_ElementId) { .id = hash + 1, .offset = offset, .baseId = base + 1, .stringId = key }; // Reserve the hash result of zero as "null id" } +#if !defined(CLAY_DISABLE_SIMD) && (defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)) +// Rotate left in AVX2 (equivalent to _mm256_rol_epi64 in AVX512) +static inline __m256i rol64(__m256i x, int r) { + return _mm256_or_si256(_mm256_slli_epi64(x, r), _mm256_srli_epi64(x, 64 - r)); +} + +// A simple ARX mix function +static inline void arx_mix(__m256i *a, __m256i *b) { + *a = _mm256_add_epi64(*a, *b); + *b = _mm256_xor_si256(rol64(*b, 17), *a); +} + +// SIMD ARX hash function (AVX2) +uint64_t arx_simd_hash(const uint8_t *data, size_t len) { + // Pinched these constants from the BLAKE implementation + __m256i v0 = _mm256_set1_epi64x(0x6a09e667f3bcc908ULL); + __m256i v1 = _mm256_set1_epi64x(0xbb67ae8584caa73bULL); + __m256i v2 = _mm256_set1_epi64x(0x3c6ef372fe94f82bULL); + __m256i v3 = _mm256_set1_epi64x(0xa54ff53a5f1d36f1ULL); + + uint8_t overflowBuffer[16] = {0}; // Temporary buffer for small inputs + + // Process 32-byte chunks + while (len > 0) { + __m256i msg; + if (len >= 32) { + msg = _mm256_loadu_si256((const __m256i *)data); + data += 32; + len -= 32; + } else { + memset(overflowBuffer, 0, 16); + memcpy(overflowBuffer, data, len); + msg = _mm256_loadu_si256((const __m256i *)overflowBuffer); + len = 0; + } + + v0 = _mm256_xor_si256(v0, msg); + arx_mix(&v0, &v1); + arx_mix(&v2, &v3); + + // Cross-lane mixing + v0 = _mm256_add_epi64(v0, v2); + v1 = _mm256_add_epi64(v1, v3); + } + + // Final mixing rounds + arx_mix(&v0, &v1); + arx_mix(&v2, &v3); + v0 = _mm256_add_epi64(v0, v2); + v1 = _mm256_add_epi64(v1, v3); + + // Extract final hash + uint64_t result[4]; + _mm256_storeu_si256((__m256i *)result, v0); + + return result[0] ^ result[1] ^ result[2] ^ result[3]; +} +#elif !defined(CLAY_DISABLE_SIMD) && defined(__aarch64__) + // Rotate left in NEON (simulating _mm256_rol_epi64) + static inline uint64x2_t rol64(uint64x2_t x, int r) { + return vorrq_u64(vshlq_n_u64(x, 17), vshrq_n_u64(x, 64 - 17)); + } + + // A simple ARX mix function + static inline void arx_mix(uint64x2_t *a, uint64x2_t *b) { + *a = vaddq_u64(*a, *b); + *b = veorq_u64(rol64(*b, 17), *a); + } + + // SIMD ARX hash function (NEON) + uint64_t arx_simd_hash(const uint8_t *data, size_t len) { + // Pinched these constants from the BLAKE implementation + uint64x2_t v0 = vdupq_n_u64(0x6a09e667f3bcc908ULL); + uint64x2_t v1 = vdupq_n_u64(0xbb67ae8584caa73bULL); + uint64x2_t v2 = vdupq_n_u64(0x3c6ef372fe94f82bULL); + uint64x2_t v3 = vdupq_n_u64(0xa54ff53a5f1d36f1ULL); + + uint8_t overflowBuffer[8] = {0}; + + // Process 16-byte chunks + while (len > 0) { + uint64x2_t msg; + if (len > 16) { + msg = vld1q_u64((const uint64_t *)data); + data += 16; + len -= 16; + } else if (len > 8) { + msg = vcombine_u64(vld1_u64((const uint64_t *)data), vdup_n_u64(0)); + data += 8; + len -= 8; + } else { + for (int i = 0; i < len; i++) { + overflowBuffer[i] = data[i]; + } + uint8x8_t lower = vld1_u8(overflowBuffer); // Load up to 8 bytes + msg = vcombine_u8(lower, vdup_n_u8(0)); // Zero upper 8 bytes + len = 0; + } + v0 = veorq_u64(v0, msg); + arx_mix(&v0, &v1); + arx_mix(&v2, &v3); + + // Cross-lane mixing + v0 = vaddq_u64(v0, v2); + v1 = vaddq_u64(v1, v3); + } + + // Final mixing rounds + arx_mix(&v0, &v1); + arx_mix(&v2, &v3); + v0 = vaddq_u64(v0, v2); + v1 = vaddq_u64(v1, v3); + + // Extract final hash + uint64_t result[2]; + vst1q_u64(result, v0); + + return result[0] ^ result[1]; + } +#endif + uint32_t Clay__HashTextWithConfig(Clay_String *text, Clay_TextElementConfig *config) { uint32_t hash = 0; uintptr_t pointerAsNumber = (uintptr_t)text->chars; - if (config->hashStringContents) { - uint32_t maxLengthToHash = CLAY__MIN(text->length, 256); - for (uint32_t i = 0; i < maxLengthToHash; i++) { - hash += text->chars[i]; - hash += (hash << 10); - hash ^= (hash >> 6); - } - } else { - hash += pointerAsNumber; - hash += (hash << 10); - hash ^= (hash >> 6); - } + hash = arx_simd_hash((const uint8_t *)text->chars, text->length) % UINT32_MAX; hash += text->length; hash += (hash << 10);