Compare commits

..

No commits in common. "3f355b578c9c2579d89ef3bd09fcd997a3f3741b" and "65a34cf353e6c014610f02268771a4ecf15bf87f" have entirely different histories.

159
clay.h
View File

@ -384,6 +384,10 @@ typedef struct {
// CLAY_TEXT_ALIGN_CENTER - Horizontally aligns wrapped lines of text to the center of their bounding box. // CLAY_TEXT_ALIGN_CENTER - Horizontally aligns wrapped lines of text to the center of their bounding box.
// CLAY_TEXT_ALIGN_RIGHT - Horizontally aligns wrapped lines of text to the right hand side of their bounding box. // CLAY_TEXT_ALIGN_RIGHT - Horizontally aligns wrapped lines of text to the right hand side of their bounding box.
Clay_TextAlignment textAlignment; Clay_TextAlignment textAlignment;
// When set to true, clay will hash the entire text contents of this string as an identifier for its internal
// text measurement cache, rather than just the pointer and length. This will incur significant performance cost for
// long bodies of text.
bool hashStringContents;
} Clay_TextElementConfig; } Clay_TextElementConfig;
CLAY__WRAPPER_STRUCT(Clay_TextElementConfig); CLAY__WRAPPER_STRUCT(Clay_TextElementConfig);
@ -872,7 +876,8 @@ CLAY_DLL_EXPORT int32_t Clay_GetMaxMeasureTextCacheWordCount(void);
// Modifies the maximum number of measured "words" (whitespace seperated runs of characters) that Clay can store in its internal text measurement cache. // Modifies the maximum number of measured "words" (whitespace seperated runs of characters) that Clay can store in its internal text measurement cache.
// This may require reallocating additional memory, and re-calling Clay_Initialize(); // This may require reallocating additional memory, and re-calling Clay_Initialize();
CLAY_DLL_EXPORT void Clay_SetMaxMeasureTextCacheWordCount(int32_t maxMeasureTextCacheWordCount); CLAY_DLL_EXPORT void Clay_SetMaxMeasureTextCacheWordCount(int32_t maxMeasureTextCacheWordCount);
// Resets Clay's internal text measurement cache. Useful if font mappings have changed or fonts have been reloaded. // Resets Clay's internal text measurement cache, useful if memory to represent strings is being re-used.
// Similar behaviour can be achieved on an individual text element level by using Clay_TextElementConfig.hashStringContents
CLAY_DLL_EXPORT void Clay_ResetMeasureTextCache(void); CLAY_DLL_EXPORT void Clay_ResetMeasureTextCache(void);
// Internal API functions required by macros ---------------------- // Internal API functions required by macros ----------------------
@ -1343,68 +1348,76 @@ Clay_ElementId Clay__HashString(Clay_String key, const uint32_t offset, const ui
} }
#if !defined(CLAY_DISABLE_SIMD) && (defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)) #if !defined(CLAY_DISABLE_SIMD) && (defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64))
static inline __m128i Clay__SIMDRotateLeft(__m128i x, int r) { // Rotate left in AVX2 (equivalent to _mm256_rol_epi64 in AVX512)
return _mm_or_si128(_mm_slli_epi64(x, r), _mm_srli_epi64(x, 64 - r)); static inline __m256i rol64(__m256i x, int r) {
return _mm256_or_si256(_mm256_slli_epi64(x, r), _mm256_srli_epi64(x, 64 - r));
} }
static inline void Clay__SIMDARXMix(__m128i* a, __m128i* b) { // A simple ARX mix function
*a = _mm_add_epi64(*a, *b); static inline void arx_mix(__m256i *a, __m256i *b) {
*b = _mm_xor_si128(Clay__SIMDRotateLeft(*b, 17), *a); *a = _mm256_add_epi64(*a, *b);
*b = _mm256_xor_si256(rol64(*b, 17), *a);
} }
uint64_t Clay__HashData(const uint8_t* data, size_t length) { // SIMD ARX hash function (AVX2)
uint64_t arx_simd_hash(const uint8_t *data, size_t len) {
// Pinched these constants from the BLAKE implementation // Pinched these constants from the BLAKE implementation
__m128i v0 = _mm_set1_epi64x(0x6a09e667f3bcc908ULL); __m256i v0 = _mm256_set1_epi64x(0x6a09e667f3bcc908ULL);
__m128i v1 = _mm_set1_epi64x(0xbb67ae8584caa73bULL); __m256i v1 = _mm256_set1_epi64x(0xbb67ae8584caa73bULL);
__m128i v2 = _mm_set1_epi64x(0x3c6ef372fe94f82bULL); __m256i v2 = _mm256_set1_epi64x(0x3c6ef372fe94f82bULL);
__m128i v3 = _mm_set1_epi64x(0xa54ff53a5f1d36f1ULL); __m256i v3 = _mm256_set1_epi64x(0xa54ff53a5f1d36f1ULL);
uint8_t overflowBuffer[16] = {0}; // Temporary buffer for small inputs uint8_t overflowBuffer[16] = {0}; // Temporary buffer for small inputs
while (length > 0) { // Process 32-byte chunks
__m128i msg; while (len > 0) {
if (length >= 16) { __m256i msg;
msg = _mm_loadu_si128((const __m128i*)data); if (len >= 32) {
data += 16; msg = _mm256_loadu_si256((const __m256i *)data);
length -= 16; data += 32;
} len -= 32;
else { } else {
for (int i = 0; i < length; i++) { memset(overflowBuffer, 0, 16);
overflowBuffer[i] = data[i]; memcpy(overflowBuffer, data, len);
} msg = _mm256_loadu_si256((const __m256i *)overflowBuffer);
msg = _mm_loadu_si128((const __m128i*)overflowBuffer); len = 0;
length = 0;
} }
v0 = _mm_xor_si128(v0, msg); v0 = _mm256_xor_si256(v0, msg);
Clay__SIMDARXMix(&v0, &v1); arx_mix(&v0, &v1);
Clay__SIMDARXMix(&v2, &v3); arx_mix(&v2, &v3);
v0 = _mm_add_epi64(v0, v2); // Cross-lane mixing
v1 = _mm_add_epi64(v1, v3); v0 = _mm256_add_epi64(v0, v2);
v1 = _mm256_add_epi64(v1, v3);
} }
Clay__SIMDARXMix(&v0, &v1); // Final mixing rounds
Clay__SIMDARXMix(&v2, &v3); arx_mix(&v0, &v1);
v0 = _mm_add_epi64(v0, v2); arx_mix(&v2, &v3);
v1 = _mm_add_epi64(v1, v3); v0 = _mm256_add_epi64(v0, v2);
v1 = _mm256_add_epi64(v1, v3);
uint64_t result[2]; // Extract final hash
_mm_storeu_si128((__m128i*)result, v0); uint64_t result[4];
_mm256_storeu_si256((__m256i *)result, v0);
return result[0] ^ result[1]; return result[0] ^ result[1] ^ result[2] ^ result[3];
} }
#elif !defined(CLAY_DISABLE_SIMD) && defined(__aarch64__) #elif !defined(CLAY_DISABLE_SIMD) && defined(__aarch64__)
static inline uint64x2_t Clay__SIMDRotateLeft(uint64x2_t x, int r) { // Rotate left in NEON (simulating _mm256_rol_epi64)
static inline uint64x2_t rol64(uint64x2_t x, int r) {
return vorrq_u64(vshlq_n_u64(x, 17), vshrq_n_u64(x, 64 - 17)); return vorrq_u64(vshlq_n_u64(x, 17), vshrq_n_u64(x, 64 - 17));
} }
static inline void Clay__SIMDARXMix(uint64x2_t* a, uint64x2_t* b) { // A simple ARX mix function
static inline void arx_mix(uint64x2_t *a, uint64x2_t *b) {
*a = vaddq_u64(*a, *b); *a = vaddq_u64(*a, *b);
*b = veorq_u64(Clay__SIMDRotateLeft(*b, 17), *a); *b = veorq_u64(rol64(*b, 17), *a);
} }
uint64_t Clay__HashData(const uint8_t* data, size_t length) { // SIMD ARX hash function (NEON)
uint64_t arx_simd_hash(const uint8_t *data, size_t len) {
// Pinched these constants from the BLAKE implementation // Pinched these constants from the BLAKE implementation
uint64x2_t v0 = vdupq_n_u64(0x6a09e667f3bcc908ULL); uint64x2_t v0 = vdupq_n_u64(0x6a09e667f3bcc908ULL);
uint64x2_t v1 = vdupq_n_u64(0xbb67ae8584caa73bULL); uint64x2_t v1 = vdupq_n_u64(0xbb67ae8584caa73bULL);
@ -1413,59 +1426,57 @@ uint64_t Clay__HashData(const uint8_t* data, size_t length) {
uint8_t overflowBuffer[8] = {0}; uint8_t overflowBuffer[8] = {0};
while (length > 0) { // Process 16-byte chunks
while (len > 0) {
uint64x2_t msg; uint64x2_t msg;
if (length > 16) { if (len > 16) {
msg = vld1q_u64((const uint64_t *)data); msg = vld1q_u64((const uint64_t *)data);
data += 16; data += 16;
length -= 16; len -= 16;
} } else if (len > 8) {
else if (length > 8) {
msg = vcombine_u64(vld1_u64((const uint64_t *)data), vdup_n_u64(0)); msg = vcombine_u64(vld1_u64((const uint64_t *)data), vdup_n_u64(0));
data += 8; data += 8;
length -= 8; len -= 8;
} } else {
else { for (int i = 0; i < len; i++) {
for (int i = 0; i < length; i++) {
overflowBuffer[i] = data[i]; overflowBuffer[i] = data[i];
} }
uint8x8_t lower = vld1_u8(overflowBuffer); uint8x8_t lower = vld1_u8(overflowBuffer); // Load up to 8 bytes
msg = vcombine_u8(lower, vdup_n_u8(0)); msg = vcombine_u8(lower, vdup_n_u8(0)); // Zero upper 8 bytes
length = 0; len = 0;
} }
v0 = veorq_u64(v0, msg); v0 = veorq_u64(v0, msg);
Clay__SIMDARXMix(&v0, &v1); arx_mix(&v0, &v1);
Clay__SIMDARXMix(&v2, &v3); arx_mix(&v2, &v3);
// Cross-lane mixing
v0 = vaddq_u64(v0, v2); v0 = vaddq_u64(v0, v2);
v1 = vaddq_u64(v1, v3); v1 = vaddq_u64(v1, v3);
} }
Clay__SIMDARXMix(&v0, &v1); // Final mixing rounds
Clay__SIMDARXMix(&v2, &v3); arx_mix(&v0, &v1);
arx_mix(&v2, &v3);
v0 = vaddq_u64(v0, v2); v0 = vaddq_u64(v0, v2);
v1 = vaddq_u64(v1, v3); v1 = vaddq_u64(v1, v3);
// Extract final hash
uint64_t result[2]; uint64_t result[2];
vst1q_u64(result, v0); vst1q_u64(result, v0);
return result[0] ^ result[1]; return result[0] ^ result[1];
} }
#else
uint64_t Clay__HashData(const uint8_t* data, size_t length) {
uint64_t hash = 0;
for (int32_t i = 0; i < length; i++) {
hash += data[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
return hash;
}
#endif #endif
uint32_t Clay__HashStringContentsWithConfig(Clay_String *text, Clay_TextElementConfig *config) { uint32_t Clay__HashTextWithConfig(Clay_String *text, Clay_TextElementConfig *config) {
uint32_t hash = Clay__HashData((const uint8_t *)text->chars, text->length) % UINT32_MAX; uint32_t hash = 0;
uintptr_t pointerAsNumber = (uintptr_t)text->chars;
hash = arx_simd_hash((const uint8_t *)text->chars, text->length) % UINT32_MAX;
hash += text->length;
hash += (hash << 10);
hash ^= (hash >> 6);
hash += config->fontId; hash += config->fontId;
hash += (hash << 10); hash += (hash << 10);
@ -1475,10 +1486,18 @@ uint32_t Clay__HashStringContentsWithConfig(Clay_String *text, Clay_TextElementC
hash += (hash << 10); hash += (hash << 10);
hash ^= (hash >> 6); hash ^= (hash >> 6);
hash += config->lineHeight;
hash += (hash << 10);
hash ^= (hash >> 6);
hash += config->letterSpacing; hash += config->letterSpacing;
hash += (hash << 10); hash += (hash << 10);
hash ^= (hash >> 6); hash ^= (hash >> 6);
hash += config->wrapMode;
hash += (hash << 10);
hash ^= (hash >> 6);
hash += (hash << 3); hash += (hash << 3);
hash ^= (hash >> 11); hash ^= (hash >> 11);
hash += (hash << 15); hash += (hash << 15);
@ -1513,7 +1532,7 @@ Clay__MeasureTextCacheItem *Clay__MeasureTextCached(Clay_String *text, Clay_Text
return &Clay__MeasureTextCacheItem_DEFAULT; return &Clay__MeasureTextCacheItem_DEFAULT;
} }
#endif #endif
uint32_t id = Clay__HashStringContentsWithConfig(text, config); uint32_t id = Clay__HashTextWithConfig(text, config);
uint32_t hashBucket = id % (context->maxMeasureTextCacheWordCount / 32); uint32_t hashBucket = id % (context->maxMeasureTextCacheWordCount / 32);
int32_t elementIndexPrevious = 0; int32_t elementIndexPrevious = 0;
int32_t elementIndex = context->measureTextHashMap.internalArray[hashBucket]; int32_t elementIndex = context->measureTextHashMap.internalArray[hashBucket];