From cb136cddd93046d46c6a8c5ee2a3f5c30a001e97 Mon Sep 17 00:00:00 2001 From: Edward Thomson Date: Wed, 14 Apr 2021 22:22:11 +0100 Subject: [PATCH] utf8: introduce git_utf8_char_length Introduce a function to determine the number of Unicode characters in a given UTF-8 string. --- src/utf8.c | 18 ++++++++++++++++++ src/utf8.h | 20 ++++++++++++++++++++ tests/core/utf8.c | 19 +++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 tests/core/utf8.c diff --git a/src/utf8.c b/src/utf8.c index 1a37da6fb..77065cb71 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -114,6 +114,24 @@ int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len) return length; } +size_t git_utf8_char_length(const char *_str, size_t str_len) +{ + const uint8_t *str = (const uint8_t *)_str; + size_t offset = 0, count = 0; + + while (offset < str_len) { + int length = utf8_charlen(str + offset, str_len - offset); + + if (length < 0) + length = 1; + + offset += length; + count++; + } + + return count; +} + size_t git_utf8_valid_buf_length(const char *_str, size_t str_len) { const uint8_t *str = (const uint8_t *)_str; diff --git a/src/utf8.h b/src/utf8.h index 71c8f3bee..dff91b294 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -19,6 +19,26 @@ */ extern int git_utf8_iterate(uint32_t *out, const char *str, size_t str_len); +/** + * Returns the number of characters in the given string. + * + * This function will count invalid codepoints; if any given byte is + * not part of a valid UTF-8 codepoint, then it will be counted toward + * the length in characters. + * + * In other words: + * 0x24 (U+0024 "$") has length 1 + * 0xc2 0xa2 (U+00A2 "¢") has length 1 + * 0x24 0xc2 0xa2 (U+0024 U+00A2 "$¢") has length 2 + * 0xf0 0x90 0x8d 0x88 (U+10348 "𐍈") has length 1 + * 0x24 0xc0 0xc1 0x34 (U+0024 "4) has length 4 + * + * @param str string to scan + * @param str_len size of the string + * @return length in characters of the string + */ +extern size_t git_utf8_char_length(const char *str, size_t str_len); + /** * Iterate through an UTF-8 string and stops after finding any invalid UTF-8 * codepoints. diff --git a/tests/core/utf8.c b/tests/core/utf8.c new file mode 100644 index 000000000..021828e9e --- /dev/null +++ b/tests/core/utf8.c @@ -0,0 +1,19 @@ +#include "clar_libgit2.h" + +void test_core_utf8__char_length(void) +{ + cl_assert_equal_i(0, git_utf8_char_length("", 0)); + cl_assert_equal_i(1, git_utf8_char_length("$", 1)); + cl_assert_equal_i(5, git_utf8_char_length("abcde", 5)); + cl_assert_equal_i(1, git_utf8_char_length("\xc2\xa2", 2)); + cl_assert_equal_i(2, git_utf8_char_length("\x24\xc2\xa2", 3)); + cl_assert_equal_i(1, git_utf8_char_length("\xf0\x90\x8d\x88", 4)); + + /* uncontinued character counted as single characters */ + cl_assert_equal_i(2, git_utf8_char_length("\x24\xc2", 2)); + cl_assert_equal_i(3, git_utf8_char_length("\x24\xc2\xc2\xa2", 4)); + + /* invalid characters are counted as single characters */ + cl_assert_equal_i(4, git_utf8_char_length("\x24\xc0\xc0\x34", 4)); + cl_assert_equal_i(4, git_utf8_char_length("\x24\xf5\xfd\xc2", 4)); +}