mirror of
https://github.com/libgit2/libgit2.git
synced 2026-06-22 06:26:26 +00:00
utf8: introduce git_utf8_char_length
Introduce a function to determine the number of Unicode characters in a given UTF-8 string.
This commit is contained in:
18
src/utf8.c
18
src/utf8.c
@@ -114,6 +114,24 @@ int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len)
|
||||
return length;
|
||||
}
|
||||
|
||||
size_t git_utf8_char_length(const char *_str, size_t str_len)
|
||||
{
|
||||
const uint8_t *str = (const uint8_t *)_str;
|
||||
size_t offset = 0, count = 0;
|
||||
|
||||
while (offset < str_len) {
|
||||
int length = utf8_charlen(str + offset, str_len - offset);
|
||||
|
||||
if (length < 0)
|
||||
length = 1;
|
||||
|
||||
offset += length;
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
|
||||
{
|
||||
const uint8_t *str = (const uint8_t *)_str;
|
||||
|
||||
20
src/utf8.h
20
src/utf8.h
@@ -19,6 +19,26 @@
|
||||
*/
|
||||
extern int git_utf8_iterate(uint32_t *out, const char *str, size_t str_len);
|
||||
|
||||
/**
|
||||
* Returns the number of characters in the given string.
|
||||
*
|
||||
* This function will count invalid codepoints; if any given byte is
|
||||
* not part of a valid UTF-8 codepoint, then it will be counted toward
|
||||
* the length in characters.
|
||||
*
|
||||
* In other words:
|
||||
* 0x24 (U+0024 "$") has length 1
|
||||
* 0xc2 0xa2 (U+00A2 "¢") has length 1
|
||||
* 0x24 0xc2 0xa2 (U+0024 U+00A2 "$¢") has length 2
|
||||
* 0xf0 0x90 0x8d 0x88 (U+10348 "𐍈") has length 1
|
||||
* 0x24 0xc0 0xc1 0x34 (U+0024 <invalid> <invalid> "4) has length 4
|
||||
*
|
||||
* @param str string to scan
|
||||
* @param str_len size of the string
|
||||
* @return length in characters of the string
|
||||
*/
|
||||
extern size_t git_utf8_char_length(const char *str, size_t str_len);
|
||||
|
||||
/**
|
||||
* Iterate through an UTF-8 string and stops after finding any invalid UTF-8
|
||||
* codepoints.
|
||||
|
||||
19
tests/core/utf8.c
Normal file
19
tests/core/utf8.c
Normal file
@@ -0,0 +1,19 @@
|
||||
#include "clar_libgit2.h"
|
||||
|
||||
void test_core_utf8__char_length(void)
|
||||
{
|
||||
cl_assert_equal_i(0, git_utf8_char_length("", 0));
|
||||
cl_assert_equal_i(1, git_utf8_char_length("$", 1));
|
||||
cl_assert_equal_i(5, git_utf8_char_length("abcde", 5));
|
||||
cl_assert_equal_i(1, git_utf8_char_length("\xc2\xa2", 2));
|
||||
cl_assert_equal_i(2, git_utf8_char_length("\x24\xc2\xa2", 3));
|
||||
cl_assert_equal_i(1, git_utf8_char_length("\xf0\x90\x8d\x88", 4));
|
||||
|
||||
/* uncontinued character counted as single characters */
|
||||
cl_assert_equal_i(2, git_utf8_char_length("\x24\xc2", 2));
|
||||
cl_assert_equal_i(3, git_utf8_char_length("\x24\xc2\xc2\xa2", 4));
|
||||
|
||||
/* invalid characters are counted as single characters */
|
||||
cl_assert_equal_i(4, git_utf8_char_length("\x24\xc0\xc0\x34", 4));
|
||||
cl_assert_equal_i(4, git_utf8_char_length("\x24\xf5\xfd\xc2", 4));
|
||||
}
|
||||
Reference in New Issue
Block a user