Blender V4.3
string_utf8.cc File Reference
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cwchar>
#include <cwctype>
#include <wcwidth.h>
#include "BLI_utildefines.h"
#include "BLI_string.h"
#include "BLI_string_utf8.h"
#include "BLI_strict_flags.h"

Go to the source code of this file.

Macros

#define UTF8_VARS_FROM_CHAR32(Char, First, Len)
 

Functions

ptrdiff_t BLI_str_utf8_invalid_byte (const char *str, size_t length)
 
int BLI_str_utf8_invalid_strip (char *str, size_t length)
 
BLI_INLINE char * str_utf8_copy_max_bytes_impl (char *dst, const char *src, size_t dst_maxncpy)
 
char * BLI_strncpy_utf8 (char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
 
size_t BLI_strncpy_utf8_rlen (char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
 
size_t BLI_strncpy_wchar_as_utf8 (char *__restrict dst, const wchar_t *__restrict src, const size_t dst_maxncpy)
 
size_t BLI_wstrlen_utf8 (const wchar_t *src)
 
size_t BLI_strlen_utf8_ex (const char *strc, size_t *r_len_bytes)
 
size_t BLI_strlen_utf8 (const char *strc)
 
size_t BLI_strnlen_utf8_ex (const char *strc, const size_t strc_maxlen, size_t *r_len_bytes)
 
size_t BLI_strnlen_utf8 (const char *strc, const size_t strc_maxlen)
 
size_t BLI_strncpy_wchar_from_utf8 (wchar_t *__restrict dst_w, const char *__restrict src_c, const size_t dst_w_maxncpy)
 
int BLI_wcwidth_or_error (char32_t ucs)
 
int BLI_wcwidth_safe (char32_t ucs)
 
int BLI_wcswidth_or_error (const char32_t *pwcs, size_t n)
 
int BLI_str_utf8_char_width_or_error (const char *p)
 
int BLI_str_utf8_char_width_safe (const char *p)
 
int BLI_str_utf8_size_or_error (const char *p)
 
int BLI_str_utf8_size_safe (const char *p)
 
uint BLI_str_utf8_as_unicode_or_error (const char *p)
 
uint BLI_str_utf8_as_unicode_safe (const char *p)
 
uint BLI_str_utf8_as_unicode_step_or_error (const char *__restrict p, const size_t p_len, size_t *__restrict index)
 
uint BLI_str_utf8_as_unicode_step_safe (const char *__restrict p, const size_t p_len, size_t *__restrict index)
 
size_t BLI_str_utf8_from_unicode_len (const uint c)
 
size_t BLI_str_utf8_from_unicode (uint c, char *dst, const size_t dst_maxncpy)
 
size_t BLI_str_utf8_as_utf32 (char32_t *__restrict dst_w, const char *__restrict src_c, const size_t dst_w_maxncpy)
 
size_t BLI_str_utf32_as_utf8 (char *__restrict dst, const char32_t *__restrict src, const size_t dst_maxncpy)
 
size_t BLI_str_utf32_as_utf8_len_ex (const char32_t *src, const size_t src_maxlen)
 
size_t BLI_str_utf32_as_utf8_len (const char32_t *src)
 
const char * BLI_str_find_prev_char_utf8 (const char *p, const char *str_start)
 
const char * BLI_str_find_next_char_utf8 (const char *p, const char *str_end)
 
size_t BLI_str_partition_utf8 (const char *str, const uint delim[], const char **r_sep, const char **r_suf)
 
size_t BLI_str_rpartition_utf8 (const char *str, const uint delim[], const char **r_sep, const char **r_suf)
 
size_t BLI_str_partition_ex_utf8 (const char *str, const char *end, const uint delim[], const char **r_sep, const char **r_suf, const bool from_right)
 
UTF8 Character Decoding (Skip & Mask Lookup)

Derived from GLIB gutf8.c.

Ranges (zero based, inclusive):

  • 000..127: 1 byte.
  • 128..191: invalid.
  • 192..223: 2 bytes.
  • 224..239: 3 bytes.
  • 240..247: 4 bytes.
  • 248..251: 4 bytes.
  • 252..253: 4 bytes.
  • 254..255: invalid.

Invalid values fall back to 1 byte or -1 (for an error value).

Note
From testing string copying via BLI_strncpy_utf8 with large (multi-megabyte) strings, using a function instead of a lookup-table is between 2 & 3 times faster.
BLI_INLINE int utf8_char_compute_skip (const char c)
 
BLI_INLINE int utf8_char_compute_skip_or_error (const char c)
 
BLI_INLINE int utf8_char_compute_skip_or_error_with_mask (const char c, char *r_mask)
 
BLI_INLINE uint utf8_char_decode (const char *p, const char mask, const int len, const uint err)
 
UTF32 Case Conversion
Warning
the lower/uppercase form of some characters use multiple characters. These cases are not accounted for by this conversion function. A common example is the German eszett / scharfes. Supporting such cases would have to operate on a character array, with support for resizing. (for reference - Python's upper/lower functions support this).
char32_t BLI_str_utf32_char_to_upper (const char32_t wc)
 
char32_t BLI_str_utf32_char_to_lower (const char32_t wc)
 
Offset Conversion in Strings
Note
Regarding the assertion: BLI_assert(offset <= offset_target) The offset_target is likely in the middle of a UTF8 byte-sequence. Most likely the offset passed in is incorrect, although it may be impractical to avoid this happening in the case of invalid UTF8 byte sequences. If the assert is impractical to avoid, it could be demoted to a warning.
int BLI_str_utf8_offset_to_index (const char *str, const size_t str_len, const int offset_target)
 
int BLI_str_utf8_offset_from_index (const char *str, const size_t str_len, const int index_target)
 
int BLI_str_utf8_offset_to_column (const char *str, const size_t str_len, const int offset_target)
 
int BLI_str_utf8_offset_from_column (const char *str, const size_t str_len, const int column_target)
 
int BLI_str_utf8_offset_to_column_with_tabs (const char *str, const size_t str_len, const int offset_target, const int tab_width)
 
int BLI_str_utf8_offset_from_column_with_tabs (const char *str, const size_t str_len, const int column_target, const int tab_width)
 

Macro Definition Documentation

◆ UTF8_VARS_FROM_CHAR32

#define UTF8_VARS_FROM_CHAR32 ( Char,
First,
Len )
Value:
if (Char < 0x80) { \
First = 0; \
Len = 1; \
} \
else if (Char < 0x800) { \
First = 0xc0; \
Len = 2; \
} \
else if (Char < 0x10000) { \
First = 0xe0; \
Len = 3; \
} \
else if (Char < 0x200000) { \
First = 0xf0; \
Len = 4; \
} \
else if (Char < 0x4000000) { \
First = 0xf8; \
Len = 5; \
} \
else { \
First = 0xfc; \
Len = 6; \
} \
(void)0

Definition at line 822 of file string_utf8.cc.

Referenced by BLI_str_utf8_from_unicode(), and BLI_str_utf8_from_unicode_len().

Function Documentation

◆ BLI_str_find_next_char_utf8()

const char * BLI_str_find_next_char_utf8 ( const char * p,
const char * str_end )

Definition at line 976 of file string_utf8.cc.

References BLI_assert.

Referenced by BLI_str_utf8_as_utf32().

◆ BLI_str_find_prev_char_utf8()

const char * BLI_str_find_prev_char_utf8 ( const char * p,
const char * str_start )

Definition at line 961 of file string_utf8.cc.

References BLI_assert.

Referenced by BLI_str_partition_ex_utf8().

◆ BLI_str_partition_ex_utf8()

size_t BLI_str_partition_ex_utf8 ( const char * str,
const char * end,
const uint delim[],
const char ** r_sep,
const char ** r_suf,
const bool from_right )

◆ BLI_str_partition_utf8()

size_t BLI_str_partition_utf8 ( const char * str,
const uint delim[],
const char ** r_sep,
const char ** r_suf )

Definition at line 989 of file string_utf8.cc.

References BLI_str_partition_ex_utf8(), and str.

◆ BLI_str_rpartition_utf8()

size_t BLI_str_rpartition_utf8 ( const char * str,
const uint delim[],
const char ** r_sep,
const char ** r_suf )

Definition at line 997 of file string_utf8.cc.

References BLI_str_partition_ex_utf8(), and str.

◆ BLI_str_utf32_as_utf8()

size_t BLI_str_utf32_as_utf8 ( char *__restrict dst,
const char32_t *__restrict src,
const size_t dst_maxncpy )

◆ BLI_str_utf32_as_utf8_len()

size_t BLI_str_utf32_as_utf8_len ( const char32_t * src)
Returns
The UTF-32 len in UTF-8.

Definition at line 950 of file string_utf8.cc.

References BLI_str_utf8_from_unicode_len(), and len.

Referenced by BKE_vfont_clipboard_set(), and ED_curve_editfont_load().

◆ BLI_str_utf32_as_utf8_len_ex()

size_t BLI_str_utf32_as_utf8_len_ex ( const char32_t * src,
size_t src_maxlen )
Returns
The UTF-32 len in UTF-8 with a clamped length.

Definition at line 938 of file string_utf8.cc.

References BLI_str_utf8_from_unicode_len(), and len.

Referenced by font_select_to_buffer().

◆ BLI_str_utf32_char_to_lower()

char32_t BLI_str_utf32_char_to_lower ( char32_t wc)

Return the lowercase of a 32-bit character or the character when no case change is needed.

Note
A 1:1 mapping doesn't account for multiple characters as part of conversion in some cases.

Definition at line 642 of file string_utf8.cc.

References ARRAY_SIZE, max, and min.

Referenced by set_case().

◆ BLI_str_utf32_char_to_upper()

char32_t BLI_str_utf32_char_to_upper ( char32_t wc)

Return the uppercase of a 32-bit character or the character when no case change is needed.

Note
A 1:1 mapping doesn't account for multiple characters as part of conversion in some cases.

Definition at line 531 of file string_utf8.cc.

References ARRAY_SIZE, max, and min.

Referenced by set_case().

◆ BLI_str_utf8_as_unicode_or_error()

uint BLI_str_utf8_as_unicode_or_error ( const char * p)
Parameters
pa pointer to Unicode character encoded as UTF-8

Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does not point to a valid UTF-8 encoded character, results are undefined. If you are not sure that the bytes are complete valid Unicode characters, you should use g_utf8_get_char_validated() instead.

Return value: the resulting character

Definition at line 760 of file string_utf8.cc.

References BLI_UTF8_ERR, len, UNLIKELY, utf8_char_compute_skip_or_error_with_mask(), and utf8_char_decode().

Referenced by BLI_str_partition_ex_utf8(), BLI_str_utf8_as_unicode_safe(), BLI_str_utf8_char_width_or_error(), BLI_str_utf8_char_width_safe(), insert_text_invoke(), key_event_glyph_or_text(), text_autocomplete_build(), and text_insert_invoke().

◆ BLI_str_utf8_as_unicode_safe()

uint BLI_str_utf8_as_unicode_safe ( const char * p)

◆ BLI_str_utf8_as_unicode_step_or_error()

uint BLI_str_utf8_as_unicode_step_or_error ( const char *__restrict p,
const size_t p_len,
size_t *__restrict index )

◆ BLI_str_utf8_as_unicode_step_safe()

◆ BLI_str_utf8_as_utf32()

size_t BLI_str_utf8_as_utf32 ( char32_t *__restrict dst_w,
const char *__restrict src_c,
const size_t dst_w_maxncpy )

◆ BLI_str_utf8_char_width_or_error()

int BLI_str_utf8_char_width_or_error ( const char * p)

◆ BLI_str_utf8_char_width_safe()

◆ BLI_str_utf8_from_unicode()

size_t BLI_str_utf8_from_unicode ( unsigned int c,
char * dst,
size_t dst_maxncpy )

BLI_str_utf8_from_unicode:

Parameters
ca Unicode character code
dstoutput buffer, must have at least dst_maxncpy bytes of space. If the length required by c exceeds dst_maxncpy, the bytes available bytes will be zeroed and dst_maxncpy returned.

Converts a single character to UTF-8.

Returns
number of bytes written.

Definition at line 861 of file string_utf8.cc.

References BLI_string_debug_size, len, UNLIKELY, and UTF8_VARS_FROM_CHAR32.

Referenced by BLI_str_utf32_as_utf8(), BLI_strncpy_wchar_as_utf8(), find_family_object(), blender::io::usd::make_safe_name(), txt_add_char_intern(), txt_extended_ascii_as_utf8(), txt_replace_char(), and wm_event_add_ghostevent().

◆ BLI_str_utf8_from_unicode_len()

size_t BLI_str_utf8_from_unicode_len ( const uint c)

◆ BLI_str_utf8_invalid_byte()

ptrdiff_t BLI_str_utf8_invalid_byte ( const char * str,
size_t length )

Find first UTF-8 invalid byte in given str, of length bytes.

Returns
the offset of the first invalid byte.

Definition at line 150 of file string_utf8.cc.

References ELEM, str, and utf8_char_compute_skip().

Referenced by BLI_str_utf8_invalid_strip(), and txt_extended_ascii_as_utf8().

◆ BLI_str_utf8_invalid_strip()

int BLI_str_utf8_invalid_strip ( char * str,
size_t length )

Remove any invalid UTF-8 byte (taking into account multi-bytes sequence of course).

Returns
number of stripped bytes.

Definition at line 285 of file string_utf8.cc.

References BLI_assert, BLI_str_utf8_invalid_byte(), and str.

Referenced by BKE_id_new_name_validate(), BKE_vfontdata_from_freetypefont(), id_name_final_build(), outputNumInput(), SEQ_edit_sequence_name_set(), TEST(), ui_textedit_end(), and wm_clipboard_text_get_ex().

◆ BLI_str_utf8_offset_from_column()

int BLI_str_utf8_offset_from_column ( const char * str,
const size_t str_len,
const int column_target )

Definition at line 1105 of file string_utf8.cc.

References BLI_str_utf8_as_unicode_step_safe(), BLI_wcwidth_safe(), int, and str.

◆ BLI_str_utf8_offset_from_column_with_tabs()

int BLI_str_utf8_offset_from_column_with_tabs ( const char * str,
const size_t str_len,
const int column_target,
const int tab_width )

◆ BLI_str_utf8_offset_from_index()

int BLI_str_utf8_offset_from_index ( const char * str,
size_t str_len,
int index_target )

Return the byte offset in str from index_target.

Parameters
index_targetThe unicode index, where multi-byte characters are counted once. There is no need to clamp this value, the index is logically clamped to BLI_strlen_utf8(str) or below.

Definition at line 1077 of file string_utf8.cc.

References BLI_assert, BLI_str_utf8_as_unicode_step_safe(), int, str, and UNUSED_VARS.

Referenced by blender::nodes::node_geo_string_to_curves_cc::get_text_layout(), blender::nodes::node_fn_slice_string_cc::node_build_multi_function(), TEST(), and txt_sel_set().

◆ BLI_str_utf8_offset_to_column()

int BLI_str_utf8_offset_to_column ( const char * str,
const size_t str_len,
const int offset_target )

◆ BLI_str_utf8_offset_to_column_with_tabs()

int BLI_str_utf8_offset_to_column_with_tabs ( const char * str,
const size_t str_len,
const int offset_target,
const int tab_width )

◆ BLI_str_utf8_offset_to_index()

int BLI_str_utf8_offset_to_index ( const char * str,
const size_t str_len,
const int offset_target )

◆ BLI_str_utf8_size_or_error()

int BLI_str_utf8_size_or_error ( const char * p)
Returns
The size (in bytes) of a single UTF-8 char.
Warning
Can return -1 on bad chars.

Definition at line 750 of file string_utf8.cc.

References utf8_char_compute_skip_or_error().

Referenced by handleNumInput(), ui_do_but_textedit(), ui_handle_menu_letter_press_search(), wm_event_add_ghostevent(), WM_event_print(), and WM_event_utf8_to_ascii().

◆ BLI_str_utf8_size_safe()

◆ BLI_strlen_utf8()

◆ BLI_strlen_utf8_ex()

size_t BLI_strlen_utf8_ex ( const char * strc,
size_t * r_len_bytes )

Definition at line 396 of file string_utf8.cc.

References BLI_str_utf8_size_safe(), len, and UNLIKELY.

Referenced by BLI_strlen_utf8().

◆ BLI_strncpy_utf8()

char * BLI_strncpy_utf8 ( char *__restrict dst,
const char *__restrict src,
size_t dst_maxncpy )

Definition at line 343 of file string_utf8.cc.

References BLI_assert, BLI_string_debug_size, and str_utf8_copy_max_bytes_impl().

◆ BLI_strncpy_utf8_rlen()

size_t BLI_strncpy_utf8_rlen ( char *__restrict dst,
const char *__restrict src,
size_t dst_maxncpy )

Definition at line 352 of file string_utf8.cc.

References BLI_assert, BLI_string_debug_size, and str_utf8_copy_max_bytes_impl().

◆ BLI_strncpy_wchar_as_utf8()

size_t BLI_strncpy_wchar_as_utf8 ( char *__restrict dst,
const wchar_t *__restrict src,
const size_t dst_maxncpy )

◆ BLI_strncpy_wchar_from_utf8()

size_t BLI_strncpy_wchar_from_utf8 ( wchar_t *__restrict dst_w,
const char *__restrict src_c,
const size_t dst_w_maxncpy )

◆ BLI_strnlen_utf8()

size_t BLI_strnlen_utf8 ( const char * strc,
size_t strc_maxlen )
Parameters
strcthe string to measure the length.
strc_maxlenthe string length (in bytes)
Returns
the unicode length (not in bytes!)

Definition at line 455 of file string_utf8.cc.

References BLI_strnlen_utf8_ex().

Referenced by blender::string_search::count_utf8_code_points(), TEST(), and ui_text_position_to_hidden().

◆ BLI_strnlen_utf8_ex()

size_t BLI_strnlen_utf8_ex ( const char * strc,
const size_t strc_maxlen,
size_t * r_len_bytes )

Definition at line 427 of file string_utf8.cc.

References BLI_str_utf8_size_safe(), len, and UNLIKELY.

Referenced by BLI_strnlen_utf8().

◆ BLI_wcswidth_or_error()

int BLI_wcswidth_or_error ( const char32_t * pwcs,
size_t n )

Definition at line 496 of file string_utf8.cc.

◆ BLI_wcwidth_or_error()

int BLI_wcwidth_or_error ( char32_t ucs)

Count columns that character/string occupies (based on wcwidth.co).

Definition at line 478 of file string_utf8.cc.

Referenced by blf_glyph_render(), BLI_str_cursor_step_next_utf32(), BLI_str_cursor_step_prev_utf32(), BLI_str_utf8_char_width_or_error(), and BLI_wcwidth_safe().

◆ BLI_wcwidth_safe()

◆ BLI_wstrlen_utf8()

size_t BLI_wstrlen_utf8 ( const wchar_t * src)
Returns
the wchar_t length in UTF-8.

Definition at line 385 of file string_utf8.cc.

References BLI_str_utf8_from_unicode_len(), and len.

◆ str_utf8_copy_max_bytes_impl()

BLI_INLINE char * str_utf8_copy_max_bytes_impl ( char * dst,
const char * src,
size_t dst_maxncpy )

Internal utility for implementing BLI_strncpy_utf8 / BLI_strncpy_utf8_rlen.

Compatible with BLI_strncpy, but ensure no partial UTF8 chars.

Note
currently we don't attempt to deal with invalid utf8 chars. See BLI_str_utf8_invalid_strip for if that is needed.

Definition at line 318 of file string_utf8.cc.

References ATTR_FALLTHROUGH, UNLIKELY, and utf8_char_compute_skip().

Referenced by BLI_strncpy_utf8(), and BLI_strncpy_utf8_rlen().

◆ utf8_char_compute_skip()

BLI_INLINE int utf8_char_compute_skip ( const char c)

◆ utf8_char_compute_skip_or_error()

BLI_INLINE int utf8_char_compute_skip_or_error ( const char c)

Definition at line 78 of file string_utf8.cc.

Referenced by BLI_str_utf8_size_or_error().

◆ utf8_char_compute_skip_or_error_with_mask()

BLI_INLINE int utf8_char_compute_skip_or_error_with_mask ( const char c,
char * r_mask )

◆ utf8_char_decode()

BLI_INLINE uint utf8_char_decode ( const char * p,
const char mask,
const int len,
const uint err )

Decode a UTF8 code-point, use in combination with utf8_char_compute_skip_or_error_with_mask.

Definition at line 134 of file string_utf8.cc.

References count, len, mask(), and result.

Referenced by BLI_str_utf8_as_unicode_or_error(), and BLI_str_utf8_as_unicode_step_or_error().