Blender V5.0
string_utf8.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 1999 Tom Tromey
2 * SPDX-FileCopyrightText: 2000 Red Hat, Inc. All rights reserved.
3 * SPDX-FileCopyrightText: 2011 Blender Authors
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 *
7 * Code from `gutf8.c` by Tom Tromey & Red Hat, Inc. */
8
12
13#include <algorithm>
14#include <cstdio>
15#include <cstdlib>
16#include <cstring>
17#include <cwchar>
18#include <cwctype>
19#include <wcwidth.h>
20
21#include "BLI_utildefines.h"
22
23#include "BLI_string.h" /* #BLI_string_debug_size. */
24#include "BLI_string_utf8.h" /* own include */
25#ifdef WIN32
26# include "utfconv.hh"
27#endif
28#ifdef __GNUC__
29# pragma GCC diagnostic error "-Wsign-conversion"
30#endif
31
32#include "BLI_strict_flags.h" /* IWYU pragma: keep. Keep last. */
33
34static size_t str_utf8_truncate_at_size_unchecked(char *str, const size_t str_size);
35
36/* -------------------------------------------------------------------- */
57
59{
60 if (UNLIKELY(c >= 192)) {
61 if ((c & 0xe0) == 0xc0) {
62 return 2;
63 }
64 if ((c & 0xf0) == 0xe0) {
65 return 3;
66 }
67 if ((c & 0xf8) == 0xf0) {
68 return 4;
69 }
70 if ((c & 0xfc) == 0xf8) {
71 return 5;
72 }
73 if ((c & 0xfe) == 0xfc) {
74 return 6;
75 }
76 }
77 return 1;
78}
79
81{
82 if (c < 128) {
83 return 1;
84 }
85 if ((c & 0xe0) == 0xc0) {
86 return 2;
87 }
88 if ((c & 0xf0) == 0xe0) {
89 return 3;
90 }
91 if ((c & 0xf8) == 0xf0) {
92 return 4;
93 }
94 if ((c & 0xfc) == 0xf8) {
95 return 5;
96 }
97 if ((c & 0xfe) == 0xfc) {
98 return 6;
99 }
100 return -1;
101}
102
104{
105 /* Originally from GLIB `UTF8_COMPUTE` macro. */
106 if (c < 128) {
107 *r_mask = 0x7f;
108 return 1;
109 }
110 if ((c & 0xe0) == 0xc0) {
111 *r_mask = 0x1f;
112 return 2;
113 }
114 if ((c & 0xf0) == 0xe0) {
115 *r_mask = 0x0f;
116 return 3;
117 }
118 if ((c & 0xf8) == 0xf0) {
119 *r_mask = 0x07;
120 return 4;
121 }
122 if ((c & 0xfc) == 0xf8) {
123 *r_mask = 0x03;
124 return 5;
125 }
126 if ((c & 0xfe) == 0xfc) {
127 *r_mask = 0x01;
128 return 6;
129 }
130 return -1;
131}
132
136BLI_INLINE uint utf8_char_decode(const char *p, const char mask, const int len, const uint err)
137{
138 /* Originally from GLIB `UTF8_GET` macro, added an 'err' argument. */
139 uint result = p[0] & mask;
140 for (int count = 1; count < len; count++) {
141 if ((p[count] & 0xc0) != 0x80) {
142 return err;
143 }
144 result <<= 6;
145 result |= p[count] & 0x3f;
146 }
147 return result;
148}
149
151
152ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t str_len)
153{
154 /* NOTE(@ideasman42): from libswish3, originally called `u8_isvalid()`,
155 * modified to return the index of the bad character (byte index not UTF).
156 * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044.
157 *
158 * Comment from code in: `libswish3`.
159 * Based on the `valid_utf8` routine from the PCRE library by Philip Hazel
160 *
161 * length is in bytes, since without knowing whether the string is valid
162 * it's hard to know how many characters there are! */
163
164 const uchar *p, *perr, *pend = (const uchar *)str + str_len;
165 uchar c;
166 int ab;
167
168 for (p = (const uchar *)str; p < pend; p++, str_len--) {
169 c = *p;
170 perr = p; /* Erroneous char is always the first of an invalid UTF8 sequence... */
171 if (ELEM(c, 0xfe, 0xff, 0x00)) {
172 /* Those three values are not allowed in UTF8 string. */
173 goto utf8_error;
174 }
175 if (c < 128) {
176 continue;
177 }
178 if ((c & 0xc0) != 0xc0) {
179 goto utf8_error;
180 }
181
182 /* Note that since we always increase p (and decrease length) by one byte in main loop,
183 * we only add/subtract extra UTF8 bytes in code below
184 * (ab number, aka number of bytes remaining in the UTF8 sequence after the initial one). */
185 ab = utf8_char_compute_skip(c) - 1;
186 if (str_len <= size_t(ab)) {
187 goto utf8_error;
188 }
189
190 /* Check top bits in the second byte */
191 p++;
192 str_len--;
193 if ((*p & 0xc0) != 0x80) {
194 goto utf8_error;
195 }
196
197 /* Check for overlong sequences for each different length */
198 switch (ab) {
199 case 1:
200 /* Check for: `XX00 000X`. */
201 if ((c & 0x3e) == 0) {
202 goto utf8_error;
203 }
204 continue; /* We know there aren't any more bytes to check */
205
206 case 2:
207 /* Check for: `1110 0000, XX0X XXXX`. */
208 if (c == 0xe0 && (*p & 0x20) == 0) {
209 goto utf8_error;
210 }
211 /* Some special cases, see section 5 of UTF8 decoder stress-test by Markus Kuhn
212 * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
213 /* From section 5.1 (and 5.2) */
214 if (c == 0xed) {
215 if (*p == 0xa0 && *(p + 1) == 0x80) {
216 goto utf8_error;
217 }
218 if (*p == 0xad && *(p + 1) == 0xbf) {
219 goto utf8_error;
220 }
221 if (*p == 0xae && *(p + 1) == 0x80) {
222 goto utf8_error;
223 }
224 if (*p == 0xaf && *(p + 1) == 0xbf) {
225 goto utf8_error;
226 }
227 if (*p == 0xb0 && *(p + 1) == 0x80) {
228 goto utf8_error;
229 }
230 if (*p == 0xbe && *(p + 1) == 0x80) {
231 goto utf8_error;
232 }
233 if (*p == 0xbf && *(p + 1) == 0xbf) {
234 goto utf8_error;
235 }
236 }
237 /* From section 5.3 */
238 if (c == 0xef) {
239 if (*p == 0xbf && *(p + 1) == 0xbe) {
240 goto utf8_error;
241 }
242 if (*p == 0xbf && *(p + 1) == 0xbf) {
243 goto utf8_error;
244 }
245 }
246 break;
247
248 case 3:
249 /* Check for: `1111 0000, XX00 XXXX`. */
250 if (c == 0xf0 && (*p & 0x30) == 0) {
251 goto utf8_error;
252 }
253 break;
254
255 case 4:
256 /* Check for `1111 1000, XX00 0XXX`. */
257 if (c == 0xf8 && (*p & 0x38) == 0) {
258 goto utf8_error;
259 }
260 break;
261
262 case 5:
263 /* Check for: `1111 1100, XX00 00XX`. */
264 if (c == 0xfc && (*p & 0x3c) == 0) {
265 goto utf8_error;
266 }
267 break;
268 }
269
270 /* Check for valid bytes after the 2nd, if any; all must start 10. */
271 while (--ab > 0) {
272 p++;
273 str_len--;
274 if ((*p & 0xc0) != 0x80) {
275 goto utf8_error;
276 }
277 }
278 }
279
280 return -1;
281
282utf8_error:
283
284 return ((const char *)perr - (const char *)str);
285}
286
287int BLI_str_utf8_invalid_strip(char *str, size_t str_len)
288{
289 ptrdiff_t bad_char;
290 int tot = 0;
291
292 BLI_assert(str[str_len] == '\0');
293
294 while ((bad_char = BLI_str_utf8_invalid_byte(str, str_len)) != -1) {
295 str += bad_char;
296 str_len -= size_t(bad_char + 1);
297
298 if (str_len == 0) {
299 /* last character bad, strip it */
300 *str = '\0';
301 tot++;
302 break;
303 }
304 /* strip, keep looking */
305 memmove(str, str + 1, str_len + 1); /* +1 for null char! */
306 tot++;
307 }
308
309 return tot;
310}
311
312int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute)
313{
314 BLI_assert(substitute);
315 ptrdiff_t bad_char;
316 int tot = 0;
317
318 BLI_assert(str[str_len] == '\0');
319
320 while ((bad_char = BLI_str_utf8_invalid_byte(str, str_len)) != -1) {
321 str[bad_char] = substitute;
322 bad_char += 1; /* Step over the bad character. */
323 str += bad_char;
324 str_len -= size_t(bad_char);
325 tot++;
326 }
327
328 return tot;
329}
330
332 const size_t str_len,
333 const char substitute,
334 char *buf,
335 const size_t buf_maxncpy)
336{
337 BLI_assert(str[str_len] == '\0');
338 const ptrdiff_t bad_char = BLI_str_utf8_invalid_byte(str, str_len);
339 if (LIKELY(bad_char == -1)) {
340 return str;
341 }
342 BLI_assert(bad_char >= 0);
343
344 /* In the case a bad character is outside the buffer limit,
345 * simply perform a truncating UTF8 copy into the buffer and return that. */
346 if (UNLIKELY(size_t(bad_char) >= buf_maxncpy)) {
347 BLI_strncpy_utf8(buf, str, buf_maxncpy);
348 return buf;
349 }
350
351 size_t buf_len;
352 if (str_len < buf_maxncpy) {
353 memcpy(buf, str, str_len + 1);
354 buf_len = str_len;
355 }
356 else {
357 buf_len = BLI_strncpy_rlen(buf, str, buf_maxncpy);
358 }
359
360 /* Skip the good characters. */
361 BLI_str_utf8_invalid_substitute(buf + bad_char, buf_len - size_t(bad_char), substitute);
362 return buf;
363}
364
378BLI_INLINE char *str_utf8_copy_max_bytes_impl(char *dst, const char *src, size_t dst_maxncpy)
379{
380 /* Cast to `uint8_t` is a no-op, quiets array subscript of type `char` warning.
381 * No need to check `src` points to a nil byte as this will return from the switch statement. */
382 size_t utf8_size;
383 while ((utf8_size = size_t(utf8_char_compute_skip(*src))) <= dst_maxncpy) {
384 dst_maxncpy -= utf8_size;
385 /* Prefer more compact block. */
386 /* NOLINTBEGIN: bugprone-assignment-in-if-condition */
387 /* clang-format off */
388 switch (utf8_size) {
389 case 6: if (UNLIKELY(!(*dst = *src++))) { return dst; } dst++; ATTR_FALLTHROUGH;
390 case 5: if (UNLIKELY(!(*dst = *src++))) { return dst; } dst++; ATTR_FALLTHROUGH;
391 case 4: if (UNLIKELY(!(*dst = *src++))) { return dst; } dst++; ATTR_FALLTHROUGH;
392 case 3: if (UNLIKELY(!(*dst = *src++))) { return dst; } dst++; ATTR_FALLTHROUGH;
393 case 2: if (UNLIKELY(!(*dst = *src++))) { return dst; } dst++; ATTR_FALLTHROUGH;
394 case 1: if (UNLIKELY(!(*dst = *src++))) { return dst; } dst++;
395 }
396 /* clang-format on */
397 /* NOLINTEND: bugprone-assignment-in-if-condition */
398 }
399 return dst;
400}
401
402char *BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
403{
404 BLI_assert(dst_maxncpy != 0);
405 BLI_string_debug_size(dst, dst_maxncpy);
406
407 char *dst_end = str_utf8_copy_max_bytes_impl(dst, src, dst_maxncpy - 1);
408 *dst_end = '\0';
409 return dst;
410}
411
412size_t BLI_strncpy_utf8_rlen(char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
413{
414 BLI_assert(dst_maxncpy != 0);
415 BLI_string_debug_size(dst, dst_maxncpy);
416
417 char *r_dst = dst;
418 dst = str_utf8_copy_max_bytes_impl(dst, src, dst_maxncpy - 1);
419 *dst = '\0';
420
421 return size_t(dst - r_dst);
422}
423
424size_t BLI_strncpy_utf8_rlen_unterminated(char *__restrict dst,
425 const char *__restrict src,
426 size_t dst_maxncpy)
427{
428 BLI_string_debug_size(dst, dst_maxncpy);
429
430 char *r_dst = dst;
431 dst = str_utf8_copy_max_bytes_impl(dst, src, dst_maxncpy);
432
433 return size_t(dst - r_dst);
434}
435
436/* -------------------------------------------------------------------- */
437/* wchar_t / UTF8 functions */
438
439size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
440 const wchar_t *__restrict src,
441 const size_t dst_maxncpy)
442{
443 BLI_assert(dst_maxncpy != 0);
444 BLI_string_debug_size(dst, dst_maxncpy);
445
446 size_t len = 0;
447 while (*src && len < dst_maxncpy) {
448 len += BLI_str_utf8_from_unicode(uint(*src++), dst + len, dst_maxncpy - len);
449 }
450 dst[len] = '\0';
451 /* Return the correct length when part of the final byte did not fit into the string. */
452 while ((len > 0) && UNLIKELY(dst[len - 1] == '\0')) {
453 len--;
454 }
455 return len;
456}
457
458size_t BLI_wstrlen_utf8(const wchar_t *src)
459{
460 size_t len = 0;
461
462 while (*src) {
464 }
465
466 return len;
467}
468
469size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes)
470{
471 size_t len = 0;
472 const char *strc_orig = strc;
473
474 while (*strc) {
475 int step = BLI_str_utf8_size_safe(strc);
476
477 /* Detect null bytes within multi-byte sequences.
478 * This matches the behavior of #BLI_strncpy_utf8 for incomplete byte sequences. */
479 for (int i = 1; i < step; i++) {
480 if (UNLIKELY(strc[i] == '\0')) {
481 step = i;
482 break;
483 }
484 }
485
486 strc += step;
487 len++;
488 }
489
490 *r_len_bytes = size_t(strc - strc_orig);
491 return len;
492}
493
494size_t BLI_strlen_utf8(const char *strc)
495{
496 size_t len_bytes;
497 return BLI_strlen_utf8_ex(strc, &len_bytes);
498}
499
500size_t BLI_strnlen_utf8_ex(const char *strc, const size_t strc_maxlen, size_t *r_len_bytes)
501{
502 size_t len = 0;
503 const char *strc_orig = strc;
504 const char *strc_end = strc + strc_maxlen;
505
506 while (*strc) {
507 int step = BLI_str_utf8_size_safe(strc);
508 if (strc + step > strc_end) {
509 break;
510 }
511
512 /* Detect null bytes within multi-byte sequences.
513 * This matches the behavior of #BLI_strncpy_utf8 for incomplete byte sequences. */
514 for (int i = 1; i < step; i++) {
515 if (UNLIKELY(strc[i] == '\0')) {
516 step = i;
517 break;
518 }
519 }
520 strc += step;
521 len++;
522 }
523
524 *r_len_bytes = size_t(strc - strc_orig);
525 return len;
526}
527
528size_t BLI_strnlen_utf8(const char *strc, const size_t strc_maxlen)
529{
530 size_t len_bytes;
531 return BLI_strnlen_utf8_ex(strc, strc_maxlen, &len_bytes);
532}
533
534size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst_w,
535 const char *__restrict src_c,
536 const size_t dst_w_maxncpy)
537{
538#ifdef WIN32
539 BLI_string_debug_size(dst_w, dst_w_maxncpy);
540 conv_utf_8_to_16(src_c, dst_w, dst_w_maxncpy);
541 /* NOTE: it would be more efficient to calculate the length as part of #conv_utf_8_to_16. */
542 return wcslen(dst_w);
543#else
544 return BLI_str_utf8_as_utf32((char32_t *)dst_w, src_c, dst_w_maxncpy);
545#endif
546}
547
548/* End wchar_t / UTF8 functions. */
549/* -------------------------------------------------------------------- */
550
551size_t BLI_vsnprintf_utf8(char *__restrict dst,
552 size_t dst_maxncpy,
553 const char *__restrict format,
554 va_list arg)
555{
556 /* NOTE: a clone of #BLI_vsnprintf that trims the end. */
557 BLI_string_debug_size(dst, dst_maxncpy);
558
559 BLI_assert(dst != nullptr);
560 BLI_assert(dst_maxncpy > 0);
561 BLI_assert(format != nullptr);
562
563 const size_t n = size_t(vsnprintf(dst, dst_maxncpy, format, arg));
564 if (n < dst_maxncpy) {
565 dst[n] = '\0';
566 }
567 else {
568 str_utf8_truncate_at_size_unchecked(dst, dst_maxncpy);
569 }
570
571 return n;
572}
573
574size_t BLI_vsnprintf_utf8_rlen(char *__restrict dst,
575 size_t dst_maxncpy,
576 const char *__restrict format,
577 va_list arg)
578{
579 BLI_string_debug_size(dst, dst_maxncpy);
580
581 BLI_assert(dst != nullptr);
582 BLI_assert(dst_maxncpy > 0);
583 BLI_assert(format != nullptr);
584
585 size_t n = size_t(vsnprintf(dst, dst_maxncpy, format, arg));
586 if (n < dst_maxncpy) {
587 dst[n] = '\0';
588 }
589 else {
590 n = str_utf8_truncate_at_size_unchecked(dst, dst_maxncpy);
591 }
592 return n;
593}
594
595size_t BLI_snprintf_utf8(char *__restrict dst,
596 size_t dst_maxncpy,
597 const char *__restrict format,
598 ...)
599{
600 BLI_string_debug_size(dst, dst_maxncpy);
601
602 va_list arg;
603 va_start(arg, format);
604 const size_t n = BLI_vsnprintf_utf8(dst, dst_maxncpy, format, arg);
605 va_end(arg);
606
607 return n;
608}
609
610size_t BLI_snprintf_utf8_rlen(char *__restrict dst,
611 size_t dst_maxncpy,
612 const char *__restrict format,
613 ...)
614{
615 BLI_string_debug_size(dst, dst_maxncpy);
616
617 va_list arg;
618 va_start(arg, format);
619 const size_t n = BLI_vsnprintf_utf8_rlen(dst, dst_maxncpy, format, arg);
620 va_end(arg);
621
622 return n;
623}
624
625int BLI_wcwidth_or_error(char32_t ucs)
626{
627 /* Treat private use areas (icon fonts), symbols, and emoticons as double-width. */
628 if (ucs >= 0xf0000 || (ucs >= 0xe000 && ucs < 0xf8ff) || (ucs >= 0x1f300 && ucs < 0x1fbff)) {
629 return 2;
630 }
631 return mk_wcwidth(ucs);
632}
633
634int BLI_wcwidth_safe(char32_t ucs)
635{
636 const int columns = BLI_wcwidth_or_error(ucs);
637 if (columns >= 0) {
638 return columns;
639 }
640 return 1;
641}
642
643int BLI_wcswidth_or_error(const char32_t *pwcs, size_t n)
644{
645 return mk_wcswidth(pwcs, n);
646}
647
649{
651 if (unicode == BLI_UTF8_ERR) {
652 return -1;
653 }
654
655 return BLI_wcwidth_or_error(char32_t(unicode));
656}
657
659{
661 if (unicode == BLI_UTF8_ERR) {
662 return 1;
663 }
664
665 return BLI_wcwidth_safe(char32_t(unicode));
666}
667
668/* -------------------------------------------------------------------- */
677
678char32_t BLI_str_utf32_char_to_upper(const char32_t wc)
679{
680 if (wc < U'\xFF') { /* Latin. */
681 if ((wc <= U'z' && wc >= U'a') || (wc <= U'\xF6' && wc >= U'\xE0') ||
682 /* Correct but the first case is know, only check the second */
683 // (wc <= U'\xFE' && wc >= U'\xF8')
684 (wc >= U'\xF8'))
685 {
686 return wc - 32;
687 }
688 return wc;
689 }
690
691 if ((wc <= U'\x137' && wc >= U'\x101') || (wc <= U'\x1E95' && wc >= U'\x1E01')) {
692 /* Latin Extended. */
693 return (wc & 1) ? wc - 1 : wc;
694 }
695 if ((wc <= U'\x586' && wc >= U'\x561') || (wc <= U'\x10F5' && wc >= U'\x10D0')) {
696 /* Armenian and Georgian */
697 return wc - 48;
698 }
699 if (wc <= U'\x24E9' && wc >= U'\x24D0') { /* Enclosed Numerals. */
700 return wc - 26;
701 }
702 if (wc <= U'\xFF5A' && wc >= U'\xFF41') { /* Full-width Forms. */
703 return wc - 32;
704 }
705
706 /* There are only three remaining ranges that contain capitalization. */
707 if (!(wc <= U'\x0292' && wc >= U'\x00FF') && !(wc <= U'\x04F9' && wc >= U'\x03AC') &&
708 !(wc <= U'\x1FE1' && wc >= U'\x1E01'))
709 {
710 return wc;
711 }
712
713 static const char32_t from[] =
714 U"\x00FF\x013A\x013C\x013E\x0140\x0142\x0144\x0146\x0148\x014B\x014D\x014F\x0151\x0153\x0155"
715 U"\x0157\x0159\x015B\x015D\x015F\x0161\x0163\x0165\x0167\x0169\x016B\x016D\x016F\x0171\x0173"
716 U"\x0175\x0177\x017A\x017C\x017E\x0183\x0185\x0188\x018C\x0192\x0199\x01A1\x01A3\x01A5\x01A8"
717 U"\x01AD\x01B0\x01B4\x01B6\x01B9\x01BD\x01C6\x01C9\x01CC\x01CE\x01D0\x01D2\x01D4\x01D6\x01D8"
718 U"\x01DA\x01DC\x01DF\x01E1\x01E3\x01E5\x01E7\x01E9\x01EB\x01ED\x01EF\x01F3\x01F5\x01FB\x01FD"
719 U"\x01FF\x0201\x0203\x0205\x0207\x0209\x020B\x020D\x020F\x0211\x0213\x0215\x0217\x0253\x0254"
720 U"\x0257\x0258\x0259\x025B\x0260\x0263\x0268\x0269\x026F\x0272\x0275\x0283\x0288\x028A\x028B"
721 U"\x0292\x03AC\x03AD\x03AE\x03AF\x03B1\x03B2\x03B3\x03B4\x03B5\x03B6\x03B7\x03B8\x03B9\x03BA"
722 U"\x03BB\x03BC\x03BD\x03BE\x03BF\x03C0\x03C1\x03C3\x03C4\x03C5\x03C6\x03C7\x03C8\x03C9\x03CA"
723 U"\x03CB\x03CC\x03CD\x03CE\x03E3\x03E5\x03E7\x03E9\x03EB\x03ED\x03EF\x0430\x0431\x0432\x0433"
724 U"\x0434\x0435\x0436\x0437\x0438\x0439\x043A\x043B\x043C\x043D\x043E\x043F\x0440\x0441\x0442"
725 U"\x0443\x0444\x0445\x0446\x0447\x0448\x0449\x044A\x044B\x044C\x044D\x044E\x044F\x0451\x0452"
726 U"\x0453\x0454\x0455\x0456\x0457\x0458\x0459\x045A\x045B\x045C\x045E\x045F\x0461\x0463\x0465"
727 U"\x0467\x0469\x046B\x046D\x046F\x0471\x0473\x0475\x0477\x0479\x047B\x047D\x047F\x0481\x0491"
728 U"\x0493\x0495\x0497\x0499\x049B\x049D\x049F\x04A1\x04A3\x04A5\x04A7\x04A9\x04AB\x04AD\x04AF"
729 U"\x04B1\x04B3\x04B5\x04B7\x04B9\x04BB\x04BD\x04BF\x04C2\x04C4\x04C8\x04CC\x04D1\x04D3\x04D5"
730 U"\x04D7\x04D9\x04DB\x04DD\x04DF\x04E1\x04E3\x04E5\x04E7\x04E9\x04EB\x04EF\x04F1\x04F3\x04F5"
731 U"\x04F9\x1EA1\x1EA3\x1EA5\x1EA7\x1EA9\x1EAB\x1EAD\x1EAF\x1EB1\x1EB3\x1EB5\x1EB7\x1EB9\x1EBB"
732 U"\x1EBD\x1EBF\x1EC1\x1EC3\x1EC5\x1EC7\x1EC9\x1ECB\x1ECD\x1ECF\x1ED1\x1ED3\x1ED5\x1ED7\x1ED9"
733 U"\x1EDB\x1EDD\x1EDF\x1EE1\x1EE3\x1EE5\x1EE7\x1EE9\x1EEB\x1EED\x1EEF\x1EF1\x1EF3\x1EF5\x1EF7"
734 U"\x1EF9\x1F00\x1F01\x1F02\x1F03\x1F04\x1F05\x1F06\x1F07\x1F10\x1F11\x1F12\x1F13\x1F14\x1F15"
735 U"\x1F20\x1F21\x1F22\x1F23\x1F24\x1F25\x1F26\x1F27\x1F30\x1F31\x1F32\x1F33\x1F34\x1F35\x1F36"
736 U"\x1F37\x1F40\x1F41\x1F42\x1F43\x1F44\x1F45\x1F51\x1F53\x1F55\x1F57\x1F60\x1F61\x1F62\x1F63"
737 U"\x1F64\x1F65\x1F66\x1F67\x1F80\x1F81\x1F82\x1F83\x1F84\x1F85\x1F86\x1F87\x1F90\x1F91\x1F92"
738 U"\x1F93\x1F94\x1F95\x1F96\x1F97\x1FA0\x1FA1\x1FA2\x1FA3\x1FA4\x1FA5\x1FA6\x1FA7\x1FB0\x1FB1"
739 U"\x1FD0\x1FD1\x1FE0\x1FE1";
740 static const char32_t to[] =
741 U"\x0178\x0139\x013B\x013D\x013F\x0141\x0143\x0145\x0147\x014A\x014C\x014E\x0150\x0152\x0154"
742 U"\x0156\x0158\x015A\x015C\x015E\x0160\x0162\x0164\x0166\x0168\x016A\x016C\x016E\x0170\x0172"
743 U"\x0174\x0176\x0179\x017B\x017D\x0182\x0184\x0187\x018B\x0191\x0198\x01A0\x01A2\x01A4\x01A7"
744 U"\x01AC\x01AF\x01B3\x01B5\x01B8\x01BC\x01C4\x01C7\x01CA\x01CD\x01CF\x01D1\x01D3\x01D5\x01D7"
745 U"\x01D9\x01DB\x01DE\x01E0\x01E2\x01E4\x01E6\x01E8\x01EA\x01EC\x01EE\x01F1\x01F4\x01FA\x01FC"
746 U"\x01FE\x0200\x0202\x0204\x0206\x0208\x020A\x020C\x020E\x0210\x0212\x0214\x0216\x0181\x0186"
747 U"\x018A\x018E\x018F\x0190\x0193\x0194\x0197\x0196\x019C\x019D\x019F\x01A9\x01AE\x01B1\x01B2"
748 U"\x01B7\x0386\x0388\x0389\x038A\x0391\x0392\x0393\x0394\x0395\x0396\x0397\x0398\x0399\x039A"
749 U"\x039B\x039C\x039D\x039E\x039F\x03A0\x03A1\x03A3\x03A4\x03A5\x03A6\x03A7\x03A8\x03A9\x03AA"
750 U"\x03AB\x038C\x038E\x038F\x03E2\x03E4\x03E6\x03E8\x03EA\x03EC\x03EE\x0410\x0411\x0412\x0413"
751 U"\x0414\x0415\x0416\x0417\x0418\x0419\x041A\x041B\x041C\x041D\x041E\x041F\x0420\x0421\x0422"
752 U"\x0423\x0424\x0425\x0426\x0427\x0428\x0429\x042A\x042B\x042C\x042D\x042E\x042F\x0401\x0402"
753 U"\x0403\x0404\x0405\x0406\x0407\x0408\x0409\x040A\x040B\x040C\x040E\x040F\x0460\x0462\x0464"
754 U"\x0466\x0468\x046A\x046C\x046E\x0470\x0472\x0474\x0476\x0478\x047A\x047C\x047E\x0480\x0490"
755 U"\x0492\x0494\x0496\x0498\x049A\x049C\x049E\x04A0\x04A2\x04A4\x04A6\x04A8\x04AA\x04AC\x04AE"
756 U"\x04B0\x04B2\x04B4\x04B6\x04B8\x04BA\x04BC\x04BE\x04C1\x04C3\x04C7\x04CB\x04D0\x04D2\x04D4"
757 U"\x04D6\x04D8\x04DA\x04DC\x04DE\x04E0\x04E2\x04E4\x04E6\x04E8\x04EA\x04EE\x04F0\x04F2\x04F4"
758 U"\x04F8\x1EA0\x1EA2\x1EA4\x1EA6\x1EA8\x1EAA\x1EAC\x1EAE\x1EB0\x1EB2\x1EB4\x1EB6\x1EB8\x1EBA"
759 U"\x1EBC\x1EBE\x1EC0\x1EC2\x1EC4\x1EC6\x1EC8\x1ECA\x1ECC\x1ECE\x1ED0\x1ED2\x1ED4\x1ED6\x1ED8"
760 U"\x1EDA\x1EDC\x1EDE\x1EE0\x1EE2\x1EE4\x1EE6\x1EE8\x1EEA\x1EEC\x1EEE\x1EF0\x1EF2\x1EF4\x1EF6"
761 U"\x1EF8\x1F08\x1F09\x1F0A\x1F0B\x1F0C\x1F0D\x1F0E\x1F0F\x1F18\x1F19\x1F1A\x1F1B\x1F1C\x1F1D"
762 U"\x1F28\x1F29\x1F2A\x1F2B\x1F2C\x1F2D\x1F2E\x1F2F\x1F38\x1F39\x1F3A\x1F3B\x1F3C\x1F3D\x1F3E"
763 U"\x1F3F\x1F48\x1F49\x1F4A\x1F4B\x1F4C\x1F4D\x1F59\x1F5B\x1F5D\x1F5F\x1F68\x1F69\x1F6A\x1F6B"
764 U"\x1F6C\x1F6D\x1F6E\x1F6F\x1F88\x1F89\x1F8A\x1F8B\x1F8C\x1F8D\x1F8E\x1F8F\x1F98\x1F99\x1F9A"
765 U"\x1F9B\x1F9C\x1F9D\x1F9E\x1F9F\x1FA8\x1FA9\x1FAA\x1FAB\x1FAC\x1FAD\x1FAE\x1FAF\x1FB8\x1FB9"
766 U"\x1FD8\x1FD9\x1FE8\x1FE9";
767
768 if (wc >= from[0] && wc <= from[ARRAY_SIZE(from) - 2]) {
769 /* Binary search since these are sorted. */
770 size_t min = 0;
771 size_t max = ARRAY_SIZE(from) - 2;
772 while (max >= min) {
773 const size_t mid = (min + max) / 2;
774 if (wc > from[mid]) {
775 min = mid + 1;
776 }
777 else if (wc < from[mid]) {
778 max = mid - 1;
779 }
780 else {
781 return to[mid];
782 }
783 }
784 }
785
786 return wc;
787}
788
789char32_t BLI_str_utf32_char_to_lower(const char32_t wc)
790{
791 if (wc < U'\xD8') { /* Latin. */
792 if ((wc <= U'Z' && wc >= U'A') || (wc <= U'\xD6' && wc >= U'\xC0')) {
793 return wc + 32;
794 }
795 return wc;
796 }
797 if ((wc <= U'\x136' && wc >= U'\x100') || (wc <= U'\x1E94' && wc >= U'\x1E00')) {
798 /* Latin Extended. */
799 return (wc % 2 == 0) ? wc + 1 : wc;
800 }
801 if ((wc <= U'\x556' && wc >= U'\x531') || (wc <= U'\x10C5' && wc >= U'\x10A0')) {
802 /* Armenian and Georgian. */
803 return wc + 48;
804 }
805 if (wc <= U'\x24CF' && wc >= U'\x24B6') { /* Enclosed Numerals. */
806 return wc + 26;
807 }
808 if (wc <= U'\xFF3A' && wc >= U'\xFF21') { /* Full-width Forms. */
809 return wc + 32;
810 }
811
812 /* There are only three remaining ranges that contain capitalization. */
813 if (!(wc <= U'\x0216' && wc >= U'\x00D8') && !(wc <= U'\x04F8' && wc >= U'\x0386') &&
814 !(wc <= U'\x1FE9' && wc >= U'\x1E00'))
815 {
816 return wc;
817 }
818
819 static const char32_t from[] =
820 U"\x00D8\x00D9\x00DA\x00DB\x00DC\x00DD\x00DE\x0139\x013B\x013D\x013F\x0141\x0143\x0145\x0147"
821 U"\x014A\x014C\x014E\x0150\x0152\x0154\x0156\x0158\x015A\x015C\x015E\x0160\x0162\x0164\x0166"
822 U"\x0168\x016A\x016C\x016E\x0170\x0172\x0174\x0176\x0178\x0179\x017B\x017D\x0181\x0182\x0184"
823 U"\x0186\x0187\x018A\x018B\x018E\x018F\x0190\x0191\x0193\x0194\x0196\x0197\x0198\x019C\x019D"
824 U"\x019F\x01A0\x01A2\x01A4\x01A7\x01A9\x01AC\x01AE\x01AF\x01B1\x01B2\x01B3\x01B5\x01B7\x01B8"
825 U"\x01BC\x01C4\x01C5\x01C7\x01C8\x01CA\x01CB\x01CD\x01CF\x01D1\x01D3\x01D5\x01D7\x01D9\x01DB"
826 U"\x01DE\x01E0\x01E2\x01E4\x01E6\x01E8\x01EA\x01EC\x01EE\x01F1\x01F4\x01FA\x01FC\x01FE\x0200"
827 U"\x0202\x0204\x0206\x0208\x020A\x020C\x020E\x0210\x0212\x0214\x0216\x0386\x0388\x0389\x038A"
828 U"\x038C\x038E\x038F\x0391\x0392\x0393\x0394\x0395\x0396\x0397\x0398\x0399\x039A\x039B\x039C"
829 U"\x039D\x039E\x039F\x03A0\x03A1\x03A3\x03A4\x03A5\x03A6\x03A7\x03A8\x03A9\x03AA\x03AB\x03E2"
830 U"\x03E4\x03E6\x03E8\x03EA\x03EC\x03EE\x0401\x0402\x0403\x0404\x0405\x0406\x0407\x0408\x0409"
831 U"\x040A\x040B\x040C\x040E\x040F\x0410\x0411\x0412\x0413\x0414\x0415\x0416\x0417\x0418\x0419"
832 U"\x041A\x041B\x041C\x041D\x041E\x041F\x0420\x0421\x0422\x0423\x0424\x0425\x0426\x0427\x0428"
833 U"\x0429\x042A\x042B\x042C\x042D\x042E\x042F\x0460\x0462\x0464\x0466\x0468\x046A\x046C\x046E"
834 U"\x0470\x0472\x0474\x0476\x0478\x047A\x047C\x047E\x0480\x0490\x0492\x0494\x0496\x0498\x049A"
835 U"\x049C\x049E\x04A0\x04A2\x04A4\x04A6\x04A8\x04AA\x04AC\x04AE\x04B0\x04B2\x04B4\x04B6\x04B8"
836 U"\x04BA\x04BC\x04BE\x04C1\x04C3\x04C7\x04CB\x04D0\x04D2\x04D4\x04D6\x04D8\x04DA\x04DC\x04DE"
837 U"\x04E0\x04E2\x04E4\x04E6\x04E8\x04EA\x04EE\x04F0\x04F2\x04F4\x04F8\x1EA0\x1EA2\x1EA4\x1EA6"
838 U"\x1EA8\x1EAA\x1EAC\x1EAE\x1EB0\x1EB2\x1EB4\x1EB6\x1EB8\x1EBA\x1EBC\x1EBE\x1EC0\x1EC2\x1EC4"
839 U"\x1EC6\x1EC8\x1ECA\x1ECC\x1ECE\x1ED0\x1ED2\x1ED4\x1ED6\x1ED8\x1EDA\x1EDC\x1EDE\x1EE0\x1EE2"
840 U"\x1EE4\x1EE6\x1EE8\x1EEA\x1EEC\x1EEE\x1EF0\x1EF2\x1EF4\x1EF6\x1EF8\x1F08\x1F09\x1F0A\x1F0B"
841 U"\x1F0C\x1F0D\x1F0E\x1F0F\x1F18\x1F19\x1F1A\x1F1B\x1F1C\x1F1D\x1F28\x1F29\x1F2A\x1F2B\x1F2C"
842 U"\x1F2D\x1F2E\x1F2F\x1F38\x1F39\x1F3A\x1F3B\x1F3C\x1F3D\x1F3E\x1F3F\x1F48\x1F49\x1F4A\x1F4B"
843 U"\x1F4C\x1F4D\x1F59\x1F5B\x1F5D\x1F5F\x1F68\x1F69\x1F6A\x1F6B\x1F6C\x1F6D\x1F6E\x1F6F\x1F88"
844 U"\x1F89\x1F8A\x1F8B\x1F8C\x1F8D\x1F8E\x1F8F\x1F98\x1F99\x1F9A\x1F9B\x1F9C\x1F9D\x1F9E\x1F9F"
845 U"\x1FA8\x1FA9\x1FAA\x1FAB\x1FAC\x1FAD\x1FAE\x1FAF\x1FB8\x1FB9\x1FD8\x1FD9\x1FE8\x1FE9";
846 static const char32_t to[] =
847 U"\x00F8\x00F9\x00FA\x00FB\x00FC\x00FD\x00FE\x013A\x013C\x013E\x0140\x0142\x0144\x0146\x0148"
848 U"\x014B\x014D\x014F\x0151\x0153\x0155\x0157\x0159\x015B\x015D\x015F\x0161\x0163\x0165\x0167"
849 U"\x0169\x016B\x016D\x016F\x0171\x0173\x0175\x0177\x00FF\x017A\x017C\x017E\x0253\x0183\x0185"
850 U"\x0254\x0188\x0257\x018C\x0258\x0259\x025B\x0192\x0260\x0263\x0269\x0268\x0199\x026f\x0272"
851 U"\x0275\x01A1\x01A3\x01A5\x01A8\x0283\x01AD\x0288\x01B0\x028A\x028B\x01B4\x01B6\x0292\x01B9"
852 U"\x01BD\x01C6\x01C6\x01C9\x01C9\x01CC\x01CC\x01CE\x01D0\x01D2\x01D4\x01D6\x01D8\x01DA\x01DC"
853 U"\x01DF\x01E1\x01E3\x01E5\x01E7\x01E9\x01EB\x01ED\x01EF\x01F3\x01F5\x01FB\x01FD\x01FF\x0201"
854 U"\x0203\x0205\x0207\x0209\x020B\x020D\x020F\x0211\x0213\x0215\x0217\x03AC\x03AD\x03AE\x03AF"
855 U"\x03CC\x03CD\x03CE\x03B1\x03B2\x03B3\x03B4\x03B5\x03B6\x03B7\x03B8\x03B9\x03BA\x03BB\x03BC"
856 U"\x03BD\x03BE\x03BF\x03C0\x03C1\x03C3\x03C4\x03C5\x03C6\x03C7\x03C8\x03C9\x03CA\x03CB\x03E3"
857 U"\x03E5\x03E7\x03E9\x03EB\x03ED\x03EF\x0451\x0452\x0453\x0454\x0455\x0456\x0457\x0458\x0459"
858 U"\x045A\x045B\x045C\x045E\x045F\x0430\x0431\x0432\x0433\x0434\x0435\x0436\x0437\x0438\x0439"
859 U"\x043A\x043B\x043C\x043D\x043E\x043F\x0440\x0441\x0442\x0443\x0444\x0445\x0446\x0447\x0448"
860 U"\x0449\x044A\x044B\x044C\x044D\x044E\x044F\x0461\x0463\x0465\x0467\x0469\x046B\x046D\x046F"
861 U"\x0471\x0473\x0475\x0477\x0479\x047B\x047D\x047F\x0481\x0491\x0493\x0495\x0497\x0499\x049B"
862 U"\x049D\x049F\x04A1\x04A3\x04A5\x04A7\x04A9\x04AB\x04AD\x04AF\x04B1\x04B3\x04B5\x04B7\x04B9"
863 U"\x04BB\x04BD\x04BF\x04C2\x04C4\x04C8\x04CC\x04D1\x04D3\x04D5\x04D7\x04D9\x04DB\x04DD\x04DF"
864 U"\x04E1\x04E3\x04E5\x04E7\x04E9\x04EB\x04EF\x04F1\x04F3\x04F5\x04F9\x1EA1\x1EA3\x1EA5\x1EA7"
865 U"\x1EA9\x1EAB\x1EAD\x1EAF\x1EB1\x1EB3\x1EB5\x1EB7\x1EB9\x1EBB\x1EBD\x1EBF\x1EC1\x1EC3\x1EC5"
866 U"\x1EC7\x1EC9\x1ECB\x1ECD\x1ECF\x1ED1\x1ED3\x1ED5\x1ED7\x1ED9\x1EDB\x1EDD\x1EDF\x1EE1\x1EE3"
867 U"\x1EE5\x1EE7\x1EE9\x1EEB\x1EED\x1EEF\x1EF1\x1EF3\x1EF5\x1EF7\x1EF9\x1F00\x1F01\x1F02\x1F03"
868 U"\x1F04\x1F05\x1F06\x1F07\x1F10\x1F11\x1F12\x1F13\x1F14\x1F15\x1F20\x1F21\x1F22\x1F23\x1F24"
869 U"\x1F25\x1F26\x1F27\x1F30\x1F31\x1F32\x1F33\x1F34\x1F35\x1F36\x1F37\x1F40\x1F41\x1F42\x1F43"
870 U"\x1F44\x1F45\x1F51\x1F53\x1F55\x1F57\x1F60\x1F61\x1F62\x1F63\x1F64\x1F65\x1F66\x1F67\x1F80"
871 U"\x1F81\x1F82\x1F83\x1F84\x1F85\x1F86\x1F87\x1F90\x1F91\x1F92\x1F93\x1F94\x1F95\x1F96\x1F97"
872 U"\x1FA0\x1FA1\x1FA2\x1FA3\x1FA4\x1FA5\x1FA6\x1FA7\x1FB0\x1FB1\x1FD0\x1FD1\x1FE0\x1FE1";
873
874 if (wc >= from[0] && wc <= from[ARRAY_SIZE(from) - 2]) {
875 /* Binary search since these are sorted. */
876 size_t min = 0;
877 size_t max = ARRAY_SIZE(from) - 2;
878 while (max >= min) {
879 const size_t mid = (min + max) / 2;
880 if (wc > from[mid]) {
881 min = mid + 1;
882 }
883 else if (wc < from[mid]) {
884 max = mid - 1;
885 }
886 else {
887 return to[mid];
888 }
889 }
890 }
891
892 return wc;
893}
894
895/* -------------------------------------------------------------------- */
901
903{
904 /* Invisible (and so can be removed at end of wrapped line) spacing characters
905 * according to the Unicode Line Breaking Algorithm (Standard Annex #14). Note
906 * to always ignore U+200B (zero-width space) and U+2060 (word joiner). */
907 return ELEM(codepoint,
908 ' ', /* Space. */
909 0x1680, /* Ogham space mark. */
910 0x2000, /* En quad. */
911 0x2001, /* Em quad. */
912 0x2002, /* En space. */
913 0x2003, /* Em space. */
914 0x2004, /* Three-per-em space. */
915 0x2005, /* Four-per-em space. */
916 0x2006, /* Six-per-em space. */
917 0x2008, /* Punctuation space. */
918 0x2009, /* Thin space. */
919 0x200A, /* Hair space. */
920 0x205F, /* Medium mathematical space. */
921 0x3000); /* Ideographic space. */
922}
923
924bool BLI_str_utf32_char_is_optional_break_after(char32_t codepoint, char32_t codepoint_prev)
925{
926 /* Subset of the characters that are line breaking opportunities
927 * according to the Unicode Line Breaking Algorithm (Standard Annex #14).
928 * Can be expanded but please no rules that differ by language. */
929
930 /* Punctuation. Backslash can be used as path separator */
931 if (ELEM(codepoint, '\\', '_')) {
932 return true;
933 }
934
935 /* Do not break on solidus if previous is a number. */
936 if (codepoint == '/' && !(codepoint_prev >= '0' && codepoint_prev <= '9')) {
937 return true;
938 }
939
940 /* Do not break on dash, hyphen, em dash if previous is space */
941 if (ELEM(codepoint, '-', 0x2010, 0x2014) &&
943 {
944 return true;
945 }
946
947 if ((codepoint >= 0x2E80 && codepoint <= 0x2FFF) || /* CJK, Kangxi Radicals. */
948 (codepoint >= 0x3040 && codepoint <= 0x309F) || /* Hiragana (except small characters). */
949 (codepoint >= 0x30A2 && codepoint <= 0x30FA) || /* Katakana (except small characters). */
950 (codepoint >= 0x3400 && codepoint <= 0x4DBF) || /* CJK Unified Ideographs Extension A. */
951 (codepoint >= 0x4E00 && codepoint <= 0x9FFF) || /* CJK Unified Ideographs. */
952 (codepoint >= 0x3040 && codepoint <= 0x309F) || /* CJK Unified Ideographs. */
953 (codepoint >= 0x3130 && codepoint <= 0x318F)) /* Hangul Compatibility Jamo. */
954 {
955 return true;
956 }
957
958 if (ELEM(codepoint, 0x0F0D, 0x0F0B)) {
959 return true; /* Tibetan shad mark and intersyllabic tsheg. */
960 }
961
962 return false;
963}
964
965bool BLI_str_utf32_char_is_optional_break_before(char32_t codepoint, char32_t codepoint_prev)
966{
967 /* Do not break on any of these if a space follows. */
969 return false;
970 }
971
972 /* Infix Numeric Separators. Allow break on these if not numbers afterward. */
973 if (ELEM(codepoint_prev,
974 ',', /* Comma. */
975 ':', /* Colon. */
976 ';', /* Semicolon. */
977 0x037E, /* Greek question mark. */
978 0x0589, /* Armenian full stop. */
979 0x060C, /* Arabic comma. */
980 0x060D, /* Arabic date separator. */
981 0x07F8, /* N'Ko comma. */
982 0x2044) /* Fraction slash. */
983 && !(codepoint >= '0' && codepoint <= '9'))
984 {
985 return true;
986 }
987
988 /* Break on full stop only if not followed by another, or by a number. */
989 if (codepoint_prev == '.' && codepoint != '.' && !(codepoint >= '0' && codepoint <= '9')) {
990 return true;
991 }
992
993 /* Close punctuation. */
994 if (ELEM(codepoint_prev,
995 0x3001, /* Ideographic comma. */
996 0x3002, /* Ideographic full stop. */
997 0xFE10, /* Presentation form for vertical ideographic comma. */
998 0xFE11, /* Presentation form for vertical ideographic full stop. */
999 0xFE12, /* Presentation form for vertical ideographic colon. */
1000 0xFE50, /* Small comma. */
1001 0xFE52, /* Small full stop. */
1002 0xFF0C, /* Full-width comma. */
1003 0xFF0E, /* Full-width full stop. */
1004 0XFF61, /* Half-width ideographic full stop. */
1005 0Xff64)) /* Half-width ideographic comma. */
1006 {
1007 return true;
1008 }
1009
1010 /* Exclamation/Interrogation. */
1011 if (ELEM(codepoint_prev,
1012 '!', /* Exclamation mark. */
1013 '?', /* Question mark. */
1014 0x05C6, /* Hebrew punctuation `maqaf`. */
1015 0x061B, /* Arabic semicolon. */
1016 0x061E, /* Arabic triple dot. */
1017 0x061F, /* Arabic question mark. */
1018 0x06D4, /* Arabic full stop. */
1019 0x07F9, /* N'Ko question mark. */
1020 0x0F0D, /* Tibetan shad mark. */
1021 0xFF01, /* Full-width exclamation mark. */
1022 0xff1f)) /* full-width question mark. */
1023 {
1024 return true;
1025 }
1026
1027 return false;
1028}
1029 /* -------------------------------------------------------------------- */
1031
1033{
1035}
1036
1037int BLI_str_utf8_size_safe(const char *p)
1038{
1039 return utf8_char_compute_skip(*p);
1040}
1041
1043{
1044 /* Originally `g_utf8_get_char` in GLIB. */
1045
1046 const uchar c = uchar(*p);
1047
1048 char mask = 0;
1050 if (UNLIKELY(len == -1)) {
1051 return BLI_UTF8_ERR;
1052 }
1054}
1055
1057{
1059 if (UNLIKELY(result == BLI_UTF8_ERR)) {
1060 return *p;
1061 }
1062 return result;
1063}
1064
1066 const size_t p_len,
1067 size_t *__restrict index)
1068{
1069 const uchar c = uchar(*(p += *index));
1070
1071 BLI_assert(*index < p_len);
1072 BLI_assert(c != '\0');
1073
1074 char mask = 0;
1076 if (UNLIKELY(len == -1) || (*index + size_t(len) > p_len)) {
1077 return BLI_UTF8_ERR;
1078 }
1079
1081 if (UNLIKELY(result == BLI_UTF8_ERR)) {
1082 return BLI_UTF8_ERR;
1083 }
1084 *index += size_t(len);
1085 BLI_assert(*index <= p_len);
1086 return result;
1087}
1088
1090 const size_t p_len,
1091 size_t *__restrict index)
1092{
1094 if (UNLIKELY(result == BLI_UTF8_ERR)) {
1095 result = uint(p[*index]);
1096 *index += 1;
1097 }
1098 BLI_assert(*index <= p_len);
1099 return result;
1100}
1101
1102/* was g_unichar_to_utf8 */
1103
1104#define UTF8_VARS_FROM_CHAR32(Char, First, Len) \
1105 if (Char < 0x80) { \
1106 First = 0; \
1107 Len = 1; \
1108 } \
1109 else if (Char < 0x800) { \
1110 First = 0xc0; \
1111 Len = 2; \
1112 } \
1113 else if (Char < 0x10000) { \
1114 First = 0xe0; \
1115 Len = 3; \
1116 } \
1117 else if (Char < 0x200000) { \
1118 First = 0xf0; \
1119 Len = 4; \
1120 } \
1121 else if (Char < 0x4000000) { \
1122 First = 0xf8; \
1123 Len = 5; \
1124 } \
1125 else { \
1126 First = 0xfc; \
1127 Len = 6; \
1128 } \
1129 (void)0
1130
1132{
1133 /* If this gets modified, also update the copy in g_string_insert_unichar() */
1134 uint len = 0;
1135 uint first;
1136
1137 UTF8_VARS_FROM_CHAR32(c, first, len);
1138 (void)first;
1139
1140 return len;
1141}
1142
1143size_t BLI_str_utf8_from_unicode(uint c, char *dst, const size_t dst_maxncpy)
1144
1145{
1146 BLI_string_debug_size(dst, dst_maxncpy);
1147
1148 /* If this gets modified, also update the copy in g_string_insert_unichar() */
1149 uint len = 0;
1150 uint first;
1151
1152 UTF8_VARS_FROM_CHAR32(c, first, len);
1153
1154 if (UNLIKELY(dst_maxncpy < len)) {
1155 /* Null terminate instead of writing a partial byte. */
1156 memset(dst, 0x0, dst_maxncpy);
1157 return dst_maxncpy;
1158 }
1159
1160 for (uint i = len - 1; i > 0; i--) {
1161 dst[i] = char((c & 0x3f) | 0x80);
1162 c >>= 6;
1163 }
1164 dst[0] = char(c | first);
1165
1166 return len;
1167}
1168
1169size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
1170 const char *__restrict src_c,
1171 const size_t dst_w_maxncpy)
1172{
1173 BLI_assert(dst_w_maxncpy != 0);
1174 BLI_string_debug_size(dst_w, dst_w_maxncpy);
1175
1176 const size_t maxlen = dst_w_maxncpy - 1;
1177 size_t len = 0;
1178
1179 const size_t src_c_len = strlen(src_c);
1180 const char *src_c_end = src_c + src_c_len;
1181 size_t index = 0;
1182 while ((index < src_c_len) && (len != maxlen)) {
1183 const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index);
1184 if (unicode != BLI_UTF8_ERR) {
1185 *dst_w = unicode;
1186 }
1187 else {
1188 *dst_w = '?';
1189 const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
1190 index = size_t(src_c_next - src_c);
1191 }
1192 dst_w++;
1193 len++;
1194 }
1195
1196 *dst_w = 0;
1197
1198 return len;
1199}
1200
1201size_t BLI_str_utf32_as_utf8(char *__restrict dst,
1202 const char32_t *__restrict src,
1203 const size_t dst_maxncpy)
1204{
1205 BLI_assert(dst_maxncpy != 0);
1206 BLI_string_debug_size(dst, dst_maxncpy);
1207
1208 size_t len = 0;
1209 while (*src && len < dst_maxncpy) {
1210 len += BLI_str_utf8_from_unicode(uint(*src++), dst + len, dst_maxncpy - len);
1211 }
1212 dst[len] = '\0';
1213 /* Return the correct length when part of the final byte did not fit into the string. */
1214 while ((len > 0) && UNLIKELY(dst[len - 1] == '\0')) {
1215 len--;
1216 }
1217 return len;
1218}
1219
1220size_t BLI_str_utf32_as_utf8_len_ex(const char32_t *src, const size_t src_maxlen)
1221{
1222 size_t len = 0;
1223 const char32_t *src_end = src + src_maxlen;
1224
1225 while ((src < src_end) && *src) {
1227 }
1228
1229 return len;
1230}
1231
1232size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
1233{
1234 size_t len = 0;
1235
1236 while (*src) {
1238 }
1239
1240 return len;
1241}
1242
1243const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
1244{
1245 /* Originally `g_utf8_find_prev_char` in GLIB. */
1246
1247 BLI_assert(p >= str_start);
1248 if (str_start < p) {
1249 for (--p; p >= str_start; p--) {
1250 if ((*p & 0xc0) != 0x80) {
1251 return (char *)p;
1252 }
1253 }
1254 }
1255 return p;
1256}
1257
1258const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
1259{
1260 /* Originally `g_utf8_find_next_char` in GLIB. */
1261
1262 BLI_assert(p <= str_end);
1263 if ((p < str_end) && (*p != '\0')) {
1264 for (++p; p < str_end && (*p & 0xc0) == 0x80; p++) {
1265 /* do nothing */
1266 }
1267 }
1268 return p;
1269}
1270
1271size_t BLI_str_partition_utf8(const char *str,
1272 const uint delim[],
1273 const char **r_sep,
1274 const char **r_suf)
1275{
1276 return BLI_str_partition_ex_utf8(str, nullptr, delim, r_sep, r_suf, false);
1277}
1278
1279size_t BLI_str_rpartition_utf8(const char *str,
1280 const uint delim[],
1281 const char **r_sep,
1282 const char **r_suf)
1283{
1284 return BLI_str_partition_ex_utf8(str, nullptr, delim, r_sep, r_suf, true);
1285}
1286
1288 const char *end,
1289 const uint delim[],
1290 const char **r_sep,
1291 const char **r_suf,
1292 const bool from_right)
1293{
1294 const size_t str_len = end ? size_t(end - str) : strlen(str);
1295 if (end == nullptr) {
1296 end = str + str_len;
1297 }
1298
1299 /* Note that here, we assume end points to a valid UTF8 char! */
1301
1302 char *suf = (char *)(str + str_len);
1303 size_t index = 0;
1304 for (char *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(end, str) : str);
1305 from_right ? (sep > str) : ((sep < end) && (*sep != '\0'));
1306 sep = (char *)(from_right ? (str != sep ? BLI_str_find_prev_char_utf8(sep, str) : nullptr) :
1307 str + index))
1308 {
1309 size_t index_ofs = 0;
1310 const uint c = BLI_str_utf8_as_unicode_step_or_error(sep, size_t(end - sep), &index_ofs);
1311 if (UNLIKELY(c == BLI_UTF8_ERR)) {
1312 break;
1313 }
1314 index += index_ofs;
1315
1316 for (const uint *d = delim; *d != '\0'; d++) {
1317 if (*d == c) {
1318 /* `suf` is already correct in case from_right is true. */
1319 *r_sep = sep;
1320 *r_suf = from_right ? suf : (char *)(str + index);
1321 return size_t(sep - str);
1322 }
1323 }
1324
1325 suf = sep; /* Useful in 'from_right' case! */
1326 }
1327
1328 *r_suf = *r_sep = nullptr;
1329 return str_len;
1330}
1331
1335static size_t str_utf8_truncate_at_size_unchecked(char *str, const size_t str_size)
1336{
1337 BLI_assert(str_size > 0);
1338 BLI_assert(!std::memchr(str, '\0', str_size - 1));
1339 size_t str_len_trim;
1340 BLI_strnlen_utf8_ex(str, str_size - 1, &str_len_trim);
1341 str[str_len_trim] = '\0';
1342 return str_len_trim;
1343}
1344
1345bool BLI_str_utf8_truncate_at_size(char *str, const size_t str_size)
1346{
1347 BLI_assert(str_size > 0);
1348 if (std::memchr(str, '\0', str_size)) {
1349 return false;
1350 }
1351
1353 return true;
1354}
1355
1356/* -------------------------------------------------------------------- */
1365
1366int BLI_str_utf8_offset_to_index(const char *str, const size_t str_len, const int offset_target)
1367{
1368 BLI_assert(offset_target >= 0);
1369 const size_t offset_target_as_size = size_t(offset_target);
1370 size_t offset = 0;
1371 int index = 0;
1372 /* Note that `offset != offset_target_as_size` works for valid UTF8 strings. */
1373 while ((offset < str_len) && (offset < offset_target_as_size)) {
1374 /* Use instead of #BLI_str_utf8_size_safe to match behavior when limiting the string length. */
1375 const uint code = BLI_str_utf8_as_unicode_step_safe(str, str_len, &offset);
1376 UNUSED_VARS(code);
1377 index++;
1378 BLI_assert(offset <= offset_target_as_size); /* See DOXY section comment. */
1379 }
1380 return index;
1381}
1382
1383int BLI_str_utf8_offset_from_index(const char *str, const size_t str_len, const int index_target)
1384{
1385 BLI_assert(index_target >= 0);
1386 size_t offset = 0;
1387 int index = 0;
1388 while ((offset < str_len) && (index < index_target)) {
1389 /* Use instead of #BLI_str_utf8_size_safe to match behavior when limiting the string length. */
1390 const uint code = BLI_str_utf8_as_unicode_step_safe(str, str_len, &offset);
1391 UNUSED_VARS(code);
1392 index++;
1393 }
1394 return int(offset);
1395}
1396
1397int BLI_str_utf8_offset_to_column(const char *str, const size_t str_len, const int offset_target)
1398{
1399 BLI_assert(offset_target >= 0);
1400 const size_t offset_target_clamp = std::min(size_t(offset_target), str_len);
1401 size_t offset = 0;
1402 int column = 0;
1403 while (offset < offset_target_clamp) {
1404 const uint code = BLI_str_utf8_as_unicode_step_safe(str, str_len, &offset);
1405 column += BLI_wcwidth_safe(code);
1406 BLI_assert(offset <= size_t(offset_target)); /* See DOXY section comment. */
1407 }
1408 return column;
1409}
1410
1411int BLI_str_utf8_offset_from_column(const char *str, const size_t str_len, const int column_target)
1412{
1413 size_t offset = 0, offset_next = 0;
1414 int column = 0;
1415 while ((offset < str_len) && (column < column_target)) {
1416 const uint code = BLI_str_utf8_as_unicode_step_safe(str, str_len, &offset_next);
1417 column += BLI_wcwidth_safe(code);
1418 if (column > column_target) {
1419 break;
1420 }
1421 offset = offset_next;
1422 }
1423 return int(offset);
1424}
1425
1427 const size_t str_len,
1428 const int offset_target,
1429 const int tab_width)
1430{
1431 BLI_assert(offset_target >= 0);
1432 const size_t offset_target_clamp = std::min(size_t(offset_target), str_len);
1433 size_t offset = 0;
1434 int column = 0;
1435 while (offset < offset_target_clamp) {
1436 const uint code = BLI_str_utf8_as_unicode_step_safe(str, str_len, &offset);
1437 /* The following line is the only change compared with #BLI_str_utf8_offset_to_column. */
1438 column += (code == '\t') ? (tab_width - (column % tab_width)) : BLI_wcwidth_safe(code);
1439 BLI_assert(offset <= size_t(offset_target)); /* See DOXY section comment. */
1440 }
1441 return column;
1442}
1443
1445 const size_t str_len,
1446 const int column_target,
1447 const int tab_width)
1448{
1449 size_t offset = 0, offset_next = 0;
1450 int column = 0;
1451 while ((offset < str_len) && (column < column_target)) {
1452 const uint code = BLI_str_utf8_as_unicode_step_safe(str, str_len, &offset_next);
1453 /* The following line is the only change compared with #BLI_str_utf8_offset_from_column. */
1454 column += (code == '\t') ? (tab_width - (column % tab_width)) : BLI_wcwidth_safe(code);
1455 if (column > column_target) {
1456 break;
1457 }
1458 offset = offset_next;
1459 }
1460 return int(offset);
1461}
1462
1463int BLI_str_utf8_column_count(const char *str, size_t str_len)
1464{
1465 return BLI_str_utf8_offset_to_column(str, str_len, int(str_len));
1466}
1467
#define BLI_assert(a)
Definition BLI_assert.h:46
#define ATTR_FALLTHROUGH
#define BLI_INLINE
#define BLI_string_debug_size(str, str_maxncpy)
Definition BLI_string.h:676
char char size_t BLI_strncpy_rlen(char *__restrict dst, const char *__restrict src, size_t dst_maxncpy) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1
#define BLI_UTF8_ERR
unsigned char uchar
unsigned int uint
#define ARRAY_SIZE(arr)
#define UNUSED_VARS(...)
#define UNLIKELY(x)
#define ELEM(...)
#define LIKELY(x)
#define U
#define str(s)
VecBase< float, D > step(VecOp< float, D >, VecOp< float, D >) RET
int count
format
ccl_device_inline float2 mask(const MaskType mask, const float2 a)
#define min(a, b)
Definition sort.cc:36
BLI_INLINE int utf8_char_compute_skip_or_error(const char c)
bool BLI_str_utf32_char_is_breaking_space(char32_t codepoint)
uint BLI_str_utf8_as_unicode_or_error(const char *p)
bool BLI_str_utf8_truncate_at_size(char *str, const size_t str_size)
int BLI_wcswidth_or_error(const char32_t *pwcs, size_t n)
int BLI_str_utf8_offset_from_column(const char *str, const size_t str_len, const int column_target)
int BLI_str_utf8_char_width_or_error(const char *p)
BLI_INLINE int utf8_char_compute_skip_or_error_with_mask(const char c, char *r_mask)
size_t BLI_str_utf8_from_unicode_len(const uint c)
BLI_INLINE int utf8_char_compute_skip(const char c)
size_t BLI_str_partition_ex_utf8(const char *str, const char *end, const uint delim[], const char **r_sep, const char **r_suf, const bool from_right)
size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes)
bool BLI_str_utf32_char_is_optional_break_before(char32_t codepoint, char32_t codepoint_prev)
char * BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
size_t BLI_strncpy_utf8_rlen_unterminated(char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
size_t BLI_strlen_utf8(const char *strc)
int BLI_str_utf8_invalid_strip(char *str, size_t str_len)
char32_t BLI_str_utf32_char_to_lower(const char32_t wc)
#define UTF8_VARS_FROM_CHAR32(Char, First, Len)
size_t BLI_strnlen_utf8(const char *strc, const size_t strc_maxlen)
size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
char32_t BLI_str_utf32_char_to_upper(const char32_t wc)
ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t str_len)
uint BLI_str_utf8_as_unicode_safe(const char *p)
BLI_INLINE uint utf8_char_decode(const char *p, const char mask, const int len, const uint err)
static size_t str_utf8_truncate_at_size_unchecked(char *str, const size_t str_size)
size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst, const wchar_t *__restrict src, const size_t dst_maxncpy)
size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst_w, const char *__restrict src_c, const size_t dst_w_maxncpy)
size_t BLI_snprintf_utf8(char *__restrict dst, size_t dst_maxncpy, const char *__restrict format,...)
const char * BLI_str_find_next_char_utf8(const char *p, const char *str_end)
size_t BLI_str_utf32_as_utf8_len_ex(const char32_t *src, const size_t src_maxlen)
size_t BLI_vsnprintf_utf8_rlen(char *__restrict dst, size_t dst_maxncpy, const char *__restrict format, va_list arg)
const char * BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
size_t BLI_wstrlen_utf8(const wchar_t *src)
int BLI_wcwidth_safe(char32_t ucs)
int BLI_str_utf8_offset_to_index(const char *str, const size_t str_len, const int offset_target)
int BLI_str_utf8_offset_to_column_with_tabs(const char *str, const size_t str_len, const int offset_target, const int tab_width)
size_t BLI_str_utf32_as_utf8(char *__restrict dst, const char32_t *__restrict src, const size_t dst_maxncpy)
int BLI_str_utf8_offset_to_column(const char *str, const size_t str_len, const int offset_target)
size_t BLI_vsnprintf_utf8(char *__restrict dst, size_t dst_maxncpy, const char *__restrict format, va_list arg)
BLI_INLINE char * str_utf8_copy_max_bytes_impl(char *dst, const char *src, size_t dst_maxncpy)
uint BLI_str_utf8_as_unicode_step_safe(const char *__restrict p, const size_t p_len, size_t *__restrict index)
bool BLI_str_utf32_char_is_optional_break_after(char32_t codepoint, char32_t codepoint_prev)
size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w, const char *__restrict src_c, const size_t dst_w_maxncpy)
size_t BLI_str_utf8_from_unicode(uint c, char *dst, const size_t dst_maxncpy)
const char * BLI_str_utf8_invalid_substitute_if_needed(const char *str, const size_t str_len, const char substitute, char *buf, const size_t buf_maxncpy)
int BLI_str_utf8_size_safe(const char *p)
int BLI_str_utf8_column_count(const char *str, size_t str_len)
size_t BLI_str_rpartition_utf8(const char *str, const uint delim[], const char **r_sep, const char **r_suf)
int BLI_wcwidth_or_error(char32_t ucs)
uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p, const size_t p_len, size_t *__restrict index)
size_t BLI_strncpy_utf8_rlen(char *__restrict dst, const char *__restrict src, size_t dst_maxncpy)
size_t BLI_str_partition_utf8(const char *str, const uint delim[], const char **r_sep, const char **r_suf)
int BLI_str_utf8_offset_from_index(const char *str, const size_t str_len, const int index_target)
int BLI_str_utf8_char_width_safe(const char *p)
size_t BLI_snprintf_utf8_rlen(char *__restrict dst, size_t dst_maxncpy, const char *__restrict format,...)
int BLI_str_utf8_size_or_error(const char *p)
int BLI_str_utf8_offset_from_column_with_tabs(const char *str, const size_t str_len, const int column_target, const int tab_width)
size_t BLI_strnlen_utf8_ex(const char *strc, const size_t strc_maxlen, size_t *r_len_bytes)
int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute)
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
int conv_utf_8_to_16(const char *in8, wchar_t *out16, size_t size16)
Definition utfconv.cc:182
uint len