Blender V5.0
BLI_string_utf8_test.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#include "testing/testing.h"
6
7#include "BLI_rand.h"
8#include "BLI_string.h"
10#include "BLI_string_utf8.h"
11#include "BLI_utildefines.h"
12
13/* Note that 'common' UTF8 variants of string functions (like copy, etc.) are tested in
14 * `BLI_string_test.cc` However, tests below are specific UTF8 conformance ones,
15 * and since they eat quite their share of lines, they deserved their own file. */
16
51
52#define STR_MB_ALPHA_1 "\x41"
53#define STR_MB_ALPHA_2 "\xc2\xaa"
54#define STR_MB_ALPHA_3 "\xe0\xa0\x80"
55#define STR_MB_ALPHA_4 "\xf0\x90\x80\x80"
56
57/* These don't decode into valid code-points and wont work in all UTF8 functions.
58 * Use them for functions which support up to #BLI_UTF8_MAX, where failure to test
59 * 5 & 6 byte sequences would cause test coverage to be incomplete.
60 * See https://stackoverflow.com/a/35027139 for details. */
61#define STR_MB_ALPHA_5 "\xf8\x80\x80\x80\x80"
62#define STR_MB_ALPHA_6 "\xfc\x80\x80\x80\x80\x80"
63
64/* -------------------------------------------------------------------- */
67
68/* Breaking strings is confusing here, prefer over-long lines. */
69/* clang-format off */
70
71/* Each test is made of a 79 bytes (80 with null char) string to test, expected string result after
72 * stripping invalid UTF8 bytes, and a single-byte string encoded with expected number of errors.
73 *
74 * Based on UTF8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt)
75 * by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
76 */
77static const char *utf8_invalid_tests[][3] = {
78/* 1 Some correct UTF-8 text. */
79 {"You should see the Greek word 'kosme': \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\" |",
80 "You should see the Greek word 'kosme': \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\" |", "\x00"},
81
82/* 2 Boundary condition test cases
83 * Note that those will pass for us, those are not erroneous unicode code points
84 * (aside from \x00, which is only valid as string terminator).
85 * 2.1 First possible sequence of a certain length */
86 {"2.1.1 1 byte (U-00000000): \"\x00\" |",
87 "2.1.1 1 byte (U-00000000): \"\" |", "\x01"},
88 {"2.1.2 2 bytes (U-00000080): \"\xc2\x80\" |",
89 "2.1.2 2 bytes (U-00000080): \"\xc2\x80\" |", "\x00"},
90 {"2.1.3 3 bytes (U-00000800): \"\xe0\xa0\x80\" |",
91 "2.1.3 3 bytes (U-00000800): \"\xe0\xa0\x80\" |", "\x00"},
92 {"2.1.4 4 bytes (U-00010000): \"\xf0\x90\x80\x80\" |",
93 "2.1.4 4 bytes (U-00010000): \"\xf0\x90\x80\x80\" |", "\x00"},
94 {"2.1.5 5 bytes (U-00200000): \"\xf8\x88\x80\x80\x80\" |",
95 "2.1.5 5 bytes (U-00200000): \"\xf8\x88\x80\x80\x80\" |", "\x00"},
96 {"2.1.6 6 bytes (U-04000000): \"\xfc\x84\x80\x80\x80\x80\" |",
97 "2.1.6 6 bytes (U-04000000): \"\xfc\x84\x80\x80\x80\x80\" |", "\x00"},
98/* 2.2 Last possible sequence of a certain length */
99 {"2.2.1 1 byte (U-0000007F): \"\x7f\" |",
100 "2.2.1 1 byte (U-0000007F): \"\x7f\" |", "\x00"},
101 {"2.2.2 2 bytes (U-000007FF): \"\xdf\xbf\" |",
102 "2.2.2 2 bytes (U-000007FF): \"\xdf\xbf\" |", "\x00"},
103 {"2.2.3 3 bytes (U-0000FFFF): \"\xef\xbf\xbf\" |",
104 "2.2.3 3 bytes (U-0000FFFF): \"\" |", "\x03"}, /* matches one of 5.3 sequences... */
105 {"2.2.4 4 bytes (U-001FFFFF): \"\xf7\xbf\xbf\xbf\" |",
106 "2.2.4 4 bytes (U-001FFFFF): \"\xf7\xbf\xbf\xbf\" |", "\x00"},
107 {"2.2.5 5 bytes (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\xbf\" |",
108 "2.2.5 5 bytes (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\xbf\" |", "\x00"},
109 {"2.2.6 6 bytes (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\xbf\" |",
110 "2.2.6 6 bytes (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\xbf\" |", "\x00"},
111/* 2.3 Other boundary conditions */
112 {"2.3.1 U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\" |",
113 "2.3.1 U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\" |", "\x00"},
114 {"2.3.2 U-0000E000 = ee 80 80 = \"\xee\x80\x80\" |",
115 "2.3.2 U-0000E000 = ee 80 80 = \"\xee\x80\x80\" |", "\x00"},
116 {"2.3.3 U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\" |",
117 "2.3.3 U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\" |", "\x00"},
118 {"2.3.4 U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\" |",
119 "2.3.4 U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\" |", "\x00"},
120 {"2.3.5 U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\" |",
121 "2.3.5 U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\" |", "\x00"},
122
123/* 3 Malformed sequences
124 * 3.1 Unexpected continuation bytes
125 * Each unexpected continuation byte should be separately signaled as a malformed sequence of its own. */
126 {"3.1.1 First continuation byte 0x80: \"\x80\" |",
127 "3.1.1 First continuation byte 0x80: \"\" |", "\x01"},
128 {"3.1.2 Last continuation byte 0xbf: \"\xbf\" |",
129 "3.1.2 Last continuation byte 0xbf: \"\" |", "\x01"},
130 {"3.1.3 2 continuation bytes: \"\x80\xbf\" |",
131 "3.1.3 2 continuation bytes: \"\" |", "\x02"},
132 {"3.1.4 3 continuation bytes: \"\x80\xbf\x80\" |",
133 "3.1.4 3 continuation bytes: \"\" |", "\x03"},
134 {"3.1.5 4 continuation bytes: \"\x80\xbf\x80\xbf\" |",
135 "3.1.5 4 continuation bytes: \"\" |", "\x04"},
136 {"3.1.6 5 continuation bytes: \"\x80\xbf\x80\xbf\x80\" |",
137 "3.1.6 5 continuation bytes: \"\" |", "\x05"},
138 {"3.1.7 6 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\" |",
139 "3.1.7 6 continuation bytes: \"\" |", "\x06"},
140 {"3.1.8 7 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\x80\" |",
141 "3.1.8 7 continuation bytes: \"\" |", "\x07"},
142/* 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): | */
143 {"3.1.9 \"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
144 "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
145 "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
146 "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\" |",
147 "3.1.9 \"\" |", "\x40"}, /* NOLINT: modernize-raw-string-literal. */
148/* 3.2 Lonely start characters
149 * 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character: */
150 {"3.2.1 \"\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf "
151 "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \" |",
152 "3.2.1 \" \" |", "\x20"}, /* NOLINT: modernize-raw-string-literal. */
153/* 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character: */
154 {"3.2.2 \"\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \" |",
155 "3.2.2 \" \" |", "\x10"},
156/* 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character: */
157 {"3.2.3 \"\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \" |",
158 "3.2.3 \" \" |", "\x08"},
159/* 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character: */
160 {"3.2.4 \"\xf8 \xf9 \xfa \xfb \" |",
161 "3.2.4 \" \" |", "\x04"},
162/* 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character: */
163 {"3.2.4 \"\xfc \xfd \" |",
164 "3.2.4 \" \" |", "\x02"},
165/* 3.3 Sequences with last continuation byte missing
166 * All bytes of an incomplete sequence should be signaled as a single malformed sequence,
167 * i.e., you should see only a single replacement character in each of the next 10 tests.
168 * (Characters as in section 2) */
169 {"3.3.1 2-byte sequence with last byte missing (U+0000): \"\xc0\" |",
170 "3.3.1 2-byte sequence with last byte missing (U+0000): \"\" |", "\x01"},
171 {"3.3.2 3-byte sequence with last byte missing (U+0000): \"\xe0\x80\" |",
172 "3.3.2 3-byte sequence with last byte missing (U+0000): \"\" |", "\x02"},
173 {"3.3.3 4-byte sequence with last byte missing (U+0000): \"\xf0\x80\x80\" |",
174 "3.3.3 4-byte sequence with last byte missing (U+0000): \"\" |", "\x03"},
175 {"3.3.4 5-byte sequence with last byte missing (U+0000): \"\xf8\x80\x80\x80\" |",
176 "3.3.4 5-byte sequence with last byte missing (U+0000): \"\" |", "\x04"},
177 {"3.3.5 6-byte sequence with last byte missing (U+0000): \"\xfc\x80\x80\x80\x80\" |",
178 "3.3.5 6-byte sequence with last byte missing (U+0000): \"\" |", "\x05"},
179 {"3.3.6 2-byte sequence with last byte missing (U-000007FF): \"\xdf\" |",
180 "3.3.6 2-byte sequence with last byte missing (U-000007FF): \"\" |", "\x01"},
181 {"3.3.7 3-byte sequence with last byte missing (U-0000FFFF): \"\xef\xbf\" |",
182 "3.3.7 3-byte sequence with last byte missing (U-0000FFFF): \"\" |", "\x02"},
183 {"3.3.8 4-byte sequence with last byte missing (U-001FFFFF): \"\xf7\xbf\xbf\" |",
184 "3.3.8 4-byte sequence with last byte missing (U-001FFFFF): \"\" |", "\x03"},
185 {"3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\" |",
186 "3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): \"\" |", "\x04"},
187 {"3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\" |",
188 "3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"\" |", "\x05"},
189/* 3.4 Concatenation of incomplete sequences
190 * All the 10 sequences of 3.3 concatenated, you should see 10 malformed sequences being signaled: */
191 {"3.4 \"\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80"
192 "\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf\""
193 " |",
194 "3.4 \"\" |", "\x1e"},
195/* 3.5 Impossible bytes
196 * The following two bytes cannot appear in a correct UTF-8 string */
197 {"3.5.1 fe = \"\xfe\" |",
198 "3.5.1 fe = \"\" |", "\x01"},
199 {"3.5.2 ff = \"\xff\" |",
200 "3.5.2 ff = \"\" |", "\x01"},
201 {"3.5.3 fe fe ff ff = \"\xfe\xfe\xff\xff\" |",
202 "3.5.3 fe fe ff ff = \"\" |", "\x04"},
203
204/* 4 Overlong sequences
205 * The following sequences are not malformed according to the letter of the Unicode 2.0 standard.
206 * However, they are longer then necessary and a correct UTF-8 encoder is not allowed to produce them.
207 * A "safe UTF-8 decoder" should reject them just like malformed sequences for two reasons:
208 * (1) It helps to debug applications if overlong sequences are not treated as valid representations
209 * of characters, because this helps to spot problems more quickly. (2) Overlong sequences provide
210 * alternative representations of characters, that could maliciously be used to bypass filters that check
211 * only for ASCII characters. For instance, a 2-byte encoded line feed (LF) would not be caught by a
212 * line counter that counts only 0x0a bytes, but it would still be processed as a line feed by an unsafe
213 * UTF-8 decoder later in the pipeline. From a security point of view, ASCII compatibility of UTF-8
214 * sequences means also, that ASCII characters are *only* allowed to be represented by ASCII bytes
215 * in the range 0x00-0x7f. To ensure this aspect of ASCII compatibility, use only "safe UTF-8 decoders"
216 * that reject overlong UTF-8 sequences for which a shorter encoding exists.
217 *
218 * 4.1 Examples of an overlong ASCII character
219 * With a safe UTF-8 decoder, all of the following five overlong representations of the ASCII character
220 * slash ("/") should be rejected like a malformed UTF-8 sequence, for instance by substituting it with
221 * a replacement character. If you see a slash below, you do not have a safe UTF-8 decoder! */
222 {"4.1.1 U+002F = c0 af = \"\xc0\xaf\" |",
223 "4.1.1 U+002F = c0 af = \"\" |", "\x02"},
224 {"4.1.2 U+002F = e0 80 af = \"\xe0\x80\xaf\" |",
225 "4.1.2 U+002F = e0 80 af = \"\" |", "\x03"},
226 {"4.1.3 U+002F = f0 80 80 af = \"\xf0\x80\x80\xaf\" |",
227 "4.1.3 U+002F = f0 80 80 af = \"\" |", "\x04"},
228 {"4.1.4 U+002F = f8 80 80 80 af = \"\xf8\x80\x80\x80\xaf\" |",
229 "4.1.4 U+002F = f8 80 80 80 af = \"\" |", "\x05"},
230 {"4.1.5 U+002F = fc 80 80 80 80 af = \"\xfc\x80\x80\x80\x80\xaf\" |",
231 "4.1.5 U+002F = fc 80 80 80 80 af = \"\" |", "\x06"},
232/* 4.2 Maximum overlong sequences
233 * Below you see the highest Unicode value that is still resulting in an overlong sequence if represented
234 * with the given number of bytes. This is a boundary test for safe UTF-8 decoders. All five characters
235 * should be rejected like malformed UTF-8 sequences. */
236 {"4.2.1 U-0000007F = c1 bf = \"\xc1\xbf\" |",
237 "4.2.1 U-0000007F = c1 bf = \"\" |", "\x02"},
238 {"4.2.2 U-000007FF = e0 9f bf = \"\xe0\x9f\xbf\" |",
239 "4.2.2 U-000007FF = e0 9f bf = \"\" |", "\x03"},
240 {"4.2.3 U-0000FFFF = f0 8f bf bf = \"\xf0\x8f\xbf\xbf\" |",
241 "4.2.3 U-0000FFFF = f0 8f bf bf = \"\" |", "\x04"},
242 {"4.2.4 U-001FFFFF = f8 87 bf bf bf = \"\xf8\x87\xbf\xbf\xbf\" |",
243 "4.2.4 U-001FFFFF = f8 87 bf bf bf = \"\" |", "\x05"},
244 {"4.2.5 U+0000 = fc 83 bf bf bf bf = \"\xfc\x83\xbf\xbf\xbf\xbf\" |",
245 "4.2.5 U+0000 = fc 83 bf bf bf bf = \"\" |", "\x06"},
246/* 4.3 Overlong representation of the NUL character
247 * The following five sequences should also be rejected like malformed UTF-8 sequences and should not be
248 * treated like the ASCII NUL character. */
249 {"4.3.1 U+0000 = c0 80 = \"\xc0\x80\" |",
250 "4.3.1 U+0000 = c0 80 = \"\" |", "\x02"},
251 {"4.3.2 U+0000 = e0 80 80 = \"\xe0\x80\x80\" |",
252 "4.3.2 U+0000 = e0 80 80 = \"\" |", "\x03"},
253 {"4.3.3 U+0000 = f0 80 80 80 = \"\xf0\x80\x80\x80\" |",
254 "4.3.3 U+0000 = f0 80 80 80 = \"\" |", "\x04"},
255 {"4.3.4 U+0000 = f8 80 80 80 80 = \"\xf8\x80\x80\x80\x80\" |",
256 "4.3.4 U+0000 = f8 80 80 80 80 = \"\" |", "\x05"},
257 {"4.3.5 U+0000 = fc 80 80 80 80 80 = \"\xfc\x80\x80\x80\x80\x80\" |",
258 "4.3.5 U+0000 = fc 80 80 80 80 80 = \"\" |", "\x06"},
259
260/* 5 Illegal code positions
261 * The following UTF-8 sequences should be rejected like malformed sequences, because they never represent
262 * valid ISO 10646 characters and a UTF-8 decoder that accepts them might introduce security problems
263 * comparable to overlong UTF-8 sequences.
264 * 5.1 Single UTF-16 surrogates */
265 {"5.1.1 U+D800 = ed a0 80 = \"\xed\xa0\x80\" |",
266 "5.1.1 U+D800 = ed a0 80 = \"\" |", "\x03"},
267 {"5.1.2 U+DB7F = ed ad bf = \"\xed\xad\xbf\" |",
268 "5.1.2 U+DB7F = ed ad bf = \"\" |", "\x03"},
269 {"5.1.3 U+DB80 = ed ae 80 = \"\xed\xae\x80\" |",
270 "5.1.3 U+DB80 = ed ae 80 = \"\" |", "\x03"},
271 {"5.1.4 U+DBFF = ed af bf = \"\xed\xaf\xbf\" |",
272 "5.1.4 U+DBFF = ed af bf = \"\" |", "\x03"},
273 {"5.1.5 U+DC00 = ed b0 80 = \"\xed\xb0\x80\" |",
274 "5.1.5 U+DC00 = ed b0 80 = \"\" |", "\x03"},
275 {"5.1.6 U+DF80 = ed be 80 = \"\xed\xbe\x80\" |",
276 "5.1.6 U+DF80 = ed be 80 = \"\" |", "\x03"},
277 {"5.1.7 U+DFFF = ed bf bf = \"\xed\xbf\xbf\" |",
278 "5.1.7 U+DFFF = ed bf bf = \"\" |", "\x03"},
279/* 5.2 Paired UTF-16 surrogates */
280 {"5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = \"\xed\xa0\x80\xed\xb0\x80\" |",
281 "5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = \"\" |", "\x06"},
282 {"5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = \"\xed\xa0\x80\xed\xbf\xbf\" |",
283 "5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = \"\" |", "\x06"},
284 {"5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = \"\xed\xad\xbf\xed\xb0\x80\" |",
285 "5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = \"\" |", "\x06"},
286 {"5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = \"\xed\xad\xbf\xed\xbf\xbf\" |",
287 "5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = \"\" |", "\x06"},
288 {"5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"\xed\xae\x80\xed\xb0\x80\" |",
289 "5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"\" |", "\x06"},
290 {"5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = \"\xed\xae\x80\xed\xbf\xbf\" |",
291 "5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = \"\" |", "\x06"},
292 {"5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = \"\xed\xaf\xbf\xed\xb0\x80\" |",
293 "5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = \"\" |", "\x06"},
294 {"5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = \"\xed\xaf\xbf\xed\xbf\xbf\" |",
295 "5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = \"\" |", "\x06"},
296/* 5.3 Non-character code positions
297 * The following "non-characters" are "reserved for internal use" by applications, and according to older versions
298 * of the Unicode Standard "should never be interchanged". Unicode Corrigendum #9 dropped the latter restriction.
299 * Nevertheless, their presence in incoming UTF-8 data can remain a potential security risk, depending
300 * on what use is made of these codes subsequently. Examples of such internal use:
301 * - Some file APIs with 16-bit characters may use the integer value -1 = U+FFFF to signal
302 * an end-of-file (EOF) or error condition.
303 * - In some UTF-16 receivers, code point U+FFFE might trigger a byte-swap operation
304 * (to convert between UTF-16LE and UTF-16BE).
305 * With such internal use of non-characters, it may be desirable and safer to block those code points in
306 * UTF-8 decoders, as they should never occur legitimately in incoming UTF-8 data, and could trigger
307 * unsafe behavior in subsequent processing.
308 *
309 * Particularly problematic non-characters in 16-bit applications: */
310 {"5.3.1 U+FFFE = ef bf be = \"\xef\xbf\xbe\" |",
311 "5.3.1 U+FFFE = ef bf be = \"\" |", "\x03"},
312 {"5.3.2 U+FFFF = ef bf bf = \"\xef\xbf\xbf\" |",
313 "5.3.2 U+FFFF = ef bf bf = \"\" |", "\x03"},
314 /* For now, we ignore those, they do not seem to be crucial anyway... */
315/* 5.3.3 U+FDD0 .. U+FDEF
316 * 5.3.4 U+nFFFE U+nFFFF (for n = 1..10) */
317 {nullptr, nullptr, nullptr},
318};
319/* clang-format on */
320
321/* BLI_str_utf8_invalid_strip (and indirectly, BLI_str_utf8_invalid_byte). */
322TEST(string, Utf8InvalidBytesStrip)
323{
324 for (int i = 0; utf8_invalid_tests[i][0] != nullptr; i++) {
325 const char *tst = utf8_invalid_tests[i][0];
326 const char *tst_stripped = utf8_invalid_tests[i][1];
327 const int errors_num = int(utf8_invalid_tests[i][2][0]);
328
329 char buff[80];
330 memcpy(buff, tst, sizeof(buff));
331
332 const int errors_found_num = BLI_str_utf8_invalid_strip(buff, sizeof(buff) - 1);
333
334 printf("[%02d] -> [%02d] \"%s\" -> \"%s\"\n", errors_num, errors_found_num, tst, buff);
335 EXPECT_EQ(errors_found_num, errors_num);
336 EXPECT_STREQ(buff, tst_stripped);
337 }
338}
339
340/* BLI_str_utf8_invalid_substitute (and indirectly, BLI_str_utf8_invalid_byte). */
341TEST(string, Utf8InvalidBytesSubstitute)
342{
343 for (int i = 0; utf8_invalid_tests[i][0] != nullptr; i++) {
344 const char *tst = utf8_invalid_tests[i][0];
345 const int errors_num = int(utf8_invalid_tests[i][2][0]);
346
347 char buff[80];
348 memcpy(buff, tst, sizeof(buff));
349
350 const int errors_found_num = BLI_str_utf8_invalid_substitute(buff, sizeof(buff) - 1, '?');
351
352 EXPECT_EQ(errors_found_num, errors_num);
353 EXPECT_EQ(BLI_str_utf8_invalid_byte(buff, sizeof(buff) - 1), -1);
354 EXPECT_EQ(strlen(buff), sizeof(buff) - 1);
355 }
356}
357
358TEST(string, Utf8InvalidBytesSubstitutePatterns)
359{
360#define TEST_SIMPLE(src_chars, expected_error_count, expected_str) \
361 { \
362 char buff[] = src_chars; \
363 EXPECT_EQ(BLI_str_utf8_invalid_substitute(buff, strlen(buff), '?'), expected_error_count); \
364 EXPECT_STREQ(buff, expected_str); \
365 } \
366 ((void)0)
367
368#define ARRAY_ARG(...) __VA_ARGS__
369
370 /* Empty string. */
371 TEST_SIMPLE(ARRAY_ARG({0x0}), 0, "");
372 /* One good. */
373 TEST_SIMPLE(ARRAY_ARG({'A', 0x0}), 0, "A");
374 /* One bad. */
375 TEST_SIMPLE(ARRAY_ARG({0xff, 0x0}), 1, "?");
376
377 /* Additional patterns. */
378 TEST_SIMPLE(ARRAY_ARG({0xe0, 0xef, 0x0}), 2, "??");
379 TEST_SIMPLE(ARRAY_ARG({0xe0, 'A', 0x0}), 1, "?A");
380 TEST_SIMPLE(ARRAY_ARG({'A', 0xef, 0x0}), 1, "A?");
381 TEST_SIMPLE(ARRAY_ARG({0xe0, 'A', 0xed, 0x0}), 2, "?A?");
382
383#undef ARRAY_ARG
384#undef TEST_SIMPLE
385}
386
388
389/* -------------------------------------------------------------------- */
392
393TEST(string, StringNLenUTF8)
394{
395 EXPECT_EQ(BLI_strnlen_utf8("", 0), 0);
396 EXPECT_EQ(BLI_strnlen_utf8("", 1), 0);
398}
399
400TEST(string, StringNLenUTF8_Incomplete)
401{
402 const char *ref_str =
404 char buf[22];
405 const size_t ref_str_len = 21;
406
407#define EXPECT_BYTE_OFFSET(truncate_ofs, expect_nchars) \
408 { \
409 size_t buf_ofs = 0; \
410 STRNCPY(buf, ref_str); \
411 buf[truncate_ofs] = '\0'; \
412 EXPECT_EQ(BLI_strnlen_utf8_ex(buf, ref_str_len, &buf_ofs), expect_nchars); \
413 EXPECT_EQ(buf_ofs, truncate_ofs); \
414 }
415
416 EXPECT_BYTE_OFFSET(0, 0);
417 EXPECT_BYTE_OFFSET(1, 1);
418
419 EXPECT_BYTE_OFFSET(2, 2);
420 EXPECT_BYTE_OFFSET(3, 2);
421
422 EXPECT_BYTE_OFFSET(4, 3);
423 EXPECT_BYTE_OFFSET(5, 3);
424 EXPECT_BYTE_OFFSET(6, 3);
425
426 EXPECT_BYTE_OFFSET(7, 4);
427 EXPECT_BYTE_OFFSET(8, 4);
428 EXPECT_BYTE_OFFSET(9, 4);
429 EXPECT_BYTE_OFFSET(10, 4);
430
431 EXPECT_BYTE_OFFSET(11, 5);
432 EXPECT_BYTE_OFFSET(12, 5);
433 EXPECT_BYTE_OFFSET(13, 5);
434 EXPECT_BYTE_OFFSET(14, 5);
435 EXPECT_BYTE_OFFSET(15, 5);
436
437 EXPECT_BYTE_OFFSET(16, 6);
438 EXPECT_BYTE_OFFSET(17, 6);
439 EXPECT_BYTE_OFFSET(18, 6);
440 EXPECT_BYTE_OFFSET(19, 6);
441 EXPECT_BYTE_OFFSET(20, 6);
442 EXPECT_BYTE_OFFSET(21, 6);
443
444#undef EXPECT_BYTE_OFFSET
445}
446
448
449/* -------------------------------------------------------------------- */
452
453TEST(string, StrCopyUTF8_ASCII)
454{
455#define STRNCPY_UTF8_ASCII(...) \
456 { \
457 const char src[] = {__VA_ARGS__, 0}; \
458 char dst[sizeof(src)]; \
459 memset(dst, 0xff, sizeof(dst)); \
460 STRNCPY_UTF8(dst, src); \
461 EXPECT_EQ(strlen(dst), sizeof(dst) - 1); \
462 EXPECT_STREQ(dst, src); \
463 }
464
466 STRNCPY_UTF8_ASCII('a', 'b', 'c');
467
468#undef STRNCPY_UTF8_ASCII
469}
470
471TEST(string, StrCopyUTF8_ASCII_Truncate)
472{
473#define STRNCPY_UTF8_ASCII_TRUNCATE(maxncpy, ...) \
474 { \
475 char src[] = {__VA_ARGS__}; \
476 char dst[sizeof(src)]; \
477 memset(dst, 0xff, sizeof(dst)); \
478 BLI_strncpy_utf8(dst, src, maxncpy); \
479 int len_expect = std::min<int>(sizeof(src), maxncpy) - 1; \
480 src[len_expect] = '\0'; /* To be able to use `EXPECT_STREQ`. */ \
481 EXPECT_EQ(strlen(dst), len_expect); \
482 EXPECT_STREQ(dst, src); \
483 }
484
486 STRNCPY_UTF8_ASCII_TRUNCATE(3, 'A', 'A', 'A', 'A');
487
488#undef STRNCPY_UTF8_ASCII_TRUNCATE
489}
490
491TEST(string, StrCopyUTF8_TruncateEncoding)
492{
493 /* Ensure copying one byte less than the code-point results in it being ignored entirely. */
494#define STRNCPY_UTF8_TRUNCATE(byte_size, ...) \
495 { \
496 const char src[] = {__VA_ARGS__, 0}; \
497 EXPECT_EQ(BLI_str_utf8_size_or_error(src), byte_size); \
498 char dst[sizeof(src)]; \
499 memset(dst, 0xff, sizeof(dst)); \
500 STRNCPY_UTF8(dst, src); \
501 EXPECT_EQ(strlen(dst), sizeof(dst) - 1); \
502 EXPECT_STREQ(dst, src); \
503 BLI_strncpy_utf8(dst, src, sizeof(dst) - 1); \
504 EXPECT_STREQ(dst, ""); \
505 }
506
507 STRNCPY_UTF8_TRUNCATE(6, 252, 1, 1, 1, 1, 1);
508 STRNCPY_UTF8_TRUNCATE(5, 248, 1, 1, 1, 1);
509 STRNCPY_UTF8_TRUNCATE(4, 240, 1, 1, 1);
510 STRNCPY_UTF8_TRUNCATE(3, 224, 1, 1);
511 STRNCPY_UTF8_TRUNCATE(2, 192, 1);
513
514#undef STRNCPY_UTF8_TRUNCATE
515}
516
517TEST(string, StrCopyUTF8_TruncateEncodingMulti)
518{
519#define STRNCPY_UTF8_TRUNC_EXPECT(src, dst_expect, dst_maxncpy) \
520 { \
521 char dst[dst_maxncpy + 1]; \
522 dst[dst_maxncpy] = 0xff; \
523 size_t len = BLI_strncpy_utf8_rlen(dst, src, dst_maxncpy); \
524 EXPECT_EQ(len, strlen(dst)); \
525 EXPECT_STREQ(dst, dst_expect); \
526 EXPECT_EQ(dst[dst_maxncpy], 0xff); \
527 }
528
529 /* Single characters. */
532
536
541
547
554
562
563 /* Multiple characters. */
569
577
587
599
613
614#undef STRNCPY_UTF8_TRUNC_EXPECT
615}
616
617TEST(string, StrCopyUTF8_TerminateEncodingEarly)
618{
619 /* A UTF8 sequence that has a null byte before the sequence ends.
620 * Ensure the UTF8 sequence does not step over the null byte. */
621#define STRNCPY_UTF8_TERMINATE_EARLY(byte_size, ...) \
622 { \
623 char src[] = {__VA_ARGS__, 0}; \
624 EXPECT_EQ(BLI_str_utf8_size_or_error(src), byte_size); \
625 char dst[sizeof(src)]; \
626 memset(dst, 0xff, sizeof(dst)); \
627 STRNCPY_UTF8(dst, src); \
628 EXPECT_EQ(strlen(dst), sizeof(dst) - 1); \
629 EXPECT_STREQ(dst, src); \
630 for (int i = sizeof(dst) - 1; i > 1; i--) { \
631 src[i] = '\0'; \
632 memset(dst, 0xff, sizeof(dst)); \
633 const int dst_copied = STRNCPY_UTF8_RLEN(dst, src); \
634 EXPECT_STREQ(dst, src); \
635 EXPECT_EQ(strlen(dst), i); \
636 EXPECT_EQ(dst_copied, i); \
637 } \
638 }
639
640 STRNCPY_UTF8_TERMINATE_EARLY(6, 252, 1, 1, 1, 1, 1);
641 STRNCPY_UTF8_TERMINATE_EARLY(5, 248, 1, 1, 1, 1);
642 STRNCPY_UTF8_TERMINATE_EARLY(4, 240, 1, 1, 1);
643 STRNCPY_UTF8_TERMINATE_EARLY(3, 224, 1, 1);
646
647#undef STRNCPY_UTF8_TERMINATE_EARLY
648}
649
651
652/* -------------------------------------------------------------------- */
655
656TEST(string, StrPrintfUTF8_ASCII)
657{
658#define SNPRINTF_UTF8_ASCII(...) \
659 { \
660 const char src[] = {__VA_ARGS__, 0}; \
661 char dst[sizeof(src)]; \
662 memset(dst, 0xff, sizeof(dst)); \
663 SNPRINTF_UTF8(dst, "%s", src); \
664 EXPECT_EQ(strlen(dst), sizeof(dst) - 1); \
665 EXPECT_STREQ(dst, src); \
666 }
667
669 SNPRINTF_UTF8_ASCII('a', 'b', 'c');
670
671#undef SNPRINTF_UTF8_ASCII
672}
673
674TEST(string, StrPrintfUTF8_TerminateEncodingEarly)
675{
676 /* A UTF8 sequence that has a null byte before the sequence ends.
677 * Ensure the UTF8 sequence does not step over the null byte. */
678#define SNPRINTF_UTF8_TERMINATE_EARLY(byte_size, ...) \
679 { \
680 char src[] = {__VA_ARGS__, 0}; \
681 EXPECT_EQ(BLI_str_utf8_size_or_error(src), byte_size); \
682 char dst[sizeof(src)]; \
683 memset(dst, 0xff, sizeof(dst)); \
684 SNPRINTF_UTF8(dst, "%s", src); \
685 EXPECT_EQ(strlen(dst), sizeof(dst) - 1); \
686 EXPECT_STREQ(dst, src); \
687 for (int i = sizeof(dst) - 1; i > 1; i--) { \
688 src[i] = '\0'; \
689 memset(dst, 0xff, sizeof(dst)); \
690 const int dst_copied = SNPRINTF_UTF8_RLEN(dst, "%s", src); \
691 EXPECT_STREQ(dst, src); \
692 EXPECT_EQ(strlen(dst), i); \
693 EXPECT_EQ(dst_copied, i); \
694 } \
695 }
696
697 SNPRINTF_UTF8_TERMINATE_EARLY(6, 252, 1, 1, 1, 1, 1);
698 SNPRINTF_UTF8_TERMINATE_EARLY(5, 248, 1, 1, 1, 1);
699 SNPRINTF_UTF8_TERMINATE_EARLY(4, 240, 1, 1, 1);
700 SNPRINTF_UTF8_TERMINATE_EARLY(3, 224, 1, 1);
703
704#undef STRNCPY_UTF8_TERMINATE_EARLY
705}
706
707TEST(string, StrPrintfUTF8_TruncateEncodingMulti)
708{
709#define SNPRINTF_UTF8_TRUNC_EXPECT(src, dst_expect, dst_maxncpy) \
710 { \
711 char dst[dst_maxncpy + 1]; \
712 dst[dst_maxncpy] = 0xff; \
713 size_t len = BLI_snprintf_utf8_rlen(dst, dst_maxncpy, "%s", src); \
714 EXPECT_EQ(len, strlen(dst)); \
715 EXPECT_STREQ(dst, dst_expect); \
716 EXPECT_EQ(dst[dst_maxncpy], 0xff); \
717 }
718
719 /* Single characters. */
722
726
731
737
744
752
753#undef STRNCPY_UTF8_TRUNC_EXPECT
754}
755
757
758/* -------------------------------------------------------------------- */
761
762TEST(string, Utf8OffsetFromIndex_ClampedIndex)
763{
764 /* Ensure an index that exceeds the number of multi-byte characters in the
765 * string has the same behavior as an index which is clamped by the number of code-points. */
766 const char *test_strings[] = {
767 "",
768 "TEST",
775 };
776 for (int i = 0; i < ARRAY_SIZE(test_strings); i++) {
777 const char *str = test_strings[i];
778 const size_t str_len = strlen(str);
779 const int str_len_utf8 = BLI_strlen_utf8(str);
780
781 const int str_offset = BLI_str_utf8_offset_from_index(str, str_len, str_len_utf8);
782 EXPECT_EQ(BLI_str_utf8_offset_from_index(str, str_len, str_len_utf8 + 1), str_offset);
783 EXPECT_EQ(BLI_str_utf8_offset_from_index(str, str_len, str_len_utf8 + 10), str_offset);
784 }
785}
786
788
789/* -------------------------------------------------------------------- */
792
793static size_t utf8_as_char32(const char *str, const char str_len, char32_t *r_result)
794{
795 size_t i = 0, result_len = 0;
796 while ((i < str_len) && (str[i] != '\0')) {
797 r_result[result_len++] = BLI_str_utf8_as_unicode_step_safe(str, str_len, &i);
798 }
799 return i;
800}
801
802template<size_t Size, size_t SizeWithPadding>
803void utf8_as_char32_test_compare_with_pad_bytes(const char utf8_src[Size])
804{
805 char utf8_src_with_pad[SizeWithPadding] = {0};
806
807 memcpy(utf8_src_with_pad, utf8_src, Size);
808
809 char32_t unicode_dst_a[Size], unicode_dst_b[Size];
810
811 memset(unicode_dst_a, 0xff, sizeof(unicode_dst_a));
812 const size_t index_a = utf8_as_char32(utf8_src, Size, unicode_dst_a);
813
814 /* Test with padded and un-padded size,
815 * to ensure that extra available space doesn't yield a different result. */
816 for (int pass = 0; pass < 2; pass++) {
817 memset(unicode_dst_b, 0xff, sizeof(unicode_dst_b));
818 const size_t index_b = utf8_as_char32(
819 utf8_src_with_pad, pass ? Size : SizeWithPadding, unicode_dst_b);
820
821 /* Check the resulting content matches. */
822 EXPECT_EQ_ARRAY(unicode_dst_a, unicode_dst_b, Size);
823 /* Check the index of the source strings match. */
824 EXPECT_EQ(index_a, index_b);
825 }
826}
827
828template<size_t Size> void utf8_as_char32_test_compare(const char utf8_src[Size])
829{
830 /* Note that 7 is a little arbitrary,
831 * chosen since it's the maximum length of multi-byte character + 1
832 * to account for any errors that read past null bytes. */
835}
836
837template<size_t Size> void utf8_as_char32_test_at_buffer_size()
838{
839 char utf8_src[Size];
840
841 /* Test uniform bytes, also with offsets ascending & descending. */
842 for (int i = 0; i <= 0xff; i++) {
843 memset(utf8_src, i, sizeof(utf8_src));
845
846 /* Offset trailing bytes up and down in steps of 1, 2, 4 .. etc. */
847 if (Size > 1) {
848 for (int mul = 1; mul < 256; mul *= 2) {
849 for (int ofs = 1; ofs < int(Size); ofs++) {
850 utf8_src[ofs] = char(i + (ofs * mul));
851 }
853
854 for (int ofs = 1; ofs < int(Size); ofs++) {
855 utf8_src[ofs] = char(i - (ofs * mul));
856 }
858 }
859 }
860 }
861
862 /* Random bytes. */
863 RNG *rng = BLI_rng_new(1);
864 for (int i = 0; i < 256; i++) {
865 BLI_rng_get_char_n(rng, utf8_src, sizeof(utf8_src));
867 }
868 BLI_rng_free(rng);
869}
870
888
890
891/* -------------------------------------------------------------------- */
894
895TEST(string, StrCursorStepNextUtf32Empty)
896{
897 const char32_t empty[] = U"";
898 const size_t len = 0;
899 int pos = 0;
900 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(empty, len, &pos));
901 pos = 1;
902 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(empty, len, &pos));
903}
904
906
907/* -------------------------------------------------------------------- */
910
911TEST(string, StrCursorStepNextUtf32Single)
912
913{
914 const char32_t single[] = U"0";
915 const size_t len = 1;
916 int pos = 0;
917 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(single, len, &pos) && pos == 1);
918 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(single, len, &pos));
919}
920
922
923/* -------------------------------------------------------------------- */
926
927TEST(string, StrCursorStepNextUtf32Simple)
928{
929 const char32_t simple[] = U"012";
930 const size_t len = 3;
931 int pos = 0;
932 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(simple, len, &pos) && pos == 1);
933 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(simple, len, &pos) && pos == 2);
934 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(simple, len - 1, &pos));
935 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(simple, len, &pos) && pos == 3);
936 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(simple, len, &pos));
937}
938
940
941/* -------------------------------------------------------------------- */
944
945TEST(string, StrCursorStepNextUtf32AllCombining)
946{
947 const char32_t allcombining[] = U"\u0300\u0300\u0300";
948 const size_t len = 3;
949 int pos = 0;
950 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(allcombining, len, &pos) && pos == 3);
951 pos = 1;
952 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(allcombining, len, &pos) && pos == 3);
953 pos = 2;
954 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(allcombining, len, &pos) && pos == 3);
955 pos = 3;
956 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(allcombining, len, &pos));
957}
958
960
961/* -------------------------------------------------------------------- */
964
965TEST(string, StrCursorStepNextUtf32Complex)
966{
967 /* Combining character, "A", two combining characters, "B". */
968 const char32_t complex[] = U"\u0300\u0041\u0300\u0320\u0042";
969 const size_t len = 5;
970 int pos = 0;
971 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(complex, len, &pos) && pos == 1);
972 pos = 1;
973 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(complex, len, &pos) && pos == 4);
974 pos = 2;
975 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(complex, len, &pos) && pos == 4);
976 pos = 3;
977 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(complex, len, &pos) && pos == 4);
978 pos = 4;
979 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(complex, len, &pos) && pos == 5);
980 pos = 5;
982}
983
985
986/* -------------------------------------------------------------------- */
989
990TEST(string, StrCursorStepNextUtf32Invalid)
991{
992 /* Latin1 "À", tab, carriage return, linefeed, separated by combining characters. */
993 const char32_t invalid[] = U"\u00C0\u0300\u0009\u0300\u000D\u0300\u000A\u0300";
994 const size_t len = 8;
995 int pos = 0;
996 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 2);
997 pos = 1;
998 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 2);
999 pos = 2;
1000 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 4);
1001 pos = 3;
1002 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 4);
1003 pos = 4;
1004 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 6);
1005 pos = 5;
1006 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 6);
1007 pos = 6;
1008 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 8);
1009 pos = 7;
1010 EXPECT_TRUE(BLI_str_cursor_step_next_utf32(invalid, len, &pos) && pos == 8);
1011 pos = 8;
1012 EXPECT_FALSE(BLI_str_cursor_step_next_utf32(invalid, len, &pos));
1013}
1014
1016
1017/* -------------------------------------------------------------------- */
1020
1021TEST(string, StrCursorStepPrevUtf32Empty)
1022{
1023 const char32_t emtpy[] = U"";
1024 const size_t len = 0;
1025 int pos = 0;
1026 EXPECT_FALSE(BLI_str_cursor_step_prev_utf32(emtpy, len, &pos));
1027}
1028
1030
1031/* -------------------------------------------------------------------- */
1034
1035TEST(string, StrCursorStepPrevUtf32Single)
1036{
1037 const char32_t single[] = U"0";
1038 const size_t len = 1;
1039 int pos = 1;
1040 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(single, len, &pos) && pos == 0);
1041 EXPECT_FALSE(BLI_str_cursor_step_prev_utf32(single, len, &pos));
1042}
1043
1045
1046/* -------------------------------------------------------------------- */
1049
1050TEST(string, StrCursorStepPrevUtf32Simple)
1051{
1052 const char32_t simple[] = U"012";
1053 const size_t len = 3;
1054 int pos = 3;
1055 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(simple, len, &pos) && pos == 2);
1056 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(simple, len, &pos) && pos == 1);
1057 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(simple, len, &pos) && pos == 0);
1058 EXPECT_FALSE(BLI_str_cursor_step_prev_utf32(simple, len, &pos));
1059}
1060
1062
1063/* -------------------------------------------------------------------- */
1066
1067TEST(string, StrCursorStepPrevUtf32AllCombining)
1068{
1069 const char32_t allcombining[] = U"\u0300\u0300\u0300";
1070 const size_t len = 3;
1071 int pos = 3;
1072 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(allcombining, len, &pos) && pos == 0);
1073 pos = 2;
1074 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(allcombining, len, &pos) && pos == 0);
1075 pos = 1;
1076 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(allcombining, len, &pos) && pos == 0);
1077 pos = 0;
1078 EXPECT_FALSE(BLI_str_cursor_step_prev_utf32(allcombining, len, &pos));
1079}
1080
1082
1083/* -------------------------------------------------------------------- */
1086
1087TEST(string, StrCursorStepPrevUtf32Complex)
1088{
1089 /* Combining character, "A", two combining characters, "B". */
1090 const char32_t complex[] = U"\u0300\u0041\u0300\u0320\u0042";
1091 const size_t len = 5;
1092 int pos = 5;
1093 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(complex, len, &pos) && pos == 4);
1094 pos = 4;
1095 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(complex, len, &pos) && pos == 1);
1096 pos = 3;
1097 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(complex, len, &pos) && pos == 1);
1098 pos = 2;
1099 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(complex, len, &pos) && pos == 1);
1100 pos = 1;
1101 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(complex, len, &pos) && pos == 0);
1102 pos = 0;
1104}
1105
1107
1108/* -------------------------------------------------------------------- */
1111
1112TEST(string, StrCursorStepPrevUtf32Invalid)
1113{
1114 /* Latin1 "À", tab, carriage return, linefeed, separated by combining characters. */
1115 const char32_t invalid[] = U"\u00C0\u0300\u0009\u0300\u000D\u0300\u000A\u0300";
1116 const size_t len = 8;
1117 int pos = 8;
1118 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 6);
1119 pos = 7;
1120 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 6);
1121 pos = 6;
1122 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 4);
1123 pos = 5;
1124 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 4);
1125 pos = 4;
1126 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 2);
1127 pos = 3;
1128 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 2);
1129 pos = 2;
1130 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 0);
1131 pos = 1;
1132 EXPECT_TRUE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos) && pos == 0);
1133 pos = 0;
1134 EXPECT_FALSE(BLI_str_cursor_step_prev_utf32(invalid, len, &pos));
1135}
1136
1138
1139/* -------------------------------------------------------------------- */
1142TEST(string, StrCursorStepNextUtf8Empty)
1143{
1144 const char empty[] = "";
1145 const size_t len = 0;
1146 int pos = 0;
1147 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(empty, len, &pos));
1148 pos = 1;
1149 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(empty, len, &pos));
1150}
1151
1153
1154/* -------------------------------------------------------------------- */
1157TEST(string, StrCursorStepNextUtf8Single)
1158{
1159 const char single[] = "0";
1160 const size_t len = 1;
1161 int pos = 0;
1162 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(single, len, &pos) && pos == 1);
1163 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(single, len, &pos));
1164}
1165
1167
1168/* -------------------------------------------------------------------- */
1171
1172TEST(string, StrCursorStepNextUtf8Simple)
1173{
1174 const char simple[] = "012";
1175 const size_t len = 3;
1176 int pos = 0;
1177 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(simple, len, &pos) && pos == 1);
1178 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(simple, len, &pos) && pos == 2);
1179 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(simple, len - 1, &pos));
1180 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(simple, len, &pos) && pos == 3);
1181 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(simple, len, &pos));
1182}
1183
1185
1186/* -------------------------------------------------------------------- */
1189
1190TEST(string, StrCursorStepNextUtf8AllCombining)
1191{
1192 const char allcombining[] = "\xCC\x80\xCC\x80\xCC\x80";
1193 const size_t len = 6;
1194 int pos = 0;
1195 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos) && pos == 6);
1196 pos = 1;
1197 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos) && pos == 6);
1198 pos = 2;
1199 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos) && pos == 6);
1200 pos = 3;
1201 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos) && pos == 6);
1202 pos = 4;
1203 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos) && pos == 6);
1204 pos = 5;
1205 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos) && pos == 6);
1206 pos = 6;
1207 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(allcombining, len, &pos));
1208}
1209
1211
1212/* -------------------------------------------------------------------- */
1215
1216TEST(string, StrCursorStepNextUtf8AllComplex)
1217{
1218 /* Combining character, "A", "©", two combining characters, "B". */
1219 const char complex[] = "\xCC\x80\x41\xC2\xA9\xCC\x80\xCC\xA0\x42";
1220 const size_t len = 10;
1221 int pos = 0;
1222 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 2);
1223 pos = 1;
1224 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 2);
1225 pos = 2;
1226 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 3);
1227 pos = 3;
1228 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 9);
1229 pos = 4;
1230 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 9);
1231 pos = 5;
1232 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 9);
1233 pos = 6;
1234 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 9);
1235 pos = 7;
1236 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 9);
1237 pos = 8;
1238 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 9);
1239 pos = 9;
1240 EXPECT_TRUE(BLI_str_cursor_step_next_utf8(complex, len, &pos) && pos == 10);
1241 pos = 10;
1242 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(complex, len, &pos));
1243}
1244
1246
1247/* -------------------------------------------------------------------- */
1250
1251TEST(string, StrCursorStepNextUtf8Invalid)
1252{
1253 /* Latin1 "À", combining, tab, carriage return, linefeed, combining. */
1254 const char invalid[] = "\xC0\xCC\x80\x09\x0D\x0A\xCC\x80";
1255 const size_t len = 8;
1256 int pos = 0;
1257 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 3);
1258 pos = 1;
1259 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 3);
1260 pos = 2;
1261 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 3);
1262 pos = 3;
1263 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 4);
1264 pos = 4;
1265 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 5);
1266 pos = 5;
1267 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 8);
1268 pos = 6;
1269 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 8);
1270 pos = 7;
1271 EXPECT_EQ(BLI_str_cursor_step_next_utf8(invalid, len, &pos) ? pos : -1, 8);
1272 pos = 8;
1273 EXPECT_FALSE(BLI_str_cursor_step_next_utf8(invalid, len, &pos));
1274}
1275
1277
1278/* -------------------------------------------------------------------- */
1281
1282TEST(string, StrCursorStepPrevUtf8Empty)
1283{
1284 const char empty[] = "";
1285 const size_t len = 0;
1286 int pos = 0;
1287 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(empty, len, &pos));
1288 pos = 1;
1289 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(empty, len, &pos));
1290}
1291
1293
1294/* -------------------------------------------------------------------- */
1297
1298TEST(string, StrCursorStepPrevUtf8Single)
1299{
1300 const char single[] = "0";
1301 const size_t len = 1;
1302 int pos = 1;
1303 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(single, len, &pos) && pos == 0);
1304 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(single, len, &pos));
1305}
1306
1308
1309/* -------------------------------------------------------------------- */
1312
1313TEST(string, StrCursorStepPrevUtf8Simple)
1314{
1315 const char simple[] = "012";
1316 const size_t len = 3;
1317 int pos = 3;
1318 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(simple, len, &pos) && pos == 2);
1319 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(simple, len, &pos) && pos == 1);
1320 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(simple, len, &pos) && pos == 0);
1321 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(simple, len, &pos));
1322}
1323
1325
1326/* -------------------------------------------------------------------- */
1329
1330TEST(string, StrCursorStepPrevUtf8AllCombining)
1331{
1332 const char allcombining[] = "\xCC\x80\xCC\x80\xCC\x80";
1333 const size_t len = 6;
1334 int pos = 6;
1335 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos) && pos == 0);
1336 pos = 5;
1337 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos) && pos == 0);
1338 pos = 4;
1339 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos) && pos == 0);
1340 pos = 3;
1341 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos) && pos == 0);
1342 pos = 2;
1343 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos) && pos == 0);
1344 pos = 1;
1345 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos) && pos == 0);
1346 pos = 0;
1347 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(allcombining, len, &pos));
1348}
1349
1351
1352/* -------------------------------------------------------------------- */
1355
1356TEST(string, StrCursorStepPrevUtf8Complex)
1357{
1358 /* Combining character, "A", "©", two combining characters, "B". */
1359 const char complex[] = "\xCC\x80\x41\xC2\xA9\xCC\x80\xCC\xA0\x42";
1360 const size_t len = 10;
1361 int pos = 10;
1362 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 9);
1363 pos = 9;
1364 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 3);
1365 pos = 8;
1366 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 3);
1367 pos = 7;
1368 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 3);
1369 pos = 6;
1370 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 3);
1371 pos = 5;
1372 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 3);
1373 pos = 4;
1374 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 3);
1375 pos = 3;
1376 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 2);
1377 pos = 2;
1378 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 0);
1379 pos = 1;
1380 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(complex, len, &pos) && pos == 0);
1381 pos = 0;
1382 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(complex, len, &pos));
1383}
1384
1386
1387/* -------------------------------------------------------------------- */
1390
1391TEST(string, StrCursorStepPrevUtf8Invalid)
1392{
1393 /* Latin1 "À", combining, tab, carriage return, linefeed, combining. */
1394 const char invalid[] = "\xC0\xCC\x80\x09\x0D\x0A\xCC\x80";
1395 const size_t len = 8;
1396 int pos = 8;
1397 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 5);
1398 pos = 7;
1399 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 5);
1400 pos = 6;
1401 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 5);
1402 pos = 5;
1403 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 4);
1404 pos = 4;
1405 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 3);
1406 pos = 3;
1407 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 0);
1408 pos = 2;
1409 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 0);
1410 pos = 1;
1411 EXPECT_TRUE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos) && pos == 0);
1412 pos = 0;
1413 EXPECT_FALSE(BLI_str_cursor_step_prev_utf8(invalid, len, &pos));
1414}
1415
EXPECT_EQ(BLI_expr_pylike_eval(expr, nullptr, 0, &result), EXPR_PYLIKE_INVALID)
Random number functions.
struct RNG * BLI_rng_new(unsigned int seed)
Definition rand.cc:39
void BLI_rng_free(struct RNG *rng) ATTR_NONNULL(1)
Definition rand.cc:53
void BLI_rng_get_char_n(RNG *rng, char *bytes, size_t bytes_len) ATTR_NONNULL(1
bool BLI_str_cursor_step_prev_utf32(const char32_t *str, int str_maxlen, int *pos)
bool BLI_str_cursor_step_next_utf8(const char *str, int str_maxlen, int *pos)
bool BLI_str_cursor_step_prev_utf8(const char *str, int str_maxlen, int *pos)
bool BLI_str_cursor_step_next_utf32(const char32_t *str, int str_maxlen, int *pos)
int BLI_str_utf8_offset_from_index(const char *str, size_t str_len, int index_target) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1)
int BLI_str_utf8_invalid_substitute(char *str, size_t str_len, const char substitute) ATTR_NONNULL(1)
ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t str_len) ATTR_NONNULL(1)
size_t size_t BLI_strnlen_utf8(const char *strc, size_t strc_maxlen) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT
size_t BLI_strlen_utf8(const char *strc) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT
unsigned int BLI_str_utf8_as_unicode_step_safe(const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1
int BLI_str_utf8_invalid_strip(char *str, size_t str_len) ATTR_NONNULL(1)
#define STR_MB_ALPHA_1
void utf8_as_char32_test_at_buffer_size()
#define STR_MB_ALPHA_5
#define SNPRINTF_UTF8_ASCII(...)
#define STRNCPY_UTF8_TRUNCATE(byte_size,...)
#define SNPRINTF_UTF8_TERMINATE_EARLY(byte_size,...)
#define EXPECT_BYTE_OFFSET(truncate_ofs, expect_nchars)
#define STRNCPY_UTF8_ASCII_TRUNCATE(maxncpy,...)
#define STRNCPY_UTF8_ASCII(...)
#define STR_MB_ALPHA_3
#define STRNCPY_UTF8_TERMINATE_EARLY(byte_size,...)
static size_t utf8_as_char32(const char *str, const char str_len, char32_t *r_result)
#define STR_MB_ALPHA_2
#define SNPRINTF_UTF8_TRUNC_EXPECT(src, dst_expect, dst_maxncpy)
void utf8_as_char32_test_compare_with_pad_bytes(const char utf8_src[Size])
void utf8_as_char32_test_compare(const char utf8_src[Size])
TEST(string, Utf8InvalidBytesStrip)
#define STR_MB_ALPHA_6
#define STRNCPY_UTF8_TRUNC_EXPECT(src, dst_expect, dst_maxncpy)
static const char * utf8_invalid_tests[][3]
#define ARRAY_ARG(...)
#define STR_MB_ALPHA_4
#define TEST_SIMPLE(src_chars, expected_error_count, expected_str)
#define ARRAY_SIZE(arr)
#define U
static void mul(btAlignedObjectArray< T > &items, const Q &value)
#define str(s)
uint pos
#define printf(...)
Definition rand.cc:33
i
Definition text_draw.cc:230
uint len