Blender V4.3
string_search.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#include "BLI_array.hh"
8#include "BLI_span.hh"
9#include "BLI_string.h"
10#include "BLI_string_ref.hh"
11#include "BLI_string_search.hh"
12#include "BLI_string_utf8.h"
14#include "BLI_task.hh"
15
16/* Right arrow, keep in sync with #UI_MENU_ARROW_SEP in `UI_interface.hh`. */
17#define UI_MENU_ARROW_SEP BLI_STR_UTF8_BLACK_RIGHT_POINTING_SMALL_TRIANGLE
18#define UI_MENU_ARROW_SEP_UNICODE 0x25b8
19
20namespace blender::string_search {
21
23{
24 return int64_t(BLI_strnlen_utf8(str.data(), size_t(str.size())));
25}
26
28{
29 constexpr int deletion_cost = 1;
30 constexpr int insertion_cost = 1;
31 constexpr int substitution_cost = 1;
32 constexpr int transposition_cost = 1;
33
34 const int size_a = count_utf8_code_points(a);
35 const int size_b = count_utf8_code_points(b);
36
37 /* Instead of keeping the entire table in memory, only keep three rows. The algorithm only
38 * accesses these rows and nothing older.
39 * All three rows are usually allocated on the stack. At most a single heap allocation is done,
40 * if the reserved stack space is too small. */
41 const int row_length = size_b + 1;
42 Array<int, 64> rows(row_length * 3);
43
44 /* Store rows as spans so that it is cheap to swap them. */
45 MutableSpan v0{rows.data() + row_length * 0, row_length};
46 MutableSpan v1{rows.data() + row_length * 1, row_length};
47 MutableSpan v2{rows.data() + row_length * 2, row_length};
48
49 /* Only v1 needs to be initialized. */
50 for (const int i : IndexRange(row_length)) {
51 v1[i] = i * insertion_cost;
52 }
53
54 uint32_t prev_unicode_a;
55 size_t offset_a = 0;
56 for (const int i : IndexRange(size_a)) {
57 v2[0] = (i + 1) * deletion_cost;
58
59 const uint32_t unicode_a = BLI_str_utf8_as_unicode_step_safe(a.data(), a.size(), &offset_a);
60
61 uint32_t prev_unicode_b;
62 size_t offset_b = 0;
63 for (const int j : IndexRange(size_b)) {
64 const uint32_t unicode_b = BLI_str_utf8_as_unicode_step_safe(b.data(), b.size(), &offset_b);
65
66 /* Check how costly the different operations would be and pick the cheapest - the one with
67 * minimal cost. */
68 int new_cost = std::min({v1[j + 1] + deletion_cost,
69 v2[j] + insertion_cost,
70 v1[j] + (unicode_a != unicode_b) * substitution_cost});
71 if (i > 0 && j > 0) {
72 if (unicode_a == prev_unicode_b && prev_unicode_a == unicode_b) {
73 new_cost = std::min(new_cost, v0[j - 1] + transposition_cost);
74 }
75 }
76
77 v2[j + 1] = new_cost;
78 prev_unicode_b = unicode_b;
79 }
80
81 /* Swap the three rows, so that the next row can be computed. */
82 std::tie(v0, v1, v2) = std::tuple<MutableSpan<int>, MutableSpan<int>, MutableSpan<int>>(
83 v1, v2, v0);
84 prev_unicode_a = unicode_a;
85 }
86
87 return v1.last();
88}
89
91{
92 /* If it is a perfect partial match, return immediately. */
93 if (full.find(query) != StringRef::not_found) {
94 return 0;
95 }
96
97 const int query_size = count_utf8_code_points(query);
98 const int full_size = count_utf8_code_points(full);
99
100 /* If there is only a single character which is not in the full string, this is not a match. */
101 if (query_size == 1) {
102 return -1;
103 }
104 BLI_assert(query.size() >= 2);
105
106 /* Allow more errors when the size grows larger. */
107 const int max_errors = query_size <= 1 ? 0 : query_size / 8 + 1;
108
109 /* If the query is too large, this cannot be a match. */
110 if (query_size - full_size > max_errors) {
111 return -1;
112 }
113
114 const uint32_t query_first_unicode = BLI_str_utf8_as_unicode_safe(query.data());
115 const uint32_t query_second_unicode = BLI_str_utf8_as_unicode_safe(
116 query.data() + BLI_str_utf8_size_safe(query.data()));
117
118 const char *full_begin = full.begin();
119 const char *full_end = full.end();
120
121 const char *window_begin = full_begin;
122 const char *window_end = window_begin;
123 const int window_size = std::min(query_size + max_errors, full_size);
124 const int extra_chars = window_size - query_size;
125 const int max_acceptable_distance = max_errors + extra_chars;
126
127 for (int i = 0; i < window_size; i++) {
128 window_end += BLI_str_utf8_size_safe(window_end);
129 }
130
131 while (true) {
132 StringRef window{window_begin, window_end};
133 const uint32_t window_begin_unicode = BLI_str_utf8_as_unicode_safe(window_begin);
134 int distance = 0;
135 /* Expect that the first or second character of the query is correct. This helps to avoid
136 * computing the more expensive distance function. */
137 if (ELEM(window_begin_unicode, query_first_unicode, query_second_unicode)) {
138 distance = damerau_levenshtein_distance(query, window);
139 if (distance <= max_acceptable_distance) {
140 return distance;
141 }
142 }
143 if (window_end == full_end) {
144 return -1;
145 }
146
147 /* When the distance is way too large, we can skip a couple of code points, because the
148 * distance can't possibly become as short as required. */
149 const int window_offset = std::max(1, distance / 2);
150 for (int i = 0; i < window_offset && window_end < full_end; i++) {
151 window_begin += BLI_str_utf8_size_safe(window_begin);
152 window_end += BLI_str_utf8_size_safe(window_end);
153 }
154 }
155}
156
157static constexpr int unused_word = -1;
158
161
163 {
164 int count = 0;
165 for (const int i : this->matched_word_indices) {
166 if (item.word_group_ids[i] == item.main_group_id) {
167 count++;
168 }
169 }
170 return count;
171 }
172
173 bool better_than(const InitialsMatch &other, const SearchItem &item) const
174 {
175 return this->count_main_group_matches(item) > other.count_main_group_matches(item);
176 }
177};
178
186static std::optional<InitialsMatch> match_word_initials(StringRef query,
187 const SearchItem &item,
188 const Span<int> word_match_map,
189 int start = 0)
190{
191 const Span<StringRef> words = item.normalized_words;
192 if (start >= words.size()) {
193 return std::nullopt;
194 }
195
196 InitialsMatch match;
197
198 size_t query_index = 0;
199 int word_index = start;
200 size_t char_index = 0;
201
202 int first_found_word_index = -1;
203
204 while (query_index < query.size()) {
205 const uint query_unicode = BLI_str_utf8_as_unicode_step_safe(
206 query.data(), query.size(), &query_index);
207 while (true) {
208 /* We are at the end of words, no complete match has been found yet. */
209 if (word_index >= words.size()) {
210 if (first_found_word_index >= 0) {
211 /* Try starting to match at another word. In some cases one can still find matches this
212 * way. */
213 return match_word_initials(query, item, word_match_map, first_found_word_index + 1);
214 }
215 return std::nullopt;
216 }
217
218 /* Skip words that the caller does not want us to use. */
219 if (word_match_map[word_index] != unused_word) {
220 word_index++;
221 BLI_assert(char_index == 0);
222 continue;
223 }
224
225 StringRef word = words[word_index];
226 /* Try to match the current character with the current word. */
227 if (int(char_index) < word.size()) {
228 const uint32_t char_unicode = BLI_str_utf8_as_unicode_step_safe(
229 word.data(), word.size(), &char_index);
230 if (query_unicode == char_unicode) {
231 match.matched_word_indices.append(word_index);
232 if (first_found_word_index == -1) {
233 first_found_word_index = word_index;
234 }
235 break;
236 }
237 }
238
239 /* Could not find a match in the current word, go to the beginning of the next word. */
240 word_index += 1;
241 char_index = 0;
242 }
243 }
244 /* Check if we can find a better match that starts at a later word. */
245 if (std::optional<InitialsMatch> sub_match = match_word_initials(
246 query, item, word_match_map, first_found_word_index + 1))
247 {
248 if (sub_match->better_than(match, item)) {
249 return sub_match;
250 }
251 }
252 return match;
253}
254
259 const SearchItem &item,
260 Span<int> word_match_map,
261 Span<StringRef> remaining_query_words)
262{
263 /* If there is another word in the remaining full query that contains the current word, we have
264 * to pick the shortest word. If we pick a longer one, it can happen that the other query word
265 * does not have a match anymore. This can lead to a situation where a query does not match
266 * itself anymore.
267 *
268 * E.g. the full query `T > Test` wouldn't match itself anymore if `Test` has a higher weight.
269 * That's because the `T` would be matched with the `Test`, but then `Test` can't match `Test
270 * anymore because that's taken up already.
271 *
272 * If we don't have to pick the shortest match for correctness, pick the one with the largest
273 * weight instead.
274 */
275 bool use_shortest_match = false;
276 for (const StringRef other_word : remaining_query_words) {
277 if (other_word.startswith(query)) {
278 use_shortest_match = true;
279 break;
280 }
281 }
282
283 int best_word_size = INT32_MAX;
284 int best_word_index = -1;
285 bool best_word_in_main_group = false;
286 for (const int i : item.normalized_words.index_range()) {
287 if (word_match_map[i] != unused_word) {
288 continue;
289 }
290 StringRef word = item.normalized_words[i];
291 const bool word_in_main_group = item.word_group_ids[i] == item.main_group_id;
292 if (word.startswith(query)) {
293 bool found_new_best = false;
294 if (use_shortest_match) {
295 if (word.size() < best_word_size) {
296 found_new_best = true;
297 }
298 }
299 else {
300 if (!best_word_in_main_group) {
301 found_new_best = true;
302 }
303 }
304 if (found_new_best) {
305 best_word_index = i;
306 best_word_size = word.size();
307 best_word_in_main_group = word_in_main_group;
308 }
309 }
310 }
311 return best_word_index;
312}
313
315 Span<StringRef> words,
316 Span<int> word_match_map,
317 int *r_error_count)
318{
319 for (const int i : words.index_range()) {
320 if (word_match_map[i] != unused_word) {
321 continue;
322 }
323 StringRef word = words[i];
324 const int error_count = get_fuzzy_match_errors(query, word);
325 if (error_count >= 0) {
326 *r_error_count = error_count;
327 return i;
328 }
329 }
330 return -1;
331}
332
337static std::optional<float> score_query_against_words(Span<StringRef> query_words,
338 const SearchItem &item)
339{
340 /* A mapping from #result_words to #query_words. It's mainly used to determine if a word has been
341 * matched already to avoid matching it again. */
342 Array<int, 64> word_match_map(item.normalized_words.size(), unused_word);
343
344 /* Start with some high score, because otherwise the final score might become negative. */
345 float total_match_score = item.is_deprecated ? 500 : 1000;
346
347 for (const int query_word_index : query_words.index_range()) {
348 const StringRef query_word = query_words[query_word_index];
349 {
350 /* Check if any result word begins with the query word. */
351 const int word_index = get_best_word_index_that_startswith(
352 query_word, item, word_match_map, query_words.drop_front(query_word_index + 1));
353 if (word_index >= 0) {
354 /* Give a match in a main group higher priority. */
355 const bool is_main_group = item.word_group_ids[word_index] == item.main_group_id;
356 total_match_score += is_main_group ? 10 : 9;
357 word_match_map[word_index] = query_word_index;
358 continue;
359 }
360 }
361 {
362 /* Try to match against word initials. */
363 if (std::optional<InitialsMatch> match = match_word_initials(
364 query_word, item, word_match_map))
365 {
366 /* If the all matched words are in the main group, give the match a higher priority. */
367 bool all_main_group_matches = match->count_main_group_matches(item) ==
368 match->matched_word_indices.size();
369 total_match_score += all_main_group_matches ? 4 : 3;
370 for (const int i : match->matched_word_indices) {
371 word_match_map[i] = query_word_index;
372 }
373 continue;
374 }
375 }
376 {
377 /* Fuzzy match against words. */
378 int error_count = 0;
379 const int word_index = get_word_index_that_fuzzy_matches(
380 query_word, item.normalized_words, word_match_map, &error_count);
381 if (word_index >= 0) {
382 total_match_score += 3 - error_count;
383 word_match_map[word_index] = query_word_index;
384 continue;
385 }
386 }
387
388 /* Couldn't match query word with anything. */
389 return std::nullopt;
390 }
391
392 {
393 /* Add penalty when query words are not in the correct order. */
394 Vector<int> match_indices;
395 for (const int index : word_match_map) {
396 if (index != unused_word) {
397 match_indices.append(index);
398 }
399 }
400 if (!match_indices.is_empty()) {
401 for (const int i : IndexRange(match_indices.size() - 1)) {
402 if (match_indices[i] > match_indices[i + 1]) {
403 total_match_score -= 1;
404 }
405 }
406 }
407 }
408
409 return total_match_score;
410}
411
413 LinearAllocator<> &allocator,
414 Vector<StringRef, 64> &r_words,
415 Vector<int, 64> &r_word_group_ids)
416{
417 const uint32_t unicode_space = uint32_t(' ');
418 const uint32_t unicode_dash = uint32_t('-');
419 const uint32_t unicode_underscore = uint32_t('_');
420 const uint32_t unicode_slash = uint32_t('/');
421 const uint32_t unicode_right_triangle = UI_MENU_ARROW_SEP_UNICODE;
422
423 BLI_assert(unicode_space == BLI_str_utf8_as_unicode_safe(" "));
424 BLI_assert(unicode_dash == BLI_str_utf8_as_unicode_safe("-"));
425 BLI_assert(unicode_underscore == BLI_str_utf8_as_unicode_safe("_"));
426 BLI_assert(unicode_slash == BLI_str_utf8_as_unicode_safe("/"));
428
429 auto is_separator = [&](uint32_t unicode) {
430 return ELEM(unicode,
431 unicode_space,
432 unicode_dash,
433 unicode_underscore,
434 unicode_slash,
435 unicode_right_triangle);
436 };
437
438 Vector<int, 64> section_indices;
439
440 /* Make a copy of the string so that we can edit it. */
441 StringRef str_copy = allocator.copy_string(str);
442 char *mutable_copy = const_cast<char *>(str_copy.data());
443 const size_t str_size_in_bytes = size_t(str.size());
444 BLI_str_tolower_ascii(mutable_copy, str_size_in_bytes);
445
446 /* Iterate over all unicode code points to split individual words. */
447 int group_id = 0;
448 bool is_in_word = false;
449 size_t word_start = 0;
450 size_t offset = 0;
451 while (offset < str_size_in_bytes) {
452 size_t size = offset;
453 uint32_t unicode = BLI_str_utf8_as_unicode_step_safe(str.data(), str.size(), &size);
454 size -= offset;
455 if (is_separator(unicode)) {
456 if (is_in_word) {
457 const StringRef word = str_copy.substr(int(word_start), int(offset - word_start));
458 r_words.append(word);
459 r_word_group_ids.append(group_id);
460 is_in_word = false;
461 }
462 }
463 else {
464 if (!is_in_word) {
465 word_start = offset;
466 is_in_word = true;
467 }
468 }
469 if (unicode == unicode_right_triangle) {
470 group_id++;
471 }
472 offset += size;
473 }
474 /* If the last word is not followed by a separator, it has to be handled separately. */
475 if (is_in_word) {
476 const StringRef word = str_copy.drop_prefix(int(word_start));
477 r_words.append(word);
478 r_word_group_ids.append(group_id);
479 }
480}
481
482void StringSearchBase::add_impl(const StringRef str, void *user_data, const float weight)
483{
485 Vector<int, 64> word_group_ids;
487 const int recent_time = recent_cache_ ?
489 -1;
490 int main_group_id = 0;
491 if (!word_group_ids.is_empty()) {
492 switch (main_words_heuristic_) {
494 main_group_id = 0;
495 break;
496 }
498 main_group_id = word_group_ids.last();
499 break;
500 }
502 main_group_id = 0;
503 word_group_ids.fill(0);
504 break;
505 }
506 }
507 }
508
509 int main_group_length = 0;
510 for (const int i : words.index_range()) {
511 if (word_group_ids[i] == main_group_id) {
512 main_group_length += int(words[i].size());
513 }
514 }
515
516 /* Not checking for the "D" to avoid problems with upper/lower-case. */
517 const bool is_deprecated = str.find("eprecated") != StringRef::not_found;
518
519 items_.append({user_data,
521 allocator_.construct_array_copy(word_group_ids.as_span()),
522 main_group_id,
523 main_group_length,
524 int(str.size()),
525 weight,
526 recent_time,
527 is_deprecated});
528}
529
531{
532 LinearAllocator<> allocator;
533 Vector<StringRef, 64> query_words;
534 /* This is just a dummy value that is not used for the query. */
535 Vector<int, 64> word_group_ids;
536 string_search::extract_normalized_words(query, allocator, query_words, word_group_ids);
537
538 /* Compute score of every result. */
539 Array<std::optional<float>> all_scores(items_.size());
540 threading::parallel_for(items_.index_range(), 256, [&](const IndexRange range) {
541 for (const int i : range) {
542 const SearchItem &item = items_[i];
543 const std::optional<float> score = string_search::score_query_against_words(query_words,
544 item);
545 all_scores[i] = score;
546 }
547 });
548 MultiValueMap<float, int> result_indices_by_score;
549 for (const int i : items_.index_range()) {
550 const std::optional<float> score = all_scores[i];
551 if (score.has_value()) {
552 result_indices_by_score.add(*score, i);
553 }
554 }
555
556 Vector<float> found_scores;
557 for (const float score : result_indices_by_score.keys()) {
558 found_scores.append(score);
559 }
560 std::sort(found_scores.begin(), found_scores.end(), std::greater<>());
561
562 /* Add results to output vector in correct order. First come the results with the best match
563 * score. Results with the same score are in the order they have been added to the search. */
564 Vector<int> sorted_result_indices;
565 for (const float score : found_scores) {
566 MutableSpan<int> indices = result_indices_by_score.lookup(score);
567 if (score == found_scores[0]) {
568 if (!query.is_empty()) {
569 /* Sort items with best score by length. Shorter items are more likely the ones you are
570 * looking for. This also ensures that exact matches will be at the top, even if the query
571 * is a sub-string of another item. */
572 std::sort(indices.begin(), indices.end(), [&](int a, int b) {
573 const SearchItem &item_a = items_[a];
574 const SearchItem &item_b = items_[b];
575 /* The length of the main group has priority over the total length. */
576 return item_a.main_group_length < item_b.main_group_length ||
577 (item_a.main_group_length == item_b.main_group_length &&
578 item_a.total_length < item_b.total_length);
579 });
580 /* Prefer items with larger weights. Use `stable_sort` so that if the weights are the same,
581 * the order won't be changed. */
582 std::stable_sort(indices.begin(), indices.end(), [&](int a, int b) {
583 return items_[a].weight > items_[b].weight;
584 });
585 }
586 /* If the query gets longer, it's less likely that accessing recent items is desired. Better
587 * always show the best match in this case. */
588 if (query.size() <= 1) {
589 /* Prefer items that have been selected recently. */
590 std::stable_sort(indices.begin(), indices.end(), [&](int a, int b) {
591 return items_[a].recent_time > items_[b].recent_time;
592 });
593 }
594 }
595 sorted_result_indices.extend(indices);
596 }
597
598 Vector<void *> sorted_data(sorted_result_indices.size());
599 for (const int i : sorted_result_indices.index_range()) {
600 const int result_index = sorted_result_indices[i];
601 const SearchItem &item = items_[result_index];
602 sorted_data[i] = item.user_data;
603 }
604 return sorted_data;
605}
606
607} // namespace blender::string_search
#define BLI_assert(a)
Definition BLI_assert.h:50
void BLI_str_tolower_ascii(char *str, size_t len) ATTR_NONNULL(1)
Definition string.c:952
size_t size_t BLI_strnlen_utf8(const char *strc, size_t strc_maxlen) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT
int BLI_str_utf8_size_safe(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1)
unsigned int BLI_str_utf8_as_unicode_step_safe(const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1
unsigned int BLI_str_utf8_as_unicode_safe(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1)
unsigned int uint
#define ELEM(...)
ATTR_WARN_UNUSED_RESULT const BMVert * v2
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
int64_t size() const
Definition BLI_array.hh:245
const T * data() const
Definition BLI_array.hh:301
StringRefNull copy_string(StringRef str)
MutableSpan< T > construct_array_copy(Span< T > src)
Value lookup_default(const Key &key, const Value &default_value) const
Definition BLI_map.hh:531
void add(const Key &key, const Value &value)
constexpr T & last(const int64_t n=0) const
Definition BLI_span.hh:690
constexpr Span drop_front(int64_t n) const
Definition BLI_span.hh:172
constexpr int64_t size() const
Definition BLI_span.hh:253
constexpr IndexRange index_range() const
Definition BLI_span.hh:402
static constexpr int64_t not_found
constexpr StringRef substr(int64_t start, int64_t size) const
constexpr bool startswith(StringRef prefix) const
constexpr int64_t size() const
constexpr const char * data() const
constexpr StringRef drop_prefix(int64_t n) const
int64_t size() const
void append(const T &value)
const T & last(const int64_t n=0) const
bool is_empty() const
IndexRange index_range() const
void fill(const T &value) const
Span< T > as_span() const
void add_impl(StringRef str, void *user_data, float weight)
Vector< void * > query_impl(StringRef query) const
local_group_size(16, 16) .push_constant(Type b
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
#define str(s)
int count
static int64_t count_utf8_code_points(StringRef str)
static constexpr int unused_word
int get_fuzzy_match_errors(StringRef query, StringRef full)
static std::optional< InitialsMatch > match_word_initials(StringRef query, const SearchItem &item, const Span< int > word_match_map, int start=0)
static int get_best_word_index_that_startswith(StringRef query, const SearchItem &item, Span< int > word_match_map, Span< StringRef > remaining_query_words)
int damerau_levenshtein_distance(StringRef a, StringRef b)
static std::optional< float > score_query_against_words(Span< StringRef > query_words, const SearchItem &item)
static int get_word_index_that_fuzzy_matches(StringRef query, Span< StringRef > words, Span< int > word_match_map, int *r_error_count)
void extract_normalized_words(StringRef str, LinearAllocator<> &allocator, Vector< StringRef, 64 > &r_words, Vector< int, 64 > &r_word_group_ids)
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:95
float distance(float a, float b)
#define INT32_MAX
Definition stdint.h:137
unsigned int uint32_t
Definition stdint.h:80
__int64 int64_t
Definition stdint.h:89
#define UI_MENU_ARROW_SEP
#define UI_MENU_ARROW_SEP_UNICODE
int count_main_group_matches(const SearchItem &item) const
bool better_than(const InitialsMatch &other, const SearchItem &item) const
Map< std::string, int > logical_time_by_str