Blender V5.0
string_search.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
8
9#include "BLI_array.hh"
12#include "BLI_span.hh"
13#include "BLI_string.h"
14#include "BLI_string_ref.hh"
15#include "BLI_string_search.hh"
16#include "BLI_string_utf8.h"
18#include "BLI_task.hh"
19
20/* Right arrow, keep in sync with #UI_MENU_ARROW_SEP in `UI_interface_c.hh`. */
21#define UI_MENU_ARROW_SEP BLI_STR_UTF8_BLACK_RIGHT_POINTING_SMALL_TRIANGLE
22#define UI_MENU_ARROW_SEP_UNICODE 0x25b8
23
24namespace blender::string_search {
25
27{
28 return int64_t(BLI_strnlen_utf8(str.data(), size_t(str.size())));
29}
30
32{
33 constexpr int deletion_cost = 1;
34 constexpr int insertion_cost = 1;
35 constexpr int substitution_cost = 1;
36 constexpr int transposition_cost = 1;
37
38 const int size_a = count_utf8_code_points(a);
39 const int size_b = count_utf8_code_points(b);
40
41 /* Instead of keeping the entire table in memory, only keep three rows. The algorithm only
42 * accesses these rows and nothing older.
43 * All three rows are usually allocated on the stack. At most a single heap allocation is done,
44 * if the reserved stack space is too small. */
45 const int row_length = size_b + 1;
46 Array<int, 64> rows(row_length * 3);
47
48 /* Store rows as spans so that it is cheap to swap them. */
49 MutableSpan v0{rows.data() + row_length * 0, row_length};
50 MutableSpan v1{rows.data() + row_length * 1, row_length};
51 MutableSpan v2{rows.data() + row_length * 2, row_length};
52
53 /* Only v1 needs to be initialized. */
54 for (const int i : IndexRange(row_length)) {
55 v1[i] = i * insertion_cost;
56 }
57
58 uint32_t prev_unicode_a;
59 size_t offset_a = 0;
60 for (const int i : IndexRange(size_a)) {
61 v2[0] = (i + 1) * deletion_cost;
62
63 const uint32_t unicode_a = BLI_str_utf8_as_unicode_step_safe(a.data(), a.size(), &offset_a);
64
65 uint32_t prev_unicode_b;
66 size_t offset_b = 0;
67 for (const int j : IndexRange(size_b)) {
68 const uint32_t unicode_b = BLI_str_utf8_as_unicode_step_safe(b.data(), b.size(), &offset_b);
69
70 /* Check how costly the different operations would be and pick the cheapest - the one with
71 * minimal cost. */
72 int new_cost = std::min({v1[j + 1] + deletion_cost,
73 v2[j] + insertion_cost,
74 v1[j] + (unicode_a != unicode_b) * substitution_cost});
75 if (i > 0 && j > 0) {
76 if (unicode_a == prev_unicode_b && prev_unicode_a == unicode_b) {
77 new_cost = std::min(new_cost, v0[j - 1] + transposition_cost);
78 }
79 }
80
81 v2[j + 1] = new_cost;
82 prev_unicode_b = unicode_b;
83 }
84
85 /* Swap the three rows, so that the next row can be computed. */
86 std::tie(v0, v1, v2) = std::tuple<MutableSpan<int>, MutableSpan<int>, MutableSpan<int>>(
87 v1, v2, v0);
88 prev_unicode_a = unicode_a;
89 }
90
91 return v1.last();
92}
93
95{
96 /* If it is a perfect partial match, return immediately. */
97 if (full.find(query) != StringRef::not_found) {
98 return 0;
99 }
100
101 const int query_size = count_utf8_code_points(query);
102 const int full_size = count_utf8_code_points(full);
103
104 /* If there is only a single character which is not in the full string, this is not a match. */
105 if (query_size == 1) {
106 return -1;
107 }
108 BLI_assert(query.size() >= 2);
109
110 /* Allow more errors when the size grows larger. */
111 const int max_errors = query_size <= 1 ? 0 : query_size / 8 + 1;
112
113 /* If the query is too large, this cannot be a match. */
114 if (query_size - full_size > max_errors) {
115 return -1;
116 }
117
118 const uint32_t query_first_unicode = BLI_str_utf8_as_unicode_safe(query.data());
119 const uint32_t query_second_unicode = BLI_str_utf8_as_unicode_safe(
120 query.data() + BLI_str_utf8_size_safe(query.data()));
121
122 const char *full_begin = full.begin();
123 const char *full_end = full.end();
124
125 const char *window_begin = full_begin;
126 const char *window_end = window_begin;
127 const int window_size = std::min(query_size + max_errors, full_size);
128 const int extra_chars = std::max(window_size - query_size, 0);
129 const int max_acceptable_distance = max_errors + extra_chars;
130
131 for (int i = 0; i < window_size; i++) {
132 window_end += BLI_str_utf8_size_safe(window_end);
133 }
134
135 while (true) {
136 StringRef window{window_begin, window_end};
137 const uint32_t window_begin_unicode = BLI_str_utf8_as_unicode_safe(window_begin);
138 int distance = 0;
139 /* Expect that the first or second character of the query is correct. This helps to avoid
140 * computing the more expensive distance function. */
141 if (ELEM(window_begin_unicode, query_first_unicode, query_second_unicode)) {
142 distance = damerau_levenshtein_distance(query, window);
143 if (distance <= max_acceptable_distance) {
144 return distance;
145 }
146 }
147 if (window_end == full_end) {
148 return -1;
149 }
150
151 /* When the distance is way too large, we can skip a couple of code points, because the
152 * distance can't possibly become as short as required. */
153 const int window_offset = std::max(1, distance / 2);
154 for (int i = 0; i < window_offset && window_end < full_end; i++) {
155 window_begin += BLI_str_utf8_size_safe(window_begin);
156 window_end += BLI_str_utf8_size_safe(window_end);
157 }
158 }
159}
160
161static constexpr int unused_word = -1;
162
165
167 {
168 int count = 0;
169 for (const int i : this->matched_word_indices) {
170 if (item.word_group_ids[i] == item.main_group_id) {
171 count++;
172 }
173 }
174 return count;
175 }
176
177 bool better_than(const InitialsMatch &other, const SearchItem &item) const
178 {
179 return this->count_main_group_matches(item) > other.count_main_group_matches(item);
180 }
181};
182
190static std::optional<InitialsMatch> match_word_initials(StringRef query,
191 const SearchItem &item,
192 const Span<int> word_match_map,
193 int start = 0)
194{
195 const Span<StringRef> words = item.normalized_words;
196 if (start >= words.size()) {
197 return std::nullopt;
198 }
199
200 InitialsMatch match;
201
202 size_t query_index = 0;
203 int word_index = start;
204 size_t char_index = 0;
205
206 int first_found_word_index = -1;
207
208 while (query_index < query.size()) {
209 const uint query_unicode = BLI_str_utf8_as_unicode_step_safe(
210 query.data(), query.size(), &query_index);
211 while (true) {
212 /* We are at the end of words, no complete match has been found yet. */
213 if (word_index >= words.size()) {
214 if (first_found_word_index >= 0) {
215 /* Try starting to match at another word. In some cases one can still find matches this
216 * way. */
217 return match_word_initials(query, item, word_match_map, first_found_word_index + 1);
218 }
219 return std::nullopt;
220 }
221
222 /* Skip words that the caller does not want us to use. */
223 if (word_match_map[word_index] != unused_word) {
224 word_index++;
225 BLI_assert(char_index == 0);
226 continue;
227 }
228
229 StringRef word = words[word_index];
230 /* Try to match the current character with the current word. */
231 if (int(char_index) < word.size()) {
232 const uint32_t char_unicode = BLI_str_utf8_as_unicode_step_safe(
233 word.data(), word.size(), &char_index);
234 if (query_unicode == char_unicode) {
235 match.matched_word_indices.append(word_index);
236 if (first_found_word_index == -1) {
237 first_found_word_index = word_index;
238 }
239 break;
240 }
241 }
242
243 /* Could not find a match in the current word, go to the beginning of the next word. */
244 word_index += 1;
245 char_index = 0;
246 }
247 }
248 /* Check if we can find a better match that starts at a later word. */
249 if (std::optional<InitialsMatch> sub_match = match_word_initials(
250 query, item, word_match_map, first_found_word_index + 1))
251 {
252 if (sub_match->better_than(match, item)) {
253 return sub_match;
254 }
255 }
256 return match;
257}
258
263 const SearchItem &item,
264 Span<int> word_match_map,
265 Span<StringRef> remaining_query_words)
266{
267 /* If there is another word in the remaining full query that contains the current word, we have
268 * to pick the shortest word. If we pick a longer one, it can happen that the other query word
269 * does not have a match anymore. This can lead to a situation where a query does not match
270 * itself anymore.
271 *
272 * E.g. the full query `T > Test` wouldn't match itself anymore if `Test` has a higher weight.
273 * That's because the `T` would be matched with the `Test`, but then `Test` can't match `Test
274 * anymore because that's taken up already.
275 *
276 * If we don't have to pick the shortest match for correctness, pick the one with the largest
277 * weight instead.
278 */
279 bool use_shortest_match = false;
280 for (const StringRef other_word : remaining_query_words) {
281 if (other_word.startswith(query)) {
282 use_shortest_match = true;
283 break;
284 }
285 }
286
287 int best_word_size = INT32_MAX;
288 int best_word_index = -1;
289 bool best_word_in_main_group = false;
290 for (const int i : item.normalized_words.index_range()) {
291 if (word_match_map[i] != unused_word) {
292 continue;
293 }
294 StringRef word = item.normalized_words[i];
295 const bool word_in_main_group = item.word_group_ids[i] == item.main_group_id;
296 if (word.startswith(query)) {
297 bool found_new_best = false;
298 if (use_shortest_match) {
299 if (word.size() < best_word_size) {
300 found_new_best = true;
301 }
302 }
303 else {
304 if (!best_word_in_main_group) {
305 found_new_best = true;
306 }
307 }
308 if (found_new_best) {
309 best_word_index = i;
310 best_word_size = word.size();
311 best_word_in_main_group = word_in_main_group;
312 }
313 }
314 }
315 return best_word_index;
316}
317
319 Span<StringRef> words,
320 Span<int> word_match_map,
321 int *r_error_count)
322{
323 for (const int i : words.index_range()) {
324 if (word_match_map[i] != unused_word) {
325 continue;
326 }
327 StringRef word = words[i];
328 const int error_count = get_fuzzy_match_errors(query, word);
329 if (error_count >= 0) {
330 *r_error_count = error_count;
331 return i;
332 }
333 }
334 return -1;
335}
336
341static std::optional<float> score_query_against_words(Span<StringRef> query_words,
342 const SearchItem &item)
343{
344 /* A mapping from #result_words to #query_words. It's mainly used to determine if a word has been
345 * matched already to avoid matching it again. */
346 Array<int, 64> word_match_map(item.normalized_words.size(), unused_word);
347
348 /* Start with some high score, because otherwise the final score might become negative. */
349 float total_match_score = item.is_deprecated ? 500 : 1000;
350
351 for (const int query_word_index : query_words.index_range()) {
352 const StringRef query_word = query_words[query_word_index];
353 {
354 /* Check if any result word begins with the query word. */
355 const int word_index = get_best_word_index_that_startswith(
356 query_word, item, word_match_map, query_words.drop_front(query_word_index + 1));
357 if (word_index >= 0) {
358 /* Give a match in a main group higher priority. */
359 const bool is_main_group = item.word_group_ids[word_index] == item.main_group_id;
360 total_match_score += is_main_group ? 10 : 9;
361 word_match_map[word_index] = query_word_index;
362 continue;
363 }
364 }
365 {
366 /* Try to match against word initials. */
367 if (std::optional<InitialsMatch> match = match_word_initials(
368 query_word, item, word_match_map))
369 {
370 /* If the all matched words are in the main group, give the match a higher priority. */
371 bool all_main_group_matches = match->count_main_group_matches(item) ==
372 match->matched_word_indices.size();
373 total_match_score += all_main_group_matches ? 4 : 3;
374 for (const int i : match->matched_word_indices) {
375 word_match_map[i] = query_word_index;
376 }
377 continue;
378 }
379 }
380 {
381 /* Fuzzy match against words. */
382 int error_count = 0;
383 const int word_index = get_word_index_that_fuzzy_matches(
384 query_word, item.normalized_words, word_match_map, &error_count);
385 if (word_index >= 0) {
386 total_match_score += 3 - error_count;
387 word_match_map[word_index] = query_word_index;
388 continue;
389 }
390 }
391
392 /* Couldn't match query word with anything. */
393 return std::nullopt;
394 }
395
396 {
397 /* Add penalty when query words are not in the correct order. */
398 Vector<int> match_indices;
399 for (const int index : word_match_map) {
400 if (index != unused_word) {
401 match_indices.append(index);
402 }
403 }
404 if (!match_indices.is_empty()) {
405 for (const int i : IndexRange(match_indices.size() - 1)) {
406 if (match_indices[i] > match_indices[i + 1]) {
407 total_match_score -= 1;
408 }
409 }
410 }
411 }
412
413 return total_match_score;
414}
415
417 LinearAllocator<> &allocator,
418 Vector<StringRef, 64> &r_words,
419 Vector<int, 64> &r_word_group_ids)
420{
421 const uint32_t unicode_space = uint32_t(' ');
422 const uint32_t unicode_dash = uint32_t('-');
423 const uint32_t unicode_underscore = uint32_t('_');
424 const uint32_t unicode_slash = uint32_t('/');
425 const uint32_t unicode_right_triangle = UI_MENU_ARROW_SEP_UNICODE;
426
427 BLI_assert(unicode_space == BLI_str_utf8_as_unicode_safe(" "));
428 BLI_assert(unicode_dash == BLI_str_utf8_as_unicode_safe("-"));
429 BLI_assert(unicode_underscore == BLI_str_utf8_as_unicode_safe("_"));
430 BLI_assert(unicode_slash == BLI_str_utf8_as_unicode_safe("/"));
432
433 auto is_separator = [&](uint32_t unicode) {
434 return ELEM(unicode,
435 unicode_space,
436 unicode_dash,
437 unicode_underscore,
438 unicode_slash,
439 unicode_right_triangle);
440 };
441
442 Vector<int, 64> section_indices;
443
444 /* Make a copy of the string so that we can edit it. */
445 StringRef str_copy = allocator.copy_string(str);
446 char *mutable_copy = const_cast<char *>(str_copy.data());
447 const size_t str_size_in_bytes = size_t(str.size());
448 BLI_str_tolower_ascii(mutable_copy, str_size_in_bytes);
449
450 /* Iterate over all unicode code points to split individual words. */
451 int group_id = 0;
452 bool is_in_word = false;
453 size_t word_start = 0;
454 size_t offset = 0;
455 while (offset < str_size_in_bytes) {
456 size_t size = offset;
457 uint32_t unicode = BLI_str_utf8_as_unicode_step_safe(str.data(), str.size(), &size);
458 size -= offset;
459 if (is_separator(unicode)) {
460 if (is_in_word) {
461 const StringRef word = str_copy.substr(int(word_start), int(offset - word_start));
462 r_words.append(word);
463 r_word_group_ids.append(group_id);
464 is_in_word = false;
465 }
466 }
467 else {
468 if (!is_in_word) {
469 word_start = offset;
470 is_in_word = true;
471 }
472 }
473 if (unicode == unicode_right_triangle) {
474 group_id++;
475 }
476 offset += size;
477 }
478 /* If the last word is not followed by a separator, it has to be handled separately. */
479 if (is_in_word) {
480 const StringRef word = str_copy.drop_prefix(int(word_start));
481 r_words.append(word);
482 r_word_group_ids.append(group_id);
483 }
484}
485
486void StringSearchBase::add_impl(const StringRef str, void *user_data, const float weight)
487{
488 BLI_assert(BLI_str_utf8_invalid_byte(str.data(), str.size()) == -1);
490 Vector<int, 64> word_group_ids;
492 const int recent_time = recent_cache_ ?
493 recent_cache_->logical_time_by_str.lookup_default(str, -1) :
494 -1;
495 int main_group_id = 0;
496 if (!word_group_ids.is_empty()) {
497 switch (main_words_heuristic_) {
499 main_group_id = 0;
500 break;
501 }
503 main_group_id = word_group_ids.last();
504 break;
505 }
507 main_group_id = 0;
508 word_group_ids.fill(0);
509 break;
510 }
511 }
512 }
513
514 int main_group_length = 0;
515 for (const int i : words.index_range()) {
516 if (word_group_ids[i] == main_group_id) {
517 main_group_length += int(words[i].size());
518 }
519 }
520
521 /* Not checking for the "D" to avoid problems with upper/lower-case. */
522 const bool is_deprecated = str.find("eprecated") != StringRef::not_found;
523
524 items_.append({user_data,
525 allocator_.construct_array_copy(words.as_span()),
526 allocator_.construct_array_copy(word_group_ids.as_span()),
527 main_group_id,
528 main_group_length,
529 int(str.size()),
530 weight,
531 recent_time,
532 is_deprecated});
533}
534
536{
537 LinearAllocator<> allocator;
538 Vector<StringRef, 64> query_words;
539 /* This is just a dummy value that is not used for the query. */
540 Vector<int, 64> word_group_ids;
541 string_search::extract_normalized_words(query, allocator, query_words, word_group_ids);
542
543 /* Compute score of every result. */
544 Array<std::optional<float>> all_scores(items_.size());
545 threading::parallel_for(items_.index_range(), 256, [&](const IndexRange range) {
546 for (const int i : range) {
547 const SearchItem &item = items_[i];
548 const std::optional<float> score = string_search::score_query_against_words(query_words,
549 item);
550 all_scores[i] = score;
551 }
552 });
553 MultiValueMap<float, int> result_indices_by_score;
554 for (const int i : items_.index_range()) {
555 const std::optional<float> score = all_scores[i];
556 if (score.has_value()) {
557 result_indices_by_score.add(*score, i);
558 }
559 }
560
561 Vector<float> found_scores;
562 for (const float score : result_indices_by_score.keys()) {
563 found_scores.append(score);
564 }
565 std::sort(found_scores.begin(), found_scores.end(), std::greater<>());
566
567 /* Add results to output vector in correct order. First come the results with the best match
568 * score. Results with the same score are in the order they have been added to the search. */
569 Vector<int> sorted_result_indices;
570 for (const float score : found_scores) {
571 MutableSpan<int> indices = result_indices_by_score.lookup(score);
572 if (score == found_scores[0]) {
573 if (!query.is_empty()) {
574 /* Sort items with best score by length. Shorter items are more likely the ones you are
575 * looking for. This also ensures that exact matches will be at the top, even if the query
576 * is a sub-string of another item. */
577 std::sort(indices.begin(), indices.end(), [&](int a, int b) {
578 const SearchItem &item_a = items_[a];
579 const SearchItem &item_b = items_[b];
580 /* The length of the main group has priority over the total length. */
581 return item_a.main_group_length < item_b.main_group_length ||
582 (item_a.main_group_length == item_b.main_group_length &&
583 item_a.total_length < item_b.total_length);
584 });
585 /* Prefer items with larger weights. Use `stable_sort` so that if the weights are the same,
586 * the order won't be changed. */
587 std::stable_sort(indices.begin(), indices.end(), [&](int a, int b) {
588 return items_[a].weight > items_[b].weight;
589 });
590 }
591 /* If the query gets longer, it's less likely that accessing recent items is desired. Better
592 * always show the best match in this case. */
593 if (query.size() <= 1) {
594 /* Prefer items that have been selected recently. */
595 std::stable_sort(indices.begin(), indices.end(), [&](int a, int b) {
596 return items_[a].recent_time > items_[b].recent_time;
597 });
598 }
599 }
600 sorted_result_indices.extend(indices);
601 }
602
603 Vector<void *> sorted_data(sorted_result_indices.size());
604 for (const int i : sorted_result_indices.index_range()) {
605 const int result_index = sorted_result_indices[i];
606 const SearchItem &item = items_[result_index];
607 sorted_data[i] = item.user_data;
608 }
609 return sorted_data;
610}
611
612} // namespace blender::string_search
#define BLI_assert(a)
Definition BLI_assert.h:46
void BLI_str_tolower_ascii(char *str, size_t len) ATTR_NONNULL(1)
Definition string.cc:956
ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t str_len) ATTR_NONNULL(1)
size_t size_t BLI_strnlen_utf8(const char *strc, size_t strc_maxlen) ATTR_NONNULL(1) ATTR_WARN_UNUSED_RESULT
int BLI_str_utf8_size_safe(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1)
unsigned int BLI_str_utf8_as_unicode_step_safe(const char *__restrict p, size_t p_len, size_t *__restrict index) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1
unsigned int BLI_str_utf8_as_unicode_safe(const char *p) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL(1)
unsigned int uint
#define ELEM(...)
ATTR_WARN_UNUSED_RESULT const BMVert * v2
long long int int64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
int64_t size() const
void append(const T &value)
IndexRange index_range() const
void extend(Span< T > array)
T * end()
T * begin()
const T * data() const
Definition BLI_array.hh:312
StringRefNull copy_string(StringRef str)
MapType::KeyIterator keys() const
Span< Value > lookup(const Key &key) const
void add(const Key &key, const Value &value)
constexpr T & last(const int64_t n=0) const
Definition BLI_span.hh:689
constexpr Span drop_front(int64_t n) const
Definition BLI_span.hh:171
constexpr int64_t size() const
Definition BLI_span.hh:252
constexpr IndexRange index_range() const
Definition BLI_span.hh:401
static constexpr int64_t not_found
constexpr int64_t find(char c, int64_t pos=0) const
constexpr const char * begin() const
constexpr const char * end() const
constexpr StringRef substr(int64_t start, int64_t size) const
constexpr bool startswith(StringRef prefix) const
constexpr int64_t size() const
constexpr const char * data() const
constexpr StringRef drop_prefix(int64_t n) const
int64_t size() const
void append(const T &value)
const T & last(const int64_t n=0) const
bool is_empty() const
IndexRange index_range() const
void fill(const T &value) const
Span< T > as_span() const
void add_impl(StringRef str, void *user_data, float weight)
Vector< void * > query_impl(StringRef query) const
#define str(s)
static ushort indices[]
#define INT32_MAX
float distance(VecOp< float, D >, VecOp< float, D >) RET
int count
static int64_t count_utf8_code_points(StringRef str)
static constexpr int unused_word
int get_fuzzy_match_errors(StringRef query, StringRef full)
static std::optional< InitialsMatch > match_word_initials(StringRef query, const SearchItem &item, const Span< int > word_match_map, int start=0)
static int get_best_word_index_that_startswith(StringRef query, const SearchItem &item, Span< int > word_match_map, Span< StringRef > remaining_query_words)
int damerau_levenshtein_distance(StringRef a, StringRef b)
static std::optional< float > score_query_against_words(Span< StringRef > query_words, const SearchItem &item)
static int get_word_index_that_fuzzy_matches(StringRef query, Span< StringRef > words, Span< int > word_match_map, int *r_error_count)
void extract_normalized_words(StringRef str, LinearAllocator<> &allocator, Vector< StringRef, 64 > &r_words, Vector< int, 64 > &r_word_group_ids)
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:93
#define UI_MENU_ARROW_SEP
#define UI_MENU_ARROW_SEP_UNICODE
int count_main_group_matches(const SearchItem &item) const
bool better_than(const InitialsMatch &other, const SearchItem &item) const
i
Definition text_draw.cc:230