Blender V5.0
csv_parse.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2025 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
8
9#include "BLI_csv_parse.hh"
11#include "BLI_task.hh"
12
13#include <atomic>
14
15namespace blender::csv_parse {
16
21static int64_t guess_next_record_start(const Span<char> buffer, const int64_t start)
22{
23 int64_t i = start;
24 while (i < buffer.size()) {
25 const char c = buffer[i];
26 if (c == '\n') {
27 return i + 1;
28 }
29 i++;
30 }
31 return buffer.size();
32}
33
40 int64_t approximate_chunk_size)
41{
42 approximate_chunk_size = std::max<int64_t>(approximate_chunk_size, 1);
43 Vector<Span<char>> chunks;
44 int64_t start = 0;
45 while (start < buffer.size()) {
46 int64_t end = std::min(start + approximate_chunk_size, buffer.size());
47 end = guess_next_record_start(buffer, end);
48 chunks.append(buffer.slice(IndexRange::from_begin_end(start, end)));
49 start = end;
50 }
51 return chunks;
52}
53
59static std::optional<CsvRecords> parse_records(const Span<char> buffer,
61 Vector<int64_t> &r_data_offsets,
62 Vector<Span<char>> &r_data_fields)
63{
64 using namespace detail;
65 /* Clear the data that may still be in there, but do not free the memory. */
66 r_data_offsets.clear();
67 r_data_fields.clear();
68
69 r_data_offsets.append(0);
70 int64_t start = 0;
71 while (start < buffer.size()) {
72 const std::optional<int64_t> next_record_start = parse_record_fields(
73 buffer,
74 start,
75 options.delimiter,
76 options.quote,
77 options.quote_escape_chars,
78 r_data_fields);
79 if (!next_record_start.has_value()) {
80 return std::nullopt;
81 }
82 /* Ignore empty lines. While those are not great practice, they can occur in practice. */
83 if (r_data_fields.size() > r_data_offsets.last()) {
84 r_data_offsets.append(r_data_fields.size());
85 }
86 start = *next_record_start;
87 }
88 return CsvRecords(OffsetIndices<int64_t>(r_data_offsets), r_data_fields);
89}
90
91std::optional<Vector<Any<>>> parse_csv_in_chunks(
92 const Span<char> buffer,
94 FunctionRef<void(const CsvRecord &record)> process_header,
95 FunctionRef<Any<>(const CsvRecords &records)> process_records)
96{
97 using namespace detail;
98
99 /* First parse the first row to get the column names. */
100 Vector<Span<char>> header_fields;
101 const std::optional<int64_t> first_data_record_start = parse_record_fields(
102 buffer, 0, options.delimiter, options.quote, options.quote_escape_chars, header_fields);
103 if (!first_data_record_start.has_value()) {
104 return std::nullopt;
105 }
106 /* Call this before starting to process the remaining data. This allows the caller to do some
107 * preprocessing that is used during chunk parsing. */
108 process_header(CsvRecord(header_fields));
109
110 /* This buffer contains only the data records, without the header. */
111 const Span<char> data_buffer = buffer.drop_front(*first_data_record_start);
112 /* Split the buffer into chunks that can be processed in parallel. */
113 const Vector<Span<char>> data_buffer_chunks = split_into_aligned_chunks(
114 data_buffer, options.chunk_size_bytes);
115
116 /* It's not common, but it can happen that .csv files contain quoted multi-line values. In the
117 * unlucky case that we split the buffer in the middle of such a multi-line field, there will be
118 * malformed chunks. In this case we fallback to parsing the whole buffer with a single thread.
119 * If this case becomes more common, we could try to avoid splitting into malformed chunks by
120 * making the splitting logic a bit smarter. */
121 std::atomic<bool> found_malformed_chunk = false;
122 Vector<std::optional<Any<>>> chunk_results(data_buffer_chunks.size());
123 struct TLS {
124 Vector<int64_t> data_offsets;
125 Vector<Span<char>> data_fields;
126 };
128 threading::parallel_for(chunk_results.index_range(), 1, [&](const IndexRange range) {
129 TLS &tls = all_tls.local();
130 for (const int64_t i : range) {
131 if (found_malformed_chunk.load(std::memory_order_relaxed)) {
132 /* All work is cancelled when there was a malformed chunk. */
133 return;
134 }
135 const Span<char> chunk_buffer = data_buffer_chunks[i];
136 const std::optional<CsvRecords> records = parse_records(
137 chunk_buffer, options, tls.data_offsets, tls.data_fields);
138 if (!records.has_value()) {
139 found_malformed_chunk.store(true, std::memory_order_relaxed);
140 return;
141 }
142 chunk_results[i] = process_records(*records);
143 }
144 });
145
146 /* If there was a malformed chunk, process the data again in a single thread without splitting
147 * the input into chunks. This should happen quite rarely but is important for overall
148 * correctness. */
149 if (found_malformed_chunk) {
150 chunk_results.clear();
151 TLS &tls = all_tls.local();
152 const std::optional<CsvRecords> records = parse_records(
153 data_buffer, options, tls.data_offsets, tls.data_fields);
154 if (!records.has_value()) {
155 return std::nullopt;
156 }
157 chunk_results.append(process_records(*records));
158 }
159
160 /* Prepare the return value. */
161 Vector<Any<>> results;
162 for (std::optional<Any<>> &result : chunk_results) {
163 BLI_assert(result.has_value());
164 results.append(std::move(result.value()));
165 }
166 return results;
167}
168
171 LinearAllocator<> &allocator)
172{
173 const StringRef escape_chars{options.quote_escape_chars};
174 if (str.find_first_of(escape_chars) == StringRef::not_found) {
175 return str;
176 }
177 /* The actual unescaped string may be shorter, but not longer. */
178 MutableSpan<char> unescaped_str = allocator.allocate_array<char>(str.size());
179 int64_t i = 0;
180 int64_t escaped_size = 0;
181 while (i < str.size()) {
182 const char c = str[i];
183 if (options.quote_escape_chars.contains(c)) {
184 if (i + 1 < str.size() && str[i + 1] == options.quote) {
185 /* Ignore the current escape character. */
186 unescaped_str[escaped_size++] = options.quote;
187 i += 2;
188 continue;
189 }
190 }
191 unescaped_str[escaped_size++] = c;
192 i++;
193 }
194 return StringRef(unescaped_str.take_front(escaped_size));
195}
196
197namespace detail {
198
199std::optional<int64_t> parse_record_fields(const Span<char> buffer,
200 const int64_t start,
201 const char delimiter,
202 const char quote,
203 const Span<char> quote_escape_chars,
204 Vector<Span<char>> &r_fields)
205{
206 using namespace detail;
207
208 const auto handle_potentially_trailing_delimiter = [&](const int64_t i) {
209 if (i <= buffer.size()) {
210 if (i < buffer.size()) {
211 if (ELEM(buffer[i], '\n', '\r')) {
212 r_fields.append({});
213 }
214 }
215 else {
216 r_fields.append({});
217 }
218 }
219 };
220
221 int64_t i = start;
222 while (i < buffer.size()) {
223 const char c = buffer[i];
224 if (c == '\n') {
225 return i + 1;
226 }
227 if (c == '\r') {
228 i++;
229 continue;
230 }
231 if (c == delimiter) {
232 r_fields.append({});
233 i++;
234 handle_potentially_trailing_delimiter(i);
235 continue;
236 }
237 if (c == quote) {
238 i++;
239 const std::optional<int64_t> end_of_field = find_end_of_quoted_field(
240 buffer, i, quote, quote_escape_chars);
241 if (!end_of_field.has_value()) {
242 return std::nullopt;
243 }
244 r_fields.append(buffer.slice(IndexRange::from_begin_end(i, *end_of_field)));
245 i = *end_of_field;
246 while (i < buffer.size()) {
247 const char inner_c = buffer[i];
248 if (inner_c == quote) {
249 i++;
250 continue;
251 }
252 if (inner_c == delimiter) {
253 i++;
254 handle_potentially_trailing_delimiter(i);
255 break;
256 }
257 if (ELEM(inner_c, '\n', '\r')) {
258 break;
259 }
260 i++;
261 }
262 continue;
263 }
264 const int64_t end_of_field = find_end_of_simple_field(buffer, i, delimiter);
265 r_fields.append(buffer.slice(IndexRange::from_begin_end(i, end_of_field)));
266 i = end_of_field;
267 while (i < buffer.size()) {
268 const char inner_c = buffer[i];
269 if (inner_c == delimiter) {
270 i++;
271 handle_potentially_trailing_delimiter(i);
272 break;
273 }
274 if (ELEM(inner_c, '\n', '\r')) {
275 break;
276 }
278 }
279 }
280
281 return buffer.size();
282}
283
285 const int64_t start,
286 const char delimiter)
287{
288 int64_t i = start;
289 while (i < buffer.size()) {
290 const char c = buffer[i];
291 if (ELEM(c, delimiter, '\n', '\r')) {
292 return i;
293 }
294 i++;
295 }
296 return buffer.size();
297}
298
299std::optional<int64_t> find_end_of_quoted_field(const Span<char> buffer,
300 const int64_t start,
301 const char quote,
302 const Span<char> escape_chars)
303{
304 int64_t i = start;
305 while (i < buffer.size()) {
306 const char c = buffer[i];
307 if (escape_chars.contains(c)) {
308 if (i + 1 < buffer.size() && buffer[i + 1] == quote) {
309 i += 2;
310 continue;
311 }
312 }
313 if (c == quote) {
314 return i;
315 }
316 i++;
317 }
318 return std::nullopt;
319}
320
321} // namespace detail
322
323} // namespace blender::csv_parse
#define BLI_assert_unreachable()
Definition BLI_assert.h:93
#define BLI_assert(a)
Definition BLI_assert.h:46
#define ELEM(...)
long long int int64_t
void append(const T &value)
static constexpr IndexRange from_begin_end(const int64_t begin, const int64_t end)
MutableSpan< T > allocate_array(int64_t size)
constexpr MutableSpan take_front(const int64_t n) const
Definition BLI_span.hh:629
constexpr Span drop_front(int64_t n) const
Definition BLI_span.hh:171
constexpr Span slice(int64_t start, int64_t size) const
Definition BLI_span.hh:137
constexpr int64_t size() const
Definition BLI_span.hh:252
constexpr bool contains(const T &value) const
Definition BLI_span.hh:277
static constexpr int64_t not_found
int64_t size() const
void append(const T &value)
const T & last(const int64_t n=0) const
IndexRange index_range() const
CCL_NAMESPACE_BEGIN struct Options options
#define str(s)
int64_t find_end_of_simple_field(Span< char > buffer, int64_t start, char delimiter)
Definition csv_parse.cc:284
std::optional< int64_t > parse_record_fields(const Span< char > buffer, const int64_t start, const char delimiter, const char quote, const Span< char > quote_escape_chars, Vector< Span< char > > &r_fields)
Definition csv_parse.cc:199
std::optional< int64_t > find_end_of_quoted_field(Span< char > buffer, int64_t start, char quote, Span< char > escape_chars)
Definition csv_parse.cc:299
std::optional< Vector< Any<> > > parse_csv_in_chunks(const Span< char > buffer, const CsvParseOptions &options, FunctionRef< void(const CsvRecord &record)> process_header, FunctionRef< Any<>(const CsvRecords &records)> process_records)
Definition csv_parse.cc:91
static int64_t guess_next_record_start(const Span< char > buffer, const int64_t start)
Definition csv_parse.cc:21
StringRef unescape_field(const StringRef str, const CsvParseOptions &options, LinearAllocator<> &allocator)
Definition csv_parse.cc:169
static Vector< Span< char > > split_into_aligned_chunks(const Span< char > buffer, int64_t approximate_chunk_size)
Definition csv_parse.cc:39
static std::optional< CsvRecords > parse_records(const Span< char > buffer, const CsvParseOptions &options, Vector< int64_t > &r_data_offsets, Vector< Span< char > > &r_data_fields)
Definition csv_parse.cc:59
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:93
i
Definition text_draw.cc:230