rapidyaml  0.8.0
parse and emit YAML, and do it fast
filter_processor.hpp
Go to the documentation of this file.
1 #ifndef _C4_YML_FILTER_PROCESSOR_HPP_
2 #define _C4_YML_FILTER_PROCESSOR_HPP_
3 
4 #include "c4/yml/common.hpp"
5 
6 #ifdef RYML_DBG
7 #include "c4/charconv.hpp"
8 #include "c4/yml/detail/parser_dbg.hpp"
9 #endif
10 
11 namespace c4 {
12 namespace yml {
13 
14 /** @defgroup doc_filter_processors Scalar filter processors
15  *
16  * These are internal classes used by @ref ParseEngine to parse the
17  * scalars; normally there is no reason for a user to be manually
18  * using these classes.
19  *
20  * @ingroup doc_parse */
21 /** @{ */
22 
23 //-----------------------------------------------------------------------------
24 
25 /** Filters an input string into a different output string */
27 {
28  csubstr src;
29  substr dst;
30  size_t rpos; ///< read position
31  size_t wpos; ///< write position
32 
33  C4_ALWAYS_INLINE FilterProcessorSrcDst(csubstr src_, substr dst_) noexcept
34  : src(src_)
35  , dst(dst_)
36  , rpos(0)
37  , wpos(0)
38  {
39  RYML_ASSERT(!dst.overlaps(src));
40  }
41 
42  C4_ALWAYS_INLINE void setwpos(size_t wpos_) noexcept { wpos = wpos_; }
43  C4_ALWAYS_INLINE void setpos(size_t rpos_, size_t wpos_) noexcept { rpos = rpos_; wpos = wpos_; }
44  C4_ALWAYS_INLINE void set_at_end() noexcept { skip(src.len - rpos); }
45 
46  C4_ALWAYS_INLINE bool has_more_chars() const noexcept { return rpos < src.len; }
47  C4_ALWAYS_INLINE bool has_more_chars(size_t maxpos) const noexcept { RYML_ASSERT(maxpos <= src.len); return rpos < maxpos; }
48 
49  C4_ALWAYS_INLINE csubstr rem() const noexcept { return src.sub(rpos); }
50  C4_ALWAYS_INLINE csubstr sofar() const noexcept { return csubstr(dst.str, wpos <= dst.len ? wpos : dst.len); }
51  C4_ALWAYS_INLINE FilterResult result() const noexcept
52  {
53  FilterResult ret;
54  ret.str.str = wpos <= dst.len ? dst.str : nullptr;
55  ret.str.len = wpos;
56  return ret;
57  }
58 
59  C4_ALWAYS_INLINE char curr() const noexcept { RYML_ASSERT(rpos < src.len); return src[rpos]; }
60  C4_ALWAYS_INLINE char next() const noexcept { return rpos+1 < src.len ? src[rpos+1] : '\0'; }
61  C4_ALWAYS_INLINE bool skipped_chars() const noexcept { return wpos != rpos; }
62 
63  C4_ALWAYS_INLINE void skip() noexcept { ++rpos; }
64  C4_ALWAYS_INLINE void skip(size_t num) noexcept { rpos += num; }
65 
66  C4_ALWAYS_INLINE void set_at(size_t pos, char c) noexcept // NOLINT(readability-make-member-function-const)
67  {
68  RYML_ASSERT(pos < wpos);
69  dst.str[pos] = c;
70  }
71  C4_ALWAYS_INLINE void set(char c) noexcept
72  {
73  if(wpos < dst.len)
74  dst.str[wpos] = c;
75  ++wpos;
76  }
77  C4_ALWAYS_INLINE void set(char c, size_t num) noexcept
78  {
79  RYML_ASSERT(num > 0);
80  if(wpos + num <= dst.len)
81  memset(dst.str + wpos, c, num);
82  wpos += num;
83  }
84 
85  C4_ALWAYS_INLINE void copy() noexcept
86  {
87  RYML_ASSERT(rpos < src.len);
88  if(wpos < dst.len)
89  dst.str[wpos] = src.str[rpos];
90  ++wpos;
91  ++rpos;
92  }
93  C4_ALWAYS_INLINE void copy(size_t num) noexcept
94  {
95  RYML_ASSERT(num);
96  RYML_ASSERT(rpos+num <= src.len);
97  if(wpos + num <= dst.len)
98  memcpy(dst.str + wpos, src.str + rpos, num);
99  wpos += num;
100  rpos += num;
101  }
102 
103  C4_ALWAYS_INLINE void translate_esc(char c) noexcept
104  {
105  if(wpos < dst.len)
106  dst.str[wpos] = c;
107  ++wpos;
108  rpos += 2;
109  }
110  C4_ALWAYS_INLINE void translate_esc_bulk(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept
111  {
112  RYML_ASSERT(nw > 0);
113  RYML_ASSERT(nr > 0);
114  RYML_ASSERT(rpos+nr <= src.len);
115  if(wpos+nw <= dst.len)
116  memcpy(dst.str + wpos, s, nw);
117  wpos += nw;
118  rpos += 1 + nr;
119  }
120  C4_ALWAYS_INLINE void translate_esc_extending(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept
121  {
122  translate_esc_bulk(s, nw, nr);
123  }
124 };
125 
126 
127 //-----------------------------------------------------------------------------
128 // filter in place
129 
130 // debugging scaffold
131 /** @cond dev */
132 #if defined(RYML_DBG) && 0
133 #define _c4dbgip(...) _c4dbgpf(__VA_ARGS__)
134 #else
135 #define _c4dbgip(...)
136 #endif
137 /** @endcond */
138 
139 /** Filters in place. While the result may be larger than the source,
140  * any extending happens only at the end of the string. Consequently,
141  * it's impossible for characters to be left unfiltered.
142  *
143  * @see FilterProcessorInplaceMidExtending */
145 {
146  substr src; ///< the subject string
147  size_t wcap; ///< write capacity - the capacity of the subject string's buffer
148  size_t rpos; ///< read position
149  size_t wpos; ///< write position
150 
151  C4_ALWAYS_INLINE FilterProcessorInplaceEndExtending(substr src_, size_t wcap_) noexcept
152  : src(src_)
153  , wcap(wcap_)
154  , rpos(0)
155  , wpos(0)
156  {
157  RYML_ASSERT(wcap >= src.len);
158  }
159 
160  C4_ALWAYS_INLINE void setwpos(size_t wpos_) noexcept { wpos = wpos_; }
161  C4_ALWAYS_INLINE void setpos(size_t rpos_, size_t wpos_) noexcept { rpos = rpos_; wpos = wpos_; }
162  C4_ALWAYS_INLINE void set_at_end() noexcept { skip(src.len - rpos); }
163 
164  C4_ALWAYS_INLINE bool has_more_chars() const noexcept { return rpos < src.len; }
165  C4_ALWAYS_INLINE bool has_more_chars(size_t maxpos) const noexcept { RYML_ASSERT(maxpos <= src.len); return rpos < maxpos; }
166 
167  C4_ALWAYS_INLINE FilterResult result() const noexcept
168  {
169  _c4dbgip("inplace: wpos={} wcap={} small={}", wpos, wcap, wpos > rpos);
170  FilterResult ret;
171  ret.str.str = (wpos <= wcap) ? src.str : nullptr;
172  ret.str.len = wpos;
173  return ret;
174  }
175  C4_ALWAYS_INLINE csubstr sofar() const noexcept { return csubstr(src.str, wpos <= wcap ? wpos : wcap); }
176  C4_ALWAYS_INLINE csubstr rem() const noexcept { return src.sub(rpos); }
177 
178  C4_ALWAYS_INLINE char curr() const noexcept { RYML_ASSERT(rpos < src.len); return src[rpos]; }
179  C4_ALWAYS_INLINE char next() const noexcept { return rpos+1 < src.len ? src[rpos+1] : '\0'; }
180 
181  C4_ALWAYS_INLINE void skip() noexcept { ++rpos; }
182  C4_ALWAYS_INLINE void skip(size_t num) noexcept { rpos += num; }
183 
184  void set_at(size_t pos, char c) noexcept
185  {
186  RYML_ASSERT(pos < wpos);
187  const size_t save = wpos;
188  wpos = pos;
189  set(c);
190  wpos = save;
191  }
192  void set(char c) noexcept
193  {
194  if(wpos < wcap) // respect write-capacity
195  src.str[wpos] = c;
196  ++wpos;
197  }
198  void set(char c, size_t num) noexcept
199  {
200  RYML_ASSERT(num);
201  if(wpos + num <= wcap) // respect write-capacity
202  memset(src.str + wpos, c, num);
203  wpos += num;
204  }
205 
206  void copy() noexcept
207  {
208  RYML_ASSERT(wpos <= rpos);
209  RYML_ASSERT(rpos < src.len);
210  if(wpos < wcap) // respect write-capacity
211  src.str[wpos] = src.str[rpos];
212  ++rpos;
213  ++wpos;
214  }
215  void copy(size_t num) noexcept
216  {
217  RYML_ASSERT(num);
218  RYML_ASSERT(rpos+num <= src.len);
219  RYML_ASSERT(wpos <= rpos);
220  if(wpos + num <= wcap) // respect write-capacity
221  {
222  if(wpos + num <= rpos) // there is no overlap
223  memcpy(src.str + wpos, src.str + rpos, num);
224  else // there is overlap
225  memmove(src.str + wpos, src.str + rpos, num);
226  }
227  rpos += num;
228  wpos += num;
229  }
230 
231  void translate_esc(char c) noexcept
232  {
233  RYML_ASSERT(rpos + 2 <= src.len);
234  RYML_ASSERT(wpos <= rpos);
235  if(wpos < wcap) // respect write-capacity
236  src.str[wpos] = c;
237  rpos += 2; // add 1u to account for the escape character
238  ++wpos;
239  }
240 
241  void translate_esc_bulk(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept
242  {
243  RYML_ASSERT(nw > 0);
244  RYML_ASSERT(nr > 0);
245  RYML_ASSERT(nw <= nr + 1u);
246  RYML_ASSERT(rpos+nr <= src.len);
247  RYML_ASSERT(wpos <= rpos);
248  const size_t wpos_next = wpos + nw;
249  const size_t rpos_next = rpos + nr + 1u; // add 1u to account for the escape character
250  RYML_ASSERT(wpos_next <= rpos_next);
251  if(wpos_next <= wcap)
252  memcpy(src.str + wpos, s, nw);
253  rpos = rpos_next;
254  wpos = wpos_next;
255  }
256 
257  C4_ALWAYS_INLINE void translate_esc_extending(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept
258  {
259  translate_esc_bulk(s, nw, nr);
260  }
261 };
262 
263 
264 //-----------------------------------------------------------------------------
265 //-----------------------------------------------------------------------------
266 //-----------------------------------------------------------------------------
267 
268 /** Filters in place. The result may be larger than the source, and
269  * extending may happen anywhere. As a result some characters may be
270  * left unfiltered when there is no slack in the buffer and the
271  * write-position would overlap the read-position. Consequently, it's
272  * possible for characters to be left unfiltered. In YAML, this
273  * happens only with double-quoted strings, and only with a small
274  * number of escape sequences such as `\L` which is substituted by three
275  * bytes. These escape sequences cause a call to translate_esc_extending()
276  * which is the only entry point to this unfiltered situation.
277  *
278  * @see FilterProcessorInplaceMidExtending */
280 {
281  substr src; ///< the subject string
282  size_t wcap; ///< write capacity - the capacity of the subject string's buffer
283  size_t rpos; ///< read position
284  size_t wpos; ///< write position
285  size_t maxcap; ///< the max capacity needed for filtering the string. This may be larger than the final string size.
286  bool unfiltered_chars; ///< number of characters that were not added to wpos from lack of capacity
287 
288  C4_ALWAYS_INLINE FilterProcessorInplaceMidExtending(substr src_, size_t wcap_) noexcept
289  : src(src_)
290  , wcap(wcap_)
291  , rpos(0)
292  , wpos(0)
293  , maxcap(src.len)
294  , unfiltered_chars(false)
295  {
296  RYML_ASSERT(wcap >= src.len);
297  }
298 
299  C4_ALWAYS_INLINE void setwpos(size_t wpos_) noexcept { wpos = wpos_; }
300  C4_ALWAYS_INLINE void setpos(size_t rpos_, size_t wpos_) noexcept { rpos = rpos_; wpos = wpos_; }
301  C4_ALWAYS_INLINE void set_at_end() noexcept { skip(src.len - rpos); }
302 
303  C4_ALWAYS_INLINE bool has_more_chars() const noexcept { return rpos < src.len; }
304  C4_ALWAYS_INLINE bool has_more_chars(size_t maxpos) const noexcept { RYML_ASSERT(maxpos <= src.len); return rpos < maxpos; }
305 
306  C4_ALWAYS_INLINE FilterResultExtending result() const noexcept
307  {
308  _c4dbgip("inplace: wpos={} wcap={} unfiltered={} maxcap={}", this->wpos, this->wcap, this->unfiltered_chars, this->maxcap);
309  FilterResultExtending ret;
310  ret.str.str = (wpos <= wcap && !unfiltered_chars) ? src.str : nullptr;
311  ret.str.len = wpos;
312  ret.reqlen = maxcap;
313  return ret;
314  }
315  C4_ALWAYS_INLINE csubstr sofar() const noexcept { return csubstr(src.str, wpos <= wcap ? wpos : wcap); }
316  C4_ALWAYS_INLINE csubstr rem() const noexcept { return src.sub(rpos); }
317 
318  C4_ALWAYS_INLINE char curr() const noexcept { RYML_ASSERT(rpos < src.len); return src[rpos]; }
319  C4_ALWAYS_INLINE char next() const noexcept { return rpos+1 < src.len ? src[rpos+1] : '\0'; }
320 
321  C4_ALWAYS_INLINE void skip() noexcept { ++rpos; }
322  C4_ALWAYS_INLINE void skip(size_t num) noexcept { rpos += num; }
323 
324  void set_at(size_t pos, char c) noexcept
325  {
326  RYML_ASSERT(pos < wpos);
327  const size_t save = wpos;
328  wpos = pos;
329  set(c);
330  wpos = save;
331  }
332  void set(char c) noexcept
333  {
334  if(wpos < wcap) // respect write-capacity
335  {
336  if((wpos <= rpos) && !unfiltered_chars)
337  src.str[wpos] = c;
338  }
339  else
340  {
341  _c4dbgip("inplace: add unwritten {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, (wpos+1u > maxcap ? wpos+1u : maxcap));
342  unfiltered_chars = true;
343  }
344  ++wpos;
345  maxcap = wpos > maxcap ? wpos : maxcap;
346  }
347  void set(char c, size_t num) noexcept
348  {
349  RYML_ASSERT(num);
350  if(wpos + num <= wcap) // respect write-capacity
351  {
352  if((wpos <= rpos) && !unfiltered_chars)
353  memset(src.str + wpos, c, num);
354  }
355  else
356  {
357  _c4dbgip("inplace: add unwritten {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, (wpos+num > maxcap ? wpos+num : maxcap));
358  unfiltered_chars = true;
359  }
360  wpos += num;
361  maxcap = wpos > maxcap ? wpos : maxcap;
362  }
363 
364  void copy() noexcept
365  {
366  RYML_ASSERT(rpos < src.len);
367  if(wpos < wcap) // respect write-capacity
368  {
369  if((wpos < rpos) && !unfiltered_chars) // write only if wpos is behind rpos
370  src.str[wpos] = src.str[rpos];
371  }
372  else
373  {
374  _c4dbgip("inplace: add unwritten {}->{} (wpos={}!=rpos={})={} (wpos={}<wcap={}) maxcap={}->{}!", unfiltered_chars, true, wpos, rpos, wpos!=rpos, wpos, wcap, wpos<wcap, maxcap, (wpos+1u > maxcap ? wpos+1u : maxcap));
375  unfiltered_chars = true;
376  }
377  ++rpos;
378  ++wpos;
379  maxcap = wpos > maxcap ? wpos : maxcap;
380  }
381  void copy(size_t num) noexcept
382  {
383  RYML_ASSERT(num);
384  RYML_ASSERT(rpos+num <= src.len);
385  if(wpos + num <= wcap) // respect write-capacity
386  {
387  if((wpos < rpos) && !unfiltered_chars) // write only if wpos is behind rpos
388  {
389  if(wpos + num <= rpos) // there is no overlap
390  memcpy(src.str + wpos, src.str + rpos, num);
391  else // there is overlap
392  memmove(src.str + wpos, src.str + rpos, num);
393  }
394  }
395  else
396  {
397  _c4dbgip("inplace: add unwritten {}->{} (wpos={}!=rpos={})={} (wpos={}<wcap={}) maxcap={}->{}!", unfiltered_chars, true, wpos, rpos, wpos!=rpos, wpos, wcap, wpos<wcap);
398  unfiltered_chars = true;
399  }
400  rpos += num;
401  wpos += num;
402  maxcap = wpos > maxcap ? wpos : maxcap;
403  }
404 
405  void translate_esc(char c) noexcept
406  {
407  RYML_ASSERT(rpos + 2 <= src.len);
408  if(wpos < wcap) // respect write-capacity
409  {
410  if((wpos <= rpos) && !unfiltered_chars)
411  src.str[wpos] = c;
412  }
413  else
414  {
415  _c4dbgip("inplace: add unfiltered {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, (wpos+1u > maxcap ? wpos+1u : maxcap));
416  unfiltered_chars = true;
417  }
418  rpos += 2;
419  ++wpos;
420  maxcap = wpos > maxcap ? wpos : maxcap;
421  }
422 
423  C4_NO_INLINE void translate_esc_bulk(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept
424  {
425  RYML_ASSERT(nw > 0);
426  RYML_ASSERT(nr > 0);
427  RYML_ASSERT(nr+1u >= nw);
428  const size_t wpos_next = wpos + nw;
429  const size_t rpos_next = rpos + nr + 1u; // add 1u to account for the escape character
430  if(wpos_next <= wcap) // respect write-capacity
431  {
432  if((wpos <= rpos) && !unfiltered_chars) // write only if wpos is behind rpos
433  memcpy(src.str + wpos, s, nw);
434  }
435  else
436  {
437  _c4dbgip("inplace: add unwritten {}->{} (wpos={}!=rpos={})={} (wpos={}<wcap={}) maxcap={}->{}!", unfiltered_chars, true, wpos, rpos, wpos!=rpos, wpos, wcap, wpos<wcap);
438  unfiltered_chars = true;
439  }
440  rpos = rpos_next;
441  wpos = wpos_next;
442  maxcap = wpos > maxcap ? wpos : maxcap;
443  }
444 
445  C4_NO_INLINE void translate_esc_extending(const char *C4_RESTRICT s, size_t nw, size_t nr) noexcept
446  {
447  RYML_ASSERT(nw > 0);
448  RYML_ASSERT(nr > 0);
449  RYML_ASSERT(rpos+nr <= src.len);
450  const size_t wpos_next = wpos + nw;
451  const size_t rpos_next = rpos + nr + 1u; // add 1u to account for the escape character
452  if(wpos_next <= rpos_next) // read and write do not overlap. just do a vanilla copy.
453  {
454  if((wpos_next <= wcap) && !unfiltered_chars)
455  memcpy(src.str + wpos, s, nw);
456  rpos = rpos_next;
457  wpos = wpos_next;
458  maxcap = wpos > maxcap ? wpos : maxcap;
459  }
460  else // there is overlap. move the (to-be-read) string to the right.
461  {
462  const size_t excess = wpos_next - rpos_next;
463  RYML_ASSERT(wpos_next > rpos_next);
464  if(src.len + excess <= wcap) // ensure we do not go past the end
465  {
466  RYML_ASSERT(rpos+nr+excess <= src.len);
467  if(wpos_next <= wcap)
468  {
469  if(!unfiltered_chars)
470  {
471  memmove(src.str + wpos_next, src.str + rpos_next, src.len - rpos_next);
472  memcpy(src.str + wpos, s, nw);
473  }
474  rpos = wpos_next; // wpos, not rpos
475  }
476  else
477  {
478  rpos = rpos_next;
479  //const size_t unw = nw > (nr + 1u) ? nw - (nr + 1u) : 0;
480  _c4dbgip("inplace: add unfiltered {}->{} maxcap={}->{}!", unfiltered_chars, true);
481  unfiltered_chars = true;
482  }
483  wpos = wpos_next;
484  // extend the string up to capacity
485  src.len += excess;
486  maxcap = wpos > maxcap ? wpos : maxcap;
487  }
488  else
489  {
490  //const size_t unw = nw > (nr + 1u) ? nw - (nr + 1u) : 0;
491  RYML_ASSERT(rpos_next <= src.len);
492  const size_t required_size = wpos_next + (src.len - rpos_next);
493  _c4dbgip("inplace: add unfiltered {}->{} maxcap={}->{}!", unfiltered_chars, true, maxcap, required_size > maxcap ? required_size : maxcap);
494  RYML_ASSERT(required_size > wcap);
495  unfiltered_chars = true;
496  maxcap = required_size > maxcap ? required_size : maxcap;
497  wpos = wpos_next;
498  rpos = rpos_next;
499  }
500  }
501  }
502 };
503 
504 #undef _c4dbgip
505 
506 
507 /** @} */
508 
509 } // namespace yml
510 } // namespace c4
511 
512 #endif /* _C4_YML_FILTER_PROCESSOR_HPP_ */
Lightweight generic type-safe wrappers for converting individual values to/from strings.
Common utilities and infrastructure used by ryml.
Definition: common.cpp:12
FilterProcessorInplaceEndExtending(substr src_, size_t wcap_) noexcept
size_t wcap
write capacity - the capacity of the subject string's buffer
void setpos(size_t rpos_, size_t wpos_) noexcept
void translate_esc_extending(const char *s, size_t nw, size_t nr) noexcept
void set(char c, size_t num) noexcept
void translate_esc_bulk(const char *s, size_t nw, size_t nr) noexcept
bool has_more_chars(size_t maxpos) const noexcept
void set_at(size_t pos, char c) noexcept
FilterResultExtending result() const noexcept
void translate_esc_extending(const char *s, size_t nw, size_t nr) noexcept
void translate_esc_bulk(const char *s, size_t nw, size_t nr) noexcept
FilterProcessorInplaceMidExtending(substr src_, size_t wcap_) noexcept
size_t maxcap
the max capacity needed for filtering the string. This may be larger than the final string size.
bool has_more_chars(size_t maxpos) const noexcept
void setpos(size_t rpos_, size_t wpos_) noexcept
void set_at(size_t pos, char c) noexcept
size_t wcap
write capacity - the capacity of the subject string's buffer
bool unfiltered_chars
number of characters that were not added to wpos from lack of capacity
void set(char c, size_t num) noexcept
Filters an input string into a different output string.
void translate_esc_bulk(const char *s, size_t nw, size_t nr) noexcept
void set_at(size_t pos, char c) noexcept
void setpos(size_t rpos_, size_t wpos_) noexcept
void copy(size_t num) noexcept
csubstr rem() const noexcept
FilterResult result() const noexcept
void skip(size_t num) noexcept
FilterProcessorSrcDst(csubstr src_, substr dst_) noexcept
void setwpos(size_t wpos_) noexcept
bool has_more_chars() const noexcept
void translate_esc(char c) noexcept
bool skipped_chars() const noexcept
csubstr sofar() const noexcept
void translate_esc_extending(const char *s, size_t nw, size_t nr) noexcept
void set(char c, size_t num) noexcept
bool has_more_chars(size_t maxpos) const noexcept