rapidyaml  0.13.0
parse and emit YAML, and do it fast
tag.cpp
Go to the documentation of this file.
1 #include "c4/yml/tag.hpp"
2 #include "c4/yml/error.hpp"
3 #include "c4/yml/detail/dbgprint.hpp"
4 
5 
6 namespace c4 {
7 namespace yml {
8 
9 bool is_custom_tag(csubstr tag)
10 {
11  if((tag.len > 2) && (tag.str[0] == '!'))
12  {
13  size_t pos = tag.find('!', 1);
14  return pos != npos && pos > 1 && tag.str[1] != '<';
15  }
16  return false;
17 }
18 
19 csubstr normalize_tag(csubstr tag)
20 {
21  YamlTag_e t = to_tag(tag);
22  if(t != TAG_NONE)
23  return from_tag(t);
24  if(tag.begins_with("!<"))
25  tag = tag.sub(1);
26  if(tag.begins_with("<!"))
27  return tag;
28  return tag;
29 }
30 
31 csubstr normalize_tag_long(csubstr tag)
32 {
33  YamlTag_e t = to_tag(tag);
34  if(t != TAG_NONE)
35  return from_tag_long(t);
36  if(tag.begins_with("!<"))
37  tag = tag.sub(1);
38  if(tag.begins_with("<!"))
39  return tag;
40  return tag;
41 }
42 
43 csubstr normalize_tag_long(csubstr tag, substr output)
44 {
45  csubstr result = normalize_tag_long(tag);
46  if(result.begins_with("!!"))
47  {
48  _RYML_CHECK_BASIC(!output.overlaps(tag));
49  tag = tag.sub(2);
50  const csubstr pfx = "<tag:yaml.org,2002:";
51  const size_t len = pfx.len + tag.len + 1;
52  if(len <= output.len)
53  {
54  memcpy(output.str , pfx.str, pfx.len);
55  memcpy(output.str + pfx.len, tag.str, tag.len);
56  output[pfx.len + tag.len] = '>';
57  result = output.first(len);
58  }
59  else
60  {
61  result.str = nullptr;
62  result.len = len;
63  }
64  }
65  return result;
66 }
67 
68 YamlTag_e to_tag(csubstr tag)
69 {
70  if(tag.begins_with("!<"))
71  tag = tag.sub(1);
72  if(tag.begins_with("!!"))
73  {
74  tag = tag.sub(2);
75  }
76  else if(tag.begins_with('!'))
77  {
78  return TAG_NONE;
79  }
80  else
81  {
82  csubstr pfx = "<tag:yaml.org,2002:";
83  csubstr pfx2 = pfx.sub(1);
84  if(tag.begins_with(pfx2))
85  {
86  tag = tag.sub(pfx2.len);
87  }
88  else if(tag.begins_with(pfx))
89  {
90  tag = tag.sub(pfx.len);
91  if(!tag.len)
92  return TAG_NONE;
93  tag = tag.offs(0, 1);
94  }
95  }
96  if(tag == "map")
97  return TAG_MAP;
98  else if(tag == "omap")
99  return TAG_OMAP;
100  else if(tag == "pairs")
101  return TAG_PAIRS;
102  else if(tag == "set")
103  return TAG_SET;
104  else if(tag == "seq")
105  return TAG_SEQ;
106  else if(tag == "binary")
107  return TAG_BINARY;
108  else if(tag == "bool")
109  return TAG_BOOL;
110  else if(tag == "float")
111  return TAG_FLOAT;
112  else if(tag == "int")
113  return TAG_INT;
114  else if(tag == "merge")
115  return TAG_MERGE;
116  else if(tag == "null")
117  return TAG_NULL;
118  else if(tag == "str")
119  return TAG_STR;
120  else if(tag == "timestamp")
121  return TAG_TIMESTAMP;
122  else if(tag == "value")
123  return TAG_VALUE;
124  else if(tag == "yaml")
125  return TAG_YAML;
126 
127  return TAG_NONE;
128 }
129 
131 {
132  switch(tag)
133  {
134  case TAG_MAP:
135  return {"<tag:yaml.org,2002:map>"};
136  case TAG_OMAP:
137  return {"<tag:yaml.org,2002:omap>"};
138  case TAG_PAIRS:
139  return {"<tag:yaml.org,2002:pairs>"};
140  case TAG_SET:
141  return {"<tag:yaml.org,2002:set>"};
142  case TAG_SEQ:
143  return {"<tag:yaml.org,2002:seq>"};
144  case TAG_BINARY:
145  return {"<tag:yaml.org,2002:binary>"};
146  case TAG_BOOL:
147  return {"<tag:yaml.org,2002:bool>"};
148  case TAG_FLOAT:
149  return {"<tag:yaml.org,2002:float>"};
150  case TAG_INT:
151  return {"<tag:yaml.org,2002:int>"};
152  case TAG_MERGE:
153  return {"<tag:yaml.org,2002:merge>"};
154  case TAG_NULL:
155  return {"<tag:yaml.org,2002:null>"};
156  case TAG_STR:
157  return {"<tag:yaml.org,2002:str>"};
158  case TAG_TIMESTAMP:
159  return {"<tag:yaml.org,2002:timestamp>"};
160  case TAG_VALUE:
161  return {"<tag:yaml.org,2002:value>"};
162  case TAG_YAML:
163  return {"<tag:yaml.org,2002:yaml>"};
164  case TAG_NONE:
165  default:
166  return {""};
167  }
168 }
169 
170 csubstr from_tag(YamlTag_e tag)
171 {
172  switch(tag)
173  {
174  case TAG_MAP:
175  return {"!!map"};
176  case TAG_OMAP:
177  return {"!!omap"};
178  case TAG_PAIRS:
179  return {"!!pairs"};
180  case TAG_SET:
181  return {"!!set"};
182  case TAG_SEQ:
183  return {"!!seq"};
184  case TAG_BINARY:
185  return {"!!binary"};
186  case TAG_BOOL:
187  return {"!!bool"};
188  case TAG_FLOAT:
189  return {"!!float"};
190  case TAG_INT:
191  return {"!!int"};
192  case TAG_MERGE:
193  return {"!!merge"};
194  case TAG_NULL:
195  return {"!!null"};
196  case TAG_STR:
197  return {"!!str"};
198  case TAG_TIMESTAMP:
199  return {"!!timestamp"};
200  case TAG_VALUE:
201  return {"!!value"};
202  case TAG_YAML:
203  return {"!!yaml"};
204  case TAG_NONE:
205  default:
206  return {""};
207  }
208 }
209 
210 bool is_valid_tag_handle(csubstr handle)
211 {
212  if(handle.begins_with('!') && handle.ends_with('!'))
213  {
214  _c4dbgpf("handle={}", _prs(handle, true));
215  csubstr trimmed = handle.sub(1);
216  if(trimmed.ends_with('!'))
217  trimmed = trimmed.offs(0, 1);
218  _c4dbgpf("handle_trimmed={}", _prs(trimmed, true));
219  // https://yaml.org/spec/1.2.2/#rule-ns-word-char
220  for(char c : trimmed)
221  {
222  bool ok = (c >= '0' && c <= '9')
223  || (c >= 'a' && c <= 'z')
224  || (c >= 'A' && c <= 'Z')
225  || c == '-';
226  if(!ok)
227  {
228  _c4dbgpf("invalid handle character: '{}'", _c4prc(c));
229  return false;
230  }
231  }
232  return true;
233  }
234  return false;
235 }
236 
237 namespace {
238 bool is_valid_tag_char(char c)
239 {
240  // https://yaml.org/spec/1.2.2/#691-node-tags
241  bool ok = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
242  if(!ok)
243  {
244  switch(c)
245  {
246  case '-':
247  case '#':
248  case ';':
249  case '/':
250  case '?':
251  case ':':
252  case '@':
253  case '&':
254  case '=':
255  case '+':
256  case '$':
257  case '_':
258  case '.':
259  case '~':
260  case '*':
261  case '\'':
262  case '(':
263  case ')':
264  case '%':
265  break;
266  default:
267  return false;
268  }
269  }
270  return true;
271 }
272 bool read_hex_char(csubstr suffix, size_t pos, char *out)
273 {
274  // must be succeeded by 2 hex digits
275  if(pos + 3 > suffix.len)
276  return false;
277  suffix = suffix.range(pos + 1, pos + 3);
278  uint8_t val = 0;
279  if(C4_UNLIKELY(!read_hex(suffix, &val) || val > 127))
280  return false;
281  *out = static_cast<char>(val);
282  return true;
283 }
284 } // namespace
285 
286 
287 size_t transform_tag(substr output, csubstr handle, csubstr prefix, csubstr tag,
288  Callbacks const& callbacks, Location const& ymlloc,
289  bool with_brackets)
290 {
291  _RYML_ASSERT_BASIC_(callbacks, tag.len >= handle.len);
292  _RYML_ASSERT_BASIC_(callbacks, !output.overlaps(tag));
293  _RYML_ASSERT_BASIC_(callbacks, prefix.len > 0);
294  csubstr rest = tag.sub(handle.len);
295  _c4dbgpf("%TAG: rest={}", _prs(rest));
296  size_t rpos = 0, wpos = 0;
297  auto appendstr = [&](csubstr s) {
298  if(s.len && wpos + s.len <= output.len)
299  memcpy(output.str + wpos, s.str, s.len);
300  wpos += s.len;
301  };
302  auto appendchar = [&](char c) {
303  if(wpos < output.len)
304  output.str[wpos] = c;
305  ++wpos;
306  };
307  if(with_brackets)
308  appendchar('<');
309  appendstr(prefix);
310  const char *errmsg = nullptr;
311  for(size_t pos = 0; pos < rest.len; ++pos)
312  {
313  char c = rest.str[pos];
314  if(C4_LIKELY(is_valid_tag_char(c)))
315  {
316  if(c != '%')
317  continue;
318  else if(read_hex_char(rest, pos, &c))
319  {
320  appendstr(rest.range(rpos, pos));
321  appendchar(c);
322  pos += 2;
323  rpos = pos + 1;
324  continue;
325  }
326  }
327  errmsg = "invalid tag";
328  goto err; // NOLINT
329  }
330  appendstr(rest.sub(rpos));
331  if(with_brackets)
332  appendchar('>');
333  return wpos;
334 err:
335  if(ymlloc)
336  {
337  _RYML_ERR_PARSE_(callbacks, ymlloc, errmsg);
338  }
339  else
340  {
341  _RYML_ERR_BASIC_(callbacks, errmsg);
342  }
343 }
344 
345 
346 //-----------------------------------------------------------------------------
347 
348 id_type TagDirectives::size() const noexcept
349 {
350  // this assumes we have a very small number of tag directives
351  id_type i = 0;
352  for(; i < RYML_MAX_TAG_DIRECTIVES; ++i)
353  if(m_directives[i].handle.empty())
354  break;
355  return i;
356 }
357 
358 TagDirective const* TagDirectives::add(csubstr handle, csubstr prefix, id_type doc_id) noexcept
359 {
360  id_type pos = size();
361  TagDirective *C4_RESTRICT td = nullptr;
362  if(pos < RYML_MAX_TAG_DIRECTIVES)
363  {
364  td = &m_directives[pos];
365  td->handle = handle;
366  td->prefix = prefix;
367  td->doc_id = doc_id;
368  _c4dbgpf("tagd[{}]: added! handle={} prefix={} doc={}", pos, td->handle, td->prefix, td->doc_id);
369  }
370  return td;
371 }
372 
373 void TagDirectives::clear() noexcept
374 {
375  for(TagDirective &td : m_directives)
376  {
377  td.handle = {};
378  td.prefix = {};
379  td.doc_id = NONE;
380  }
381 }
382 
384 {
385  TagDirective const* first = nullptr;
386  TagDirective const* last = nullptr;
387  for(id_type i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
388  {
389  TagDirective const& C4_RESTRICT td = m_directives[i];
390  if(doc_id == td.doc_id)
391  {
392  first = m_directives + i;
393  break;
394  }
395  else if(td.handle.empty())
396  {
397  break;
398  }
399  }
400  if(first)
401  {
402  last = m_directives + RYML_MAX_TAG_DIRECTIVES;
403  for(TagDirective const* C4_RESTRICT td = first; td < last; ++td)
404  {
405  if(doc_id != td->doc_id || td->handle.empty())
406  {
407  last = td;
408  break;
409  }
410  }
411  }
412  else
413  {
414  first = last = m_directives;
415  }
416  return TagDirectiveRange{first, last};
417 }
418 
419 TagDirective const* TagDirectives::lookup(csubstr tag, id_type doc_id) const noexcept
420 {
421  _c4dbgpf("tagd: searching for {}, doc_id={}", _prs(tag), doc_id);
422  for(id_type i = 0; i < RYML_MAX_TAG_DIRECTIVES; ++i)
423  {
424  TagDirective const& C4_RESTRICT td = m_directives[i];
425  if(td.handle.empty())
426  {
427  continue;
428  }
429  _c4dbgpf("tagd[{}]: handle={} prefix={} doc_id={}", i, td.handle, td.prefix, td.doc_id);
430  if(tag.begins_with(td.handle))
431  {
432  if(td.handle == '!' && (
433  tag.begins_with("!!")
434  || tag.begins_with('<')
435  || tag.begins_with("!<")
436  || is_custom_tag(tag)))
437  continue;
438  _c4dbgpf("tagd[{}]: matches handle!", i);
439  if(doc_id == td.doc_id)
440  {
441  _c4dbgpf("tagd[{}]: matches doc={}!", i, doc_id);
442  return &td;
443  }
444  }
445  }
446  return nullptr;
447 }
448 
449 csubstr TagDirectives::resolve(substr buf, size_t *bufsz, csubstr tag, id_type id, Location const& ymlloc, Callbacks const& callbacks, bool with_brackets) const
450 {
451  _RYML_ASSERT_BASIC_(callbacks, !buf.overlaps(tag));
452  TagDirective const* C4_RESTRICT td = lookup(tag, id);
453  *bufsz = 0;
454  csubstr handle, prefix, ret;
455  const char *errmsg = nullptr;
456  size_t len;
457  if(td)
458  {
459  handle = td->handle;
460  prefix = td->prefix;
461  }
462  else
463  {
464  _c4dbgp("tagd: no directive found");
465  if(tag.begins_with('<'))
466  {
467  _c4dbgp("tagd: already resolved");
468  if(C4_UNLIKELY(!tag.ends_with('>')))
469  {
470  errmsg = "malformed tag";
471  goto err; // NOLINT
472  }
473  return tag;
474  }
475  else if(tag.begins_with("!<"))
476  {
477  _c4dbgp("tagd: already resolved");
478  if(C4_UNLIKELY(!tag.ends_with('>')))
479  {
480  errmsg = "malformed tag";
481  goto err; // NOLINT
482  }
483  return tag.sub(1);
484  }
485  else if(tag.begins_with("!!"))
486  {
487  _c4dbgp("tagd: !!");
488  YamlTag_e tagenum = to_tag(tag);
489  if(tagenum != TAG_NONE)
490  {
491  _c4dbgpf("tagd: standard tag: {} -> {}", tag, from_tag_long(tagenum));
492  tag = from_tag_long(tagenum);
493  return with_brackets ? tag : tag.offs(1, 1);
494  }
495  handle = "!!";
496  prefix = "tag:yaml.org,2002:";
497  }
498  else if(C4_UNLIKELY(is_custom_tag(tag)))
499  {
500  _c4dbgp("tagd: custom_tag");
501  _c4dbgpf("tag '{}' at id={}: no matching directive was found", tag, id);
502  errmsg = "tag without matching directive";
503  goto err; // NOLINT
504  }
505  else
506  {
507  _c4dbgp("tagd: !");
508  handle = prefix = "!";
509  }
510  }
511  len = transform_tag(buf, handle, prefix, tag, callbacks, ymlloc, with_brackets);
512  *bufsz = len;
513  if(len <= buf.len)
514  {
515  ret = buf.first(len);
516  }
517  else
518  {
519  _c4dbgp("tagd: not enough room");
520  ret.str = nullptr;
521  ret.len = len;
522  }
523  return ret;
524 err:
525  if(ymlloc)
526  {
527  _RYML_ERR_PARSE_(callbacks, ymlloc, errmsg);
528  }
529  else
530  {
531  _RYML_ERR_BASIC_(callbacks, errmsg);
532  }
533 }
534 
535 
536 //-----------------------------------------------------------------------------
537 TagCache::LookupResult TagCache::find(csubstr tag, id_type doc_id, id_type linear_threshold) const noexcept
538 {
539  LookupResult ret = {};
540  id_type sz = m_entries.size();
541  if(sz < linear_threshold) // do a linear search on small size
542  {
543  for(size_t i = 0; i < sz; ++i)
544  {
545  Entry const& C4_RESTRICT e = m_entries[i];
546  if(e.tag == tag && e.doc_id == doc_id)
547  {
548  ret.resolved = e.resolved;
549  ret.pos = i;
550  return ret;
551  }
552  else if(e.tag > tag || ((e.tag == tag) && e.doc_id > doc_id))
553  {
554  ret.pos = i;
555  return ret;
556  }
557  }
558  ret.pos = sz;
559  }
560  else // do a binary search on larger size
561  {
562  id_type first = 0;
563  id_type count = sz;
564  while(count)
565  {
566  id_type halfsz = count / id_type(2);
567  id_type mid = first + halfsz;
568  _RYML_ASSERT_BASIC_(m_entries.m_callbacks, mid < sz);
569  Entry const& C4_RESTRICT e = m_entries[mid];
570  if(e.tag < tag || (e.tag == tag && e.doc_id < doc_id))
571  {
572  first = mid + 1;
573  _RYML_ASSERT_BASIC_(m_entries.m_callbacks, count >= halfsz + 1);
574  count -= halfsz + 1;
575  }
576  else
577  {
578  count = halfsz;
579  }
580  }
581  ret.pos = first;
582  if(first < sz)
583  {
584  Entry const& C4_RESTRICT e = m_entries[first];
585  if(e.tag == tag && e.doc_id == doc_id)
586  {
587  ret.resolved = m_entries[first].resolved;
588  }
589  }
590  }
591  return ret;
592 }
593 
594 void TagCache::add(csubstr tag, csubstr resolved, id_type doc_id, const_iterator pos) RYML_NOEXCEPT
595 {
596  const id_type sz = m_entries.size();
597  _RYML_ASSERT_BASIC_(m_entries.m_callbacks, pos <= sz);
598  _RYML_ASSERT_BASIC_(m_entries.m_callbacks, pos == sz || tag < m_entries[pos].tag || (tag == m_entries[pos].tag && doc_id < m_entries[pos].doc_id));
599  m_entries.resize(sz + 1);
600  if(pos < sz)
601  memmove(m_entries.m_stack + pos + 1, m_entries.m_stack + pos, (sz - pos) * sizeof(Entry));
602  m_entries.m_stack[pos].tag = tag;
603  m_entries.m_stack[pos].resolved = resolved;
604  m_entries.m_stack[pos].doc_id = doc_id;
605  _c4dbgpf("tagcache: add entry @pos={}: docid={} {} -> {}", pos, doc_id, tag, _maybe_null_str(resolved));
606 }
607 
608 } // namespace yml
609 } // namespace c4
#define RYML_NOEXCEPT
Conditionally expands to noexcept when RYML_USE_ASSERT is 0 and is empty otherwise.
Definition: common.hpp:192
Error utilities used by ryml.
bool read_hex(csubstr s, I *v) noexcept
read an hexadecimal integer from a string.
Definition: charconv.hpp:889
csubstr from_tag_long(YamlTag_e tag)
Definition: tag.cpp:130
bool is_valid_tag_handle(csubstr handle)
Definition: tag.cpp:210
bool is_custom_tag(csubstr tag)
is a tag of the form !handle!tag?
Definition: tag.cpp:9
csubstr normalize_tag_long(csubstr tag)
Definition: tag.cpp:31
YamlTag_e
a bit mask for marking tags for types
Definition: tag.hpp:33
size_t transform_tag(substr output, csubstr handle, csubstr prefix, csubstr tag, Callbacks const &callbacks, Location const &ymlloc, bool with_brackets)
returns the length of the transformed tag, or 0 to signal that the tag is local and cannot be resolve...
Definition: tag.cpp:287
csubstr normalize_tag(csubstr tag)
Definition: tag.cpp:19
csubstr from_tag(YamlTag_e tag)
Definition: tag.cpp:170
YamlTag_e to_tag(csubstr tag)
Definition: tag.cpp:68
#define RYML_MAX_TAG_DIRECTIVES
the maximum number of tag directives in a Tree
Definition: tag.hpp:26
@ TAG_SET
!!set Unordered set of non-equal values.
Definition: tag.hpp:39
@ TAG_MERGE
!!merge Specify one or more mapping to be merged with the current one.
Definition: tag.hpp:46
@ TAG_INT
!!float Mathematical integers.
Definition: tag.hpp:45
@ TAG_SEQ
!!seq Sequence of arbitrary values.
Definition: tag.hpp:40
@ TAG_NULL
!!null Devoid of value.
Definition: tag.hpp:47
@ TAG_YAML
!!yaml Specify the default value of a mapping https://yaml.org/type/yaml.html
Definition: tag.hpp:51
@ TAG_TIMESTAMP
!!timestamp A point in time https://yaml.org/type/timestamp.html
Definition: tag.hpp:49
@ TAG_NONE
Definition: tag.hpp:34
@ TAG_STR
!!str A sequence of zero or more Unicode characters.
Definition: tag.hpp:48
@ TAG_BOOL
!!bool Mathematical Booleans.
Definition: tag.hpp:43
@ TAG_MAP
!!map Unordered set of key: value pairs without duplicates.
Definition: tag.hpp:36
@ TAG_BINARY
!!binary A sequence of zero or more octets (8 bit values).
Definition: tag.hpp:42
@ TAG_PAIRS
!!pairs Ordered sequence of key: value pairs allowing duplicates.
Definition: tag.hpp:38
@ TAG_VALUE
!!value Specify the default value of a mapping https://yaml.org/type/value.html
Definition: tag.hpp:50
@ TAG_OMAP
!!omap Ordered sequence of key: value pairs without duplicates.
Definition: tag.hpp:37
@ TAG_FLOAT
!!float Floating-point approximation to real numbers.
Definition: tag.hpp:44
RYML_ID_TYPE id_type
The type of a node id in the YAML tree; to override the default type, define the macro RYML_ID_TYPE t...
Definition: common.hpp:244
@ npos
a null string position
Definition: common.hpp:258
@ NONE
an index to none
Definition: common.hpp:251
(Undefined by default) Use shorter error message from checks/asserts: do not show the check condition...
Definition: common.cpp:14
A c-style callbacks class to customize behavior on errors or allocation.
Definition: common.hpp:541
holds a source or yaml file position, for example when an error is detected; See also location_format...
Definition: common.hpp:284
LookupResult find(csubstr tag, id_type doc_id, id_type linear_threshold=Entries::sso_size) const noexcept
Definition: tag.cpp:537
id_type const_iterator
Definition: tag.hpp:79
void add(csubstr tag, csubstr resolved, id_type doc_id, const_iterator pos) RYML_NOEXCEPT
Definition: tag.cpp:594
csubstr handle
Eg.
Definition: tag.hpp:109
void clear() noexcept
Definition: tag.cpp:373
TagDirective m_directives[RYML_MAX_TAG_DIRECTIVES]
Definition: tag.hpp:127
id_type size() const noexcept
Definition: tag.cpp:348
TagDirectiveRange lookup_range(id_type doc_id) const noexcept
Definition: tag.cpp:383
csubstr resolve(substr buf, size_t *bufsz, csubstr tag, id_type doc_id, Location const &ymlloc, Callbacks const &callbacks, bool with_brackets=true) const
Definition: tag.cpp:449
TagDirective const * add(csubstr handle, csubstr prefix, id_type doc_id) noexcept
Definition: tag.cpp:358
TagDirective const * lookup(csubstr tag, id_type id) const noexcept
Definition: tag.cpp:419