rapidyaml 0.15.2
parse and emit YAML, and do it fast
Loading...
Searching...
No Matches
UTF utilities

Functions

substr c4::skip_bom (substr s)
 skip the Byte Order Mark, or get the full string if there is Byte Order Mark.
csubstr c4::skip_bom (csubstr s)
 skip the Byte Order Mark, or get the full string if there is Byte Order Mark
substr c4::get_bom (substr s)
 get the Byte Order Mark, or an empty string if there is no Byte Order Mark
csubstr c4::get_bom (csubstr s)
 get the Byte Order Mark, or an empty string if there is no Byte Order Mark
size_t c4::first_non_bom (csubstr s)
 return the position of the first character not belonging to the Byte Order Mark, or 0 if there is no Byte Order Mark.
substr c4::decode_code_point (substr out, csubstr code_point)
 decode the given code_point, writing into the output string in out.
size_t c4::decode_code_point (uint8_t *buf, size_t buflen, uint32_t code)
 decode the given code point, writing into the output string buf, of size buflen

Detailed Description

Function Documentation

◆ skip_bom() [1/2]

substr c4::skip_bom ( substr s)

skip the Byte Order Mark, or get the full string if there is Byte Order Mark.

See also
Implements the Byte Order Marks as described in https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding

Definition at line 103 of file utf.cpp.

104{
105 return s.sub(first_non_bom(s));
106}
size_t first_non_bom(csubstr s)
return the position of the first character not belonging to the Byte Order Mark, or 0 if there is no ...
Definition utf.cpp:59
basic_substring sub(size_t first) const noexcept
return [first,len[
Definition substr.hpp:502

◆ skip_bom() [2/2]

csubstr c4::skip_bom ( csubstr s)

skip the Byte Order Mark, or get the full string if there is Byte Order Mark

See also
Implements the Byte Order Marks as described in https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding

Definition at line 107 of file utf.cpp.

108{
109 return s.sub(first_non_bom(s));
110}

◆ get_bom() [1/2]

substr c4::get_bom ( substr s)

get the Byte Order Mark, or an empty string if there is no Byte Order Mark

See also
Implements the Byte Order Marks as described in https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding

Definition at line 95 of file utf.cpp.

96{
97 return s.first(first_non_bom(s));
98}
basic_substring first(size_t num) const noexcept
return the first num elements: [0,num[
Definition substr.hpp:529

◆ get_bom() [2/2]

csubstr c4::get_bom ( csubstr s)

get the Byte Order Mark, or an empty string if there is no Byte Order Mark

See also
Implements the Byte Order Marks as described in https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding

Definition at line 99 of file utf.cpp.

100{
101 return s.first(first_non_bom(s));
102}

◆ first_non_bom()

size_t c4::first_non_bom ( csubstr s)

return the position of the first character not belonging to the Byte Order Mark, or 0 if there is no Byte Order Mark.

See also
Implements the Byte Order Marks as described in https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding

Definition at line 59 of file utf.cpp.

60{
61 #define c4check2_(s, c0, c1) ((s).len >= 2) && (((s).str[0] == (c0)) && ((s).str[1] == (c1)))
62 #define c4check3_(s, c0, c1, c2) ((s).len >= 3) && (((s).str[0] == (c0)) && ((s).str[1] == (c1)) && ((s).str[2] == (c2)))
63 #define c4check4_(s, c0, c1, c2, c3) ((s).len >= 4) && (((s).str[0] == (c0)) && ((s).str[1] == (c1)) && ((s).str[2] == (c2)) && ((s).str[3] == (c3)))
64 // see https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding
65 if(s.len < 2u)
66 return false;
67 else if(c4check3_(s, '\xef', '\xbb', '\xbf')) // UTF-8
68 return 3u;
69 else if(c4check4_(s, '\x00', '\x00', '\xfe', '\xff')) // UTF-32BE
70 return 4u;
71 else if(c4check4_(s, '\xff', '\xfe', '\x00', '\x00')) // UTF-32LE
72 return 4u;
73 else if(c4check2_(s, '\xfe', '\xff')) // UTF-16BE
74 return 2u;
75 else if(c4check2_(s, '\xff', '\xfe')) // UTF-16BE
76 return 2u;
77 else if(c4check3_(s, '\x2b', '\x2f', '\x76')) // UTF-7
78 return 3u;
79 else if(c4check3_(s, '\xf7', '\x64', '\x4c')) // UTF-1
80 return 3u;
81 else if(c4check4_(s, '\xdd', '\x73', '\x66', '\x73')) // UTF-EBCDIC
82 return 4u;
83 else if(c4check3_(s, '\x0e', '\xfe', '\xff')) // SCSU
84 return 3u;
85 else if(c4check3_(s, '\xfb', '\xee', '\x28')) // BOCU-1
86 return 3u;
87 else if(c4check4_(s, '\x84', '\x31', '\x95', '\x33')) // GB18030
88 return 4u;
89 return 0u;
90 #undef c4check2_
91 #undef c4check3_
92 #undef c4check4_
93}
size_t len
the length of the substring
Definition substr.hpp:218
#define c4check3_(s, c0, c1, c2)
#define c4check2_(s, c0, c1)
#define c4check4_(s, c0, c1, c2, c3)

Referenced by get_bom(), get_bom(), skip_bom(), and skip_bom().

◆ decode_code_point() [1/2]

substr c4::decode_code_point ( substr out,
csubstr code_point )

decode the given code_point, writing into the output string in out.

Parameters
outthe output string. must have at least 4 bytes (this is asserted), and must not have a null string.
code_pointmust have length in ]0,8], and must not begin with any of
`U+`,`\x`,`\u`,`\U`,`0` 
(asserted)
Returns
the part of out that was written, which will always be at most 4 bytes.

Definition at line 42 of file utf.cpp.

43{
44 C4_ASSERT(out.len >= 4);
45 C4_ASSERT(!code_point.begins_with("U+"));
46 C4_ASSERT(!code_point.begins_with("\\x"));
47 C4_ASSERT(!code_point.begins_with("\\u"));
48 C4_ASSERT(!code_point.begins_with("\\U"));
49 C4_ASSERT(!code_point.begins_with('0'));
50 C4_ASSERT(code_point.len <= 8);
51 C4_ASSERT(code_point.len > 0);
52 uint32_t code_point_val;
53 C4_CHECK(read_hex(code_point, &code_point_val));
54 size_t ret = decode_code_point((uint8_t*)out.str, out.len, code_point_val);
55 C4_ASSERT(ret <= 4);
56 return out.first(ret);
57}
bool read_hex(csubstr s, I *v) noexcept
read an hexadecimal integer from a string.
Definition charconv.hpp:902
substr decode_code_point(substr out, csubstr code_point)
decode the given code_point, writing into the output string in out.
Definition utf.cpp:42
bool begins_with(const C c) const noexcept
true if the first character of the string is c
Definition substr.hpp:850
C * str
a restricted pointer to the first character of the substring
Definition substr.hpp:216

Referenced by decode_code_point(), and decode_code_point().

◆ decode_code_point() [2/2]

size_t c4::decode_code_point ( uint8_t * buf,
size_t buflen,
uint32_t code )

decode the given code point, writing into the output string buf, of size buflen

Parameters
bufthe output string. must have at least 4 bytes (this is asserted), and must not be null
buflenthe length of the output string. must be at least 4
codethe code point must have length in ]0,8], and must not begin with any of
`U+`,`\x`,`\u`,`\U`,`0` 
(asserted)
Returns
the number of written characters, which will always be at most 4 bytes.

Definition at line 8 of file utf.cpp.

9{
10 C4_ASSERT(buf);
11 C4_ASSERT(buflen >= 4);
12 C4_UNUSED(buflen);
13 if (code <= UINT32_C(0x7f))
14 {
15 buf[0] = (uint8_t)code;
16 return 1u;
17 }
18 else if(code <= UINT32_C(0x7ff))
19 {
20 buf[0] = (uint8_t)(UINT32_C(0xc0) | (code >> 6u)); /* 110xxxxx */
21 buf[1] = (uint8_t)(UINT32_C(0x80) | (code & UINT32_C(0x3f))); /* 10xxxxxx */
22 return 2u;
23 }
24 else if(code <= UINT32_C(0xffff))
25 {
26 buf[0] = (uint8_t)(UINT32_C(0xe0) | ((code >> 12u))); /* 1110xxxx */ // NOLINT
27 buf[1] = (uint8_t)(UINT32_C(0x80) | ((code >> 6u) & UINT32_C(0x3f))); /* 10xxxxxx */
28 buf[2] = (uint8_t)(UINT32_C(0x80) | ((code ) & UINT32_C(0x3f))); /* 10xxxxxx */ // NOLINT
29 return 3u;
30 }
31 else if(code <= UINT32_C(0x10ffff))
32 {
33 buf[0] = (uint8_t)(UINT32_C(0xf0) | ((code >> 18u))); /* 11110xxx */ // NOLINT
34 buf[1] = (uint8_t)(UINT32_C(0x80) | ((code >> 12u) & UINT32_C(0x3f))); /* 10xxxxxx */
35 buf[2] = (uint8_t)(UINT32_C(0x80) | ((code >> 6u) & UINT32_C(0x3f))); /* 10xxxxxx */
36 buf[3] = (uint8_t)(UINT32_C(0x80) | ((code ) & UINT32_C(0x3f))); /* 10xxxxxx */ // NOLINT
37 return 4u;
38 }
39 return 0;
40}