rapidyaml 0.15.2
parse and emit YAML, and do it fast
Loading...
Searching...
No Matches
utf.cpp
Go to the documentation of this file.
1#include "c4/utf.hpp"
2#include "c4/charconv.hpp"
3
4namespace c4 {
5
6C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wold-style-cast")
7
8size_t decode_code_point(uint8_t *C4_RESTRICT buf, size_t buflen, const uint32_t code)
9{
10 C4_ASSERT(buf);
11 C4_ASSERT(buflen >= 4);
12 C4_UNUSED(buflen);
13 if (code <= UINT32_C(0x7f))
14 {
15 buf[0] = (uint8_t)code;
16 return 1u;
17 }
18 else if(code <= UINT32_C(0x7ff))
19 {
20 buf[0] = (uint8_t)(UINT32_C(0xc0) | (code >> 6u)); /* 110xxxxx */
21 buf[1] = (uint8_t)(UINT32_C(0x80) | (code & UINT32_C(0x3f))); /* 10xxxxxx */
22 return 2u;
23 }
24 else if(code <= UINT32_C(0xffff))
25 {
26 buf[0] = (uint8_t)(UINT32_C(0xe0) | ((code >> 12u))); /* 1110xxxx */ // NOLINT
27 buf[1] = (uint8_t)(UINT32_C(0x80) | ((code >> 6u) & UINT32_C(0x3f))); /* 10xxxxxx */
28 buf[2] = (uint8_t)(UINT32_C(0x80) | ((code ) & UINT32_C(0x3f))); /* 10xxxxxx */ // NOLINT
29 return 3u;
30 }
31 else if(code <= UINT32_C(0x10ffff))
32 {
33 buf[0] = (uint8_t)(UINT32_C(0xf0) | ((code >> 18u))); /* 11110xxx */ // NOLINT
34 buf[1] = (uint8_t)(UINT32_C(0x80) | ((code >> 12u) & UINT32_C(0x3f))); /* 10xxxxxx */
35 buf[2] = (uint8_t)(UINT32_C(0x80) | ((code >> 6u) & UINT32_C(0x3f))); /* 10xxxxxx */
36 buf[3] = (uint8_t)(UINT32_C(0x80) | ((code ) & UINT32_C(0x3f))); /* 10xxxxxx */ // NOLINT
37 return 4u;
38 }
39 return 0;
40}
41
43{
44 C4_ASSERT(out.len >= 4);
45 C4_ASSERT(!code_point.begins_with("U+"));
46 C4_ASSERT(!code_point.begins_with("\\x"));
47 C4_ASSERT(!code_point.begins_with("\\u"));
48 C4_ASSERT(!code_point.begins_with("\\U"));
49 C4_ASSERT(!code_point.begins_with('0'));
50 C4_ASSERT(code_point.len <= 8);
51 C4_ASSERT(code_point.len > 0);
52 uint32_t code_point_val;
53 C4_CHECK(read_hex(code_point, &code_point_val));
54 size_t ret = decode_code_point((uint8_t*)out.str, out.len, code_point_val);
55 C4_ASSERT(ret <= 4);
56 return out.first(ret);
57}
58
60{
61 #define c4check2_(s, c0, c1) ((s).len >= 2) && (((s).str[0] == (c0)) && ((s).str[1] == (c1)))
62 #define c4check3_(s, c0, c1, c2) ((s).len >= 3) && (((s).str[0] == (c0)) && ((s).str[1] == (c1)) && ((s).str[2] == (c2)))
63 #define c4check4_(s, c0, c1, c2, c3) ((s).len >= 4) && (((s).str[0] == (c0)) && ((s).str[1] == (c1)) && ((s).str[2] == (c2)) && ((s).str[3] == (c3)))
64 // see https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding
65 if(s.len < 2u)
66 return false;
67 else if(c4check3_(s, '\xef', '\xbb', '\xbf')) // UTF-8
68 return 3u;
69 else if(c4check4_(s, '\x00', '\x00', '\xfe', '\xff')) // UTF-32BE
70 return 4u;
71 else if(c4check4_(s, '\xff', '\xfe', '\x00', '\x00')) // UTF-32LE
72 return 4u;
73 else if(c4check2_(s, '\xfe', '\xff')) // UTF-16BE
74 return 2u;
75 else if(c4check2_(s, '\xff', '\xfe')) // UTF-16BE
76 return 2u;
77 else if(c4check3_(s, '\x2b', '\x2f', '\x76')) // UTF-7
78 return 3u;
79 else if(c4check3_(s, '\xf7', '\x64', '\x4c')) // UTF-1
80 return 3u;
81 else if(c4check4_(s, '\xdd', '\x73', '\x66', '\x73')) // UTF-EBCDIC
82 return 4u;
83 else if(c4check3_(s, '\x0e', '\xfe', '\xff')) // SCSU
84 return 3u;
85 else if(c4check3_(s, '\xfb', '\xee', '\x28')) // BOCU-1
86 return 3u;
87 else if(c4check4_(s, '\x84', '\x31', '\x95', '\x33')) // GB18030
88 return 4u;
89 return 0u;
90 #undef c4check2_
91 #undef c4check3_
92 #undef c4check4_
93}
94
96{
97 return s.first(first_non_bom(s));
98}
100{
101 return s.first(first_non_bom(s));
102}
104{
105 return s.sub(first_non_bom(s));
106}
108{
109 return s.sub(first_non_bom(s));
110}
111
112C4_SUPPRESS_WARNING_GCC_CLANG_POP
113
114} // namespace c4
Lightweight generic type-safe wrappers for converting individual values to/from strings.
bool read_hex(csubstr s, I *v) noexcept
read an hexadecimal integer from a string.
Definition charconv.hpp:902
basic_substring< char > substr
a mutable string view
Definition substr.hpp:2355
basic_substring< const char > csubstr
an immutable string view
Definition substr.hpp:2356
substr skip_bom(substr s)
skip the Byte Order Mark, or get the full string if there is Byte Order Mark.
Definition utf.cpp:103
substr get_bom(substr s)
get the Byte Order Mark, or an empty string if there is no Byte Order Mark
Definition utf.cpp:95
size_t first_non_bom(csubstr s)
return the position of the first character not belonging to the Byte Order Mark, or 0 if there is no ...
Definition utf.cpp:59
substr decode_code_point(substr out, csubstr code_point)
decode the given code_point, writing into the output string in out.
Definition utf.cpp:42
bool begins_with(const C c) const noexcept
true if the first character of the string is c
Definition substr.hpp:850
size_t len
the length of the substring
Definition substr.hpp:218
basic_substring first(size_t num) const noexcept
return the first num elements: [0,num[
Definition substr.hpp:529
basic_substring sub(size_t first) const noexcept
return [first,len[
Definition substr.hpp:502
C * str
a restricted pointer to the first character of the substring
Definition substr.hpp:216
#define c4check3_(s, c0, c1, c2)
#define c4check2_(s, c0, c1)
#define c4check4_(s, c0, c1, c2, c3)
utilities for UTF and Byte Order Mark