Ada 2.8.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1#include "ada.h"
3#include "ada/common_defs.h"
4#include "ada/unicode.h"
5
7#include "ada_idna.cpp"
9
10#include <algorithm>
11#if ADA_NEON
12#include <arm_neon.h>
13#elif ADA_SSE2
14#include <emmintrin.h>
15#endif
16
17namespace ada::unicode {
18
19constexpr uint64_t broadcast(uint8_t v) noexcept {
20 return 0x101010101010101ull * v;
21}
22
23constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
24 uint64_t broadcast_80 = broadcast(0x80);
25 uint64_t broadcast_Ap = broadcast(128 - 'A');
26 uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1);
27 uint64_t non_ascii = 0;
28 size_t i = 0;
29
30 for (; i + 7 < length; i += 8) {
31 uint64_t word{};
32 memcpy(&word, input + i, sizeof(word));
33 non_ascii |= (word & broadcast_80);
34 word ^=
35 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
36 memcpy(input + i, &word, sizeof(word));
37 }
38 if (i < length) {
39 uint64_t word{};
40 memcpy(&word, input + i, length - i);
41 non_ascii |= (word & broadcast_80);
42 word ^=
43 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
44 memcpy(input + i, &word, length - i);
45 }
46 return non_ascii == 0;
47}
48#if ADA_NEON
49ada_really_inline bool has_tabs_or_newline(
50 std::string_view user_input) noexcept {
51 // first check for short strings in which case we do it naively.
52 if (user_input.size() < 16) { // slow path
53 for (size_t i = 0; i < user_input.size(); i++) {
54 if (user_input[i] == '\r' || user_input[i] == '\n' ||
55 user_input[i] == '\t') {
56 return true;
57 }
58 }
59 return false;
60 }
61 // fast path for long strings (expected to be common)
62 size_t i = 0;
75 static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
76 0, 9, 10, 0, 0, 13, 0, 0};
77 const uint8x16_t rnt = vld1q_u8(rnt_array);
78 // m['0xd', '0xa', '0x9']
79 uint8x16_t running{0};
80 for (; i + 15 < user_input.size(); i += 16) {
81 uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
82
83 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
84 }
85 if (i < user_input.size()) {
86 uint8x16_t word =
87 vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
88 running = vorrq_u8(running, vceqq_u8(vqtbl1q_u8(rnt, word), word));
89 }
90 return vmaxvq_u32(vreinterpretq_u32_u8(running)) != 0;
91}
92#elif ADA_SSE2
93ada_really_inline bool has_tabs_or_newline(
94 std::string_view user_input) noexcept {
95 // first check for short strings in which case we do it naively.
96 if (user_input.size() < 16) { // slow path
97 for (size_t i = 0; i < user_input.size(); i++) {
98 if (user_input[i] == '\r' || user_input[i] == '\n' ||
99 user_input[i] == '\t') {
100 return true;
101 }
102 }
103 return false;
104 }
105 // fast path for long strings (expected to be common)
106 size_t i = 0;
107 const __m128i mask1 = _mm_set1_epi8('\r');
108 const __m128i mask2 = _mm_set1_epi8('\n');
109 const __m128i mask3 = _mm_set1_epi8('\t');
110 // If we supported SSSE3, we could use the algorithm that we use for NEON.
111 __m128i running{0};
112 for (; i + 15 < user_input.size(); i += 16) {
113 __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
114 running = _mm_or_si128(
115 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
116 _mm_cmpeq_epi8(word, mask2))),
117 _mm_cmpeq_epi8(word, mask3));
118 }
119 if (i < user_input.size()) {
120 __m128i word = _mm_loadu_si128(
121 (const __m128i*)(user_input.data() + user_input.length() - 16));
122 running = _mm_or_si128(
123 _mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
124 _mm_cmpeq_epi8(word, mask2))),
125 _mm_cmpeq_epi8(word, mask3));
126 }
127 return _mm_movemask_epi8(running) != 0;
128}
129#else
130ada_really_inline bool has_tabs_or_newline(
131 std::string_view user_input) noexcept {
132 auto has_zero_byte = [](uint64_t v) {
133 return ((v - 0x0101010101010101) & ~(v) & 0x8080808080808080);
134 };
135 size_t i = 0;
136 uint64_t mask1 = broadcast('\r');
137 uint64_t mask2 = broadcast('\n');
138 uint64_t mask3 = broadcast('\t');
139 uint64_t running{0};
140 for (; i + 7 < user_input.size(); i += 8) {
141 uint64_t word{};
142 memcpy(&word, user_input.data() + i, sizeof(word));
143 uint64_t xor1 = word ^ mask1;
144 uint64_t xor2 = word ^ mask2;
145 uint64_t xor3 = word ^ mask3;
146 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
147 }
148 if (i < user_input.size()) {
149 uint64_t word{};
150 memcpy(&word, user_input.data() + i, user_input.size() - i);
151 uint64_t xor1 = word ^ mask1;
152 uint64_t xor2 = word ^ mask2;
153 uint64_t xor3 = word ^ mask3;
154 running |= has_zero_byte(xor1) | has_zero_byte(xor2) | has_zero_byte(xor3);
155 }
156 return running;
157}
158#endif
159
160// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
161// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
162// U+003F (?), U+0040 (@), U+005B ([), U+005C (\‍), U+005D (]), U+005E (^), or
163// U+007C (|).
164constexpr static std::array<uint8_t, 256> is_forbidden_host_code_point_table =
165 []() constexpr {
166 std::array<uint8_t, 256> result{};
167 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
168 '>', '?', '@', '[', '\\', ']', '^', '|'}) {
169 result[c] = true;
170 }
171 return result;
172 }();
173
174ada_really_inline constexpr bool is_forbidden_host_code_point(
175 const char c) noexcept {
176 return is_forbidden_host_code_point_table[uint8_t(c)];
177}
178
179constexpr static std::array<uint8_t, 256> is_forbidden_domain_code_point_table =
180 []() constexpr {
181 std::array<uint8_t, 256> result{};
182 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
183 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
184 result[c] = true;
185 }
186 for (uint8_t c = 0; c <= 32; c++) {
187 result[c] = true;
188 }
189 for (size_t c = 127; c < 255; c++) {
190 result[c] = true;
191 }
192 return result;
193 }();
194
195static_assert(sizeof(is_forbidden_domain_code_point_table) == 256);
196
197ada_really_inline constexpr bool is_forbidden_domain_code_point(
198 const char c) noexcept {
199 return is_forbidden_domain_code_point_table[uint8_t(c)];
200}
201
202ada_really_inline constexpr bool contains_forbidden_domain_code_point(
203 const char* input, size_t length) noexcept {
204 size_t i = 0;
205 uint8_t accumulator{};
206 for (; i + 4 <= length; i += 4) {
207 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
208 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 1])];
209 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 2])];
210 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i + 3])];
211 }
212 for (; i < length; i++) {
213 accumulator |= is_forbidden_domain_code_point_table[uint8_t(input[i])];
214 }
215 return accumulator;
216}
217
218constexpr static std::array<uint8_t, 256>
220 std::array<uint8_t, 256> result{};
221 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
222 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
223 result[c] = 1;
224 }
225 for (uint8_t c = 'A'; c <= 'Z'; c++) {
226 result[c] = 2;
227 }
228 for (uint8_t c = 0; c <= 32; c++) {
229 result[c] = 1;
230 }
231 for (size_t c = 127; c < 255; c++) {
232 result[c] = 1;
233 }
234 return result;
235 }();
236
237ada_really_inline constexpr uint8_t
238contains_forbidden_domain_code_point_or_upper(const char* input,
239 size_t length) noexcept {
240 size_t i = 0;
241 uint8_t accumulator{};
242 for (; i + 4 <= length; i += 4) {
243 accumulator |=
245 accumulator |=
247 accumulator |=
249 accumulator |=
251 }
252 for (; i < length; i++) {
253 accumulator |=
255 }
256 return accumulator;
257}
258
259// std::isalnum(c) || c == '+' || c == '-' || c == '.') is true for
260constexpr static std::array<bool, 256> is_alnum_plus_table = []() constexpr {
261 std::array<bool, 256> result{};
262 for (size_t c = 0; c < 256; c++) {
263 result[c] = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') ||
264 (c >= 'A' && c <= 'Z') || c == '+' || c == '-' || c == '.';
265 }
266 return result;
267}();
268
269ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept {
270 return is_alnum_plus_table[uint8_t(c)];
271 // A table is almost surely much faster than the
272 // following under most compilers: return
273 // return (std::isalnum(c) || c == '+' || c == '-' || c == '.');
274}
275
276ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept {
277 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
278 (c >= 'a' && c <= 'f');
279}
280
281ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept {
282 return (unsigned char)c <= ' ';
283}
284
285ada_really_inline constexpr bool is_ascii_tab_or_newline(
286 const char c) noexcept {
287 return c == '\t' || c == '\n' || c == '\r';
288}
289
290constexpr std::string_view table_is_double_dot_path_segment[] = {
291 "..", "%2e.", ".%2e", "%2e%2e"};
292
293ada_really_inline ada_constexpr bool is_double_dot_path_segment(
294 std::string_view input) noexcept {
295 // This will catch most cases:
296 // The length must be 2,4 or 6.
297 // We divide by two and require
298 // that the result be between 1 and 3 inclusively.
299 uint64_t half_length = uint64_t(input.size()) / 2;
300 if (half_length - 1 > 2) {
301 return false;
302 }
303 // We have a string of length 2, 4 or 6.
304 // We now check the first character:
305 if ((input[0] != '.') && (input[0] != '%')) {
306 return false;
307 }
308 // We are unlikely the get beyond this point.
309 int hash_value = (input.size() + (unsigned)(input[0])) & 3;
310 const std::string_view target = table_is_double_dot_path_segment[hash_value];
311 if (target.size() != input.size()) {
312 return false;
313 }
314 // We almost never get here.
315 // Optimizing the rest is relatively unimportant.
316 auto prefix_equal_unsafe = [](std::string_view a, std::string_view b) {
317 uint16_t A, B;
318 memcpy(&A, a.data(), sizeof(A));
319 memcpy(&B, b.data(), sizeof(B));
320 return A == B;
321 };
322 if (!prefix_equal_unsafe(input, target)) {
323 return false;
324 }
325 for (size_t i = 2; i < input.size(); i++) {
326 char c = input[i];
327 if ((uint8_t((c | 0x20) - 0x61) <= 25 ? (c | 0x20) : c) != target[i]) {
328 return false;
329 }
330 }
331 return true;
332 // The above code might be a bit better than the code below. Compilers
333 // are not stupid and may use the fact that these strings have length 2,4 and
334 // 6 and other tricks.
335 // return input == ".." ||
336 // input == ".%2e" || input == ".%2E" ||
337 // input == "%2e." || input == "%2E." ||
338 // input == "%2e%2e" || input == "%2E%2E" || input == "%2E%2e" || input ==
339 // "%2e%2E";
340}
341
342ada_really_inline constexpr bool is_single_dot_path_segment(
343 std::string_view input) noexcept {
344 return input == "." || input == "%2e" || input == "%2E";
345}
346
347ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept {
348 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
349}
350
351constexpr static char hex_to_binary_table[] = {
352 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11,
353 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15};
355unsigned constexpr convert_hex_to_binary(const char c) noexcept {
356 return hex_to_binary_table[c - '0'];
357}
358
359std::string percent_decode(const std::string_view input, size_t first_percent) {
360 // next line is for safety only, we expect users to avoid calling
361 // percent_decode when first_percent is outside the range.
362 if (first_percent == std::string_view::npos) {
363 return std::string(input);
364 }
365 std::string dest;
366 dest.reserve(input.length());
367 dest.append(input.substr(0, first_percent));
368 const char* pointer = input.data() + first_percent;
369 const char* end = input.data() + input.size();
370 // Optimization opportunity: if the following code gets
371 // called often, it can be optimized quite a bit.
372 while (pointer < end) {
373 const char ch = pointer[0];
374 size_t remaining = end - pointer - 1;
375 if (ch != '%' || remaining < 2 ||
376 ( // ch == '%' && // It is unnecessary to check that ch == '%'.
377 (!is_ascii_hex_digit(pointer[1]) ||
378 !is_ascii_hex_digit(pointer[2])))) {
379 dest += ch;
380 pointer++;
381 continue;
382 } else {
383 unsigned a = convert_hex_to_binary(pointer[1]);
384 unsigned b = convert_hex_to_binary(pointer[2]);
385 char c = static_cast<char>(a * 16 + b);
386 dest += c;
387 pointer += 3;
388 }
389 }
390 return dest;
391}
392
393std::string percent_encode(const std::string_view input,
394 const uint8_t character_set[]) {
395 auto pointer =
396 std::find_if(input.begin(), input.end(), [character_set](const char c) {
397 return character_sets::bit_at(character_set, c);
398 });
399 // Optimization: Don't iterate if percent encode is not required
400 if (pointer == input.end()) {
401 return std::string(input);
402 }
403
404 std::string result;
405 result.reserve(input.length()); // in the worst case, percent encoding might
406 // produce 3 characters.
407 result.append(input.substr(0, std::distance(input.begin(), pointer)));
408
409 for (; pointer != input.end(); pointer++) {
410 if (character_sets::bit_at(character_set, *pointer)) {
411 result.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
412 } else {
413 result += *pointer;
414 }
415 }
416
417 return result;
418}
419
420template <bool append>
421bool percent_encode(const std::string_view input, const uint8_t character_set[],
422 std::string& out) {
423 ada_log("percent_encode ", input, " to output string while ",
424 append ? "appending" : "overwriting");
425 auto pointer =
426 std::find_if(input.begin(), input.end(), [character_set](const char c) {
427 return character_sets::bit_at(character_set, c);
428 });
429 ada_log("percent_encode done checking, moved to ",
430 std::distance(input.begin(), pointer));
431
432 // Optimization: Don't iterate if percent encode is not required
433 if (pointer == input.end()) {
434 ada_log("percent_encode encoding not needed.");
435 return false;
436 }
437 if (!append) {
438 out.clear();
439 }
440 ada_log("percent_encode appending ", std::distance(input.begin(), pointer),
441 " bytes");
442 out.append(input.data(), std::distance(input.begin(), pointer));
443 ada_log("percent_encode processing ", std::distance(pointer, input.end()),
444 " bytes");
445 for (; pointer != input.end(); pointer++) {
446 if (character_sets::bit_at(character_set, *pointer)) {
447 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
448 } else {
449 out += *pointer;
450 }
451 }
452 return true;
453}
454
455bool to_ascii(std::optional<std::string>& out, const std::string_view plain,
456 size_t first_percent) {
457 std::string percent_decoded_buffer;
458 std::string_view input = plain;
459 if (first_percent != std::string_view::npos) {
460 percent_decoded_buffer = unicode::percent_decode(plain, first_percent);
461 input = percent_decoded_buffer;
462 }
463 // input is a non-empty UTF-8 string, must be percent decoded
464 std::string idna_ascii = ada::idna::to_ascii(input);
465 if (idna_ascii.empty() || contains_forbidden_domain_code_point(
466 idna_ascii.data(), idna_ascii.size())) {
467 return false;
468 }
469 out = std::move(idna_ascii);
470 return true;
471}
472
473std::string percent_encode(const std::string_view input,
474 const uint8_t character_set[], size_t index) {
475 std::string out;
476 out.append(input.data(), index);
477 auto pointer = input.begin() + index;
478 for (; pointer != input.end(); pointer++) {
479 if (character_sets::bit_at(character_set, *pointer)) {
480 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
481 } else {
482 out += *pointer;
483 }
484 }
485 return out;
486}
487
488std::string to_unicode(std::string_view input) {
489 return ada::idna::to_unicode(input);
490}
491
492} // namespace ada::unicode
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ada_constexpr
#define ADA_PUSH_DISABLE_ALL_WARNINGS
#define ADA_POP_DISABLE_WARNINGS
#define ada_really_inline
Definition common_defs.h:84
ada_really_inline bool bit_at(const uint8_t a[], const uint8_t i)
constexpr char hex[1024]
std::string to_ascii(std::string_view ut8_string)
std::string to_unicode(std::string_view input)
bool contains_forbidden_domain_code_point(std::string_view ascii_string)
Includes the declarations for unicode operations.
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table
Definition unicode.cpp:179
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table_or_upper
Definition unicode.cpp:219
static constexpr char hex_to_binary_table[]
Definition unicode.cpp:351
constexpr uint64_t broadcast(uint8_t v) noexcept
Definition unicode.cpp:19
constexpr std::string_view table_is_double_dot_path_segment[]
Definition unicode.cpp:290
static constexpr std::array< uint8_t, 256 > is_forbidden_host_code_point_table
Definition unicode.cpp:164
static constexpr std::array< bool, 256 > is_alnum_plus_table
Definition unicode.cpp:260
tl::expected< result_type, ada::errors > result
Definitions for all unicode specific functions.