1/* (c) Magnus Auvinen. See licence.txt in the root of the distribution for more information. */
2/* If you are missing that file, acquire a complete release at teeworlds.com. */
3
4#include "str.h"
5
6#include "dbg.h"
7#include "detect.h"
8#include "math.h"
9#include "mem.h"
10
11#include <cctype>
12#include <charconv> // std::to_chars
13#include <cstdarg>
14#include <cstdio>
15#include <cstdlib>
16#include <cstring>
17
18int str_copy(char *dst, const char *src, int dst_size)
19{
20 dst[0] = '\0';
21 strncat(dest: dst, src: src, n: dst_size - 1);
22 return str_utf8_fix_truncation(str: dst);
23}
24
25void str_append(char *dst, const char *src, int dst_size)
26{
27 int s = str_length(str: dst);
28 int i = 0;
29 while(s < dst_size)
30 {
31 dst[s] = src[i];
32 if(!src[i]) /* check for null termination */
33 break;
34 s++;
35 i++;
36 }
37
38 dst[dst_size - 1] = 0; /* assure null termination */
39 str_utf8_fix_truncation(str: dst);
40}
41
42void str_truncate(char *dst, int dst_size, const char *src, int truncation_len)
43{
44 int size = dst_size;
45 if(truncation_len < size)
46 {
47 size = truncation_len + 1;
48 }
49 str_copy(dst, src, dst_size: size);
50}
51
52int str_length(const char *str)
53{
54 return (int)strlen(s: str);
55}
56
57int str_format_v(char *buffer, int buffer_size, const char *format, va_list args)
58{
59#if defined(CONF_FAMILY_WINDOWS)
60 _vsprintf_p(buffer, buffer_size, format, args);
61 buffer[buffer_size - 1] = 0; /* assure null termination */
62#else
63 vsnprintf(s: buffer, maxlen: buffer_size, format: format, arg: args);
64 /* null termination is assured by definition of vsnprintf */
65#endif
66 return str_utf8_fix_truncation(str: buffer);
67}
68
69#if !defined(CONF_DEBUG)
70int str_format_int(char *buffer, size_t buffer_size, int value)
71{
72 buffer[0] = '\0'; // Fix false positive clang-analyzer-core.UndefinedBinaryOperatorResult when using result
73 auto result = std::to_chars(buffer, buffer + buffer_size - 1, value);
74 result.ptr[0] = '\0';
75 return result.ptr - buffer;
76}
77#endif
78
79#undef str_format
80int str_format(char *buffer, int buffer_size, const char *format, ...)
81{
82 va_list args;
83 va_start(args, format);
84 int length = str_format_v(buffer, buffer_size, format, args);
85 va_end(args);
86 return length;
87}
88#if !defined(CONF_DEBUG)
89#define str_format str_format_opt
90#endif
91
92char str_uppercase(char c)
93{
94 if(c >= 'a' && c <= 'z')
95 return 'A' + (c - 'a');
96 return c;
97}
98
99bool str_isnum(char c)
100{
101 return c >= '0' && c <= '9';
102}
103
104int str_isallnum(const char *str)
105{
106 while(*str)
107 {
108 if(!str_isnum(c: *str))
109 return 0;
110 str++;
111 }
112 return 1;
113}
114
115int str_isallnum_hex(const char *str)
116{
117 while(*str)
118 {
119 if(!str_isnum(c: *str) && !(*str >= 'a' && *str <= 'f') && !(*str >= 'A' && *str <= 'F'))
120 return 0;
121 str++;
122 }
123 return 1;
124}
125
126int str_isspace(char c)
127{
128 return c == ' ' || c == '\n' || c == '\r' || c == '\t';
129}
130
131const char *str_trim_words(const char *str, int words)
132{
133 while(*str && str_isspace(c: *str))
134 str++;
135 while(words && *str)
136 {
137 if(str_isspace(c: *str) && !str_isspace(c: *(str + 1)))
138 words--;
139 str++;
140 }
141 return str;
142}
143
144bool str_has_cc(const char *str)
145{
146 unsigned char *s = (unsigned char *)str;
147 while(*s)
148 {
149 if(*s < 32)
150 {
151 return true;
152 }
153 s++;
154 }
155 return false;
156}
157
158/* makes sure that the string only contains the characters between 32 and 255 */
159void str_sanitize_cc(char *str_in)
160{
161 unsigned char *str = (unsigned char *)str_in;
162 while(*str)
163 {
164 if(*str < 32)
165 *str = ' ';
166 str++;
167 }
168}
169
170/* makes sure that the string only contains the characters between 32 and 255 + \r\n\t */
171void str_sanitize(char *str_in)
172{
173 unsigned char *str = (unsigned char *)str_in;
174 while(*str)
175 {
176 if(*str < 32 && !(*str == '\r') && !(*str == '\n') && !(*str == '\t'))
177 *str = ' ';
178 str++;
179 }
180}
181
182void str_sanitize_filename(char *str_in)
183{
184 unsigned char *str = (unsigned char *)str_in;
185 while(*str)
186 {
187 if(*str <= 0x1F || *str == 0x7F || *str == '\\' || *str == '/' || *str == '|' || *str == ':' ||
188 *str == '*' || *str == '?' || *str == '<' || *str == '>' || *str == '"')
189 {
190 *str = ' ';
191 }
192 str++;
193 }
194}
195
196bool str_valid_filename(const char *str)
197{
198 // References:
199 // - https://en.wikipedia.org/w/index.php?title=Filename&oldid=1281340521#Comparison_of_filename_limitations
200 // - https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file (last update 2024-08-28)
201 if(str[0] == '\0')
202 {
203 return false; // empty name not allowed
204 }
205
206 bool prev_space = false;
207 bool prev_period = false;
208 bool first_space_checked = false;
209 const char *iterator = str;
210 while(*iterator)
211 {
212 const int code = str_utf8_decode(ptr: &iterator);
213 if(code <= 0x1F || code == 0x7F || code == '\\' || code == '/' || code == '|' || code == ':' ||
214 code == '*' || code == '?' || code == '<' || code == '>' || code == '"')
215 {
216 return false; // disallowed characters, mostly for Windows
217 }
218 else if(str_utf8_isspace(code) && code != ' ')
219 {
220 return false; // we only allow regular space characters
221 }
222 if(code == ' ')
223 {
224 if(!first_space_checked)
225 {
226 return false; // leading spaces not allowed
227 }
228 if(prev_space)
229 {
230 return false; // multiple consecutive spaces not allowed
231 }
232 prev_space = true;
233 prev_period = false;
234 }
235 else
236 {
237 prev_space = false;
238 prev_period = code == '.';
239 first_space_checked = true;
240 }
241 }
242 if(prev_space || prev_period)
243 {
244 return false; // trailing spaces and periods not allowed
245 }
246
247 static constexpr const char *RESERVED_NAMES[] = {
248 "CON", "PRN", "AUX", "NUL",
249 "COM0", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "COM¹", "COM²", "COM³",
250 "LPT0", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9", "LPT¹", "LPT²", "LPT³"};
251 for(const char *reserved_name : RESERVED_NAMES)
252 {
253 const char *prefix = str_startswith_nocase(str, prefix: reserved_name);
254 if(prefix != nullptr && (prefix[0] == '\0' || prefix[0] == '.'))
255 {
256 return false; // reserved name not allowed when it makes up the entire filename or when followed by period
257 }
258 }
259
260 return true;
261}
262
263int str_comp_filenames(const char *a, const char *b)
264{
265 int result;
266
267 for(; *a && *b; ++a, ++b)
268 {
269 if(str_isnum(c: *a) && str_isnum(c: *b))
270 {
271 result = 0;
272 do
273 {
274 if(!result)
275 result = *a - *b;
276 ++a;
277 ++b;
278 } while(str_isnum(c: *a) && str_isnum(c: *b));
279
280 if(str_isnum(c: *a))
281 return 1;
282 else if(str_isnum(c: *b))
283 return -1;
284 else if(result || *a == '\0' || *b == '\0')
285 return result;
286 }
287
288 result = tolower(c: *a) - tolower(c: *b);
289 if(result)
290 return result;
291 }
292 return *a - *b;
293}
294
295void str_clean_whitespaces(char *str)
296{
297 char *read = str;
298 char *write = str;
299
300 /* skip initial whitespace */
301 while(*read == ' ')
302 read++;
303
304 /* end of read string is detected in the loop */
305 while(true)
306 {
307 /* skip whitespace */
308 int found_whitespace = 0;
309 for(; *read == ' '; read++)
310 found_whitespace = 1;
311 /* if not at the end of the string, put a found whitespace here */
312 if(*read)
313 {
314 if(found_whitespace)
315 *write++ = ' ';
316 *write++ = *read++;
317 }
318 else
319 {
320 *write = 0;
321 break;
322 }
323 }
324}
325
326char *str_skip_to_whitespace(char *str)
327{
328 while(*str && !str_isspace(c: *str))
329 str++;
330 return str;
331}
332
333const char *str_skip_to_whitespace_const(const char *str)
334{
335 while(*str && !str_isspace(c: *str))
336 str++;
337 return str;
338}
339
340char *str_skip_whitespaces(char *str)
341{
342 while(*str && str_isspace(c: *str))
343 str++;
344 return str;
345}
346
347const char *str_skip_whitespaces_const(const char *str)
348{
349 while(*str && str_isspace(c: *str))
350 str++;
351 return str;
352}
353
354/* case */
355int str_comp_nocase(const char *a, const char *b)
356{
357#if defined(CONF_FAMILY_WINDOWS)
358 return _stricmp(a, b);
359#else
360 return strcasecmp(s1: a, s2: b);
361#endif
362}
363
364int str_comp_nocase_num(const char *a, const char *b, int num)
365{
366#if defined(CONF_FAMILY_WINDOWS)
367 return _strnicmp(a, b, num);
368#else
369 return strncasecmp(s1: a, s2: b, n: num);
370#endif
371}
372
373int str_comp(const char *a, const char *b)
374{
375 return strcmp(s1: a, s2: b);
376}
377
378int str_comp_num(const char *a, const char *b, int num)
379{
380 return strncmp(s1: a, s2: b, n: num);
381}
382
383const char *str_startswith_nocase(const char *str, const char *prefix)
384{
385 int prefixl = str_length(str: prefix);
386 if(str_comp_nocase_num(a: str, b: prefix, num: prefixl) == 0)
387 {
388 return str + prefixl;
389 }
390 else
391 {
392 return nullptr;
393 }
394}
395
396const char *str_startswith(const char *str, const char *prefix)
397{
398 int prefixl = str_length(str: prefix);
399 if(str_comp_num(a: str, b: prefix, num: prefixl) == 0)
400 {
401 return str + prefixl;
402 }
403 else
404 {
405 return nullptr;
406 }
407}
408
409const char *str_endswith_nocase(const char *str, const char *suffix)
410{
411 int strl = str_length(str);
412 int suffixl = str_length(str: suffix);
413 const char *strsuffix;
414 if(strl < suffixl)
415 {
416 return nullptr;
417 }
418 strsuffix = str + strl - suffixl;
419 if(str_comp_nocase(a: strsuffix, b: suffix) == 0)
420 {
421 return strsuffix;
422 }
423 else
424 {
425 return nullptr;
426 }
427}
428
429const char *str_endswith(const char *str, const char *suffix)
430{
431 int strl = str_length(str);
432 int suffixl = str_length(str: suffix);
433 const char *strsuffix;
434 if(strl < suffixl)
435 {
436 return nullptr;
437 }
438 strsuffix = str + strl - suffixl;
439 if(str_comp(a: strsuffix, b: suffix) == 0)
440 {
441 return strsuffix;
442 }
443 else
444 {
445 return nullptr;
446 }
447}
448
449const char *str_find_nocase(const char *haystack, const char *needle)
450{
451 while(*haystack) /* native implementation */
452 {
453 const char *a = haystack;
454 const char *b = needle;
455 while(*a && *b && tolower(c: (unsigned char)*a) == tolower(c: (unsigned char)*b))
456 {
457 a++;
458 b++;
459 }
460 if(!(*b))
461 return haystack;
462 haystack++;
463 }
464
465 return nullptr;
466}
467
468const char *str_find(const char *haystack, const char *needle)
469{
470 while(*haystack) /* native implementation */
471 {
472 const char *a = haystack;
473 const char *b = needle;
474 while(*a && *b && *a == *b)
475 {
476 a++;
477 b++;
478 }
479 if(!(*b))
480 return haystack;
481 haystack++;
482 }
483
484 return nullptr;
485}
486
487static const char *str_token_get(const char *str, const char *delim, int *length)
488{
489 size_t len = strspn(s: str, accept: delim);
490 if(len > 1)
491 str++;
492 else
493 str += len;
494 if(!*str)
495 return nullptr;
496
497 *length = strcspn(s: str, reject: delim);
498 return str;
499}
500
501const char *str_next_token(const char *str, const char *delim, char *buffer, int buffer_size)
502{
503 int len = 0;
504 const char *tok = str_token_get(str, delim, length: &len);
505 if(len < 0 || tok == nullptr)
506 {
507 buffer[0] = '\0';
508 return nullptr;
509 }
510
511 len = buffer_size > len ? len : buffer_size - 1;
512 mem_copy(dest: buffer, source: tok, size: len);
513 buffer[len] = '\0';
514
515 return tok + len;
516}
517
518int str_in_list(const char *list, const char *delim, const char *needle)
519{
520 const char *tok = list;
521 int len = 0, notfound = 1, needlelen = str_length(str: needle);
522
523 while(notfound && (tok = str_token_get(str: tok, delim, length: &len)))
524 {
525 notfound = needlelen != len || str_comp_num(a: tok, b: needle, num: len);
526 tok = tok + len;
527 }
528
529 return !notfound;
530}
531
532bool str_delimiters_around_offset(const char *haystack, const char *delim, int offset, int *start, int *end)
533{
534 bool found = true;
535 const char *search = haystack;
536 const int delim_len = str_length(str: delim);
537 *start = 0;
538 while(str_find(haystack: search, needle: delim))
539 {
540 const char *test = str_find(haystack: search, needle: delim) + delim_len;
541 int distance = test - haystack;
542 if(distance > offset)
543 break;
544
545 *start = distance;
546 search = test;
547 }
548 if(search == haystack)
549 found = false;
550
551 if(str_find(haystack: search, needle: delim))
552 {
553 *end = str_find(haystack: search, needle: delim) - haystack;
554 }
555 else
556 {
557 *end = str_length(str: haystack);
558 found = false;
559 }
560
561 return found;
562}
563
564const char *str_rchr(const char *haystack, char needle)
565{
566 return strrchr(s: haystack, c: needle);
567}
568
569int str_countchr(const char *haystack, char needle)
570{
571 int count = 0;
572 while(*haystack)
573 {
574 if(*haystack == needle)
575 count++;
576 haystack++;
577 }
578 return count;
579}
580
581void str_hex(char *dst, int dst_size, const void *data, int data_size)
582{
583 static const char hex[] = "0123456789ABCDEF";
584 int data_index;
585 int dst_index;
586 for(data_index = 0, dst_index = 0; data_index < data_size && dst_index < dst_size - 3; data_index++)
587 {
588 dst[data_index * 3] = hex[((const unsigned char *)data)[data_index] >> 4];
589 dst[data_index * 3 + 1] = hex[((const unsigned char *)data)[data_index] & 0xf];
590 dst[data_index * 3 + 2] = ' ';
591 dst_index += 3;
592 }
593 dst[dst_index] = '\0';
594}
595
596void str_hex_cstyle(char *dst, int dst_size, const void *data, int data_size, int bytes_per_line)
597{
598 static const char hex[] = "0123456789ABCDEF";
599 int data_index;
600 int dst_index;
601 int remaining_bytes_per_line = bytes_per_line;
602 for(data_index = 0, dst_index = 0; data_index < data_size && dst_index < dst_size - 6; data_index++)
603 {
604 --remaining_bytes_per_line;
605 dst[data_index * 6] = '0';
606 dst[data_index * 6 + 1] = 'x';
607 dst[data_index * 6 + 2] = hex[((const unsigned char *)data)[data_index] >> 4];
608 dst[data_index * 6 + 3] = hex[((const unsigned char *)data)[data_index] & 0xf];
609 dst[data_index * 6 + 4] = ',';
610 if(remaining_bytes_per_line == 0)
611 {
612 dst[data_index * 6 + 5] = '\n';
613 remaining_bytes_per_line = bytes_per_line;
614 }
615 else
616 {
617 dst[data_index * 6 + 5] = ' ';
618 }
619 dst_index += 6;
620 }
621 dst[dst_index] = '\0';
622 // Remove trailing comma and space/newline
623 if(dst_index >= 1)
624 dst[dst_index - 1] = '\0';
625 if(dst_index >= 2)
626 dst[dst_index - 2] = '\0';
627}
628
629static int hexval(char x)
630{
631 switch(x)
632 {
633 case '0': return 0;
634 case '1': return 1;
635 case '2': return 2;
636 case '3': return 3;
637 case '4': return 4;
638 case '5': return 5;
639 case '6': return 6;
640 case '7': return 7;
641 case '8': return 8;
642 case '9': return 9;
643 case 'a':
644 case 'A': return 10;
645 case 'b':
646 case 'B': return 11;
647 case 'c':
648 case 'C': return 12;
649 case 'd':
650 case 'D': return 13;
651 case 'e':
652 case 'E': return 14;
653 case 'f':
654 case 'F': return 15;
655 default: return -1;
656 }
657}
658
659static int byteval(const char *hex, unsigned char *dst)
660{
661 int v1 = hexval(x: hex[0]);
662 int v2 = hexval(x: hex[1]);
663
664 if(v1 < 0 || v2 < 0)
665 return 1;
666
667 *dst = v1 * 16 + v2;
668 return 0;
669}
670
671int str_hex_decode(void *dst, int dst_size, const char *src)
672{
673 unsigned char *cdst = (unsigned char *)dst;
674 int slen = str_length(str: src);
675 int len = slen / 2;
676 int i;
677 if(slen != dst_size * 2)
678 return 2;
679
680 for(i = 0; i < len && dst_size; i++, dst_size--)
681 {
682 if(byteval(hex: src + i * 2, dst: cdst++))
683 return 1;
684 }
685 return 0;
686}
687
688void str_base64(char *dst, int dst_size, const void *data_raw, int data_size)
689{
690 static const char DIGITS[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
691
692 const unsigned char *data = (const unsigned char *)data_raw;
693 unsigned value = 0;
694 int num_bits = 0;
695 int i = 0;
696 int o = 0;
697
698 dst_size -= 1;
699 dst[dst_size] = 0;
700 while(true)
701 {
702 if(num_bits < 6 && i < data_size)
703 {
704 value = (value << 8) | data[i];
705 num_bits += 8;
706 i += 1;
707 }
708 if(o == dst_size)
709 {
710 return;
711 }
712 if(num_bits > 0)
713 {
714 unsigned padded;
715 if(num_bits >= 6)
716 {
717 padded = (value >> (num_bits - 6)) & 0x3f;
718 }
719 else
720 {
721 padded = (value << (6 - num_bits)) & 0x3f;
722 }
723 dst[o] = DIGITS[padded];
724 num_bits -= 6;
725 o += 1;
726 }
727 else if(o % 4 != 0)
728 {
729 dst[o] = '=';
730 o += 1;
731 }
732 else
733 {
734 dst[o] = 0;
735 return;
736 }
737 }
738}
739
740static int base64_digit_value(char digit)
741{
742 if('A' <= digit && digit <= 'Z')
743 {
744 return digit - 'A';
745 }
746 else if('a' <= digit && digit <= 'z')
747 {
748 return digit - 'a' + 26;
749 }
750 else if('0' <= digit && digit <= '9')
751 {
752 return digit - '0' + 52;
753 }
754 else if(digit == '+')
755 {
756 return 62;
757 }
758 else if(digit == '/')
759 {
760 return 63;
761 }
762 return -1;
763}
764
765int str_base64_decode(void *dst_raw, int dst_size, const char *data)
766{
767 unsigned char *dst = (unsigned char *)dst_raw;
768 int data_len = str_length(str: data);
769
770 int i;
771 int o = 0;
772
773 if(data_len % 4 != 0)
774 {
775 return -3;
776 }
777 if(data_len / 4 * 3 > dst_size)
778 {
779 // Output buffer too small.
780 return -2;
781 }
782 for(i = 0; i < data_len; i += 4)
783 {
784 int num_output_bytes = 3;
785 char copy[4];
786 int d[4];
787 int value;
788 int b;
789 mem_copy(dest: copy, source: data + i, size: sizeof(copy));
790 if(i == data_len - 4)
791 {
792 if(copy[3] == '=')
793 {
794 copy[3] = 'A';
795 num_output_bytes = 2;
796 if(copy[2] == '=')
797 {
798 copy[2] = 'A';
799 num_output_bytes = 1;
800 }
801 }
802 }
803 d[0] = base64_digit_value(digit: copy[0]);
804 d[1] = base64_digit_value(digit: copy[1]);
805 d[2] = base64_digit_value(digit: copy[2]);
806 d[3] = base64_digit_value(digit: copy[3]);
807 if(d[0] == -1 || d[1] == -1 || d[2] == -1 || d[3] == -1)
808 {
809 // Invalid digit.
810 return -1;
811 }
812 value = (d[0] << 18) | (d[1] << 12) | (d[2] << 6) | d[3];
813 for(b = 0; b < 3; b++)
814 {
815 unsigned char byte_value = (value >> (16 - 8 * b)) & 0xff;
816 if(b < num_output_bytes)
817 {
818 dst[o] = byte_value;
819 o += 1;
820 }
821 else
822 {
823 if(byte_value != 0)
824 {
825 // Padding not zeroed.
826 return -2;
827 }
828 }
829 }
830 }
831 return o;
832}
833
834void str_escape(char **dst, const char *src, const char *end)
835{
836 while(*src && *dst + 1 < end)
837 {
838 if(*src == '"' || *src == '\\') // escape \ and "
839 {
840 if(*dst + 2 < end)
841 *(*dst)++ = '\\';
842 else
843 break;
844 }
845 *(*dst)++ = *src++;
846 }
847 **dst = 0;
848}
849
850int str_toint(const char *str)
851{
852 return str_toint_base(str, base: 10);
853}
854
855bool str_toint(const char *str, int *out)
856{
857 // returns true if conversion was successful
858 char *end;
859 int value = strtol(nptr: str, endptr: &end, base: 10);
860 if(*end != '\0')
861 return false;
862 if(out != nullptr)
863 *out = value;
864 return true;
865}
866
867int str_toint_base(const char *str, int base)
868{
869 return strtol(nptr: str, endptr: nullptr, base: base);
870}
871
872unsigned long str_toulong_base(const char *str, int base)
873{
874 return strtoul(nptr: str, endptr: nullptr, base: base);
875}
876
877int64_t str_toint64_base(const char *str, int base)
878{
879 return strtoll(nptr: str, endptr: nullptr, base: base);
880}
881
882float str_tofloat(const char *str)
883{
884 return strtod(nptr: str, endptr: nullptr);
885}
886
887bool str_tofloat(const char *str, float *out)
888{
889 // returns true if conversion was successful
890 char *end;
891 float value = strtod(nptr: str, endptr: &end);
892 if(*end != '\0')
893 return false;
894 if(out != nullptr)
895 *out = value;
896 return true;
897}
898
899unsigned str_quickhash(const char *str)
900{
901 unsigned hash = 5381;
902 for(; *str; str++)
903 hash = ((hash << 5) + hash) + (*str); /* hash * 33 + c */
904 return hash;
905}
906
907int str_utf8_encode(char *ptr, int chr)
908{
909 /* encode */
910 if(chr <= 0x7F)
911 {
912 ptr[0] = (char)chr;
913 return 1;
914 }
915 else if(chr <= 0x7FF)
916 {
917 ptr[0] = 0xC0 | ((chr >> 6) & 0x1F);
918 ptr[1] = 0x80 | (chr & 0x3F);
919 return 2;
920 }
921 else if(chr <= 0xFFFF)
922 {
923 ptr[0] = 0xE0 | ((chr >> 12) & 0x0F);
924 ptr[1] = 0x80 | ((chr >> 6) & 0x3F);
925 ptr[2] = 0x80 | (chr & 0x3F);
926 return 3;
927 }
928 else if(chr <= 0x10FFFF)
929 {
930 ptr[0] = 0xF0 | ((chr >> 18) & 0x07);
931 ptr[1] = 0x80 | ((chr >> 12) & 0x3F);
932 ptr[2] = 0x80 | ((chr >> 6) & 0x3F);
933 ptr[3] = 0x80 | (chr & 0x3F);
934 return 4;
935 }
936
937 return 0;
938}
939
940static unsigned char str_byte_next(const char **ptr)
941{
942 unsigned char byte_value = **ptr;
943 (*ptr)++;
944 return byte_value;
945}
946
947static void str_byte_rewind(const char **ptr)
948{
949 (*ptr)--;
950}
951
952int str_utf8_decode(const char **ptr)
953{
954 // As per https://encoding.spec.whatwg.org/#utf-8-decoder.
955 unsigned char utf8_lower_boundary = 0x80;
956 unsigned char utf8_upper_boundary = 0xBF;
957 int utf8_code_point = 0;
958 int utf8_bytes_seen = 0;
959 int utf8_bytes_needed = 0;
960 while(true)
961 {
962 unsigned char byte_value = str_byte_next(ptr);
963 if(utf8_bytes_needed == 0)
964 {
965 if(byte_value <= 0x7F)
966 {
967 return byte_value;
968 }
969 else if(0xC2 <= byte_value && byte_value <= 0xDF)
970 {
971 utf8_bytes_needed = 1;
972 utf8_code_point = byte_value - 0xC0;
973 }
974 else if(0xE0 <= byte_value && byte_value <= 0xEF)
975 {
976 if(byte_value == 0xE0)
977 utf8_lower_boundary = 0xA0;
978 if(byte_value == 0xED)
979 utf8_upper_boundary = 0x9F;
980 utf8_bytes_needed = 2;
981 utf8_code_point = byte_value - 0xE0;
982 }
983 else if(0xF0 <= byte_value && byte_value <= 0xF4)
984 {
985 if(byte_value == 0xF0)
986 utf8_lower_boundary = 0x90;
987 if(byte_value == 0xF4)
988 utf8_upper_boundary = 0x8F;
989 utf8_bytes_needed = 3;
990 utf8_code_point = byte_value - 0xF0;
991 }
992 else
993 {
994 return -1; // Error.
995 }
996 utf8_code_point = utf8_code_point << (6 * utf8_bytes_needed);
997 continue;
998 }
999 if(!(utf8_lower_boundary <= byte_value && byte_value <= utf8_upper_boundary))
1000 {
1001 // Resetting variables not necessary, will be done when
1002 // the function is called again.
1003 str_byte_rewind(ptr);
1004 return -1;
1005 }
1006 utf8_lower_boundary = 0x80;
1007 utf8_upper_boundary = 0xBF;
1008 utf8_bytes_seen += 1;
1009 utf8_code_point = utf8_code_point + ((byte_value - 0x80) << (6 * (utf8_bytes_needed - utf8_bytes_seen)));
1010 if(utf8_bytes_seen != utf8_bytes_needed)
1011 {
1012 continue;
1013 }
1014 // Resetting variables not necessary, see above.
1015 return utf8_code_point;
1016 }
1017}
1018
1019void str_utf8_truncate(char *dst, int dst_size, const char *src, int truncation_len)
1020{
1021 int size = -1;
1022 const char *cursor = src;
1023 int pos = 0;
1024 while(pos <= truncation_len && cursor - src < dst_size && size != cursor - src)
1025 {
1026 size = cursor - src;
1027 if(str_utf8_decode(ptr: &cursor) == 0)
1028 {
1029 break;
1030 }
1031 pos++;
1032 }
1033 str_copy(dst, src, dst_size: size + 1);
1034}
1035
1036int str_utf8_fix_truncation(char *str)
1037{
1038 int len = str_length(str);
1039 if(len > 0)
1040 {
1041 int last_char_index = str_utf8_rewind(str, cursor: len);
1042 const char *last_char = str + last_char_index;
1043 // Fix truncated UTF-8.
1044 if(str_utf8_decode(ptr: &last_char) == -1)
1045 {
1046 str[last_char_index] = 0;
1047 return last_char_index;
1048 }
1049 }
1050 return len;
1051}
1052
1053void str_utf8_trim_right(char *param)
1054{
1055 const char *str = param;
1056 char *end = nullptr;
1057 while(*str)
1058 {
1059 char *str_old = (char *)str;
1060 int code = str_utf8_decode(ptr: &str);
1061
1062 // check if unicode is not empty
1063 if(!str_utf8_isspace(code))
1064 {
1065 end = nullptr;
1066 }
1067 else if(!end)
1068 {
1069 end = str_old;
1070 }
1071 }
1072 if(end)
1073 {
1074 *end = 0;
1075 }
1076}
1077
1078void str_utf8_tolower(const char *input, char *output, size_t size)
1079{
1080 size_t out_pos = 0;
1081 while(*input)
1082 {
1083 const int code = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &input));
1084 char encoded_code[4];
1085 const int code_size = str_utf8_encode(ptr: encoded_code, chr: code);
1086 if(out_pos + code_size + 1 > size) // +1 for null termination
1087 {
1088 break;
1089 }
1090 mem_copy(dest: &output[out_pos], source: encoded_code, size: code_size);
1091 out_pos += code_size;
1092 }
1093 output[out_pos] = '\0';
1094}
1095
1096int str_utf8_isspace(int code)
1097{
1098 return code <= 0x0020 || code == 0x0085 || code == 0x00A0 || code == 0x034F ||
1099 code == 0x115F || code == 0x1160 || code == 0x1680 || code == 0x180E ||
1100 (code >= 0x2000 && code <= 0x200F) || (code >= 0x2028 && code <= 0x202F) ||
1101 (code >= 0x205F && code <= 0x2064) || (code >= 0x206A && code <= 0x206F) ||
1102 code == 0x2800 || code == 0x3000 || code == 0x3164 ||
1103 (code >= 0xFE00 && code <= 0xFE0F) || code == 0xFEFF || code == 0xFFA0 ||
1104 (code >= 0xFFF9 && code <= 0xFFFC);
1105}
1106
1107int str_utf8_isstart(char c)
1108{
1109 if((c & 0xC0) == 0x80) /* 10xxxxxx */
1110 return 0;
1111 return 1;
1112}
1113
1114int str_utf8_rewind(const char *str, int cursor)
1115{
1116 while(cursor)
1117 {
1118 cursor--;
1119 if(str_utf8_isstart(c: *(str + cursor)))
1120 break;
1121 }
1122 return cursor;
1123}
1124
1125const char *str_utf8_find_nocase(const char *haystack, const char *needle, const char **end)
1126{
1127 while(*haystack) /* native implementation */
1128 {
1129 const char *a = haystack;
1130 const char *b = needle;
1131 const char *a_next = a;
1132 const char *b_next = b;
1133 while(*a && *b && str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a_next)) == str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b_next)))
1134 {
1135 a = a_next;
1136 b = b_next;
1137 }
1138 if(!(*b))
1139 {
1140 if(end != nullptr)
1141 *end = a_next;
1142 return haystack;
1143 }
1144 str_utf8_decode(ptr: &haystack);
1145 }
1146
1147 if(end != nullptr)
1148 *end = nullptr;
1149 return nullptr;
1150}
1151
1152int str_utf8_comp_nocase(const char *a, const char *b)
1153{
1154 int code_a;
1155 int code_b;
1156
1157 while(*a && *b)
1158 {
1159 code_a = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a));
1160 code_b = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b));
1161
1162 if(code_a != code_b)
1163 return code_a - code_b;
1164 }
1165 return (unsigned char)*a - (unsigned char)*b;
1166}
1167
1168int str_utf8_comp_nocase_num(const char *a, const char *b, int num)
1169{
1170 int code_a;
1171 int code_b;
1172 const char *old_a = a;
1173
1174 if(num <= 0)
1175 return 0;
1176
1177 while(*a && *b)
1178 {
1179 code_a = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &a));
1180 code_b = str_utf8_tolower_codepoint(code: str_utf8_decode(ptr: &b));
1181
1182 if(code_a != code_b)
1183 return code_a - code_b;
1184
1185 if(a - old_a >= num)
1186 return 0;
1187 }
1188
1189 return (unsigned char)*a - (unsigned char)*b;
1190}
1191
1192const char *str_utf8_skip_whitespaces(const char *str)
1193{
1194 const char *str_old;
1195 int code;
1196
1197 while(*str)
1198 {
1199 str_old = str;
1200 code = str_utf8_decode(ptr: &str);
1201
1202 // check if unicode is not empty
1203 if(!str_utf8_isspace(code))
1204 {
1205 return str_old;
1206 }
1207 }
1208
1209 return str;
1210}
1211
1212int str_utf8_forward(const char *str, int cursor)
1213{
1214 const char *ptr = str + cursor;
1215 if(str_utf8_decode(ptr: &ptr) == 0)
1216 {
1217 return cursor;
1218 }
1219 return ptr - str;
1220}
1221
1222int str_utf8_check(const char *str)
1223{
1224 int codepoint;
1225 while((codepoint = str_utf8_decode(ptr: &str)))
1226 {
1227 if(codepoint == -1)
1228 {
1229 return 0;
1230 }
1231 }
1232 return 1;
1233}
1234
1235void str_utf8_copy_num(char *dst, const char *src, int dst_size, int num)
1236{
1237 int new_cursor;
1238 int cursor = 0;
1239
1240 while(src[cursor] && num > 0)
1241 {
1242 new_cursor = str_utf8_forward(str: src, cursor);
1243 if(new_cursor >= dst_size) // reserve 1 byte for the null termination
1244 break;
1245 else
1246 cursor = new_cursor;
1247 --num;
1248 }
1249
1250 str_copy(dst, src, dst_size: cursor < dst_size ? cursor + 1 : dst_size);
1251}
1252
1253void str_utf8_stats(const char *str, size_t max_size, size_t max_count, size_t *size, size_t *count)
1254{
1255 const char *cursor = str;
1256 *size = 0;
1257 *count = 0;
1258 while(*size < max_size && *count < max_count)
1259 {
1260 if(str_utf8_decode(ptr: &cursor) == 0)
1261 {
1262 break;
1263 }
1264 if((size_t)(cursor - str) >= max_size)
1265 {
1266 break;
1267 }
1268 *size = cursor - str;
1269 ++(*count);
1270 }
1271}
1272
1273size_t str_utf8_offset_bytes_to_chars(const char *str, size_t byte_offset)
1274{
1275 size_t char_offset = 0;
1276 size_t current_offset = 0;
1277 while(current_offset < byte_offset)
1278 {
1279 const size_t prev_byte_offset = current_offset;
1280 current_offset = str_utf8_forward(str, cursor: current_offset);
1281 if(current_offset == prev_byte_offset)
1282 break;
1283 char_offset++;
1284 }
1285 return char_offset;
1286}
1287
1288size_t str_utf8_offset_chars_to_bytes(const char *str, size_t char_offset)
1289{
1290 size_t byte_offset = 0;
1291 for(size_t i = 0; i < char_offset; i++)
1292 {
1293 const size_t prev_byte_offset = byte_offset;
1294 byte_offset = str_utf8_forward(str, cursor: byte_offset);
1295 if(byte_offset == prev_byte_offset)
1296 break;
1297 }
1298 return byte_offset;
1299}
1300
1301int str_utf8_dist(const char *a, const char *b)
1302{
1303 int buf_len = 2 * (str_length(str: a) + 1 + str_length(str: b) + 1);
1304 int *buf = (int *)calloc(nmemb: buf_len, size: sizeof(*buf));
1305 int result = str_utf8_dist_buffer(a, b, buf, buf_len);
1306 free(ptr: buf);
1307 return result;
1308}
1309
1310static int str_to_utf32_unchecked(const char *str, int **out)
1311{
1312 int out_len = 0;
1313 while((**out = str_utf8_decode(ptr: &str)))
1314 {
1315 (*out)++;
1316 out_len++;
1317 }
1318 return out_len;
1319}
1320
1321int str_utf8_dist_buffer(const char *a_utf8, const char *b_utf8, int *buf, int buf_len)
1322{
1323 int a_utf8_len = str_length(str: a_utf8);
1324 int b_utf8_len = str_length(str: b_utf8);
1325 int *a, *b; // UTF-32
1326 int a_len, b_len; // UTF-32 length
1327 dbg_assert(buf_len >= 2 * (a_utf8_len + 1 + b_utf8_len + 1), "buffer too small");
1328 if(a_utf8_len > b_utf8_len)
1329 {
1330 const char *tmp2 = a_utf8;
1331 a_utf8 = b_utf8;
1332 b_utf8 = tmp2;
1333 }
1334 a = buf;
1335 a_len = str_to_utf32_unchecked(str: a_utf8, out: &buf);
1336 b = buf;
1337 b_len = str_to_utf32_unchecked(str: b_utf8, out: &buf);
1338 return str_utf32_dist_buffer(a, a_len, b, b_len, buf, buf_len: buf_len - b_len - a_len);
1339}
1340
1341int str_utf32_dist_buffer(const int *a, int a_len, const int *b, int b_len, int *buf, int buf_len)
1342{
1343 int i, j;
1344 dbg_assert(buf_len >= (a_len + 1) + (b_len + 1), "buffer too small");
1345 if(a_len > b_len)
1346 {
1347 int tmp1 = a_len;
1348 const int *tmp2 = a;
1349
1350 a_len = b_len;
1351 a = b;
1352
1353 b_len = tmp1;
1354 b = tmp2;
1355 }
1356#define B(i, j) buf[((j) & 1) * (a_len + 1) + (i)]
1357 for(i = 0; i <= a_len; i++)
1358 {
1359 B(i, 0) = i;
1360 }
1361 for(j = 1; j <= b_len; j++)
1362 {
1363 B(0, j) = j;
1364 for(i = 1; i <= a_len; i++)
1365 {
1366 int subst = (a[i - 1] != b[j - 1]);
1367 B(i, j) = minimum(
1368 B(i - 1, j) + 1,
1369 B(i, j - 1) + 1,
1370 B(i - 1, j - 1) + subst);
1371 }
1372 }
1373 return B(a_len, b_len);
1374#undef B
1375}
1376