unicode.h source code [include/c++/16.1.1/bits/unicode.h]

1	// Unicode utilities -- C++ --
2
3	// Copyright The GNU Toolchain Authors.
4	//
5	// This file is part of the GNU ISO C++ Library. This library is free
6	// software; you can redistribute it and/or modify it under the
7	// terms of the GNU General Public License as published by the
8	// Free Software Foundation; either version 3, or (at your option)
9	// any later version.
10
11	// This library is distributed in the hope that it will be useful,
12	// but WITHOUT ANY WARRANTY; without even the implied warranty of
13	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	// GNU General Public License for more details.
15
16	// Under Section 7 of GPL version 3, you are granted additional
17	// permissions described in the GCC Runtime Library Exception, version
18	// 3.1, as published by the Free Software Foundation.
19
20	// You should have received a copy of the GNU General Public License and
21	// a copy of the GCC Runtime Library Exception along with this program;
22	// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23	// <http://www.gnu.org/licenses/>.
24
25	/* @file include/bits/unicode.h*
26	* This is an internal header file, included by other library headers.
27	* Do not attempt to use it directly. @headername{format}
28	*/
29
30	#ifndef _GLIBCXX_UNICODE_H
31	#define _GLIBCXX_UNICODE_H 1
32
33	#if __cplusplus >= 202002L
34	#include <array>
35	#include <bit> // bit_width
36	#include <charconv> // __detail::__from_chars_alnum_to_val_table
37	#include <string_view>
38	#include <cstdint>
39	#include <bits/stl_algo.h>
40	#include <bits/stl_iterator.h>
41	#include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
42	#include <bits/ranges_util.h> // view_interface
43
44	namespace std _GLIBCXX_VISIBILITY(default)
45	{
46	_GLIBCXX_BEGIN_NAMESPACE_VERSION
47	namespace __unicode
48	{
49	// A Unicode code point that is not a high or low surrogate.
50	constexpr bool
51	__is_scalar_value(char32_t __c)
52	{
53	if (__c < `0xD800`) [[likely]]
54	return true;
55	return `0xDFFF` < __c && __c <= `0x10FFFF`;
56	}
57
58	// A code point that can be encoded in a single code unit of type _CharT.
59	template<typename _CharT>
60	constexpr bool
61	__is_single_code_unit(char32_t __c)
62	{
63	if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= `0xFF`)
64	return __c <= `0x7F`; // ASCII character
65	else
66	return __c < __gnu_cxx::__int_traits<_CharT>::__max
67	&& __is_scalar_value(__c);
68	}
69
70	// Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
71
72	struct _Repl
73	{
74	constexpr char32_t
75	operator()() const noexcept
76	{ return `0xFFFD`; }
77	};
78
79	struct _Null_sentinel_t
80	{
81	template<input_iterator _It>
82	requires default_initializable<iter_value_t<_It>>
83	&& equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
84	friend constexpr auto
85	operator==(_It __it, _Null_sentinel_t)
86	{ return *__it == iter_value_t<_It>{}; }
87	};
88
89	// An iterator over an input range of FromFmt code units that yields either
90	// UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.
91	// The code units from the input range are interpreted as Unicode code points
92	// and the iterator produces the individual code unit for each code point.
93	// Invalid sequences in the input are replaced with U+FFDD so that the result
94	// is always valid UTF-8, UTF-16, or UTF-32.
95	//
96	// The iterator knows the bounds of the underlying input range and will not
97	// read outside those bounds (incrementing or decrementing at the boundary
98	// is erroneously idempotent).
99	//
100	// On construction, the iterator attemps to decode a single code point from
101	// the input range and then encode it into an internal buffer in the output
102	// format, e.g. if the input is UTF-8 and the output is UTF-16, it might read
103	// three char8_t code units from the input and store two char16_t code units
104	// in its buffer. Incrementing the iterator will first iterate over buffer,
105	// yielding each code unit in turn, and then extract another code point from
106	// the input. Failure to extract a valid code point from the input will store
107	// U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt.
108	template<typename _FromFmt, typename _ToFmt,
109	input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
110	typename _ErrorHandler = _Repl>
111	requires convertible_to<iter_value_t<_Iter>, _FromFmt>
112	class _Utf_iterator
113	{
114	static_assert(forward_iterator<_Iter> \|\| noexcept(_ErrorHandler()()));
115
116	public:
117	using value_type = _ToFmt;
118	using difference_type = iter_difference_t<_Iter>;
119	using reference = value_type;
120	using iterator_concept
121	= std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
122	bidirectional_iterator_tag>;
123
124	constexpr _Utf_iterator() = default;
125
126	constexpr
127	_Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
128	requires bidirectional_iterator<_Iter>
129	: _M_first_and_curr{__first, __it}, _M_last(__last)
130	{
131	if (_M_curr() != _M_last)
132	_M_read();
133	else
134	_M_buf = {};
135	}
136
137	constexpr
138	_Utf_iterator(_Iter __it, _Sent __last)
139	requires (!bidirectional_iterator<_Iter>)
140	: _M_first_and_curr{__it}, _M_last(__last)
141	{
142	if (_M_curr() != _M_last)
143	_M_read();
144	else
145	_M_buf = {};
146	}
147
148	template<class _Iter2, class _Sent2>
149	requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
150	constexpr
151	_Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
152	_ErrorHandler>& __other)
153	: _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
154	_M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
155	_M_last(__other._M_last)
156	{ }
157
158	[[nodiscard]]
159	constexpr _Iter
160	begin() const requires bidirectional_iterator<_Iter>
161	{ return _M_first(); }
162
163	[[nodiscard]]
164	constexpr _Sent
165	end() const { return _M_last; }
166
167	[[nodiscard]]
168	constexpr _Iter
169	base() const requires forward_iterator<_Iter>
170	{ return _M_curr(); }
171
172	[[nodiscard]]
173	constexpr iter_difference_t<_Iter>
174	_M_units() const requires forward_iterator<_Iter>
175	{ return _M_to_increment; }
176
177	[[nodiscard]]
178	constexpr value_type
179	operator() const* { return _M_buf[_M_buf_index]; }
180
181	constexpr _Utf_iterator&
182	operator++()
183	{
184	if (_M_buf_index + `1` < _M_buf_last)
185	++_M_buf_index; // Move to the next code unit in the buffer.
186	else if (_M_curr() != _M_last)
187	{
188	// Advance past the current code point (for non-forward iterators
189	// we already moved there after decoding the last code point).
190	if constexpr (forward_iterator<_Iter>)
191	std::advance(_M_curr(), _M_to_increment);
192	if (_M_curr() == _M_last)
193	_M_buf_index = `0`;
194	else // Decode next code point from the input and update buffer.
195	_M_read();
196	}
197	// else erroneous, but ignored for now.
198	return *this;
199	}
200
201	constexpr _Utf_iterator
202	operator++(int)
203	{
204	auto __tmp = *this;
205	++*this;
206	return __tmp;
207	}
208
209	constexpr _Utf_iterator&
210	operator--() requires bidirectional_iterator<_Iter>
211	{
212	if (_M_buf_index > `0`)
213	--_M_buf_index;
214	else if (_M_curr() != _M_first())
215	{
216	_M_read_reverse();
217	_M_buf_index = _M_buf_last - `1`;
218	ranges::advance(_M_curr(), -_M_to_increment);
219	}
220	// else erroneous, but ignored for now.
221	return *this;
222	}
223
224	constexpr _Utf_iterator
225	operator--(int)
226	{
227	auto __tmp = *this;
228	--*this;
229	return __tmp;
230	}
231
232	[[nodiscard]]
233	friend constexpr bool
234	operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
235	requires forward_iterator<_Iter> \|\| requires (_Iter __i) { __i != __i; }
236	{
237	if constexpr (forward_iterator<_Iter>)
238	return __lhs._M_curr() == __rhs._M_curr()
239	&& __lhs._M_buf_index == __rhs._M_buf_index;
240	else if (__lhs._M_curr() != __rhs._M_curr())
241	return false;
242	else if (__lhs._M_buf_index == __rhs._M_buf_index
243	&& __lhs._M_buf_last == __rhs._M_buf_last)
244	return true;
245	else
246	return __lhs._M_buf_index == __lhs._M_buf_last
247	&& __rhs._M_buf_index == __rhs._M_buf_last;
248	}
249
250	[[nodiscard]]
251	friend constexpr bool
252	operator==(_Utf_iterator __lhs, _Sent __rhs)
253	{
254	if constexpr (forward_iterator<_Iter>)
255	return __lhs._M_curr() == __rhs;
256	else
257	return __lhs._M_curr() == __rhs
258	&& __lhs._M_buf_index == __lhs._M_buf_last;
259	}
260
261	private:
262	constexpr void
263	_M_read()
264	{
265	if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
266	_M_read_utf8();
267	else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
268	_M_read_utf16();
269	else
270	{
271	static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
272	_M_read_utf32();
273	}
274	}
275
276	constexpr void
277	_M_read_reverse() requires bidirectional_iterator<_Iter>
278	{
279	if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
280	_M_read_reverse_utf8();
281	else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
282	_M_read_reverse_utf16();
283	else
284	{
285	static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
286	_M_read_reverse_utf32();
287	}
288	}
289
290	template<typename>
291	struct _Guard
292	{
293	_Guard(void*, _Iter&) { }
294	};
295
296	template<typename _It> requires forward_iterator<_It>
297	struct _Guard<_It>
298	{
299	constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
300	_Utf_iterator* _M_this;
301	_It _M_orig;
302	};
303
304	constexpr char32_t
305	_M_read_utf8()
306	{
307	_Guard<_Iter> __g{this, _M_curr()};
308	char32_t __c{};
309	const uint8_t __lo_bound = `0x80`, __hi_bound = `0xBF`;
310	uint8_t __u = *_M_curr()++;
311	uint8_t __to_incr = `1`;
312	auto __incr = [&, this] {
313	++__to_incr;
314	return ++_M_curr();
315	};
316
317	if (__u <= `0x7F`) [[likely]] // 0x00 to 0x7F
318	__c = __u;
319	else if (__u < `0xC2`) [[unlikely]]
320	__c = _S_error();
321	else if (_M_curr() == _M_last) [[unlikely]]
322	__c = _S_error();
323	else if (__u <= `0xDF`) // 0xC2 to 0xDF
324	{
325	__c = __u & `0x1F`;
326	__u = *_M_curr();
327
328	if (__u < __lo_bound \|\| __u > __hi_bound) [[unlikely]]
329	__c = _S_error();
330	else
331	{
332	__c = (__c << `6`) \| (__u & `0x3F`);
333	__incr();
334	}
335	}
336	else if (__u <= `0xEF`) // 0xE0 to 0xEF
337	{
338	const uint8_t __lo_bound_2 = __u == `0xE0` ? `0xA0` : __lo_bound;
339	const uint8_t __hi_bound_2 = __u == `0xED` ? `0x9F` : __hi_bound;
340
341	__c = __u & `0x0F`;
342	__u = *_M_curr();
343
344	if (__u < __lo_bound_2 \|\| __u > __hi_bound_2) [[unlikely]]
345	__c = _S_error();
346	else if (__incr() == _M_last) [[unlikely]]
347	__c = _S_error();
348	else
349	{
350	__c = (__c << `6`) \| (__u & `0x3F`);
351	__u = *_M_curr();
352
353	if (__u < __lo_bound \|\| __u > __hi_bound) [[unlikely]]
354	__c = _S_error();
355	else
356	{
357	__c = (__c << `6`) \| (__u & `0x3F`);
358	__incr();
359	}
360	}
361	}
362	else if (__u <= `0xF4`) // 0xF0 to 0xF4
363	{
364	const uint8_t __lo_bound_2 = __u == `0xF0` ? `0x90` : __lo_bound;
365	const uint8_t __hi_bound_2 = __u == `0xF4` ? `0x8F` : __hi_bound;
366
367	__c = __u & `0x07`;
368	__u = *_M_curr();
369
370	if (__u < __lo_bound_2 \|\| __u > __hi_bound_2) [[unlikely]]
371	__c = _S_error();
372	else if (__incr() == _M_last) [[unlikely]]
373	__c = _S_error();
374	else
375	{
376	__c = (__c << `6`) \| (__u & `0x3F`);
377	__u = *_M_curr();
378
379	if (__u < __lo_bound \|\| __u > __hi_bound) [[unlikely]]
380	__c = _S_error();
381	else if (__incr() == _M_last) [[unlikely]]
382	__c = _S_error();
383	else
384	{
385	__c = (__c << `6`) \| (__u & `0x3F`);
386	__u = *_M_curr();
387
388	if (__u < __lo_bound \|\| __u > __hi_bound) [[unlikely]]
389	__c = _S_error();
390	else
391	{
392	__c = (__c << `6`) \| (__u & `0x3F`);
393	__incr();
394	}
395	}
396	}
397	}
398	else [[unlikely]]
399	__c = _S_error();
400
401	_M_update(__c, __to_incr);
402
403	return __c;
404	}
405
406	constexpr void
407	_M_read_utf16()
408	{
409	_Guard<_Iter> __g{this, _M_curr()};
410	char32_t __c{};
411	uint16_t __u = *_M_curr()++;
412	uint8_t __to_incr = `1`;
413
414	if (__u < `0xD800` \|\| __u > `0xDFFF`) [[likely]]
415	__c = __u;
416	else if (__u < `0xDC00` && _M_curr() != _M_last)
417	{
418	uint16_t __u2 = *_M_curr();
419	if (__u2 < `0xDC00` \|\| __u2 > `0xDFFF`) [[unlikely]]
420	__c = _S_error();
421	else
422	{
423	++_M_curr();
424	__to_incr = `2`;
425	uint32_t __x = (__u & `0x3F`) << `10` \| (__u2 & `0x3FF`);
426	uint32_t __w = (__u >> `6`) & `0x1F`;
427	__c = (__w + `1`) << `16` \| __x;
428	}
429	}
430	else
431	__c = _S_error();
432
433	_M_update(__c, __to_incr);
434	}
435
436	constexpr void
437	_M_read_utf32()
438	{
439	_Guard<_Iter> __g{this, _M_curr()};
440	char32_t __c = *_M_curr()++;
441	if (!__is_scalar_value(__c)) [[unlikely]]
442	__c = _S_error();
443	_M_update(__c, to_incr: `1`);
444	}
445
446	constexpr void
447	_M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
448	{
449	const auto __first = _M_first();
450	auto __curr = _M_curr();
451	// The code point we decode:
452	char32_t __c{};
453	// The last code unit read:
454	uint8_t __u = *--__curr;
455	// Count of bytes read:
456	uint8_t __to_incr = `1`;
457
458	if (__u <= `0x7F`) [[likely]]
459	{
460	_M_update(c: __u, to_incr: `1`);
461	return;
462	}
463
464	// Continuation bytes match 10xxxxxx
465	auto __is_continuation = [](uint8_t __b) {
466	return (__b & `0xC0`) == `0x80`;
467	};
468	// 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.
469	// 0xF5-0xFF would produce code points above U+10FFFF
470	auto __is_invalid = [](uint8_t __b) {
471	return (__b & `0xFE`) == `0xC0` \|\| __b >= `0xF5`;
472	};
473
474	// No valid or invalid multibyte sequence is longer than 4 bytes,
475	// so skip back over at most four bytes.
476	while (__is_continuation(__u) && __to_incr < `4` && __curr != __first)
477	{
478	++__to_incr;
479	__u = *--__curr;
480	}
481
482	// If the last byte read was a continuation byte then either we read
483	// four continuation bytes, or stopped at the start of the sequence.
484	// Either way, the maximal subparts are the individual continuation
485	// bytes so each one should be replaced with U+FFFD.
486	if (__is_continuation(__u) \|\| __is_invalid(__u)) [[unlikely]]
487	{
488	// Either found four continuation bytes (maximum allowed is three)
489	// or first non-continuation byte is an invalid UTF-8 code unit.
490	_M_update(c: _S_error(), to_incr: `1`);
491	return;
492	}
493	// __u is a valid start byte so use countl_one to get the expected
494	// length of the multibyte sequence that starts with this byte.
495	int __seq_length = std::countl_one(x: (unsigned char)__u);
496	if (__seq_length < __to_incr) [[unlikely]]
497	{
498	// If the expected number of continuation bytes is less than
499	// the number we found, then the last continuation byte is a
500	// maximal subpart and the decremented iterator points to it.
501	_M_update(c: _S_error(), to_incr: `1`);
502	return;
503	}
504
505	auto __orig = std::__exchange(_M_curr(), std::move(__curr));
506	if (_M_read_utf8() == _S_error()) [[unlikely]]
507	{
508	if (_M_to_increment < __to_incr) // Read truncated sequence, set
509	_M_to_increment = `1`; // curr to last continuation byte.
510	}
511
512	_M_curr() = std::move(__orig);
513	// operator--() will move back by _M_to_increment
514	}
515
516	constexpr void
517	_M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
518	{
519	_Guard<_Iter> __g{this, _M_curr()};
520	char32_t __c{};
521	uint16_t __u = *--_M_curr();
522	uint8_t __to_incr = `1`;
523
524	if (__u < `0xD800` \|\| __u > `0xDFFF`) [[likely]]
525	__c = __u;
526	else if (__u >= `0xDC00` && _M_curr() != _M_first()) [[likely]]
527	{
528	// read a low surrogate, expect a high surrogate before it.
529	uint16_t __u2 = *--_M_curr();
530	if (__u2 < `0xD800` \|\| __u2 >= `0xDC00`) [[unlikely]]
531	__c = _S_error(); // unpaired low surrogate
532	else
533	{
534	__to_incr = `2`;
535	uint32_t __x = (__u2 & `0x3F`) << `10` \| (__u & `0x3FF`);
536	uint32_t __w = (__u2 >> `6`) & `0x1F`;
537	__c = (__w + `1`) << `16` \| __x;
538	}
539	}
540	else
541	__c = _S_error(); // unpaired surrogate
542
543	_M_update(__c, __to_incr);
544	}
545
546	constexpr void
547	_M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
548	{
549	_Guard<_Iter> __g{this, _M_curr()};
550	char32_t __c = *--_M_curr();
551	if (!__is_scalar_value(__c)) [[unlikely]]
552	__c = _S_error();
553	_M_update(__c, to_incr: `1`);
554	}
555
556	// Encode the code point __c as one or more code units in _M_buf.
557	constexpr void
558	_M_update(char32_t __c, uint8_t __to_incr)
559	{
560	_M_to_increment = __to_incr;
561	_M_buf_index = `0`;
562	if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
563	{
564	_M_buf[`0`] = __c;
565	_M_buf_last = `1`;
566	}
567	else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
568	{
569	if (__is_single_code_unit<_ToFmt>(__c))
570	{
571	_M_buf[`0`] = __c;
572	_M_buf[`1`] = `0`;
573	_M_buf_last = `1`;
574	}
575	else
576	{
577	// From http://www.unicode.org/faq/utf_bom.html#utf16-4
578	const char32_t __lead_offset = `0xD800` - (`0x10000` >> `10`);
579	char16_t __lead = __lead_offset + (__c >> `10`);
580	char16_t __trail = `0xDC00` + (__c & `0x3FF`);
581	_M_buf[`0`] = __lead;
582	_M_buf[`1`] = __trail;
583	_M_buf_last = `2`;
584	}
585	}
586	else
587	{
588	static_assert(sizeof(_ToFmt) == `1`);
589	int __bits = std::bit_width(x: (uint32_t)__c);
590	if (__bits <= `7`) [[likely]]
591	{
592	_M_buf[`0`] = __c;
593	_M_buf[`1`] = _M_buf[`2`] = _M_buf[`3`] = `0`;
594	_M_buf_last = `1`;
595	}
596	else if (__bits <= `11`)
597	{
598	_M_buf[`0`] = `0xC0` \| (__c >> `6`);
599	_M_buf[`1`] = `0x80` \| (__c & `0x3F`);
600	_M_buf[`2`] = _M_buf[`3`] = `0`;
601	_M_buf_last = `2`;
602	}
603	else if (__bits <= `16`)
604	{
605	_M_buf[`0`] = `0xE0` \| (__c >> `12`);
606	_M_buf[`1`] = `0x80` \| ((__c >> `6`) & `0x3F`);
607	_M_buf[`2`] = `0x80` \| (__c & `0x3F`);
608	_M_buf[`3`] = `0`;
609	_M_buf_last = `3`;
610	}
611	else
612	{
613	_M_buf[`0`] = `0xF0` \| ((__c >> `18`) & `0x07`);
614	_M_buf[`1`] = `0x80` \| ((__c >> `12`) & `0x3F`);
615	_M_buf[`2`] = `0x80` \| ((__c >> `6`) & `0x3F`);
616	_M_buf[`3`] = `0x80` \| (__c & `0x3F`);
617	_M_buf_last = `4`;
618	}
619	}
620	}
621
622	constexpr char32_t
623	_S_error()
624	{
625	char32_t __c = _ErrorHandler()();
626	__glibcxx_assert(__is_scalar_value(__c));
627	return __c;
628	}
629
630	constexpr _Iter
631	_M_first() const requires bidirectional_iterator<_Iter>
632	{ return _M_first_and_curr._M_first; }
633
634	constexpr _Iter&
635	_M_curr() { return _M_first_and_curr._M_curr; }
636
637	constexpr _Iter
638	_M_curr() const { return _M_first_and_curr._M_curr; }
639
640	// _M_first is not needed for non-bidirectional ranges.
641	template<typename _It>
642	struct _First_and_curr
643	{
644	_First_and_curr() = default;
645
646	constexpr
647	_First_and_curr(_It __curr) : _M_curr(__curr) { }
648
649	template<convertible_to<_It> _It2>
650	constexpr
651	_First_and_curr(const _First_and_curr<_It2>& __other)
652	: _M_curr(__other._M_curr) { }
653
654	// First code unit of the current code point for forward iterators,
655	// past-the-end of the current code point for input iterators.
656	_It _M_curr;
657	};
658
659	template<typename _It> requires bidirectional_iterator<_It>
660	struct _First_and_curr<_It>
661	{
662	_First_and_curr() = default;
663
664	constexpr
665	_First_and_curr(_It __first, _It __curr)
666	: _M_first(__first), _M_curr(__curr) { }
667
668	template<convertible_to<_It> _It2>
669	constexpr
670	_First_and_curr(const _First_and_curr<_It2>& __other)
671	: _M_first(__other._M_first), _M_curr(__other._M_curr) { }
672
673	_It _M_first; // Start of the underlying range.
674	_It _M_curr; // First code unit of the current code point.
675	};
676
677	// Iterators pointing to the start of the underlying range and to the
678	// start (or end, for non-forward iterators) of the current code point.
679	_First_and_curr<_Iter> _M_first_and_curr;
680
681	// The end of the underlying input range.
682	[[no_unique_address]] _Sent _M_last;
683
684	// Buffer holding the individual code units of the current code point.
685	array<value_type, `4` / sizeof(_ToFmt)> _M_buf;
686
687	uint8_t _M_buf_index = `0`; // Index of current code unit in the buffer.
688	uint8_t _M_buf_last = `0`; // Number of code units in the buffer.
689	uint8_t _M_to_increment = `0`; // How far to advance _M_curr on increment.
690
691	template<typename _FromFmt2, typename _ToFmt2,
692	input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
693	typename _ErrHandler>
694	requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
695	friend class _Utf_iterator;
696	};
697
698	template<typename _ToFormat, ranges::input_range _View>
699	requires ranges::view<_View>
700	class _Utf_view
701	: public ranges::view_interface<_Utf_view<_ToFormat, _View>>
702	{
703	using _Iterator = _Utf_iterator<ranges::range_value_t<_View>,
704	_ToFormat, ranges::iterator_t<_View>,
705	ranges::sentinel_t<_View>>;
706
707	template<typename _Iter, typename _Sent>
708	constexpr auto
709	_M_begin(_Iter __first, _Sent __last)
710	{
711	if constexpr (bidirectional_iterator<_Iter>)
712	return _Iterator(__first, __first, __last);
713	else
714	return _Iterator(__first, __last);
715	}
716
717	template<typename _Iter, typename _Sent>
718	constexpr auto
719	_M_end(_Iter __first, _Sent __last)
720	{
721	if constexpr (!is_same_v<_Iter, _Sent>)
722	return __last;
723	else if constexpr (bidirectional_iterator<_Iter>)
724	return _Iterator(__first, __last, __last);
725	else
726	return _Iterator(__last, __last);
727	}
728
729	_View _M_base;
730
731	public:
732	constexpr explicit
733	_Utf_view(_View __r) : _M_base(std::move(__r)) { }
734
735	constexpr auto begin()
736	{ return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
737
738	constexpr auto end()
739	{ return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
740
741	constexpr bool empty() const { return ranges::empty(_M_base); }
742	};
743
744	#ifdef __cpp_char8_t
745	template<typename _View>
746	using _Utf8_view = _Utf_view<char8_t, _View>;
747	#else
748	template<typename _View>
749	using _Utf8_view = _Utf_view<char, _View>;
750	#endif
751	template<typename _View>
752	using _Utf16_view = _Utf_view<char16_t, _View>;
753	template<typename _View>
754	using _Utf32_view = _Utf_view<char32_t, _View>;
755
756	inline namespace __v16_0_0
757	{
758	#define _GLIBCXX_GET_UNICODE_DATA 160000
759	#include "unicode-data.h"
760	#ifdef _GLIBCXX_GET_UNICODE_DATA
761	# error "Invalid unicode data"
762	#endif
763
764	// The field width of a code point.
765	constexpr int
766	__field_width(char32_t __c) noexcept
767	{
768	if (__c < __width_edges[`0`]) [[likely]]
769	return `1`;
770
771	auto* __p = std::upper_bound(first: __width_edges, last: std::end(arr: __width_edges), val: __c);
772	return (__p - __width_edges) % `2` + `1`;
773	}
774
775	// @pre c <= 0x10FFFF
776	constexpr bool
777	__should_escape_category(char32_t __c) noexcept
778	{
779	constexpr uint32_t __mask = `0x01`;
780	auto* __end = std::end(arr: __escape_edges);
781	auto* __p = std::lower_bound(first: __escape_edges, last: __end,
782	val: (__c << `1u`) + `2`);
783	return __p[-`1`] & __mask;
784	}
785
786
787	// @pre c <= 0x10FFFF
788	constexpr _Gcb_property
789	__grapheme_cluster_break_property(char32_t __c) noexcept
790	{
791	constexpr uint32_t __mask = (`1` << __gcb_shift_bits) - `1`;
792	auto* __end = std::end(arr: __gcb_edges);
793	auto* __p = std::lower_bound(first: __gcb_edges, last: __end,
794	val: (__c << __gcb_shift_bits) \| __mask);
795	return _Gcb_property(__p[-`1`] & __mask);
796	}
797
798	constexpr bool
799	__is_incb_linker(char32_t __c) noexcept
800	{
801	const auto __end = std::end(arr: __incb_linkers);
802	// Array is small enough that linear search is faster than binary search.
803	return _GLIBCXX_STD_A::find(first: __incb_linkers, last: __end, val: __c) != __end;
804	}
805
806	// @pre c <= 0x10FFFF
807	constexpr _InCB
808	__incb_property(char32_t __c) noexcept
809	{
810	if ((__c << `2`) < __incb_edges[`0`]) [[likely]]
811	return _InCB(`0`);
812
813	constexpr uint32_t __mask = `0x3`;
814	auto* __end = std::end(arr: __incb_edges);
815	auto* __p = std::lower_bound(first: __incb_edges, last: __end, val: (__c << `2`) \| __mask);
816	return _InCB(__p[-`1`] & __mask);
817	}
818
819	constexpr bool
820	__is_extended_pictographic(char32_t __c)
821	{
822	if (__c < __xpicto_edges[`0`]) [[likely]]
823	return `0`;
824
825	auto* __p = std::upper_bound(first: __xpicto_edges, last: std::end(arr: __xpicto_edges), val: __c);
826	return (__p - __xpicto_edges) % `2`;
827	}
828
829	struct _Grapheme_cluster_iterator_base
830	{
831	char32_t _M_c; // First code point in the cluster.
832	_Gcb_property _M_prop; // GCB property of _M_c.
833	enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
834	_XPicto _M_xpicto_seq_state = _XPicto::_Init;
835	unsigned char _M_RI_count = `0`;
836	bool _M_incb_linker_seen = false;
837
838	constexpr void
839	_M_reset(char32_t __c, _Gcb_property __p)
840	{
841	_M_c = __c;
842	_M_prop = __p;
843	_M_xpicto_seq_state = _XPicto::_Init;
844	_M_RI_count = `0`;
845	_M_incb_linker_seen = false;
846	}
847
848	constexpr void
849	_M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
850	{
851	if (_M_xpicto_seq_state == _XPicto::_Failed)
852	return;
853
854	auto __next_state = _XPicto::_Failed;
855	if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
856	{
857	if (__p == _Gcb_property::_Gcb_ZWJ)
858	{
859	if (_M_xpicto_seq_state == _XPicto::_Matched)
860	__next_state = _XPicto::_Zwj;
861	// We check _M_c here so that we do the lookup at most once,
862	// and only for clusters containing at least one ZWJ.
863	else if (__is_extended_pictographic(c: _M_c))
864	__next_state = _XPicto::_Zwj;
865	}
866	else if (__p == _Gcb_property::_Gcb_Extend)
867	__next_state = _M_xpicto_seq_state; // no change
868	}
869	else // Zwj
870	{
871	// This assumes that all \p{Extended_Pictographic} emoji have
872	// Grapheme_Cluster_Break=Other.
873	if (__p == _Gcb_property::_Gcb_Other
874	&& __is_extended_pictographic(__c))
875	__next_state = _XPicto::_Matched;
876	}
877	_M_xpicto_seq_state = __next_state;
878	}
879
880	constexpr void
881	_M_update_ri_count(_Gcb_property __p)
882	{
883	if (__p == _Gcb_property::_Gcb_Regional_Indicator)
884	++_M_RI_count;
885	else
886	_M_RI_count = `0`;
887	}
888
889	constexpr void
890	_M_update_incb_state(char32_t __c, _Gcb_property)
891	{
892	if (__is_incb_linker(__c))
893	_M_incb_linker_seen = true;
894	}
895	};
896
897	// Split a range into extended grapheme clusters.
898	template<ranges::forward_range _View> requires ranges::view<_View>
899	class _Grapheme_cluster_view
900	: public ranges::view_interface<_Grapheme_cluster_view<_View>>
901	{
902	public:
903
904	constexpr
905	_Grapheme_cluster_view(_View __v)
906	: _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
907	{ }
908
909	constexpr auto begin() const { return _M_begin; }
910	constexpr auto end() const { return _M_begin.end(); }
911
912	private:
913	struct _Iterator : private _Grapheme_cluster_iterator_base
914	{
915	private:
916	// Iterator over the underlying code points.
917	using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
918
919	public:
920	// TODO: Change value_type to be subrange<_U32_iterator> instead?
921	// Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
922	// That would be the whole cluster, not just the first code point.
923	// Would need to store two iterators and find end of current cluster
924	// on increment, so operator returns value_type(_M_base, _M_next).*
925	using value_type = char32_t;
926	using iterator_concept = forward_iterator_tag;
927	using difference_type = ptrdiff_t;
928
929	constexpr
930	_Iterator(_U32_iterator __i)
931	: _M_base(__i)
932	{
933	if (__i != __i.end())
934	{
935	_M_c = *__i;
936	_M_prop = __grapheme_cluster_break_property(_M_c);
937	}
938	}
939
940	// The first code point of the current extended grapheme cluster.
941	constexpr value_type
942	operator() const*
943	{ return _M_c; }
944
945	constexpr auto
946	operator->() const
947	{ return &_M_c; }
948
949	// Move to the next extended grapheme cluster.
950	constexpr _Iterator&
951	operator++()
952	{
953	const auto __end = _M_base.end();
954	if (_M_base != __end)
955	{
956	auto __p_prev = _M_prop;
957	auto __it = _M_base;
958	while (++__it != __end)
959	{
960	char32_t __c = *__it;
961	auto __p = __grapheme_cluster_break_property(*__it);
962	_M_update_xpicto_seq_state(__c, __p);
963	_M_update_ri_count(__p);
964	_M_update_incb_state(__c, __p);
965	if (_M_is_break(p1: __p_prev, p2: __p, curr: __it))
966	{
967	// Found a grapheme cluster break
968	_M_reset(__c, __p);
969	break;
970	}
971	__p_prev = __p;
972	}
973	_M_base = __it;
974	}
975	return *this;
976	}
977
978	constexpr _Iterator
979	operator++(int)
980	{
981	auto __tmp = *this;
982	++*this;
983	return __tmp;
984	}
985
986	constexpr bool
987	operator==(const _Iterator& __i) const
988	{ return _M_base == __i._M_base; }
989
990	// This supports iter != iter.end()
991	constexpr bool
992	operator==(const ranges::sentinel_t<_View>& __i) const
993	{ return _M_base == __i; }
994
995	// Iterator to the start of the current cluster.
996	constexpr auto base() const { return _M_base.base(); }
997
998	// The end of the underlying view (not the end of the current cluster!)
999	constexpr auto end() const { return _M_base.end(); }
1000
1001	// Field width of the first code point in the cluster.
1002	constexpr int
1003	width() const noexcept
1004	{ return __field_width(_M_c); }
1005
1006	private:
1007	_U32_iterator _M_base;
1008
1009	// Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
1010	// http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
1011	// This implements the rules from TR29 revision 43 in Unicode 15.1.0.
1012	// Return true if there is a break between code point with property p1
1013	// and code point with property p2.
1014	constexpr bool
1015	_M_is_break(_Gcb_property __p1, _Gcb_property __p2,
1016	_U32_iterator __curr) const
1017	{
1018	using enum _Gcb_property;
1019
1020	if (__p1 == _Gcb_Control \|\| __p1 == _Gcb_LF)
1021	return true; // Break after Control or LF.
1022
1023	if (__p1 == _Gcb_CR)
1024	return __p2 != _Gcb_LF; // Do not break between a CR and LF.
1025
1026	// Rule GB5
1027	if (__p2 == _Gcb_Control \|\| __p2 == _Gcb_CR \|\| __p2 == _Gcb_LF)
1028	return true; // Break before Control, CR or LF.
1029
1030	// Rule GB6
1031	if (__p1 == _Gcb_L)
1032	switch (__p2)
1033	{
1034	case _Gcb_L:
1035	case _Gcb_V:
1036	case _Gcb_LV:
1037	case _Gcb_LVT:
1038	return false; // Do not break Hangul syllable sequences.
1039	default:
1040	return true;
1041	}
1042
1043	// Rule GB7
1044	if (__p1 == _Gcb_LV \|\| __p1 == _Gcb_V)
1045	switch (__p2)
1046	{
1047	case _Gcb_V:
1048	case _Gcb_T:
1049	return false; // Do not break Hangul syllable sequences.
1050	default:
1051	return true;
1052	}
1053
1054	// Rule GB8
1055	if (__p1 == _Gcb_LVT \|\| __p1 == _Gcb_T)
1056	return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
1057
1058	// Rule GB9
1059	if (__p2 == _Gcb_Extend \|\| __p2 == _Gcb_ZWJ)
1060	return false; // Do not break before extending characters or ZWJ.
1061
1062	// The following GB9x rules only apply to extended grapheme clusters,
1063	// which is what the C++ standard uses (not legacy grapheme clusters).
1064
1065	// Rule GB9a
1066	if (__p2 == _Gcb_SpacingMark)
1067	return false; // Do not break before SpacingMarks,
1068	// Rule GB9b
1069	if (__p1 == _Gcb_Prepend)
1070	return false; // or after Prepend characters.
1071
1072	// Rule GB9c (Unicode 15.1.0)
1073	// Do not break within certain combinations with
1074	// Indic_Conjunct_Break (InCB)=Linker.
1075	if (_M_incb_linker_seen
1076	&& __incb_property(_M_c) == _InCB::_Consonant
1077	&& __incb_property(*__curr) == _InCB::_Consonant)
1078	{
1079	// Match [_M_base, __curr] against regular expression
1080	// Consonant ([Extend Linker] Linker [Extend Linker]* Consonant)+*
1081	bool __have_linker = false;
1082	auto __it = _M_base;
1083	while (++__it != __curr)
1084	{
1085	if (__is_incb_linker(*__it))
1086	__have_linker = true;
1087	else
1088	{
1089	auto __incb = __incb_property(*__it);
1090	if (__incb == _InCB::_Consonant)
1091	__have_linker = false;
1092	else if (__incb != _InCB::_Extend)
1093	break;
1094	}
1095	}
1096	if (__it == __curr && __have_linker)
1097	return false;
1098	}
1099
1100	// Rule GB11
1101	// Do not break within emoji modifier sequences
1102	// or emoji zwj sequences.
1103	if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
1104	return false;
1105
1106	// Rules GB12 and GB13
1107	// Do not break within emoji flag sequences. That is, do not break
1108	// between regional indicator (RI) symbols if there is an odd number
1109	// of RI characters before the break point.
1110	if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
1111	return (_M_RI_count & `1`) == `0`;
1112
1113	// Rule GB999
1114	return true; // Otherwise, break everywhere.
1115	}
1116	};
1117
1118	_Iterator _M_begin;
1119	};
1120
1121	} // namespace __v16_0_0
1122
1123	// Return the field width of a string.
1124	template<typename _CharT>
1125	constexpr size_t
1126	__field_width(basic_string_view<_CharT> __s)
1127	{
1128	if (__s.empty()) [[unlikely]]
1129	return `0`;
1130	_Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
1131	auto __it = __gc.begin();
1132	const auto __end = __gc.end();
1133	size_t __n = __it.width();
1134	while (++__it != __end)
1135	__n += __it.width();
1136	return __n;
1137	}
1138
1139	// Truncate a string to at most `__max` field width units, and return the
1140	// resulting field width.
1141	template<typename _CharT>
1142	constexpr size_t
1143	__truncate(basic_string_view<_CharT>& __s, size_t __max)
1144	{
1145	if (__s.empty()) [[unlikely]]
1146	return `0`;
1147
1148	_Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
1149	auto __it = __gc.begin();
1150	const auto __end = __gc.end();
1151	size_t __n = __it.width();
1152	if (__n > __max)
1153	{
1154	__s = {};
1155	return `0`;
1156	}
1157	while (++__it != __end)
1158	{
1159	size_t __n2 = __n + __it.width();
1160	if (__n2 > __max)
1161	{
1162	__s = basic_string_view<_CharT>(__s.begin(), __it.base());
1163	return __n;
1164	}
1165	__n = __n2;
1166	}
1167	return __n;
1168	}
1169
1170	template<typename _CharT>
1171	consteval bool
1172	__literal_encoding_is_unicode()
1173	{
1174	if constexpr (is_same_v<_CharT, char16_t>)
1175	return true;
1176	else if constexpr (is_same_v<_CharT, char32_t>)
1177	return true;
1178	#ifdef __cpp_char8_t
1179	else if constexpr (is_same_v<_CharT, char8_t>)
1180	return true;
1181	#endif
1182
1183	const char* __enc = "";
1184
1185	#ifdef __GNUC_EXECUTION_CHARSET_NAME
1186	auto __remove_iso10646_prefix = [](const char* __s) {
1187	// GNU iconv allows "ISO-10646/" prefix (case-insensitive).
1188	if (__s[`0`] == `'I'` \|\| __s[`0`] == `'i'`)
1189	if (__s[`1`] == `'S'` \|\| __s[`1`] == `'s'`)
1190	if (__s[`2`] == `'O'` \|\| __s[`2`] == `'o'`)
1191	if (string_view(__s + `3`).starts_with("-10646/"))
1192	return __s + `10`;
1193	return __s;
1194	};
1195
1196	if constexpr (is_same_v<_CharT, char>)
1197	__enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
1198	# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
1199	else
1200	__enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
1201	# endif
1202
1203	if ((__enc[`0`] == `'U'` \|\| __enc[`0`] == `'u'`)
1204	&& (__enc[`1`] == `'T'` \|\| __enc[`1`] == `'t'`)
1205	&& (__enc[`2`] == `'F'` \|\| __enc[`2`] == `'f'`))
1206	{
1207	__enc += `3`;
1208	if (__enc[`0`] == `'-'`)
1209	++__enc;
1210	if (__enc[`0`] == `'8'`)
1211	return __enc[`1`] == `'\0'` \|\| string_view(__enc + `1`) == "//";
1212	else if constexpr (!is_same_v<_CharT, char>)
1213	{
1214	string_view __s(__enc);
1215	if (__s.ends_with("//"))
1216	__s.remove_suffix(`2`);
1217	if (__s.ends_with("LE") \|\| __s.ends_with("BE"))
1218	__s.remove_suffix(`2`);
1219	return __s == "16" \|\| __s == "32";
1220	}
1221	}
1222	#elif defined __clang_literal_encoding__
1223	if constexpr (is_same_v<_CharT, char>)
1224	__enc = __clang_literal_encoding__;
1225	# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
1226	else
1227	__enc = __clang_wide_literal_encoding__;
1228	# endif
1229	// Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
1230	string_view __s(__enc);
1231	if (__s == "UTF-8")
1232	return true;
1233	else if constexpr (!is_same_v<_CharT, char>)
1234	return __s == "UTF-16" \|\| __s == "UTF-32";
1235	#endif
1236
1237	return false;
1238	}
1239
1240	consteval bool
1241	__literal_encoding_is_utf8()
1242	{ return __literal_encoding_is_unicode<char>(); }
1243
1244	consteval bool
1245	__literal_encoding_is_extended_ascii()
1246	{
1247	return `'0'` == `0x30` && `'A'` == `0x41` && `'Z'` == `0x5a`
1248	&& `'a'` == `0x61` && `'z'` == `0x7a`;
1249	}
1250
1251	// https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
1252	constexpr bool
1253	__charset_alias_match(string_view __a, string_view __b)
1254	{
1255	// Map alphanumeric chars to their base 64 value, everything else to 127.
1256	auto __map = [](char __c, bool& __num) -> unsigned char {
1257	if (__c == `'0'`) [[unlikely]]
1258	return __num ? `0` : `127`;
1259	const auto __v = __detail::__from_chars_alnum_to_val(__c);
1260	__num = __v < `10`;
1261	return __v;
1262	};
1263
1264	auto __ptr_a = __a.begin(), __end_a = __a.end();
1265	auto __ptr_b = __b.begin(), __end_b = __b.end();
1266	bool __num_a = false, __num_b = false;
1267
1268	while (true)
1269	{
1270	// Find the value of the next alphanumeric character in each string.
1271	unsigned char __val_a{}, __val_b{};
1272	while (__ptr_a != __end_a
1273	&& (__val_a = __map(*__ptr_a, __num_a)) == `127`)
1274	++__ptr_a;
1275	while (__ptr_b != __end_b
1276	&& (__val_b = __map(*__ptr_b, __num_b)) == `127`)
1277	++__ptr_b;
1278	// Stop when we reach the end of a string, or get a mismatch.
1279	if (__ptr_a == __end_a)
1280	return __ptr_b == __end_b;
1281	else if (__ptr_b == __end_b)
1282	return false;
1283	else if (__val_a != __val_b)
1284	return false; // Found non-matching characters.
1285	++__ptr_a;
1286	++__ptr_b;
1287	}
1288	return true;
1289	}
1290
1291	} // namespace __unicode
1292
1293	namespace ranges
1294	{
1295	template<typename _To, typename _Range>
1296	inline constexpr bool
1297	enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
1298	= enable_borrowed_range<_Range>;
1299
1300	template<typename _Range>
1301	inline constexpr bool
1302	enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
1303	= enable_borrowed_range<_Range>;
1304	} // namespace ranges
1305
1306	_GLIBCXX_END_NAMESPACE_VERSION
1307	} // namespace std
1308	#endif // C++20
1309	#endif // _GLIBCXX_UNICODE_H
1310

Browse the source code of include/c++/16.1.1/bits/unicode.h