nulib.text.unicode source code

1 /**
2     Unicode Parsing and Utilities.
3     
4     Copyright:
5         Copyright © 2023-2025, Kitsunebi Games
6         Copyright © 2023-2025, Inochi2D Project
7     
8     License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9     Authors:   Luna Nielsen
10 */
11 module nulib.text.unicode;
12 import nulib.memory.endian;
13 import nulib.collections.vector;
14 import nulib.string;
15 
16 public import nulib.text.unicode.utf8;
17 public import nulib.text.unicode.utf16;
18 public import nulib.text.unicode.utf32;
19 
20 // For encoding dispatch
21 import utf8 = nulib.text.unicode.utf8;
22 import utf16 = nulib.text.unicode.utf16;
23 import utf32 = nulib.text.unicode.utf32;
24 
25 @nogc:
26 
27 /**
28     A unicode codepoint
29 */
30 alias codepoint = uint;
31 
32 /**
33     Codepoint for the unicode byte-order-mark
34 */
35 enum codepoint UNICODE_BOM = 0xFEFF;
36 
37 /**
38     Validates whether the codepoint is within spec
39 */
40 bool validate(codepoint code) @safe {
41     return code <= 0x10FFFF && !hasSurrogatePairs(code);
42 }
43 
44 /**
45     Gets whether the codepoint mistakenly has surrogate pairs encoded within it.
46 */
47 bool hasSurrogatePairs(codepoint code) @safe {
48     return (code >= 0x0000D800 && code <= 0x0000DFFF);
49 }
50 
51 /**
52     Gets whether the character is a BOM
53 */
54 bool isBOM(codepoint c) @safe {
55     return isLittleEndianBOM(c) || isBigEndianBOM(c); 
56 }
57 
58 /**
59     Gets whether the byte order mark is little endian
60 */
61 pragma(inline, true)
62 bool isLittleEndianBOM(codepoint c) @safe {
63     return (c == 0xFFFE0000 || c == 0x0000FFFE);
64 }
65 
66 /**
67     Gets whether the byte order mark is big endian
68 */
69 pragma(inline, true)
70 bool isBigEndianBOM(codepoint c) @safe {
71     return (c == 0xFEFF0000 || c == 0x0000FEFF);
72 }
73 
74 /**
75     Gets the endianess from a BOM
76 */
77 Endianess getEndianFromBOM(codepoint c) @safe {
78     return isBigEndianBOM(c) ? 
79         Endianess.bigEndian : 
80         Endianess.littleEndian;
81 }
82 
83 /**
84     Decodes a string
85 */
86 UnicodeSequence decode(T)(auto ref T str, bool stripBOM = false) if (isSomeSafeString!T) {
87     static if (StringCharSize!T == 1)
88         return utf8.decode(str);
89     else static if (StringCharSize!T == 2)
90         return utf16.decode(str, stripBOM);
91     else static if (StringCharSize!T == 4)
92         return utf32.decode(str, stripBOM);
93     else
94         assert(0, "String type not supported.");
95 }
96 
97 /**
98     Encodes a string
99 */
100 T encode(T)(auto ref UnicodeSequence seq, bool addBOM = false) if (isSomeNString!T) {
101     static if (StringCharSize!T == 1)
102         return utf8.encode(seq);
103     else static if (StringCharSize!T == 2)
104         return utf16.encode(seq, addBOM);
105     else static if (StringCharSize!T == 4)
106         return utf32.encode(seq, addBOM);
107     else
108         assert(0, "String type not supported.");
109 }
110 
111 /**
112     Converts the given string to a UTF-8 string.
113 
114     This will always create a copy.
115 */
116 auto ref toUTF8(FromT)(auto ref FromT from) if (isSomeSafeString!FromT) {
117     static if (StringCharSize!FromT == 1)
118         return nstring(from);
119     else
120         return encode!nstring(decode(from, true), false);
121 }
122 
123 /**
124     Converts the given string to a UTF-16 string.
125 
126     This will always create a copy.
127 */
128 auto ref toUTF16(FromT)(auto ref FromT from, bool addBOM = false) if (isSomeSafeString!FromT) {
129     static if (StringCharSize!FromT == 2)
130         return nwstring(from);
131     else
132         return encode!nwstring(decode(from, true), addBOM);
133 }
134 
135 /**
136     Converts the given string to a UTF-32 string.
137 
138     This will always create a copy.
139 */
140 auto ref toUTF32(FromT)(auto ref FromT from, bool addBOM = false) if (isSomeSafeString!FromT) {
141     static if (StringCharSize!FromT == 4)
142         return ndstring(from);
143     else
144         return encode!ndstring(decode(from, true), addBOM);
145 }
146 
147 /**
148     Validates whether the codepoint is within spec
149 */
150 __gshared codepoint unicodeReplacementCharacter = 0xFFFD;
151 
152 /**
153     A unicode codepoint sequence
154 */
155 alias UnicodeSequence = vector!codepoint;
156 
157 /**
158     A unicode codepoint sequence
159 */
160 alias UnicodeSlice = codepoint[];
161 
162 /**
163     A unicode grapheme
164 */
165 struct Grapheme {
166 private:
167     size_t state;
168 
169 public:
170 
171     /**
172         Byte offset
173     */
174     size_t offset;
175 
176     /**
177         Cluster of codepoints, memory beloning to the original UnicodeSequence
178     */
179     codepoint[] cluster;
180 }
181 
182 /**
183     A sequence of graphemes
184 */
185 alias GraphemeSequence = weak_vector!Grapheme;