nulib.text.unicode.utf16 source code

1 /**
2     UTF-16 Utilities
3     
4     Copyright:
5         Copyright © 2023-2025, Kitsunebi Games
6         Copyright © 2023-2025, Inochi2D Project
7     
8     License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9     Authors:   Luna Nielsen
10 */
11 module nulib.text.unicode.utf16;
12 import nulib.text.unicode.utf32;
13 import nulib.text.unicode;
14 import nulib.memory.endian;
15 import nulib.string;
16 
17 @nogc:
18 
19 private {
20 
21     // Surrogate mask
22     enum ushort utf16_smask = 0b11111100_00000000;
23     
24     // Data mask
25     enum ushort utf16_dmask = cast(ushort)(~utf16_smask);
26 
27     /// Leading surrogate
28     enum wchar utf16_lead  = 0b11011000_00000000;
29 
30     /// Trailing surrogate
31     enum wchar utf16_trail = 0b11011100_00000000;
32 }
33 
34 /**
35     Validates whether the given character is a valid UTF-16 sequence
36 */
37 bool validate(wchar[2] c) {
38     return 
39         ((c[0] >= 0 && c[0] <= 0xD7FF) || (c[0] >= 0xE000 && c[0] <= 0xFFFF)) ||
40         ((c[0] & utf16_smask) == utf16_lead && ((c[1] & utf16_smask) == utf16_trail));
41 }
42 
43 /**
44     Validates whether the given nwstring is a valid UTF-16 string.
45 
46     This function assumes that the string is in machine-native
47     endianess.
48 */
49 bool validate(nwstring str) {
50     return validate(str[]);
51 }
52 
53 
54 /**
55     Validates whether the given nwstring is a valid UTF-16 string.
56 
57     This function assumes that the string is in machine-native
58     endianess.
59 */
60 bool validate(inout(wchar)[] str) {
61     nwstring tmp = str;
62 
63     // Handle endianess.
64     codepoint bom = getBOM(str);
65     if (bom != 0 && getEndianFromBOM(bom) != NATIVE_ENDIAN) {
66         tmp = toMachineOrder(str);
67     }
68 
69     size_t i = 0;
70     while(i < tmp.length) {
71         wchar[2] txt;
72 
73         // Validate length
74         size_t clen = getLength(tmp[i]);
75         if (clen >= i+tmp.length) return false;
76         if (clen == 0) return false;
77 
78         txt[0..clen] = tmp[i..i+clen];
79         if (!validate(txt)) return false;
80 
81         i += clen;
82     }
83 
84     return true;
85 }
86 
87 /**
88     Gets the BOM of the nwstring if it has one.
89 
90     Otherwise returns a NUL character.
91 */
92 codepoint getBOM(inout(wchar)[] str) {
93     if (str.length == 0) 
94         return 0;
95 
96     union tmp {
97         wchar c;
98         ubyte[2] bytes;
99     }
100     tmp tmp_;
101     tmp_.c = str[0];
102 
103     if (isBOM(cast(codepoint)tmp_.c)) {
104         return cast(codepoint)tmp_.c;
105     }
106 
107     return 0;
108 }
109 
110 /**
111     Gets the BOM of the nwstring if it has one.
112 
113     Otherwise returns a NUL character.
114 */
115 codepoint getBOM(nwstring str) {
116     return getBOM(str[]);
117 }
118 
119 /**
120     Gets how many utf-16 units are in the specified character
121 */
122 size_t getLength(wchar c) {
123     if ((c >= 0 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF)) return 1;
124     if ((c & utf16_smask) == utf16_lead) return 2;
125     return 0;
126 }
127 
128 @("UTF-16 char len")
129 unittest {
130     assert('a'.getLength == 1);
131     assert('あ'.getLength == 1);
132     assert(utf16_trail.getLength() == 0); // Malformed leading byte
133 }
134 
135 /**
136     Gets how many utf-16 units are in the specified codepoint
137 
138     Returns 0 if the codepoint can't be represented.
139 */
140 size_t getUTF16Length(codepoint code) {
141     if (code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFF)) return 1;
142     else if (code >= 0x010000 && code <= 0x10FFFF) return 2;
143     return 0;
144 }
145 
146 @("UTF-16 codepoint len")
147 unittest {
148     assert(0xF4.getUTF16Length == 1);
149     assert(0x10FFFF.getUTF16Length == 2);
150     assert(0x11FFFF.getUTF16Length == 0);
151 }
152 
153 /**
154     Returns a string which is [str] converted to machine order.
155 
156     If the string has no BOM the specified fallback endian will be used.
157 */
158 nwstring toMachineOrder(inout(wchar)[] str, Endianess fallbackEndian = NATIVE_ENDIAN) {
159 
160     if (str.length == 0)
161         return nwstring.init;
162 
163     codepoint bom = getBOM(str);
164     Endianess endian = getEndianFromBOM(bom);
165     if (bom == 0)
166         endian = fallbackEndian;
167     
168     if (endian != NATIVE_ENDIAN) {
169 
170         // Flip all the bytes around.
171         nwstring tmp;
172         foreach(i, ref const(wchar) c; str) {
173             tmp ~= c.nu_etoh(endian);
174         }
175         return tmp;
176     }
177 
178     // Already local order.
179     return nwstring(str);
180 }
181 
182 /**
183     Returns a string which is [str] converted to machine order.
184 
185     If the string has no BOM it is assumed it's already in
186     machine order.
187 */
188 nwstring toMachineOrder(nwstring str) {
189     return toMachineOrder(str[]);
190 }
191 
192 /**
193     Decodes a single utf-16 character,
194 
195     Character is assumed to be in the same
196     endianness as the system!
197 */
198 codepoint decode(wchar[2] chr, ref size_t read) {
199     // Handle endianness
200     read = chr[0].getLength();
201     
202     switch(read) {
203         default:
204             read = 1;
205             return unicodeReplacementCharacter;
206         
207         case 1: 
208             return cast(codepoint)chr[0];
209         
210         case 2:
211             codepoint code = 
212                 ((chr[0] & utf16_dmask) + 0x400) +
213                 ((chr[1] & utf16_dmask) + 0x37) +
214                 0x10000;
215             return code;
216     }
217 }
218 
219 /**
220     Decodes a single utf-16 character from a 
221     nwstring.
222 */
223 codepoint decodeOne(nwstring str, size_t offset = 0) {
224     if (str.length == 0) 
225         return unicodeReplacementCharacter;
226 
227     // Gets the string in the current machine order.
228     str = str.toMachineOrder();
229 
230     // Get length of first character.
231     size_t read = getLength(str[0]);
232     size_t i;
233     while(i < offset++) {
234 
235         // We're out of characters to read.
236         if (read > str.length)
237             return unicodeReplacementCharacter;
238 
239         read = getLength(str[read]);
240     }
241     
242     // Decode to UTF-32 to avoid duplication
243     // of effort.
244     wchar[2] tmp;
245     tmp[0..read] = str[0..read];
246     return decode(tmp, read);
247 }
248 
249 /**
250     Decodes a UTF-16 string.
251 
252     This function will automatically detect BOMs
253     and handle endianness where applicable.
254 */
255 UnicodeSequence decode(inout(wchar)[] str, bool stripBOM = false) {
256     UnicodeSequence code;
257 
258     // Gets the string in the current machine order.
259     nwstring tmp = str.toMachineOrder();
260     size_t i = 0;
261 
262     // Strip BOM if there is one.
263     if (stripBOM && getBOM(tmp)) {
264         i++;
265     }
266 
267     while(i < tmp.length) {
268         wchar[2] txt;
269 
270         // Validate length, add FFFD if invalid.
271         size_t clen = tmp[i].getLength();
272         if (i+clen > tmp.length || clen == 0) {
273             code ~= unicodeReplacementCharacter;
274             i++;
275             continue;
276         }
277 
278         txt[0..clen] = tmp[i..i+clen];
279         code ~= txt.decode(clen);
280         i += clen;
281     }
282 
283     return code;
284 }
285 
286 /**
287     Decodes a UTF-16 string.
288 
289     This function will automatically detect BOMs
290     and handle endianness where applicable.
291 */
292 UnicodeSequence decode(nwstring str, bool stripBOM = false) {
293     return decode(str[], stripBOM);
294 }
295 
296 @("UTF-16 decode string")
297 unittest {
298     codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
299     codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
300     assert(decode(nwstring("こんにちは世界！"w))[0..$] == seq1);
301     assert(decode(nwstring("こ\uFFFDにちは世界！"w))[0..$] == seq2);
302 }
303 
304 /**
305     Encodes a unicode sequence to UTF-16
306 */
307 nwstring encode(UnicodeSlice slice, bool addBOM = false) {
308     nwstring out_;
309 
310     // Add BOM if requested.
311     if (addBOM && slice.length > 0 && slice[0] != UNICODE_BOM) {
312         out_ ~= cast(wchar)UNICODE_BOM;
313     }
314 
315     size_t i = 0;
316     while(i < slice.length) {
317         wchar[2] txt;
318 
319         size_t clen = slice[i].getUTF16Length();
320         if (clen == 1) {
321             txt[0] = cast(wchar)slice[i];
322             out_ ~= txt[0];
323         } if (clen == 2) {
324             codepoint c = slice[i] - 0x10000;
325             
326             txt[0] = cast(wchar)((c >> 10) + 0xD800);
327             txt[1] = cast(wchar)((c << 10) + 0xDC00);
328             out_ ~= cast(wstring)txt[0..$];
329         } else {
330             i++;
331             continue;
332         }        
333 
334         i++;
335     }
336 
337     return out_;
338 }
339 
340 /**
341     Encodes a series of unicode codepoints to UTF-16
342 */
343 nwstring encode(ref UnicodeSequence sequence, bool addBOM = false) {
344     return encode(sequence[0..$], addBOM);
345 }
346 
347 @("UTF-16 encode")
348 unittest {
349     codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
350     codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01];
351     assert(encode(seq1) == "こんにちは世界！"w);
352     assert(encode(seq2) == "こ\uFFFDにちは世界！"w);
353 }