1 /** 2 UTF-16 Utilities 3 4 Copyright: 5 Copyright © 2023-2025, Kitsunebi Games 6 Copyright © 2023-2025, Inochi2D Project 7 8 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 Authors: Luna Nielsen 10 */ 11 module nulib.text.unicode.utf16; 12 import nulib.text.unicode.utf32; 13 import nulib.text.unicode; 14 import nulib.memory.endian; 15 import nulib.string; 16 17 @nogc: 18 19 private { 20 21 // Surrogate mask 22 enum ushort utf16_smask = 0b11111100_00000000; 23 24 // Data mask 25 enum ushort utf16_dmask = cast(ushort)(~utf16_smask); 26 27 /// Leading surrogate 28 enum wchar utf16_lead = 0b11011000_00000000; 29 30 /// Trailing surrogate 31 enum wchar utf16_trail = 0b11011100_00000000; 32 } 33 34 /** 35 Validates whether the given character is a valid UTF-16 sequence 36 */ 37 bool validate(wchar[2] c) { 38 return 39 ((c[0] >= 0 && c[0] <= 0xD7FF) || (c[0] >= 0xE000 && c[0] <= 0xFFFF)) || 40 ((c[0] & utf16_smask) == utf16_lead && ((c[1] & utf16_smask) == utf16_trail)); 41 } 42 43 /** 44 Validates whether the given nwstring is a valid UTF-16 string. 45 46 This function assumes that the string is in machine-native 47 endianess. 48 */ 49 bool validate(nwstring str) { 50 return validate(str[]); 51 } 52 53 54 /** 55 Validates whether the given nwstring is a valid UTF-16 string. 56 57 This function assumes that the string is in machine-native 58 endianess. 59 */ 60 bool validate(inout(wchar)[] str) { 61 nwstring tmp = str; 62 63 // Handle endianess. 64 codepoint bom = getBOM(str); 65 if (bom != 0 && getEndianFromBOM(bom) != NATIVE_ENDIAN) { 66 tmp = toMachineOrder(str); 67 } 68 69 size_t i = 0; 70 while(i < tmp.length) { 71 wchar[2] txt; 72 73 // Validate length 74 size_t clen = getLength(tmp[i]); 75 if (clen >= i+tmp.length) return false; 76 if (clen == 0) return false; 77 78 txt[0..clen] = tmp[i..i+clen]; 79 if (!validate(txt)) return false; 80 81 i += clen; 82 } 83 84 return true; 85 } 86 87 /** 88 Gets the BOM of the nwstring if it has one. 89 90 Otherwise returns a NUL character. 91 */ 92 codepoint getBOM(inout(wchar)[] str) { 93 if (str.length == 0) 94 return 0; 95 96 union tmp { 97 wchar c; 98 ubyte[2] bytes; 99 } 100 tmp tmp_; 101 tmp_.c = str[0]; 102 103 if (isBOM(cast(codepoint)tmp_.c)) { 104 return cast(codepoint)tmp_.c; 105 } 106 107 return 0; 108 } 109 110 /** 111 Gets the BOM of the nwstring if it has one. 112 113 Otherwise returns a NUL character. 114 */ 115 codepoint getBOM(nwstring str) { 116 return getBOM(str[]); 117 } 118 119 /** 120 Gets how many utf-16 units are in the specified character 121 */ 122 size_t getLength(wchar c) { 123 if ((c >= 0 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF)) return 1; 124 if ((c & utf16_smask) == utf16_lead) return 2; 125 return 0; 126 } 127 128 @("UTF-16 char len") 129 unittest { 130 assert('a'.getLength == 1); 131 assert('あ'.getLength == 1); 132 assert(utf16_trail.getLength() == 0); // Malformed leading byte 133 } 134 135 /** 136 Gets how many utf-16 units are in the specified codepoint 137 138 Returns 0 if the codepoint can't be represented. 139 */ 140 size_t getUTF16Length(codepoint code) { 141 if (code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFF)) return 1; 142 else if (code >= 0x010000 && code <= 0x10FFFF) return 2; 143 return 0; 144 } 145 146 @("UTF-16 codepoint len") 147 unittest { 148 assert(0xF4.getUTF16Length == 1); 149 assert(0x10FFFF.getUTF16Length == 2); 150 assert(0x11FFFF.getUTF16Length == 0); 151 } 152 153 /** 154 Returns a string which is [str] converted to machine order. 155 156 If the string has no BOM the specified fallback endian will be used. 157 */ 158 nwstring toMachineOrder(inout(wchar)[] str, Endianess fallbackEndian = NATIVE_ENDIAN) { 159 160 if (str.length == 0) 161 return nwstring.init; 162 163 codepoint bom = getBOM(str); 164 Endianess endian = getEndianFromBOM(bom); 165 if (bom == 0) 166 endian = fallbackEndian; 167 168 if (endian != NATIVE_ENDIAN) { 169 170 // Flip all the bytes around. 171 nwstring tmp; 172 foreach(i, ref const(wchar) c; str) { 173 tmp ~= c.nu_etoh(endian); 174 } 175 return tmp; 176 } 177 178 // Already local order. 179 return nwstring(str); 180 } 181 182 /** 183 Returns a string which is [str] converted to machine order. 184 185 If the string has no BOM it is assumed it's already in 186 machine order. 187 */ 188 nwstring toMachineOrder(nwstring str) { 189 return toMachineOrder(str[]); 190 } 191 192 /** 193 Decodes a single utf-16 character, 194 195 Character is assumed to be in the same 196 endianness as the system! 197 */ 198 codepoint decode(wchar[2] chr, ref size_t read) { 199 // Handle endianness 200 read = chr[0].getLength(); 201 202 switch(read) { 203 default: 204 read = 1; 205 return unicodeReplacementCharacter; 206 207 case 1: 208 return cast(codepoint)chr[0]; 209 210 case 2: 211 codepoint code = 212 ((chr[0] & utf16_dmask) + 0x400) + 213 ((chr[1] & utf16_dmask) + 0x37) + 214 0x10000; 215 return code; 216 } 217 } 218 219 /** 220 Decodes a single utf-16 character from a 221 nwstring. 222 */ 223 codepoint decodeOne(nwstring str, size_t offset = 0) { 224 if (str.length == 0) 225 return unicodeReplacementCharacter; 226 227 // Gets the string in the current machine order. 228 str = str.toMachineOrder(); 229 230 // Get length of first character. 231 size_t read = getLength(str[0]); 232 size_t i; 233 while(i < offset++) { 234 235 // We're out of characters to read. 236 if (read > str.length) 237 return unicodeReplacementCharacter; 238 239 read = getLength(str[read]); 240 } 241 242 // Decode to UTF-32 to avoid duplication 243 // of effort. 244 wchar[2] tmp; 245 tmp[0..read] = str[0..read]; 246 return decode(tmp, read); 247 } 248 249 /** 250 Decodes a UTF-16 string. 251 252 This function will automatically detect BOMs 253 and handle endianness where applicable. 254 */ 255 UnicodeSequence decode(inout(wchar)[] str, bool stripBOM = false) { 256 UnicodeSequence code; 257 258 // Gets the string in the current machine order. 259 nwstring tmp = str.toMachineOrder(); 260 size_t i = 0; 261 262 // Strip BOM if there is one. 263 if (stripBOM && getBOM(tmp)) { 264 i++; 265 } 266 267 while(i < tmp.length) { 268 wchar[2] txt; 269 270 // Validate length, add FFFD if invalid. 271 size_t clen = tmp[i].getLength(); 272 if (i+clen > tmp.length || clen == 0) { 273 code ~= unicodeReplacementCharacter; 274 i++; 275 continue; 276 } 277 278 txt[0..clen] = tmp[i..i+clen]; 279 code ~= txt.decode(clen); 280 i += clen; 281 } 282 283 return code; 284 } 285 286 /** 287 Decodes a UTF-16 string. 288 289 This function will automatically detect BOMs 290 and handle endianness where applicable. 291 */ 292 UnicodeSequence decode(nwstring str, bool stripBOM = false) { 293 return decode(str[], stripBOM); 294 } 295 296 @("UTF-16 decode string") 297 unittest { 298 codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; 299 codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; 300 assert(decode(nwstring("こんにちは世界!"w))[0..$] == seq1); 301 assert(decode(nwstring("こ\uFFFDにちは世界!"w))[0..$] == seq2); 302 } 303 304 /** 305 Encodes a unicode sequence to UTF-16 306 */ 307 nwstring encode(UnicodeSlice slice, bool addBOM = false) { 308 nwstring out_; 309 310 // Add BOM if requested. 311 if (addBOM && slice.length > 0 && slice[0] != UNICODE_BOM) { 312 out_ ~= cast(wchar)UNICODE_BOM; 313 } 314 315 size_t i = 0; 316 while(i < slice.length) { 317 wchar[2] txt; 318 319 size_t clen = slice[i].getUTF16Length(); 320 if (clen == 1) { 321 txt[0] = cast(wchar)slice[i]; 322 out_ ~= txt[0]; 323 } if (clen == 2) { 324 codepoint c = slice[i] - 0x10000; 325 326 txt[0] = cast(wchar)((c >> 10) + 0xD800); 327 txt[1] = cast(wchar)((c << 10) + 0xDC00); 328 out_ ~= cast(wstring)txt[0..$]; 329 } else { 330 i++; 331 continue; 332 } 333 334 i++; 335 } 336 337 return out_; 338 } 339 340 /** 341 Encodes a series of unicode codepoints to UTF-16 342 */ 343 nwstring encode(ref UnicodeSequence sequence, bool addBOM = false) { 344 return encode(sequence[0..$], addBOM); 345 } 346 347 @("UTF-16 encode") 348 unittest { 349 codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; 350 codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; 351 assert(encode(seq1) == "こんにちは世界!"w); 352 assert(encode(seq2) == "こ\uFFFDにちは世界!"w); 353 }