1 /** 2 UTF-8 Utilities 3 4 Copyright: 5 Copyright © 2023-2025, Kitsunebi Games 6 Copyright © 2023-2025, Inochi2D Project 7 8 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 Authors: Luna Nielsen 10 */ 11 module nulib.text.unicode.utf8; 12 import nulib.text.unicode; 13 import nulib.text.unicode : validate; 14 import nulib.string; 15 16 @nogc: 17 18 private { 19 20 // Highest ascii value in UTF8 21 enum utf8_ascii = 0x7F; 22 23 // Data mask 24 enum ubyte utf8_datamask(uint offset) = 0xFF >> offset; 25 26 // Lead mask 27 enum ubyte utf8_leadmask(uint offset) = cast(ubyte)(~utf8_datamask!offset); 28 29 // Lookup table containing the correct byte patterns and codepoints for each 30 // utf8 codepoint size. 31 const ubyte[4] utf8_leadmasks = [ 32 utf8_leadmask!0, // Lead byte (1 byte) 33 utf8_leadmask!2, // Lead byte (2 bytes) 34 utf8_leadmask!3, // Lead byte (3 bytes) 35 utf8_leadmask!4, // Lead byte (4 bytes) 36 ]; 37 38 // UTF-8 Well-Formed Byte Sequence Table 39 // A translation of Table 3-7 in the unicode conformance documents. 40 const ubyte[2][4][9] utf8_wfbseqtable = [ 41 [[0x00, 0x7F], [0x00, 0xFF], [0x00, 0xFF], [0x00, 0xFF]], 42 [[0xC2, 0xDF], [0x80, 0xBF], [0x00, 0xFF], [0x00, 0xFF]], 43 [[0xE0, 0xE0], [0xA0, 0xBF], [0x80, 0xBF], [0x00, 0xFF]], 44 [[0xE1, 0xEC], [0x80, 0xBF], [0x80, 0xBF], [0x00, 0xFF]], 45 [[0xED, 0xED], [0x80, 0x9F], [0x80, 0xBF], [0x00, 0xFF]], 46 [[0xEE, 0xEF], [0x80, 0xBF], [0x80, 0xBF], [0x00, 0xFF]], 47 [[0xF0, 0xF0], [0x90, 0xBF], [0x80, 0xBF], [0x80, 0xBF]], 48 [[0xF1, 0xF3], [0x80, 0xBF], [0x80, 0xBF], [0x80, 0xBF]], 49 [[0xF4, 0xF4], [0x80, 0x8F], [0x80, 0xBF], [0x80, 0xBF]], 50 ]; 51 } 52 53 /** 54 Validates a utf-8 character sequence. 55 */ 56 bool validate(const(char)[4] seq) { 57 58 // Validate and get length. 59 size_t len = getLength(seq[0]); 60 if (!len) return false; 61 62 switch(len) { 63 default: 64 return false; 65 66 case 1: 67 if (seq[0] >= utf8_wfbseqtable[0][0][0] && seq[0] <= utf8_wfbseqtable[0][0][1]) 68 return true; 69 70 return false; 71 72 case 2: 73 if ((seq[0] >= utf8_wfbseqtable[1][0][0] && seq[0] <= utf8_wfbseqtable[1][0][1]) && 74 (seq[1] >= utf8_wfbseqtable[1][1][0] && seq[1] <= utf8_wfbseqtable[1][1][1])) 75 return true; 76 77 return false; 78 79 case 3: 80 static foreach(tableIdx; 2..6) { 81 82 if ((seq[0] >= utf8_wfbseqtable[tableIdx][0][0] && seq[0] <= utf8_wfbseqtable[tableIdx][0][1]) && 83 (seq[1] >= utf8_wfbseqtable[tableIdx][1][0] && seq[1] <= utf8_wfbseqtable[tableIdx][1][1]) && 84 (seq[2] >= utf8_wfbseqtable[tableIdx][2][0] && seq[2] <= utf8_wfbseqtable[tableIdx][2][1])) 85 return true; 86 87 } 88 return false; 89 90 case 4: 91 static foreach(tableIdx; 6..9) { 92 93 if ((seq[0] >= utf8_wfbseqtable[tableIdx][0][0] && seq[0] <= utf8_wfbseqtable[tableIdx][0][1]) && 94 (seq[1] >= utf8_wfbseqtable[tableIdx][1][0] && seq[1] <= utf8_wfbseqtable[tableIdx][1][1]) && 95 (seq[2] >= utf8_wfbseqtable[tableIdx][2][0] && seq[2] <= utf8_wfbseqtable[tableIdx][2][1]) && 96 (seq[3] >= utf8_wfbseqtable[tableIdx][3][0] && seq[3] <= utf8_wfbseqtable[tableIdx][3][1])) 97 return true; 98 99 } 100 return false; 101 } 102 } 103 104 @("validate: UTF-8 byte sequence") 105 unittest { 106 107 assert( validate([0x24, 0x00, 0x00, 0x00])); 108 assert( validate([0xF4, 0x80, 0x83, 0x92])); 109 110 assert(!validate([0xC0, 0xAF, 0x00, 0x00])); 111 assert(!validate([0xE0, 0x9F, 0x80, 0x00])); 112 } 113 114 /** 115 Returns whether the given nstring is a valid UTF-8 string 116 */ 117 bool validate(inout(char)[] str) { 118 size_t i = 0; 119 while(i < str.length) { 120 char[4] txt; 121 122 // Validate length 123 size_t clen = getLength(str[i]); 124 if (clen >= i+str.length) return false; 125 if (clen == 0) return false; 126 127 // Validate sequence 128 txt[0..clen] = str[i..i+clen]; 129 if (!validate(txt)) return false; 130 131 // iteration 132 i += clen; 133 } 134 135 return true; 136 } 137 138 @("validate: UTF-8 string") 139 unittest { 140 141 assert( validate(nstring("Hello, world!"))); 142 assert( validate(nstring("こんにちは世界!"))); 143 144 // Invalid sequence test 145 const char[3] seq1 = [0xC1, 0xBF, 0xCC]; 146 const char[4] seq2 = [0xF4, 0x9F, 0xBF, 0xBF]; 147 const char[2] seq3 = [0xF4, 0x80]; 148 assert(!validate(nstring(seq1[0..$]))); 149 assert(!validate(nstring(seq2[0..$]))); 150 assert(!validate(nstring(seq3[0..$]))); // Sequence is cut off 151 } 152 153 /** 154 Gets the expected byte-size of the specified character 155 156 Returns 0 on malformed leading byte 157 */ 158 size_t getLength(char c) { 159 static foreach_reverse(i; 0..utf8_leadmasks.length) { 160 if ((c & utf8_leadmask!(i+1)) == utf8_leadmasks[i]) { 161 return i+1; 162 } 163 } 164 165 // Malformed leading byte 166 return 0; 167 } 168 169 @("getLength: get UTF-8 character length") 170 unittest { 171 assert('a'.getLength == 1); 172 assert((0b11110000).getLength == 4); 173 assert((0xC0).getLength() == 2); 174 assert((0b10010101).getLength() == 0); // Malformed leading byte 175 } 176 177 /** 178 Gets how many utf-16 units are in the specified codepoint 179 180 Returns 0 if the codepoint can't be represented. 181 */ 182 size_t getUTF8Length(codepoint code) { 183 if (code <= 0x7F) return 1; 184 else if (code >= 0x0080 && code <= 0x07FF) return 2; 185 else if (code >= 0x0800 && code <= 0xFFFF) return 3; 186 else if (code >= 0x010000 && code <= 0x10FFFF) return 4; 187 return 0; 188 } 189 190 @("getUTF8Length: calculate codepoint length") 191 unittest { 192 assert(0x1.getUTF8Length == 1); 193 assert(0xF4.getUTF8Length == 2); 194 assert(0x0801.getUTF8Length == 3); 195 assert(0x010001.getUTF8Length == 4); 196 assert(0x11FFFF.getUTF8Length == 0); 197 } 198 199 200 /** 201 Decodes a UTF-8 character 202 */ 203 codepoint decode(const(char)[4] utf, ref size_t read) { 204 codepoint code = 0x00; 205 size_t needed = 0; 206 207 ubyte lower = 0x80; 208 ubyte upper = 0xBF; 209 210 size_t len = getLength(utf[0]); 211 if (len == 1) { 212 213 // ASCII 214 return utf[0]; 215 } else if (len == 2) { 216 217 // 2 byte code 218 needed = 1; 219 code = utf[0] & 0x1F; 220 } else if (len == 3) { 221 222 // 3 byte code 223 if (utf[0] == 0xA0) lower = 0xA0; 224 if (utf[0] == 0x9F) upper = 0x9F; 225 needed = 2; 226 code = utf[0] & 0xF; 227 } else if (len == 4) { 228 229 // 4 byte code 230 if (utf[0] == 0xF0) lower = 0x90; 231 if (utf[0] == 0xF4) upper = 0x8F; 232 needed = 3; 233 code = utf[0] & 0x7; 234 } else { 235 236 // Replacement character \uFFFD 237 return unicodeReplacementCharacter; 238 } 239 240 // Return how many bytes are read 241 read = needed+1; 242 243 // Decoding 244 foreach(i; 1..needed+1) { 245 246 // Invalid character! 247 if (utf[i] < lower || utf[i] > upper) { 248 read = i; 249 return unicodeReplacementCharacter; 250 } 251 252 code = (code << 6) | (utf[i] & 0x3F); 253 } 254 return code; 255 } 256 257 258 /** 259 Decodes the specified UTF-8 character 260 261 Returns unicodeReplacementCharacter if character is a malformed UTF-8 sequence 262 */ 263 codepoint decode(const(char)[4] utf) { 264 size_t throwaway; 265 return decode(utf, throwaway); 266 } 267 268 @("decode: UTF-8 char") 269 unittest { 270 assert(decode(['a', 0x00, 0x00, 0x00]) == cast(uint)'a'); 271 assert(decode([0xEB, 0x9D, 0xB7, 0x00]) == 0xB777); 272 assert(decode([0xFF, 0xFF, 0xFF, 0xFF]) == unicodeReplacementCharacter); 273 } 274 275 /** 276 Decodes a string to a vector of codepoints. 277 Invalid codes will be replaced with unicodeReplacementCharacter 278 */ 279 UnicodeSequence decode(inout(char)[] str) { 280 UnicodeSequence code; 281 282 size_t i = 0; 283 while(i < str.length) { 284 char[4] txt; 285 286 // Validate length, add FFFD if invalid. 287 size_t clen = str[i].getLength(); 288 if (i+clen > str.length || clen == 0) { 289 code ~= unicodeReplacementCharacter; 290 i++; 291 continue; 292 } 293 294 txt[0..clen] = str[i..i+clen]; 295 code ~= txt.decode(clen); 296 i += clen; 297 } 298 299 return code; 300 } 301 302 @("decode: UTF-8 string") 303 unittest { 304 import std.stdio : writeln; 305 assert(decode(nstring("Hello, world!"))[0..$] == [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33]); 306 assert(decode(nstring("こんにちは世界!"))[0..$] == [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]); 307 308 assert(decode(nstring("こ\xF0\xA4\xADにちは世界!"))[0..$] == [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]); 309 } 310 311 /** 312 Encodes a series of unicode codepoints to UTF-8 313 */ 314 nstring encode(UnicodeSlice slice) { 315 nstring out_; 316 317 size_t i = 0; 318 while(i < slice.length) { 319 ptrdiff_t count = 0; 320 ptrdiff_t offset = 0; 321 322 // Skip invalid codepoints. 323 if (!slice[i].validate()) { 324 i++; 325 continue; 326 } 327 328 if (slice[i] <= utf8_ascii) { 329 330 // Single-byte ascii 331 out_ ~= cast(char)slice[i++]; 332 continue; 333 } else if (slice[i] >= 0x0080 && slice[i] <= 0x07FF) { 334 335 // 2 byte 336 count = 1; 337 offset = 0xC0; 338 } else if (slice[i] >= 0x0800 && slice[i] <= 0xFFFF) { 339 340 // 2 byte 341 count = 2; 342 offset = 0xE0; 343 } else if (slice[i] >= 0x10000 && slice[i] <= 0x10FFFF) { 344 345 // 2 byte 346 count = 3; 347 offset = 0xF0; 348 } 349 350 // The magic where things get stitched back together. 351 char[4] bytes; 352 bytes[0] = cast(ubyte)((slice[i] >> (6 * count)) + offset); 353 size_t ix = 1; 354 while (count > 0) { 355 size_t temp = slice[i] >> (6 * (count - 1)); 356 bytes[ix++] = 0x80 | (temp & 0x3F); 357 count--; 358 } 359 360 out_ ~= bytes[0..ix]; 361 i++; 362 } 363 364 return out_; 365 } 366 367 /** 368 Encodes a series of unicode codepoints to UTF-8 369 */ 370 nstring encode(ref UnicodeSequence sequence) { 371 return encode(sequence[0..$]); 372 } 373 374 @("encode: UTF-8") 375 unittest { 376 codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; 377 codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; 378 assert(encode(seq1) == "こんにちは世界!"); 379 assert(encode(seq2) == "こ\uFFFDにちは世界!"); 380 }