1 /** 2 Unicode Parsing and Utilities. 3 4 Copyright: 5 Copyright © 2023-2025, Kitsunebi Games 6 Copyright © 2023-2025, Inochi2D Project 7 8 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 Authors: Luna Nielsen 10 */ 11 module nulib.text.unicode; 12 import nulib.memory.endian; 13 import nulib.collections.vector; 14 import nulib.string; 15 16 public import nulib.text.unicode.utf8; 17 public import nulib.text.unicode.utf16; 18 public import nulib.text.unicode.utf32; 19 20 // For encoding dispatch 21 import utf8 = nulib.text.unicode.utf8; 22 import utf16 = nulib.text.unicode.utf16; 23 import utf32 = nulib.text.unicode.utf32; 24 25 @nogc: 26 27 /** 28 A unicode codepoint 29 */ 30 alias codepoint = uint; 31 32 /** 33 Codepoint for the unicode byte-order-mark 34 */ 35 enum codepoint UNICODE_BOM = 0xFEFF; 36 37 /** 38 Validates whether the codepoint is within spec 39 */ 40 bool validate(codepoint code) @safe { 41 return code <= 0x10FFFF && !hasSurrogatePairs(code); 42 } 43 44 /** 45 Gets whether the codepoint mistakenly has surrogate pairs encoded within it. 46 */ 47 bool hasSurrogatePairs(codepoint code) @safe { 48 return (code >= 0x0000D800 && code <= 0x0000DFFF); 49 } 50 51 /** 52 Gets whether the character is a BOM 53 */ 54 bool isBOM(codepoint c) @safe { 55 return isLittleEndianBOM(c) || isBigEndianBOM(c); 56 } 57 58 /** 59 Gets whether the byte order mark is little endian 60 */ 61 pragma(inline, true) 62 bool isLittleEndianBOM(codepoint c) @safe { 63 return (c == 0xFFFE0000 || c == 0x0000FFFE); 64 } 65 66 /** 67 Gets whether the byte order mark is big endian 68 */ 69 pragma(inline, true) 70 bool isBigEndianBOM(codepoint c) @safe { 71 return (c == 0xFEFF0000 || c == 0x0000FEFF); 72 } 73 74 /** 75 Gets the endianess from a BOM 76 */ 77 Endianess getEndianFromBOM(codepoint c) @safe { 78 return isBigEndianBOM(c) ? 79 Endianess.bigEndian : 80 Endianess.littleEndian; 81 } 82 83 /** 84 Decodes a string 85 */ 86 UnicodeSequence decode(T)(auto ref T str, bool stripBOM = false) if (isSomeSafeString!T) { 87 static if (StringCharSize!T == 1) 88 return utf8.decode(str); 89 else static if (StringCharSize!T == 2) 90 return utf16.decode(str, stripBOM); 91 else static if (StringCharSize!T == 4) 92 return utf32.decode(str, stripBOM); 93 else 94 assert(0, "String type not supported."); 95 } 96 97 /** 98 Encodes a string 99 */ 100 T encode(T)(auto ref UnicodeSequence seq, bool addBOM = false) if (isSomeNString!T) { 101 static if (StringCharSize!T == 1) 102 return utf8.encode(seq); 103 else static if (StringCharSize!T == 2) 104 return utf16.encode(seq, addBOM); 105 else static if (StringCharSize!T == 4) 106 return utf32.encode(seq, addBOM); 107 else 108 assert(0, "String type not supported."); 109 } 110 111 /** 112 Converts the given string to a UTF-8 string. 113 114 This will always create a copy. 115 */ 116 auto ref toUTF8(FromT)(auto ref FromT from) if (isSomeSafeString!FromT) { 117 static if (StringCharSize!FromT == 1) 118 return nstring(from); 119 else 120 return encode!nstring(decode(from, true), false); 121 } 122 123 /** 124 Converts the given string to a UTF-16 string. 125 126 This will always create a copy. 127 */ 128 auto ref toUTF16(FromT)(auto ref FromT from, bool addBOM = false) if (isSomeSafeString!FromT) { 129 static if (StringCharSize!FromT == 2) 130 return nwstring(from); 131 else 132 return encode!nwstring(decode(from, true), addBOM); 133 } 134 135 /** 136 Converts the given string to a UTF-32 string. 137 138 This will always create a copy. 139 */ 140 auto ref toUTF32(FromT)(auto ref FromT from, bool addBOM = false) if (isSomeSafeString!FromT) { 141 static if (StringCharSize!FromT == 4) 142 return ndstring(from); 143 else 144 return encode!ndstring(decode(from, true), addBOM); 145 } 146 147 /** 148 Validates whether the codepoint is within spec 149 */ 150 __gshared codepoint unicodeReplacementCharacter = 0xFFFD; 151 152 /** 153 A unicode codepoint sequence 154 */ 155 alias UnicodeSequence = vector!codepoint; 156 157 /** 158 A unicode codepoint sequence 159 */ 160 alias UnicodeSlice = codepoint[]; 161 162 /** 163 A unicode grapheme 164 */ 165 struct Grapheme { 166 private: 167 size_t state; 168 169 public: 170 171 /** 172 Byte offset 173 */ 174 size_t offset; 175 176 /** 177 Cluster of codepoints, memory beloning to the original UnicodeSequence 178 */ 179 codepoint[] cluster; 180 } 181 182 /** 183 A sequence of graphemes 184 */ 185 alias GraphemeSequence = weak_vector!Grapheme;