1 /** 2 UTF-32 Utilities 3 4 Copyright: 5 Copyright © 2023-2025, Kitsunebi Games 6 Copyright © 2023-2025, Inochi2D Project 7 8 License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 Authors: Luna Nielsen 10 */ 11 module nulib.text.unicode.utf32; 12 import nulib.text.unicode; 13 import nulib.memory.endian; 14 import nulib.string; 15 16 @nogc: 17 18 /** 19 Validates a UTF32 codepoint 20 */ 21 bool validate(dchar c) { 22 23 // Name conflict, so we just import it locally. 24 import uni = nulib.text.unicode; 25 return uni.validate(c); 26 } 27 28 /** 29 Validates a UTF32 string 30 */ 31 bool validate(ndstring str) { 32 return validate(str[]); 33 } 34 35 /** 36 Validates a UTF32 string 37 */ 38 bool validate(inout(dchar)[] str) { 39 ndstring tmp = str; 40 41 // Handle endianess. 42 codepoint bom = getBOM(str); 43 if (bom != 0 && getEndianFromBOM(bom) != NATIVE_ENDIAN) { 44 tmp = toMachineOrder(str); 45 } 46 47 foreach(dchar c; tmp) { 48 if (!validate(c)) 49 return false; 50 } 51 52 return true; 53 } 54 55 /** 56 Gets the BOM 57 */ 58 codepoint getBOM(inout(dchar)[] str) { 59 if (str.length == 0) 60 return 0; 61 62 // This is UTF32. 63 if (isBOM(str[0])) 64 return str[0]; 65 66 return 0; 67 } 68 69 /** 70 Returns a string which is [str] converted to machine order. 71 72 If the string has no BOM the specified fallback endian will be used. 73 */ 74 ndstring toMachineOrder(inout(dchar)[] str, Endianess fallbackEndian = NATIVE_ENDIAN) { 75 76 // Empty string early escape. 77 if (str.length == 0) 78 return ndstring.init; 79 80 codepoint bom = getBOM(str); 81 Endianess endian = getEndianFromBOM(bom); 82 if (bom == 0) 83 endian = fallbackEndian; 84 85 if (endian != NATIVE_ENDIAN) { 86 87 // Flip all the bytes around 88 ndstring tmp; 89 foreach(i, ref const(dchar) c; str) { 90 tmp ~= c.nu_etoh(endian); 91 } 92 93 return tmp; 94 } 95 96 return ndstring(str); 97 } 98 99 /** 100 Decodes a single UTF-32 character 101 */ 102 codepoint decode(dchar c) { 103 if (!validate(c)) 104 return unicodeReplacementCharacter; 105 return c; 106 } 107 108 /** 109 Decodes a single UTF-32 string 110 */ 111 UnicodeSequence decode(inout(dchar)[] str, bool stripBOM = false) { 112 ndstring tmp; 113 size_t start = 0; 114 115 // Handle BOM 116 if (getBOM(str) != 0) { 117 tmp = toMachineOrder(str); 118 start = stripBOM ? 1 : 0; 119 } 120 121 foreach(ref c; str[start..$]) { 122 tmp ~= cast(wchar)decode(c); 123 } 124 125 return UnicodeSequence(cast(uint[])tmp[]); 126 } 127 128 /** 129 Decodes a single UTF-32 string 130 */ 131 UnicodeSequence decode(ndstring str, bool stripBOM = false) { 132 return decode(str[], stripBOM); 133 } 134 135 /** 136 Encodes a UTF-32 string. 137 138 Since UnicodeSequence is already technically 139 UTF-32 this doesn't do much other than 140 throw the data into a nwstring. 141 */ 142 ndstring encode(UnicodeSlice slice, bool addBOM = false) { 143 ndstring out_; 144 145 if (addBOM && slice.length > 0 && slice[0] != UNICODE_BOM) { 146 out_ ~= UNICODE_BOM; 147 } 148 149 out_ ~= ndstring(cast(dchar[])slice[0..$]); 150 return out_; 151 } 152 153 /** 154 Encodes a UTF-32 string. 155 156 Since UnicodeSequence is already technically 157 UTF-32 this doesn't do much other than 158 throw the data into a nwstring. 159 */ 160 ndstring encode(ref UnicodeSequence seq, bool addBOM = false) { 161 return encode(seq[0..$], addBOM); 162 }