1 /**
2     UTF-32 Utilities
3     
4     Copyright:
5         Copyright © 2023-2025, Kitsunebi Games
6         Copyright © 2023-2025, Inochi2D Project
7     
8     License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9     Authors:   Luna Nielsen
10 */
11 module nulib.text.unicode.utf32;
12 import nulib.text.unicode;
13 import nulib.memory.endian;
14 import nulib.string;
15 
16 @nogc:
17 
18 /**
19     Validates a UTF32 codepoint
20 */
21 bool validate(dchar c) {
22 
23     // Name conflict, so we just import it locally.
24     import uni = nulib.text.unicode;
25     return uni.validate(c);
26 }
27 
28 /**
29     Validates a UTF32 string
30 */
31 bool validate(ndstring str) {
32     return validate(str[]);
33 }
34 
35 /**
36     Validates a UTF32 string
37 */
38 bool validate(inout(dchar)[] str) {
39     ndstring tmp = str;
40 
41     // Handle endianess.
42     codepoint bom = getBOM(str);
43     if (bom != 0 && getEndianFromBOM(bom) != NATIVE_ENDIAN) {
44         tmp = toMachineOrder(str);
45     }
46 
47     foreach(dchar c; tmp) {
48         if (!validate(c)) 
49             return false;
50     }
51 
52     return true;
53 }
54 
55 /**
56     Gets the BOM
57 */
58 codepoint getBOM(inout(dchar)[] str) {
59     if (str.length == 0)
60         return 0;
61     
62     // This is UTF32.
63     if (isBOM(str[0]))
64         return str[0];
65 
66     return 0;
67 }
68 
69 /**
70     Returns a string which is [str] converted to machine order.
71 
72     If the string has no BOM the specified fallback endian will be used.
73 */
74 ndstring toMachineOrder(inout(dchar)[] str, Endianess fallbackEndian = NATIVE_ENDIAN) {
75     
76     // Empty string early escape.
77     if (str.length == 0) 
78         return ndstring.init;
79 
80     codepoint bom = getBOM(str);
81     Endianess endian = getEndianFromBOM(bom);
82     if (bom == 0)
83         endian = fallbackEndian;
84     
85     if (endian != NATIVE_ENDIAN) {
86 
87         // Flip all the bytes around
88         ndstring tmp;
89         foreach(i, ref const(dchar) c; str) {
90             tmp ~= c.nu_etoh(endian);
91         }
92 
93         return tmp;
94     }
95 
96     return ndstring(str);
97 }
98 
99 /**
100     Decodes a single UTF-32 character
101 */
102 codepoint decode(dchar c) {
103     if (!validate(c))
104         return unicodeReplacementCharacter;
105     return c;
106 }
107 
108 /**
109     Decodes a single UTF-32 string
110 */
111 UnicodeSequence decode(inout(dchar)[] str, bool stripBOM = false) {
112     ndstring tmp;
113     size_t start = 0;
114 
115     // Handle BOM
116     if (getBOM(str) != 0) {
117         tmp = toMachineOrder(str);
118         start = stripBOM ? 1 : 0;
119     }
120 
121     foreach(ref c; str[start..$]) {
122         tmp ~= cast(wchar)decode(c);
123     }
124 
125     return UnicodeSequence(cast(uint[])tmp[]);
126 }
127 
128 /**
129     Decodes a single UTF-32 string
130 */
131 UnicodeSequence decode(ndstring str, bool stripBOM = false) {
132     return decode(str[], stripBOM);
133 }
134 
135 /**
136     Encodes a UTF-32 string.
137 
138     Since UnicodeSequence is already technically
139     UTF-32 this doesn't do much other than
140     throw the data into a nwstring.
141 */
142 ndstring encode(UnicodeSlice slice, bool addBOM = false) {
143     ndstring out_;
144     
145     if (addBOM && slice.length > 0 && slice[0] != UNICODE_BOM) {
146         out_ ~= UNICODE_BOM;
147     }
148 
149     out_ ~= ndstring(cast(dchar[])slice[0..$]);
150     return out_;
151 }
152 
153 /**
154     Encodes a UTF-32 string.
155 
156     Since UnicodeSequence is already technically
157     UTF-32 this doesn't do much other than
158     throw the data into a nwstring.
159 */
160 ndstring encode(ref UnicodeSequence seq, bool addBOM = false) {
161     return encode(seq[0..$], addBOM);
162 }