utf8-decode.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
utf8-decode.c (7826B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stddef.h> | |
3 #include <stdint.h> | |
4 #include <stdio.h> | |
5 #include <string.h> | |
6 | |
7 #include "../grapheme.h" | |
8 #include "util.h" | |
9 | |
10 static const struct { | |
11 char *arr; /* UTF-8 byte sequence */ | |
12 size_t len; /* length of UTF-8 byte sequence */ | |
13 size_t exp_len; /* expected length returned */ | |
14 uint_least32_t exp_cp; /* expected codepoint returned */ | |
15 } dec_test[] = { | |
16 { | |
17 /* empty sequence | |
18 * [ ] -> | |
19 * INVALID | |
20 */ | |
21 .arr = NULL, | |
22 .len = 0, | |
23 .exp_len = 0, | |
24 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
25 }, | |
26 { | |
27 /* invalid lead byte | |
28 * [ 11111101 ] -> | |
29 * INVALID | |
30 */ | |
31 .arr = (char *)(unsigned char[]) { 0xFD }, | |
32 .len = 1, | |
33 .exp_len = 1, | |
34 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
35 }, | |
36 { | |
37 /* valid 1-byte sequence | |
38 * [ 00000001 ] -> | |
39 * 0000001 | |
40 */ | |
41 .arr = (char *)(unsigned char[]) { 0x01 }, | |
42 .len = 1, | |
43 .exp_len = 1, | |
44 .exp_cp = 0x1, | |
45 }, | |
46 { | |
47 /* valid 2-byte sequence | |
48 * [ 11000011 10111111 ] -> | |
49 * 00011111111 | |
50 */ | |
51 .arr = (char *)(unsigned char[]) { 0xC3, 0xBF }, | |
52 .len = 2, | |
53 .exp_len = 2, | |
54 .exp_cp = 0xFF, | |
55 }, | |
56 { | |
57 /* invalid 2-byte sequence (second byte missing) | |
58 * [ 11000011 ] -> | |
59 * INVALID | |
60 */ | |
61 .arr = (char *)(unsigned char[]) { 0xC3 }, | |
62 .len = 1, | |
63 .exp_len = 2, | |
64 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
65 }, | |
66 { | |
67 /* invalid 2-byte sequence (second byte malformed) | |
68 * [ 11000011 11111111 ] -> | |
69 * INVALID | |
70 */ | |
71 .arr = (char *)(unsigned char[]) { 0xC3, 0xFF }, | |
72 .len = 2, | |
73 .exp_len = 1, | |
74 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
75 }, | |
76 { | |
77 /* invalid 2-byte sequence (overlong encoded) | |
78 * [ 11000001 10111111 ] -> | |
79 * INVALID | |
80 */ | |
81 .arr = (char *)(unsigned char[]) { 0xC1, 0xBF }, | |
82 .len = 2, | |
83 .exp_len = 2, | |
84 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
85 }, | |
86 { | |
87 /* valid 3-byte sequence | |
88 * [ 11100000 10111111 10111111 ] -> | |
89 * 0000111111111111 | |
90 */ | |
91 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF }, | |
92 .len = 3, | |
93 .exp_len = 3, | |
94 .exp_cp = 0xFFF, | |
95 }, | |
96 { | |
97 /* invalid 3-byte sequence (second byte missing) | |
98 * [ 11100000 ] -> | |
99 * INVALID | |
100 */ | |
101 .arr = (char *)(unsigned char[]) { 0xE0 }, | |
102 .len = 1, | |
103 .exp_len = 3, | |
104 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
105 }, | |
106 { | |
107 /* invalid 3-byte sequence (second byte malformed) | |
108 * [ 11100000 01111111 10111111 ] -> | |
109 * INVALID | |
110 */ | |
111 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF }, | |
112 .len = 3, | |
113 .exp_len = 1, | |
114 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
115 }, | |
116 { | |
117 /* invalid 3-byte sequence (short string, second byte ma… | |
118 * [ 11100000 01111111 ] -> | |
119 * INVALID | |
120 */ | |
121 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F }, | |
122 .len = 2, | |
123 .exp_len = 1, | |
124 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
125 }, | |
126 { | |
127 /* invalid 3-byte sequence (third byte missing) | |
128 * [ 11100000 10111111 ] -> | |
129 * INVALID | |
130 */ | |
131 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF }, | |
132 .len = 2, | |
133 .exp_len = 3, | |
134 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
135 }, | |
136 { | |
137 /* invalid 3-byte sequence (third byte malformed) | |
138 * [ 11100000 10111111 01111111 ] -> | |
139 * INVALID | |
140 */ | |
141 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F }, | |
142 .len = 3, | |
143 .exp_len = 2, | |
144 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
145 }, | |
146 { | |
147 /* invalid 3-byte sequence (overlong encoded) | |
148 * [ 11100000 10011111 10111111 ] -> | |
149 * INVALID | |
150 */ | |
151 .arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF }, | |
152 .len = 3, | |
153 .exp_len = 3, | |
154 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
155 }, | |
156 { | |
157 /* invalid 3-byte sequence (UTF-16 surrogate half) | |
158 * [ 11101101 10100000 10000000 ] -> | |
159 * INVALID | |
160 */ | |
161 .arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 }, | |
162 .len = 3, | |
163 .exp_len = 3, | |
164 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
165 }, | |
166 { | |
167 /* valid 4-byte sequence | |
168 * [ 11110011 10111111 10111111 10111111 ] -> | |
169 * 011111111111111111111 | |
170 */ | |
171 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xB… | |
172 .len = 4, | |
173 .exp_len = 4, | |
174 .exp_cp = UINT32_C(0xFFFFF), | |
175 }, | |
176 { | |
177 /* invalid 4-byte sequence (second byte missing) | |
178 * [ 11110011 ] -> | |
179 * INVALID | |
180 */ | |
181 .arr = (char *)(unsigned char[]) { 0xF3 }, | |
182 .len = 1, | |
183 .exp_len = 4, | |
184 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
185 }, | |
186 { | |
187 /* invalid 4-byte sequence (second byte malformed) | |
188 * [ 11110011 01111111 10111111 10111111 ] -> | |
189 * INVALID | |
190 */ | |
191 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xB… | |
192 .len = 4, | |
193 .exp_len = 1, | |
194 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
195 }, | |
196 { | |
197 /* invalid 4-byte sequence (short string 1, second byte | |
198 * malformed) [ 11110011 011111111 ] -> INVALID | |
199 */ | |
200 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F }, | |
201 .len = 2, | |
202 .exp_len = 1, | |
203 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
204 }, | |
205 { | |
206 /* invalid 4-byte sequence (short string 2, second byte | |
207 * malformed) [ 11110011 011111111 10111111 ] -> INVALID | |
208 */ | |
209 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF }, | |
210 .len = 3, | |
211 .exp_len = 1, | |
212 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
213 }, | |
214 | |
215 { | |
216 /* invalid 4-byte sequence (third byte missing) | |
217 * [ 11110011 10111111 ] -> | |
218 * INVALID | |
219 */ | |
220 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF }, | |
221 .len = 2, | |
222 .exp_len = 4, | |
223 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
224 }, | |
225 { | |
226 /* invalid 4-byte sequence (third byte malformed) | |
227 * [ 11110011 10111111 01111111 10111111 ] -> | |
228 * INVALID | |
229 */ | |
230 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xB… | |
231 .len = 4, | |
232 .exp_len = 2, | |
233 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
234 }, | |
235 { | |
236 /* invalid 4-byte sequence (short string, third byte mal… | |
237 * [ 11110011 10111111 01111111 ] -> | |
238 * INVALID | |
239 */ | |
240 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F }, | |
241 .len = 3, | |
242 .exp_len = 2, | |
243 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
244 }, | |
245 { | |
246 /* invalid 4-byte sequence (fourth byte missing) | |
247 * [ 11110011 10111111 10111111 ] -> | |
248 * INVALID | |
249 */ | |
250 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF }, | |
251 .len = 3, | |
252 .exp_len = 4, | |
253 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
254 }, | |
255 { | |
256 /* invalid 4-byte sequence (fourth byte malformed) | |
257 * [ 11110011 10111111 10111111 01111111 ] -> | |
258 * INVALID | |
259 */ | |
260 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7… | |
261 .len = 4, | |
262 .exp_len = 3, | |
263 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
264 }, | |
265 { | |
266 /* invalid 4-byte sequence (overlong encoded) | |
267 * [ 11110000 10000000 10000001 10111111 ] -> | |
268 * INVALID | |
269 */ | |
270 .arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xB… | |
271 .len = 4, | |
272 .exp_len = 4, | |
273 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
274 }, | |
275 { | |
276 /* invalid 4-byte sequence (UTF-16-unrepresentable) | |
277 * [ 11110100 10010000 10000000 10000000 ] -> | |
278 * INVALID | |
279 */ | |
280 .arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x8… | |
281 .len = 4, | |
282 .exp_len = 4, | |
283 .exp_cp = GRAPHEME_INVALID_CODEPOINT, | |
284 }, | |
285 }; | |
286 | |
287 int | |
288 main(int argc, char *argv[]) | |
289 { | |
290 size_t i, failed; | |
291 | |
292 (void)argc; | |
293 | |
294 /* UTF-8 decoder test */ | |
295 for (i = 0, failed = 0; i < LEN(dec_test); i++) { | |
296 size_t len; | |
297 uint_least32_t cp; | |
298 | |
299 len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].… | |
300 &cp); | |
301 | |
302 if (len != dec_test[i].exp_len || cp != dec_test[i].exp_… | |
303 fprintf(stderr, | |
304 "%s: Failed test %zu: " | |
305 "Expected (%zx,%u), but got (%zx,%u).\n", | |
306 argv[0], i, dec_test[i].exp_len, | |
307 dec_test[i].exp_cp, len, cp); | |
308 failed++; | |
309 } | |
310 } | |
311 printf("%s: %zu/%zu unit tests passed.\n", argv[0], | |
312 LEN(dec_test) - failed, LEN(dec_test)); | |
313 | |
314 return (failed > 0) ? 1 : 0; | |
315 } |