Introduction
Introduction Statistics Contact Development Disclaimer Help
utf8-decode.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
utf8-decode.c (7826B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <string.h>
6
7 #include "../grapheme.h"
8 #include "util.h"
9
10 static const struct {
11 char *arr; /* UTF-8 byte sequence */
12 size_t len; /* length of UTF-8 byte sequence */
13 size_t exp_len; /* expected length returned */
14 uint_least32_t exp_cp; /* expected codepoint returned */
15 } dec_test[] = {
16 {
17 /* empty sequence
18 * [ ] ->
19 * INVALID
20 */
21 .arr = NULL,
22 .len = 0,
23 .exp_len = 0,
24 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
25 },
26 {
27 /* invalid lead byte
28 * [ 11111101 ] ->
29 * INVALID
30 */
31 .arr = (char *)(unsigned char[]) { 0xFD },
32 .len = 1,
33 .exp_len = 1,
34 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
35 },
36 {
37 /* valid 1-byte sequence
38 * [ 00000001 ] ->
39 * 0000001
40 */
41 .arr = (char *)(unsigned char[]) { 0x01 },
42 .len = 1,
43 .exp_len = 1,
44 .exp_cp = 0x1,
45 },
46 {
47 /* valid 2-byte sequence
48 * [ 11000011 10111111 ] ->
49 * 00011111111
50 */
51 .arr = (char *)(unsigned char[]) { 0xC3, 0xBF },
52 .len = 2,
53 .exp_len = 2,
54 .exp_cp = 0xFF,
55 },
56 {
57 /* invalid 2-byte sequence (second byte missing)
58 * [ 11000011 ] ->
59 * INVALID
60 */
61 .arr = (char *)(unsigned char[]) { 0xC3 },
62 .len = 1,
63 .exp_len = 2,
64 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
65 },
66 {
67 /* invalid 2-byte sequence (second byte malformed)
68 * [ 11000011 11111111 ] ->
69 * INVALID
70 */
71 .arr = (char *)(unsigned char[]) { 0xC3, 0xFF },
72 .len = 2,
73 .exp_len = 1,
74 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
75 },
76 {
77 /* invalid 2-byte sequence (overlong encoded)
78 * [ 11000001 10111111 ] ->
79 * INVALID
80 */
81 .arr = (char *)(unsigned char[]) { 0xC1, 0xBF },
82 .len = 2,
83 .exp_len = 2,
84 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
85 },
86 {
87 /* valid 3-byte sequence
88 * [ 11100000 10111111 10111111 ] ->
89 * 0000111111111111
90 */
91 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF },
92 .len = 3,
93 .exp_len = 3,
94 .exp_cp = 0xFFF,
95 },
96 {
97 /* invalid 3-byte sequence (second byte missing)
98 * [ 11100000 ] ->
99 * INVALID
100 */
101 .arr = (char *)(unsigned char[]) { 0xE0 },
102 .len = 1,
103 .exp_len = 3,
104 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
105 },
106 {
107 /* invalid 3-byte sequence (second byte malformed)
108 * [ 11100000 01111111 10111111 ] ->
109 * INVALID
110 */
111 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF },
112 .len = 3,
113 .exp_len = 1,
114 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
115 },
116 {
117 /* invalid 3-byte sequence (short string, second byte ma…
118 * [ 11100000 01111111 ] ->
119 * INVALID
120 */
121 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F },
122 .len = 2,
123 .exp_len = 1,
124 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
125 },
126 {
127 /* invalid 3-byte sequence (third byte missing)
128 * [ 11100000 10111111 ] ->
129 * INVALID
130 */
131 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF },
132 .len = 2,
133 .exp_len = 3,
134 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
135 },
136 {
137 /* invalid 3-byte sequence (third byte malformed)
138 * [ 11100000 10111111 01111111 ] ->
139 * INVALID
140 */
141 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F },
142 .len = 3,
143 .exp_len = 2,
144 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
145 },
146 {
147 /* invalid 3-byte sequence (overlong encoded)
148 * [ 11100000 10011111 10111111 ] ->
149 * INVALID
150 */
151 .arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF },
152 .len = 3,
153 .exp_len = 3,
154 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
155 },
156 {
157 /* invalid 3-byte sequence (UTF-16 surrogate half)
158 * [ 11101101 10100000 10000000 ] ->
159 * INVALID
160 */
161 .arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 },
162 .len = 3,
163 .exp_len = 3,
164 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
165 },
166 {
167 /* valid 4-byte sequence
168 * [ 11110011 10111111 10111111 10111111 ] ->
169 * 011111111111111111111
170 */
171 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xB…
172 .len = 4,
173 .exp_len = 4,
174 .exp_cp = UINT32_C(0xFFFFF),
175 },
176 {
177 /* invalid 4-byte sequence (second byte missing)
178 * [ 11110011 ] ->
179 * INVALID
180 */
181 .arr = (char *)(unsigned char[]) { 0xF3 },
182 .len = 1,
183 .exp_len = 4,
184 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
185 },
186 {
187 /* invalid 4-byte sequence (second byte malformed)
188 * [ 11110011 01111111 10111111 10111111 ] ->
189 * INVALID
190 */
191 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xB…
192 .len = 4,
193 .exp_len = 1,
194 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
195 },
196 {
197 /* invalid 4-byte sequence (short string 1, second byte
198 * malformed) [ 11110011 011111111 ] -> INVALID
199 */
200 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F },
201 .len = 2,
202 .exp_len = 1,
203 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
204 },
205 {
206 /* invalid 4-byte sequence (short string 2, second byte
207 * malformed) [ 11110011 011111111 10111111 ] -> INVALID
208 */
209 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF },
210 .len = 3,
211 .exp_len = 1,
212 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
213 },
214
215 {
216 /* invalid 4-byte sequence (third byte missing)
217 * [ 11110011 10111111 ] ->
218 * INVALID
219 */
220 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF },
221 .len = 2,
222 .exp_len = 4,
223 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
224 },
225 {
226 /* invalid 4-byte sequence (third byte malformed)
227 * [ 11110011 10111111 01111111 10111111 ] ->
228 * INVALID
229 */
230 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xB…
231 .len = 4,
232 .exp_len = 2,
233 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
234 },
235 {
236 /* invalid 4-byte sequence (short string, third byte mal…
237 * [ 11110011 10111111 01111111 ] ->
238 * INVALID
239 */
240 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F },
241 .len = 3,
242 .exp_len = 2,
243 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
244 },
245 {
246 /* invalid 4-byte sequence (fourth byte missing)
247 * [ 11110011 10111111 10111111 ] ->
248 * INVALID
249 */
250 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF },
251 .len = 3,
252 .exp_len = 4,
253 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
254 },
255 {
256 /* invalid 4-byte sequence (fourth byte malformed)
257 * [ 11110011 10111111 10111111 01111111 ] ->
258 * INVALID
259 */
260 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7…
261 .len = 4,
262 .exp_len = 3,
263 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
264 },
265 {
266 /* invalid 4-byte sequence (overlong encoded)
267 * [ 11110000 10000000 10000001 10111111 ] ->
268 * INVALID
269 */
270 .arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xB…
271 .len = 4,
272 .exp_len = 4,
273 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
274 },
275 {
276 /* invalid 4-byte sequence (UTF-16-unrepresentable)
277 * [ 11110100 10010000 10000000 10000000 ] ->
278 * INVALID
279 */
280 .arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x8…
281 .len = 4,
282 .exp_len = 4,
283 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
284 },
285 };
286
287 int
288 main(int argc, char *argv[])
289 {
290 size_t i, failed;
291
292 (void)argc;
293
294 /* UTF-8 decoder test */
295 for (i = 0, failed = 0; i < LEN(dec_test); i++) {
296 size_t len;
297 uint_least32_t cp;
298
299 len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].…
300 &cp);
301
302 if (len != dec_test[i].exp_len || cp != dec_test[i].exp_…
303 fprintf(stderr,
304 "%s: Failed test %zu: "
305 "Expected (%zx,%u), but got (%zx,%u).\n",
306 argv[0], i, dec_test[i].exp_len,
307 dec_test[i].exp_cp, len, cp);
308 failed++;
309 }
310 }
311 printf("%s: %zu/%zu unit tests passed.\n", argv[0],
312 LEN(dec_test) - failed, LEN(dec_test));
313
314 return (failed > 0) ? 1 : 0;
315 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.