bidirectional.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
bidirectional.c (12515B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <errno.h> | |
3 #include <inttypes.h> | |
4 #include <stddef.h> | |
5 #include <stdio.h> | |
6 #include <stdlib.h> | |
7 #include <string.h> | |
8 | |
9 #include "util.h" | |
10 | |
11 #define FILE_BIDI_BRACKETS "data/BidiBrackets.txt" | |
12 #define FILE_BIDI_CLASS "data/DerivedBidiClass.txt" | |
13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt" | |
14 #define FILE_UNICODE_DATA "data/UnicodeData.txt" | |
15 | |
16 #define NUM_BRACKET_ALIASES 20 | |
17 | |
18 static const struct property_spec bidi_property[] = { | |
19 { | |
20 /* default */ | |
21 .enumname = "L", | |
22 .file = FILE_BIDI_CLASS, | |
23 .ucdname = "L", | |
24 }, | |
25 { | |
26 .enumname = "AL", | |
27 .file = FILE_BIDI_CLASS, | |
28 .ucdname = "AL", | |
29 }, | |
30 { | |
31 .enumname = "AN", | |
32 .file = FILE_BIDI_CLASS, | |
33 .ucdname = "AN", | |
34 }, | |
35 { | |
36 .enumname = "B", | |
37 .file = FILE_BIDI_CLASS, | |
38 .ucdname = "B", | |
39 }, | |
40 { | |
41 .enumname = "BN", | |
42 .file = FILE_BIDI_CLASS, | |
43 .ucdname = "BN", | |
44 }, | |
45 { | |
46 .enumname = "CS", | |
47 .file = FILE_BIDI_CLASS, | |
48 .ucdname = "CS", | |
49 }, | |
50 { | |
51 .enumname = "EN", | |
52 .file = FILE_BIDI_CLASS, | |
53 .ucdname = "EN", | |
54 }, | |
55 { | |
56 .enumname = "ES", | |
57 .file = FILE_BIDI_CLASS, | |
58 .ucdname = "ES", | |
59 }, | |
60 { | |
61 .enumname = "ET", | |
62 .file = FILE_BIDI_CLASS, | |
63 .ucdname = "ET", | |
64 }, | |
65 { | |
66 .enumname = "FSI", | |
67 .file = FILE_BIDI_CLASS, | |
68 .ucdname = "FSI", | |
69 }, | |
70 { | |
71 .enumname = "LRE", | |
72 .file = FILE_BIDI_CLASS, | |
73 .ucdname = "LRE", | |
74 }, | |
75 { | |
76 .enumname = "LRI", | |
77 .file = FILE_BIDI_CLASS, | |
78 .ucdname = "LRI", | |
79 }, | |
80 { | |
81 .enumname = "LRO", | |
82 .file = FILE_BIDI_CLASS, | |
83 .ucdname = "LRO", | |
84 }, | |
85 { | |
86 .enumname = "NSM", | |
87 .file = FILE_BIDI_CLASS, | |
88 .ucdname = "NSM", | |
89 }, | |
90 { | |
91 .enumname = "ON", | |
92 .file = FILE_BIDI_CLASS, | |
93 .ucdname = "ON", | |
94 }, | |
95 { | |
96 .enumname = "PDF", | |
97 .file = FILE_BIDI_CLASS, | |
98 .ucdname = "PDF", | |
99 }, | |
100 { | |
101 .enumname = "PDI", | |
102 .file = FILE_BIDI_CLASS, | |
103 .ucdname = "PDI", | |
104 }, | |
105 { | |
106 .enumname = "R", | |
107 .file = FILE_BIDI_CLASS, | |
108 .ucdname = "R", | |
109 }, | |
110 { | |
111 .enumname = "RLE", | |
112 .file = FILE_BIDI_CLASS, | |
113 .ucdname = "RLE", | |
114 }, | |
115 { | |
116 .enumname = "RLI", | |
117 .file = FILE_BIDI_CLASS, | |
118 .ucdname = "RLI", | |
119 }, | |
120 { | |
121 .enumname = "RLO", | |
122 .file = FILE_BIDI_CLASS, | |
123 .ucdname = "RLO", | |
124 }, | |
125 { | |
126 .enumname = "S", | |
127 .file = FILE_BIDI_CLASS, | |
128 .ucdname = "S", | |
129 }, | |
130 { | |
131 .enumname = "WS", | |
132 .file = FILE_BIDI_CLASS, | |
133 .ucdname = "WS", | |
134 }, | |
135 }; | |
136 | |
137 struct decomposition_payload { | |
138 uint_least32_t cp; | |
139 uint_least32_t decomposition; | |
140 }; | |
141 | |
142 static int | |
143 decomposition_callback(const char *file, char **field, size_t nfields, | |
144 char *comment, void *payload) | |
145 { | |
146 char *p; | |
147 struct decomposition_payload *decomp = | |
148 (struct decomposition_payload *)payload; | |
149 uint_least32_t cp; | |
150 | |
151 (void)file; | |
152 (void)comment; | |
153 | |
154 if (nfields < 6) { | |
155 /* we have fewer than 6 fields, discard the line */ | |
156 return 0; | |
157 } | |
158 | |
159 hextocp(field[0], strlen(field[0]), &cp); | |
160 | |
161 if (decomp->cp == cp) { | |
162 /* we hit the line that contains our decomposition targe… | |
163 if (strlen(field[5]) > 0) { | |
164 p = field[5]; | |
165 if (*p == '<') { | |
166 /* | |
167 * the decomposition contains some metad… | |
168 * <...> we skip | |
169 */ | |
170 for (; *p != '\0'; p++) { | |
171 if (*p == '>') { | |
172 p++; | |
173 while (*p == ' ') { | |
174 p++; | |
175 } | |
176 break; | |
177 } | |
178 } | |
179 } | |
180 hextocp(p, strlen(p), &(decomp->decomposition)); | |
181 } else { | |
182 decomp->decomposition = decomp->cp; | |
183 } | |
184 } | |
185 | |
186 return 0; | |
187 } | |
188 | |
189 static struct { | |
190 uint_least32_t base[NUM_BRACKET_ALIASES]; | |
191 size_t baselen; | |
192 uint_least32_t pair[NUM_BRACKET_ALIASES]; | |
193 size_t pairlen; | |
194 uint_least8_t class; | |
195 char type; | |
196 } *b = NULL; | |
197 | |
198 static size_t blen; | |
199 static uint_least8_t bracket_class_count = 1; | |
200 | |
201 static int | |
202 bracket_callback(const char *file, char **field, size_t nfields, char *c… | |
203 void *payload) | |
204 { | |
205 size_t i, j; | |
206 struct decomposition_payload decomp_base, decomp_pair; | |
207 uint_least32_t cp_base, cp_pair; | |
208 | |
209 (void)file; | |
210 (void)comment; | |
211 (void)payload; | |
212 | |
213 if (nfields < 3) { | |
214 /* we have fewer than 3 fields, discard the line */ | |
215 return 0; | |
216 } | |
217 | |
218 /* parse field data */ | |
219 hextocp(field[0], strlen(field[0]), &cp_base); | |
220 hextocp(field[1], strlen(field[1]), &cp_pair); | |
221 | |
222 /* determine decomposition of the base and pair codepoints */ | |
223 decomp_base.cp = cp_base; | |
224 decomp_pair.cp = cp_pair; | |
225 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callba… | |
226 &decomp_base); | |
227 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callba… | |
228 &decomp_pair); | |
229 | |
230 /* | |
231 * check if we already have the canonical form in the bracket ar… | |
232 * per convention the canonical form is the first element of the… | |
233 * array | |
234 */ | |
235 for (i = 0; i < blen; i++) { | |
236 if (decomp_base.decomposition == b[i].base[0]) { | |
237 /* we have a match, check type */ | |
238 if (strlen(field[2]) != 1 || | |
239 (field[2][0] != 'o' && field[2][0] != 'c')) { | |
240 /* malformed line */ | |
241 return 1; | |
242 } else if (b[i].type != field[2][0]) { | |
243 /* mismatching types */ | |
244 return 1; | |
245 } | |
246 | |
247 /* | |
248 * add our base alias to the base array unless i… | |
249 * already in it | |
250 */ | |
251 for (j = 0; j < b[i].baselen; j++) { | |
252 if (cp_base == b[i].base[j]) { | |
253 /* already in array, do nothing … | |
254 break; | |
255 } | |
256 } | |
257 if (j == b[i].baselen) { | |
258 /* | |
259 * the base alias is not already in the … | |
260 * add it | |
261 */ | |
262 if (b[i].baselen == NUM_BRACKET_ALIASES)… | |
263 fprintf(stderr, "too many aliase… | |
264 return 1; | |
265 } | |
266 b[i].baselen++; | |
267 b[i].base[b[i].baselen - 1] = cp_base; | |
268 } | |
269 | |
270 /* | |
271 * also add our pair alias to the pair array unl… | |
272 * it isn't already in it | |
273 */ | |
274 for (j = 0; j < b[i].pairlen; j++) { | |
275 if (cp_pair == b[i].pair[j]) { | |
276 /* already in array, do nothing … | |
277 break; | |
278 } | |
279 } | |
280 if (j == b[i].pairlen) { | |
281 /* | |
282 * the pair alias is not already in the … | |
283 * add it | |
284 */ | |
285 if (b[i].pairlen == NUM_BRACKET_ALIASES)… | |
286 fprintf(stderr, "too many aliase… | |
287 return 1; | |
288 } | |
289 b[i].pairlen++; | |
290 b[i].pair[b[i].pairlen - 1] = cp_pair; | |
291 } | |
292 | |
293 return 0; | |
294 } | |
295 } | |
296 | |
297 /* extend bracket pair array, as this is a new bracket type */ | |
298 if (!(b = realloc(b, (++blen) * sizeof(*b)))) { | |
299 fprintf(stderr, "realloc: %s\n", strerror(errno)); | |
300 exit(1); | |
301 } | |
302 | |
303 /* fill field data by adding the canonical form first */ | |
304 b[blen - 1].base[0] = decomp_base.decomposition; | |
305 b[blen - 1].baselen = 1; | |
306 b[blen - 1].pair[0] = decomp_pair.decomposition; | |
307 b[blen - 1].pairlen = 1; | |
308 | |
309 /* add alias if it differs from the canonical form */ | |
310 if (cp_base != decomp_base.decomposition) { | |
311 b[blen - 1].base[1] = cp_base; | |
312 b[blen - 1].baselen = 2; | |
313 } | |
314 if (cp_pair != decomp_pair.decomposition) { | |
315 b[blen - 1].pair[1] = cp_pair; | |
316 b[blen - 1].pairlen = 2; | |
317 } | |
318 | |
319 /* add bracket type */ | |
320 if (strlen(field[2]) != 1 || | |
321 (field[2][0] != 'o' && field[2][0] != 'c')) { | |
322 /* malformed line */ | |
323 return 1; | |
324 } else { | |
325 b[blen - 1].type = field[2][0]; | |
326 } | |
327 | |
328 /* | |
329 * determine bracket class by iterating over the bracket-array | |
330 * and seeing if our current canonical cp already has a matching… | |
331 * We only need to check the first entry in each bracket alias | |
332 * list, as this is, per convention, the canonical form. | |
333 * If not, add a new class. | |
334 */ | |
335 for (i = 0; i + 1 < blen; i++) { | |
336 if (b[i].pair[0] == b[blen - 1].base[0]) { | |
337 /* matched class */ | |
338 b[blen - 1].class = b[i].class; | |
339 break; | |
340 } | |
341 } | |
342 if (i + 1 == blen) { | |
343 /* no match, assign a new class */ | |
344 b[blen - 1].class = bracket_class_count++; | |
345 } | |
346 | |
347 return 0; | |
348 } | |
349 | |
350 static void | |
351 post_process(struct properties *prop) | |
352 { | |
353 size_t i, j; | |
354 | |
355 for (i = 0; i < blen; i++) { | |
356 /* | |
357 * given the base property fits in 5 bits, we simply | |
358 * store the bracket-offset in the bits above that. | |
359 * | |
360 * All those properties that are not set here implicitly | |
361 * have offset 0, which we prepared to contain a stub | |
362 * for a character that is not a bracket. | |
363 */ | |
364 for (j = 0; j < b[i].baselen; j++) { | |
365 prop[b[i].base[j]].property |= (i << 5); | |
366 } | |
367 } | |
368 } | |
369 | |
370 static uint_least8_t | |
371 fill_missing(uint_least32_t cp) | |
372 { | |
373 /* based on the @missing-properties in data/DerivedBidiClass.txt… | |
374 if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) || | |
375 (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) || | |
376 (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) || | |
377 (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) || | |
378 (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) || | |
379 (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) || | |
380 (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) || | |
381 (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) || | |
382 (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) || | |
383 (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) || | |
384 (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) { | |
385 return 17; /* class R */ | |
386 } else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) || | |
387 (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) || | |
388 (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) || | |
389 (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) || | |
390 (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) || | |
391 (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) … | |
392 (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) … | |
393 (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) … | |
394 (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) … | |
395 (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) … | |
396 (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF)))… | |
397 return 1; /* class AL */ | |
398 } else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) { | |
399 return 8; /* class ET */ | |
400 } else { | |
401 return 0; /* class L */ | |
402 } | |
403 } | |
404 | |
405 static struct properties *prop_mirror = NULL; | |
406 | |
407 static int | |
408 mirror_callback(const char *file, char **field, size_t nfields, char *co… | |
409 void *payload) | |
410 { | |
411 uint_least32_t cp, cp_mirror; | |
412 | |
413 (void)file; | |
414 (void)comment; | |
415 (void)payload; | |
416 | |
417 hextocp(field[0], strlen(field[0]), &cp); | |
418 | |
419 cp_mirror = cp; | |
420 | |
421 if (nfields >= 2 && strlen(field[1]) > 0 && | |
422 hextocp(field[1], strlen(field[1]), &cp_mirror)) { | |
423 return 1; | |
424 } | |
425 | |
426 prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least… | |
427 | |
428 return 0; | |
429 } | |
430 | |
431 static int_least64_t | |
432 get_value(const struct properties *prop, size_t offset) | |
433 { | |
434 return prop[offset].property; | |
435 } | |
436 | |
437 int | |
438 main(int argc, char *argv[]) | |
439 { | |
440 struct properties_compressed comp_mirror; | |
441 struct properties_major_minor mm_mirror; | |
442 size_t i; | |
443 | |
444 (void)argc; | |
445 | |
446 /* | |
447 * the first element in the bracket array is initialized to | |
448 * all-zeros, as we use the implicit 0-offset for all those | |
449 * codepoints that are not a bracket | |
450 */ | |
451 if (!(b = calloc((blen = 1), sizeof(*b)))) { | |
452 fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
453 exit(1); | |
454 } | |
455 parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, N… | |
456 | |
457 properties_generate_break_property(bidi_property, LEN(bidi_prope… | |
458 fill_missing, NULL, post_proc… | |
459 "bidi", argv[0]); | |
460 | |
461 printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t" | |
462 "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n" | |
463 "static const struct bracket {\n\tenum bracket_type type;… | |
464 "\tuint_least8_t class;\n} bidi_bracket[] = {\n"); | |
465 for (i = 0; i < blen; i++) { | |
466 printf("\t{\n\t\t.type = %s,\n\t\t.class = " | |
467 "%" PRIuLEAST8 ",\n\t},\n", | |
468 (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" : | |
469 (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" : | |
470 "BIDI_BRACKET_NONE", | |
471 b[i].class); | |
472 } | |
473 printf("};\n"); | |
474 | |
475 /* | |
476 * allocate property buffer for all 0x110000 codepoints | |
477 * | |
478 * the buffers contain the offset from the "base" character | |
479 * to the respective mirrored character. By callocing we set all | |
480 * fields to zero, which is also the Unicode "default" in the se… | |
481 * that the coe point is its mirror (unless we fill it in) | |
482 */ | |
483 if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirr… | |
484 fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
485 exit(1); | |
486 } | |
487 parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, N… | |
488 | |
489 /* compress properties */ | |
490 properties_compress(prop_mirror, &comp_mirror); | |
491 | |
492 fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", ar… | |
493 properties_get_major_minor(&comp_mirror, &mm_mirror)); | |
494 | |
495 /* print tables */ | |
496 properties_print_lookup_table("mirror_major", mm_mirror.major, 0… | |
497 printf("\n"); | |
498 properties_print_derived_lookup_table("mirror_minor", mm_mirror.… | |
499 mm_mirror.minorlen, get_va… | |
500 comp_mirror.data); | |
501 | |
502 free(comp_mirror.data); | |
503 free(comp_mirror.offset); | |
504 free(mm_mirror.major); | |
505 free(mm_mirror.minor); | |
506 | |
507 return 0; | |
508 } |