Introduction
Introduction Statistics Contact Development Disclaimer Help
bidirectional.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
bidirectional.c (12515B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <errno.h>
3 #include <inttypes.h>
4 #include <stddef.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8
9 #include "util.h"
10
11 #define FILE_BIDI_BRACKETS "data/BidiBrackets.txt"
12 #define FILE_BIDI_CLASS "data/DerivedBidiClass.txt"
13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt"
14 #define FILE_UNICODE_DATA "data/UnicodeData.txt"
15
16 #define NUM_BRACKET_ALIASES 20
17
18 static const struct property_spec bidi_property[] = {
19 {
20 /* default */
21 .enumname = "L",
22 .file = FILE_BIDI_CLASS,
23 .ucdname = "L",
24 },
25 {
26 .enumname = "AL",
27 .file = FILE_BIDI_CLASS,
28 .ucdname = "AL",
29 },
30 {
31 .enumname = "AN",
32 .file = FILE_BIDI_CLASS,
33 .ucdname = "AN",
34 },
35 {
36 .enumname = "B",
37 .file = FILE_BIDI_CLASS,
38 .ucdname = "B",
39 },
40 {
41 .enumname = "BN",
42 .file = FILE_BIDI_CLASS,
43 .ucdname = "BN",
44 },
45 {
46 .enumname = "CS",
47 .file = FILE_BIDI_CLASS,
48 .ucdname = "CS",
49 },
50 {
51 .enumname = "EN",
52 .file = FILE_BIDI_CLASS,
53 .ucdname = "EN",
54 },
55 {
56 .enumname = "ES",
57 .file = FILE_BIDI_CLASS,
58 .ucdname = "ES",
59 },
60 {
61 .enumname = "ET",
62 .file = FILE_BIDI_CLASS,
63 .ucdname = "ET",
64 },
65 {
66 .enumname = "FSI",
67 .file = FILE_BIDI_CLASS,
68 .ucdname = "FSI",
69 },
70 {
71 .enumname = "LRE",
72 .file = FILE_BIDI_CLASS,
73 .ucdname = "LRE",
74 },
75 {
76 .enumname = "LRI",
77 .file = FILE_BIDI_CLASS,
78 .ucdname = "LRI",
79 },
80 {
81 .enumname = "LRO",
82 .file = FILE_BIDI_CLASS,
83 .ucdname = "LRO",
84 },
85 {
86 .enumname = "NSM",
87 .file = FILE_BIDI_CLASS,
88 .ucdname = "NSM",
89 },
90 {
91 .enumname = "ON",
92 .file = FILE_BIDI_CLASS,
93 .ucdname = "ON",
94 },
95 {
96 .enumname = "PDF",
97 .file = FILE_BIDI_CLASS,
98 .ucdname = "PDF",
99 },
100 {
101 .enumname = "PDI",
102 .file = FILE_BIDI_CLASS,
103 .ucdname = "PDI",
104 },
105 {
106 .enumname = "R",
107 .file = FILE_BIDI_CLASS,
108 .ucdname = "R",
109 },
110 {
111 .enumname = "RLE",
112 .file = FILE_BIDI_CLASS,
113 .ucdname = "RLE",
114 },
115 {
116 .enumname = "RLI",
117 .file = FILE_BIDI_CLASS,
118 .ucdname = "RLI",
119 },
120 {
121 .enumname = "RLO",
122 .file = FILE_BIDI_CLASS,
123 .ucdname = "RLO",
124 },
125 {
126 .enumname = "S",
127 .file = FILE_BIDI_CLASS,
128 .ucdname = "S",
129 },
130 {
131 .enumname = "WS",
132 .file = FILE_BIDI_CLASS,
133 .ucdname = "WS",
134 },
135 };
136
137 struct decomposition_payload {
138 uint_least32_t cp;
139 uint_least32_t decomposition;
140 };
141
142 static int
143 decomposition_callback(const char *file, char **field, size_t nfields,
144 char *comment, void *payload)
145 {
146 char *p;
147 struct decomposition_payload *decomp =
148 (struct decomposition_payload *)payload;
149 uint_least32_t cp;
150
151 (void)file;
152 (void)comment;
153
154 if (nfields < 6) {
155 /* we have fewer than 6 fields, discard the line */
156 return 0;
157 }
158
159 hextocp(field[0], strlen(field[0]), &cp);
160
161 if (decomp->cp == cp) {
162 /* we hit the line that contains our decomposition targe…
163 if (strlen(field[5]) > 0) {
164 p = field[5];
165 if (*p == '<') {
166 /*
167 * the decomposition contains some metad…
168 * <...> we skip
169 */
170 for (; *p != '\0'; p++) {
171 if (*p == '>') {
172 p++;
173 while (*p == ' ') {
174 p++;
175 }
176 break;
177 }
178 }
179 }
180 hextocp(p, strlen(p), &(decomp->decomposition));
181 } else {
182 decomp->decomposition = decomp->cp;
183 }
184 }
185
186 return 0;
187 }
188
189 static struct {
190 uint_least32_t base[NUM_BRACKET_ALIASES];
191 size_t baselen;
192 uint_least32_t pair[NUM_BRACKET_ALIASES];
193 size_t pairlen;
194 uint_least8_t class;
195 char type;
196 } *b = NULL;
197
198 static size_t blen;
199 static uint_least8_t bracket_class_count = 1;
200
201 static int
202 bracket_callback(const char *file, char **field, size_t nfields, char *c…
203 void *payload)
204 {
205 size_t i, j;
206 struct decomposition_payload decomp_base, decomp_pair;
207 uint_least32_t cp_base, cp_pair;
208
209 (void)file;
210 (void)comment;
211 (void)payload;
212
213 if (nfields < 3) {
214 /* we have fewer than 3 fields, discard the line */
215 return 0;
216 }
217
218 /* parse field data */
219 hextocp(field[0], strlen(field[0]), &cp_base);
220 hextocp(field[1], strlen(field[1]), &cp_pair);
221
222 /* determine decomposition of the base and pair codepoints */
223 decomp_base.cp = cp_base;
224 decomp_pair.cp = cp_pair;
225 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callba…
226 &decomp_base);
227 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callba…
228 &decomp_pair);
229
230 /*
231 * check if we already have the canonical form in the bracket ar…
232 * per convention the canonical form is the first element of the…
233 * array
234 */
235 for (i = 0; i < blen; i++) {
236 if (decomp_base.decomposition == b[i].base[0]) {
237 /* we have a match, check type */
238 if (strlen(field[2]) != 1 ||
239 (field[2][0] != 'o' && field[2][0] != 'c')) {
240 /* malformed line */
241 return 1;
242 } else if (b[i].type != field[2][0]) {
243 /* mismatching types */
244 return 1;
245 }
246
247 /*
248 * add our base alias to the base array unless i…
249 * already in it
250 */
251 for (j = 0; j < b[i].baselen; j++) {
252 if (cp_base == b[i].base[j]) {
253 /* already in array, do nothing …
254 break;
255 }
256 }
257 if (j == b[i].baselen) {
258 /*
259 * the base alias is not already in the …
260 * add it
261 */
262 if (b[i].baselen == NUM_BRACKET_ALIASES)…
263 fprintf(stderr, "too many aliase…
264 return 1;
265 }
266 b[i].baselen++;
267 b[i].base[b[i].baselen - 1] = cp_base;
268 }
269
270 /*
271 * also add our pair alias to the pair array unl…
272 * it isn't already in it
273 */
274 for (j = 0; j < b[i].pairlen; j++) {
275 if (cp_pair == b[i].pair[j]) {
276 /* already in array, do nothing …
277 break;
278 }
279 }
280 if (j == b[i].pairlen) {
281 /*
282 * the pair alias is not already in the …
283 * add it
284 */
285 if (b[i].pairlen == NUM_BRACKET_ALIASES)…
286 fprintf(stderr, "too many aliase…
287 return 1;
288 }
289 b[i].pairlen++;
290 b[i].pair[b[i].pairlen - 1] = cp_pair;
291 }
292
293 return 0;
294 }
295 }
296
297 /* extend bracket pair array, as this is a new bracket type */
298 if (!(b = realloc(b, (++blen) * sizeof(*b)))) {
299 fprintf(stderr, "realloc: %s\n", strerror(errno));
300 exit(1);
301 }
302
303 /* fill field data by adding the canonical form first */
304 b[blen - 1].base[0] = decomp_base.decomposition;
305 b[blen - 1].baselen = 1;
306 b[blen - 1].pair[0] = decomp_pair.decomposition;
307 b[blen - 1].pairlen = 1;
308
309 /* add alias if it differs from the canonical form */
310 if (cp_base != decomp_base.decomposition) {
311 b[blen - 1].base[1] = cp_base;
312 b[blen - 1].baselen = 2;
313 }
314 if (cp_pair != decomp_pair.decomposition) {
315 b[blen - 1].pair[1] = cp_pair;
316 b[blen - 1].pairlen = 2;
317 }
318
319 /* add bracket type */
320 if (strlen(field[2]) != 1 ||
321 (field[2][0] != 'o' && field[2][0] != 'c')) {
322 /* malformed line */
323 return 1;
324 } else {
325 b[blen - 1].type = field[2][0];
326 }
327
328 /*
329 * determine bracket class by iterating over the bracket-array
330 * and seeing if our current canonical cp already has a matching…
331 * We only need to check the first entry in each bracket alias
332 * list, as this is, per convention, the canonical form.
333 * If not, add a new class.
334 */
335 for (i = 0; i + 1 < blen; i++) {
336 if (b[i].pair[0] == b[blen - 1].base[0]) {
337 /* matched class */
338 b[blen - 1].class = b[i].class;
339 break;
340 }
341 }
342 if (i + 1 == blen) {
343 /* no match, assign a new class */
344 b[blen - 1].class = bracket_class_count++;
345 }
346
347 return 0;
348 }
349
350 static void
351 post_process(struct properties *prop)
352 {
353 size_t i, j;
354
355 for (i = 0; i < blen; i++) {
356 /*
357 * given the base property fits in 5 bits, we simply
358 * store the bracket-offset in the bits above that.
359 *
360 * All those properties that are not set here implicitly
361 * have offset 0, which we prepared to contain a stub
362 * for a character that is not a bracket.
363 */
364 for (j = 0; j < b[i].baselen; j++) {
365 prop[b[i].base[j]].property |= (i << 5);
366 }
367 }
368 }
369
370 static uint_least8_t
371 fill_missing(uint_least32_t cp)
372 {
373 /* based on the @missing-properties in data/DerivedBidiClass.txt…
374 if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) ||
375 (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) ||
376 (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) ||
377 (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) ||
378 (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) ||
379 (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) ||
380 (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) ||
381 (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) ||
382 (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) ||
383 (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) ||
384 (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) {
385 return 17; /* class R */
386 } else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) ||
387 (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) ||
388 (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) ||
389 (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) ||
390 (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) ||
391 (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) …
392 (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) …
393 (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) …
394 (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) …
395 (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) …
396 (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF)))…
397 return 1; /* class AL */
398 } else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) {
399 return 8; /* class ET */
400 } else {
401 return 0; /* class L */
402 }
403 }
404
405 static struct properties *prop_mirror = NULL;
406
407 static int
408 mirror_callback(const char *file, char **field, size_t nfields, char *co…
409 void *payload)
410 {
411 uint_least32_t cp, cp_mirror;
412
413 (void)file;
414 (void)comment;
415 (void)payload;
416
417 hextocp(field[0], strlen(field[0]), &cp);
418
419 cp_mirror = cp;
420
421 if (nfields >= 2 && strlen(field[1]) > 0 &&
422 hextocp(field[1], strlen(field[1]), &cp_mirror)) {
423 return 1;
424 }
425
426 prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least…
427
428 return 0;
429 }
430
431 static int_least64_t
432 get_value(const struct properties *prop, size_t offset)
433 {
434 return prop[offset].property;
435 }
436
437 int
438 main(int argc, char *argv[])
439 {
440 struct properties_compressed comp_mirror;
441 struct properties_major_minor mm_mirror;
442 size_t i;
443
444 (void)argc;
445
446 /*
447 * the first element in the bracket array is initialized to
448 * all-zeros, as we use the implicit 0-offset for all those
449 * codepoints that are not a bracket
450 */
451 if (!(b = calloc((blen = 1), sizeof(*b)))) {
452 fprintf(stderr, "calloc: %s\n", strerror(errno));
453 exit(1);
454 }
455 parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, N…
456
457 properties_generate_break_property(bidi_property, LEN(bidi_prope…
458 fill_missing, NULL, post_proc…
459 "bidi", argv[0]);
460
461 printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t"
462 "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n"
463 "static const struct bracket {\n\tenum bracket_type type;…
464 "\tuint_least8_t class;\n} bidi_bracket[] = {\n");
465 for (i = 0; i < blen; i++) {
466 printf("\t{\n\t\t.type = %s,\n\t\t.class = "
467 "%" PRIuLEAST8 ",\n\t},\n",
468 (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" :
469 (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" :
470 "BIDI_BRACKET_NONE",
471 b[i].class);
472 }
473 printf("};\n");
474
475 /*
476 * allocate property buffer for all 0x110000 codepoints
477 *
478 * the buffers contain the offset from the "base" character
479 * to the respective mirrored character. By callocing we set all
480 * fields to zero, which is also the Unicode "default" in the se…
481 * that the coe point is its mirror (unless we fill it in)
482 */
483 if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirr…
484 fprintf(stderr, "calloc: %s\n", strerror(errno));
485 exit(1);
486 }
487 parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, N…
488
489 /* compress properties */
490 properties_compress(prop_mirror, &comp_mirror);
491
492 fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", ar…
493 properties_get_major_minor(&comp_mirror, &mm_mirror));
494
495 /* print tables */
496 properties_print_lookup_table("mirror_major", mm_mirror.major, 0…
497 printf("\n");
498 properties_print_derived_lookup_table("mirror_minor", mm_mirror.…
499 mm_mirror.minorlen, get_va…
500 comp_mirror.data);
501
502 free(comp_mirror.data);
503 free(comp_mirror.offset);
504 free(mm_mirror.major);
505 free(mm_mirror.minor);
506
507 return 0;
508 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.