| bidirectional.c - libgrapheme - unicode string library | |
| git clone git://git.suckless.org/libgrapheme | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| bidirectional.c (12515B) | |
| --- | |
| 1 /* See LICENSE file for copyright and license details. */ | |
| 2 #include <errno.h> | |
| 3 #include <inttypes.h> | |
| 4 #include <stddef.h> | |
| 5 #include <stdio.h> | |
| 6 #include <stdlib.h> | |
| 7 #include <string.h> | |
| 8 | |
| 9 #include "util.h" | |
| 10 | |
| 11 #define FILE_BIDI_BRACKETS "data/BidiBrackets.txt" | |
| 12 #define FILE_BIDI_CLASS "data/DerivedBidiClass.txt" | |
| 13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt" | |
| 14 #define FILE_UNICODE_DATA "data/UnicodeData.txt" | |
| 15 | |
| 16 #define NUM_BRACKET_ALIASES 20 | |
| 17 | |
| 18 static const struct property_spec bidi_property[] = { | |
| 19 { | |
| 20 /* default */ | |
| 21 .enumname = "L", | |
| 22 .file = FILE_BIDI_CLASS, | |
| 23 .ucdname = "L", | |
| 24 }, | |
| 25 { | |
| 26 .enumname = "AL", | |
| 27 .file = FILE_BIDI_CLASS, | |
| 28 .ucdname = "AL", | |
| 29 }, | |
| 30 { | |
| 31 .enumname = "AN", | |
| 32 .file = FILE_BIDI_CLASS, | |
| 33 .ucdname = "AN", | |
| 34 }, | |
| 35 { | |
| 36 .enumname = "B", | |
| 37 .file = FILE_BIDI_CLASS, | |
| 38 .ucdname = "B", | |
| 39 }, | |
| 40 { | |
| 41 .enumname = "BN", | |
| 42 .file = FILE_BIDI_CLASS, | |
| 43 .ucdname = "BN", | |
| 44 }, | |
| 45 { | |
| 46 .enumname = "CS", | |
| 47 .file = FILE_BIDI_CLASS, | |
| 48 .ucdname = "CS", | |
| 49 }, | |
| 50 { | |
| 51 .enumname = "EN", | |
| 52 .file = FILE_BIDI_CLASS, | |
| 53 .ucdname = "EN", | |
| 54 }, | |
| 55 { | |
| 56 .enumname = "ES", | |
| 57 .file = FILE_BIDI_CLASS, | |
| 58 .ucdname = "ES", | |
| 59 }, | |
| 60 { | |
| 61 .enumname = "ET", | |
| 62 .file = FILE_BIDI_CLASS, | |
| 63 .ucdname = "ET", | |
| 64 }, | |
| 65 { | |
| 66 .enumname = "FSI", | |
| 67 .file = FILE_BIDI_CLASS, | |
| 68 .ucdname = "FSI", | |
| 69 }, | |
| 70 { | |
| 71 .enumname = "LRE", | |
| 72 .file = FILE_BIDI_CLASS, | |
| 73 .ucdname = "LRE", | |
| 74 }, | |
| 75 { | |
| 76 .enumname = "LRI", | |
| 77 .file = FILE_BIDI_CLASS, | |
| 78 .ucdname = "LRI", | |
| 79 }, | |
| 80 { | |
| 81 .enumname = "LRO", | |
| 82 .file = FILE_BIDI_CLASS, | |
| 83 .ucdname = "LRO", | |
| 84 }, | |
| 85 { | |
| 86 .enumname = "NSM", | |
| 87 .file = FILE_BIDI_CLASS, | |
| 88 .ucdname = "NSM", | |
| 89 }, | |
| 90 { | |
| 91 .enumname = "ON", | |
| 92 .file = FILE_BIDI_CLASS, | |
| 93 .ucdname = "ON", | |
| 94 }, | |
| 95 { | |
| 96 .enumname = "PDF", | |
| 97 .file = FILE_BIDI_CLASS, | |
| 98 .ucdname = "PDF", | |
| 99 }, | |
| 100 { | |
| 101 .enumname = "PDI", | |
| 102 .file = FILE_BIDI_CLASS, | |
| 103 .ucdname = "PDI", | |
| 104 }, | |
| 105 { | |
| 106 .enumname = "R", | |
| 107 .file = FILE_BIDI_CLASS, | |
| 108 .ucdname = "R", | |
| 109 }, | |
| 110 { | |
| 111 .enumname = "RLE", | |
| 112 .file = FILE_BIDI_CLASS, | |
| 113 .ucdname = "RLE", | |
| 114 }, | |
| 115 { | |
| 116 .enumname = "RLI", | |
| 117 .file = FILE_BIDI_CLASS, | |
| 118 .ucdname = "RLI", | |
| 119 }, | |
| 120 { | |
| 121 .enumname = "RLO", | |
| 122 .file = FILE_BIDI_CLASS, | |
| 123 .ucdname = "RLO", | |
| 124 }, | |
| 125 { | |
| 126 .enumname = "S", | |
| 127 .file = FILE_BIDI_CLASS, | |
| 128 .ucdname = "S", | |
| 129 }, | |
| 130 { | |
| 131 .enumname = "WS", | |
| 132 .file = FILE_BIDI_CLASS, | |
| 133 .ucdname = "WS", | |
| 134 }, | |
| 135 }; | |
| 136 | |
| 137 struct decomposition_payload { | |
| 138 uint_least32_t cp; | |
| 139 uint_least32_t decomposition; | |
| 140 }; | |
| 141 | |
| 142 static int | |
| 143 decomposition_callback(const char *file, char **field, size_t nfields, | |
| 144 char *comment, void *payload) | |
| 145 { | |
| 146 char *p; | |
| 147 struct decomposition_payload *decomp = | |
| 148 (struct decomposition_payload *)payload; | |
| 149 uint_least32_t cp; | |
| 150 | |
| 151 (void)file; | |
| 152 (void)comment; | |
| 153 | |
| 154 if (nfields < 6) { | |
| 155 /* we have fewer than 6 fields, discard the line */ | |
| 156 return 0; | |
| 157 } | |
| 158 | |
| 159 hextocp(field[0], strlen(field[0]), &cp); | |
| 160 | |
| 161 if (decomp->cp == cp) { | |
| 162 /* we hit the line that contains our decomposition targe… | |
| 163 if (strlen(field[5]) > 0) { | |
| 164 p = field[5]; | |
| 165 if (*p == '<') { | |
| 166 /* | |
| 167 * the decomposition contains some metad… | |
| 168 * <...> we skip | |
| 169 */ | |
| 170 for (; *p != '\0'; p++) { | |
| 171 if (*p == '>') { | |
| 172 p++; | |
| 173 while (*p == ' ') { | |
| 174 p++; | |
| 175 } | |
| 176 break; | |
| 177 } | |
| 178 } | |
| 179 } | |
| 180 hextocp(p, strlen(p), &(decomp->decomposition)); | |
| 181 } else { | |
| 182 decomp->decomposition = decomp->cp; | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 return 0; | |
| 187 } | |
| 188 | |
| 189 static struct { | |
| 190 uint_least32_t base[NUM_BRACKET_ALIASES]; | |
| 191 size_t baselen; | |
| 192 uint_least32_t pair[NUM_BRACKET_ALIASES]; | |
| 193 size_t pairlen; | |
| 194 uint_least8_t class; | |
| 195 char type; | |
| 196 } *b = NULL; | |
| 197 | |
| 198 static size_t blen; | |
| 199 static uint_least8_t bracket_class_count = 1; | |
| 200 | |
| 201 static int | |
| 202 bracket_callback(const char *file, char **field, size_t nfields, char *c… | |
| 203 void *payload) | |
| 204 { | |
| 205 size_t i, j; | |
| 206 struct decomposition_payload decomp_base, decomp_pair; | |
| 207 uint_least32_t cp_base, cp_pair; | |
| 208 | |
| 209 (void)file; | |
| 210 (void)comment; | |
| 211 (void)payload; | |
| 212 | |
| 213 if (nfields < 3) { | |
| 214 /* we have fewer than 3 fields, discard the line */ | |
| 215 return 0; | |
| 216 } | |
| 217 | |
| 218 /* parse field data */ | |
| 219 hextocp(field[0], strlen(field[0]), &cp_base); | |
| 220 hextocp(field[1], strlen(field[1]), &cp_pair); | |
| 221 | |
| 222 /* determine decomposition of the base and pair codepoints */ | |
| 223 decomp_base.cp = cp_base; | |
| 224 decomp_pair.cp = cp_pair; | |
| 225 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callba… | |
| 226 &decomp_base); | |
| 227 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callba… | |
| 228 &decomp_pair); | |
| 229 | |
| 230 /* | |
| 231 * check if we already have the canonical form in the bracket ar… | |
| 232 * per convention the canonical form is the first element of the… | |
| 233 * array | |
| 234 */ | |
| 235 for (i = 0; i < blen; i++) { | |
| 236 if (decomp_base.decomposition == b[i].base[0]) { | |
| 237 /* we have a match, check type */ | |
| 238 if (strlen(field[2]) != 1 || | |
| 239 (field[2][0] != 'o' && field[2][0] != 'c')) { | |
| 240 /* malformed line */ | |
| 241 return 1; | |
| 242 } else if (b[i].type != field[2][0]) { | |
| 243 /* mismatching types */ | |
| 244 return 1; | |
| 245 } | |
| 246 | |
| 247 /* | |
| 248 * add our base alias to the base array unless i… | |
| 249 * already in it | |
| 250 */ | |
| 251 for (j = 0; j < b[i].baselen; j++) { | |
| 252 if (cp_base == b[i].base[j]) { | |
| 253 /* already in array, do nothing … | |
| 254 break; | |
| 255 } | |
| 256 } | |
| 257 if (j == b[i].baselen) { | |
| 258 /* | |
| 259 * the base alias is not already in the … | |
| 260 * add it | |
| 261 */ | |
| 262 if (b[i].baselen == NUM_BRACKET_ALIASES)… | |
| 263 fprintf(stderr, "too many aliase… | |
| 264 return 1; | |
| 265 } | |
| 266 b[i].baselen++; | |
| 267 b[i].base[b[i].baselen - 1] = cp_base; | |
| 268 } | |
| 269 | |
| 270 /* | |
| 271 * also add our pair alias to the pair array unl… | |
| 272 * it isn't already in it | |
| 273 */ | |
| 274 for (j = 0; j < b[i].pairlen; j++) { | |
| 275 if (cp_pair == b[i].pair[j]) { | |
| 276 /* already in array, do nothing … | |
| 277 break; | |
| 278 } | |
| 279 } | |
| 280 if (j == b[i].pairlen) { | |
| 281 /* | |
| 282 * the pair alias is not already in the … | |
| 283 * add it | |
| 284 */ | |
| 285 if (b[i].pairlen == NUM_BRACKET_ALIASES)… | |
| 286 fprintf(stderr, "too many aliase… | |
| 287 return 1; | |
| 288 } | |
| 289 b[i].pairlen++; | |
| 290 b[i].pair[b[i].pairlen - 1] = cp_pair; | |
| 291 } | |
| 292 | |
| 293 return 0; | |
| 294 } | |
| 295 } | |
| 296 | |
| 297 /* extend bracket pair array, as this is a new bracket type */ | |
| 298 if (!(b = realloc(b, (++blen) * sizeof(*b)))) { | |
| 299 fprintf(stderr, "realloc: %s\n", strerror(errno)); | |
| 300 exit(1); | |
| 301 } | |
| 302 | |
| 303 /* fill field data by adding the canonical form first */ | |
| 304 b[blen - 1].base[0] = decomp_base.decomposition; | |
| 305 b[blen - 1].baselen = 1; | |
| 306 b[blen - 1].pair[0] = decomp_pair.decomposition; | |
| 307 b[blen - 1].pairlen = 1; | |
| 308 | |
| 309 /* add alias if it differs from the canonical form */ | |
| 310 if (cp_base != decomp_base.decomposition) { | |
| 311 b[blen - 1].base[1] = cp_base; | |
| 312 b[blen - 1].baselen = 2; | |
| 313 } | |
| 314 if (cp_pair != decomp_pair.decomposition) { | |
| 315 b[blen - 1].pair[1] = cp_pair; | |
| 316 b[blen - 1].pairlen = 2; | |
| 317 } | |
| 318 | |
| 319 /* add bracket type */ | |
| 320 if (strlen(field[2]) != 1 || | |
| 321 (field[2][0] != 'o' && field[2][0] != 'c')) { | |
| 322 /* malformed line */ | |
| 323 return 1; | |
| 324 } else { | |
| 325 b[blen - 1].type = field[2][0]; | |
| 326 } | |
| 327 | |
| 328 /* | |
| 329 * determine bracket class by iterating over the bracket-array | |
| 330 * and seeing if our current canonical cp already has a matching… | |
| 331 * We only need to check the first entry in each bracket alias | |
| 332 * list, as this is, per convention, the canonical form. | |
| 333 * If not, add a new class. | |
| 334 */ | |
| 335 for (i = 0; i + 1 < blen; i++) { | |
| 336 if (b[i].pair[0] == b[blen - 1].base[0]) { | |
| 337 /* matched class */ | |
| 338 b[blen - 1].class = b[i].class; | |
| 339 break; | |
| 340 } | |
| 341 } | |
| 342 if (i + 1 == blen) { | |
| 343 /* no match, assign a new class */ | |
| 344 b[blen - 1].class = bracket_class_count++; | |
| 345 } | |
| 346 | |
| 347 return 0; | |
| 348 } | |
| 349 | |
| 350 static void | |
| 351 post_process(struct properties *prop) | |
| 352 { | |
| 353 size_t i, j; | |
| 354 | |
| 355 for (i = 0; i < blen; i++) { | |
| 356 /* | |
| 357 * given the base property fits in 5 bits, we simply | |
| 358 * store the bracket-offset in the bits above that. | |
| 359 * | |
| 360 * All those properties that are not set here implicitly | |
| 361 * have offset 0, which we prepared to contain a stub | |
| 362 * for a character that is not a bracket. | |
| 363 */ | |
| 364 for (j = 0; j < b[i].baselen; j++) { | |
| 365 prop[b[i].base[j]].property |= (i << 5); | |
| 366 } | |
| 367 } | |
| 368 } | |
| 369 | |
| 370 static uint_least8_t | |
| 371 fill_missing(uint_least32_t cp) | |
| 372 { | |
| 373 /* based on the @missing-properties in data/DerivedBidiClass.txt… | |
| 374 if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) || | |
| 375 (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) || | |
| 376 (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) || | |
| 377 (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) || | |
| 378 (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) || | |
| 379 (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) || | |
| 380 (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) || | |
| 381 (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) || | |
| 382 (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) || | |
| 383 (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) || | |
| 384 (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) { | |
| 385 return 17; /* class R */ | |
| 386 } else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) || | |
| 387 (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) || | |
| 388 (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) || | |
| 389 (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) || | |
| 390 (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) || | |
| 391 (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) … | |
| 392 (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) … | |
| 393 (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) … | |
| 394 (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) … | |
| 395 (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) … | |
| 396 (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF)))… | |
| 397 return 1; /* class AL */ | |
| 398 } else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) { | |
| 399 return 8; /* class ET */ | |
| 400 } else { | |
| 401 return 0; /* class L */ | |
| 402 } | |
| 403 } | |
| 404 | |
| 405 static struct properties *prop_mirror = NULL; | |
| 406 | |
| 407 static int | |
| 408 mirror_callback(const char *file, char **field, size_t nfields, char *co… | |
| 409 void *payload) | |
| 410 { | |
| 411 uint_least32_t cp, cp_mirror; | |
| 412 | |
| 413 (void)file; | |
| 414 (void)comment; | |
| 415 (void)payload; | |
| 416 | |
| 417 hextocp(field[0], strlen(field[0]), &cp); | |
| 418 | |
| 419 cp_mirror = cp; | |
| 420 | |
| 421 if (nfields >= 2 && strlen(field[1]) > 0 && | |
| 422 hextocp(field[1], strlen(field[1]), &cp_mirror)) { | |
| 423 return 1; | |
| 424 } | |
| 425 | |
| 426 prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least… | |
| 427 | |
| 428 return 0; | |
| 429 } | |
| 430 | |
| 431 static int_least64_t | |
| 432 get_value(const struct properties *prop, size_t offset) | |
| 433 { | |
| 434 return prop[offset].property; | |
| 435 } | |
| 436 | |
| 437 int | |
| 438 main(int argc, char *argv[]) | |
| 439 { | |
| 440 struct properties_compressed comp_mirror; | |
| 441 struct properties_major_minor mm_mirror; | |
| 442 size_t i; | |
| 443 | |
| 444 (void)argc; | |
| 445 | |
| 446 /* | |
| 447 * the first element in the bracket array is initialized to | |
| 448 * all-zeros, as we use the implicit 0-offset for all those | |
| 449 * codepoints that are not a bracket | |
| 450 */ | |
| 451 if (!(b = calloc((blen = 1), sizeof(*b)))) { | |
| 452 fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
| 453 exit(1); | |
| 454 } | |
| 455 parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, N… | |
| 456 | |
| 457 properties_generate_break_property(bidi_property, LEN(bidi_prope… | |
| 458 fill_missing, NULL, post_proc… | |
| 459 "bidi", argv[0]); | |
| 460 | |
| 461 printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t" | |
| 462 "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n" | |
| 463 "static const struct bracket {\n\tenum bracket_type type;… | |
| 464 "\tuint_least8_t class;\n} bidi_bracket[] = {\n"); | |
| 465 for (i = 0; i < blen; i++) { | |
| 466 printf("\t{\n\t\t.type = %s,\n\t\t.class = " | |
| 467 "%" PRIuLEAST8 ",\n\t},\n", | |
| 468 (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" : | |
| 469 (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" : | |
| 470 "BIDI_BRACKET_NONE", | |
| 471 b[i].class); | |
| 472 } | |
| 473 printf("};\n"); | |
| 474 | |
| 475 /* | |
| 476 * allocate property buffer for all 0x110000 codepoints | |
| 477 * | |
| 478 * the buffers contain the offset from the "base" character | |
| 479 * to the respective mirrored character. By callocing we set all | |
| 480 * fields to zero, which is also the Unicode "default" in the se… | |
| 481 * that the coe point is its mirror (unless we fill it in) | |
| 482 */ | |
| 483 if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirr… | |
| 484 fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
| 485 exit(1); | |
| 486 } | |
| 487 parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, N… | |
| 488 | |
| 489 /* compress properties */ | |
| 490 properties_compress(prop_mirror, &comp_mirror); | |
| 491 | |
| 492 fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", ar… | |
| 493 properties_get_major_minor(&comp_mirror, &mm_mirror)); | |
| 494 | |
| 495 /* print tables */ | |
| 496 properties_print_lookup_table("mirror_major", mm_mirror.major, 0… | |
| 497 printf("\n"); | |
| 498 properties_print_derived_lookup_table("mirror_minor", mm_mirror.… | |
| 499 mm_mirror.minorlen, get_va… | |
| 500 comp_mirror.data); | |
| 501 | |
| 502 free(comp_mirror.data); | |
| 503 free(comp_mirror.offset); | |
| 504 free(mm_mirror.major); | |
| 505 free(mm_mirror.minor); | |
| 506 | |
| 507 return 0; | |
| 508 } |