| line.c - libgrapheme - unicode string library | |
| git clone git://git.suckless.org/libgrapheme | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| line.c (11273B) | |
| --- | |
| 1 /* See LICENSE file for copyright and license details. */ | |
| 2 #include <stdio.h> | |
| 3 #include <stdlib.h> | |
| 4 #include <string.h> | |
| 5 | |
| 6 #include "util.h" | |
| 7 | |
| 8 #define FILE_EAW "data/EastAsianWidth.txt" | |
| 9 #define FILE_EMOJI "data/emoji-data.txt" | |
| 10 #define FILE_LINE "data/LineBreak.txt" | |
| 11 | |
| 12 static const struct property_spec line_break_property[] = { | |
| 13 { | |
| 14 .enumname = "AL", | |
| 15 .file = FILE_LINE, | |
| 16 .ucdname = "AL", | |
| 17 }, | |
| 18 /* | |
| 19 * Both extended pictographic and cn are large classes, | |
| 20 * but we are only interested in their intersection for LB30b, | |
| 21 * so we have the following two temporary classes. At first | |
| 22 * the extpict-class is filled, then the cn-class, which leads | |
| 23 * to conflicts (that we handle by putting them in the "proper" | |
| 24 * class BOTH_CN_EXTPICT). We make use of the fact that there | |
| 25 * is no intersection between AL and Cn. | |
| 26 * | |
| 27 * Any consecutive conflicts are permitted to overwrite | |
| 28 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need | |
| 29 * them, and in the final postprocessing we "reset" all | |
| 30 * remaining matches (that then didn't fit any of the other | |
| 31 * classes) to the generic class AL. | |
| 32 */ | |
| 33 { | |
| 34 .enumname = "TMP_CN", | |
| 35 .file = FILE_LINE, | |
| 36 .ucdname = "Cn", | |
| 37 }, | |
| 38 { | |
| 39 .enumname = "TMP_EXTENDED_PICTOGRAPHIC", | |
| 40 .file = FILE_EMOJI, | |
| 41 .ucdname = "Extended_Pictographic", | |
| 42 }, | |
| 43 /* end of special block */ | |
| 44 { | |
| 45 .enumname = "B2", | |
| 46 .file = FILE_LINE, | |
| 47 .ucdname = "B2", | |
| 48 }, | |
| 49 { | |
| 50 .enumname = "BA", | |
| 51 .file = FILE_LINE, | |
| 52 .ucdname = "BA", | |
| 53 }, | |
| 54 { | |
| 55 .enumname = "BB", | |
| 56 .file = FILE_LINE, | |
| 57 .ucdname = "BB", | |
| 58 }, | |
| 59 { | |
| 60 .enumname = "BK", | |
| 61 .file = FILE_LINE, | |
| 62 .ucdname = "BK", | |
| 63 }, | |
| 64 { | |
| 65 .enumname = "BOTH_CN_EXTPICT", | |
| 66 .file = NULL, | |
| 67 .ucdname = NULL, | |
| 68 }, | |
| 69 { | |
| 70 .enumname = "CB", | |
| 71 .file = FILE_LINE, | |
| 72 .ucdname = "CB", | |
| 73 }, | |
| 74 { | |
| 75 .enumname = "CL", | |
| 76 .file = FILE_LINE, | |
| 77 .ucdname = "CL", | |
| 78 }, | |
| 79 { | |
| 80 .enumname = "CM", | |
| 81 .file = FILE_LINE, | |
| 82 .ucdname = "CM", | |
| 83 }, | |
| 84 { | |
| 85 .enumname = "CP_WITHOUT_EAW_HWF", | |
| 86 .file = FILE_LINE, | |
| 87 .ucdname = "CP", | |
| 88 }, | |
| 89 { | |
| 90 .enumname = "CP_WITH_EAW_HWF", | |
| 91 .file = NULL, | |
| 92 .ucdname = NULL, | |
| 93 }, | |
| 94 { | |
| 95 .enumname = "CR", | |
| 96 .file = FILE_LINE, | |
| 97 .ucdname = "CR", | |
| 98 }, | |
| 99 { | |
| 100 .enumname = "EB", | |
| 101 .file = FILE_LINE, | |
| 102 .ucdname = "EB", | |
| 103 }, | |
| 104 { | |
| 105 .enumname = "EM", | |
| 106 .file = FILE_LINE, | |
| 107 .ucdname = "EM", | |
| 108 }, | |
| 109 { | |
| 110 .enumname = "EX", | |
| 111 .file = FILE_LINE, | |
| 112 .ucdname = "EX", | |
| 113 }, | |
| 114 { | |
| 115 .enumname = "GL", | |
| 116 .file = FILE_LINE, | |
| 117 .ucdname = "GL", | |
| 118 }, | |
| 119 { | |
| 120 .enumname = "H2", | |
| 121 .file = FILE_LINE, | |
| 122 .ucdname = "H2", | |
| 123 }, | |
| 124 { | |
| 125 .enumname = "H3", | |
| 126 .file = FILE_LINE, | |
| 127 .ucdname = "H3", | |
| 128 }, | |
| 129 { | |
| 130 .enumname = "HL", | |
| 131 .file = FILE_LINE, | |
| 132 .ucdname = "HL", | |
| 133 }, | |
| 134 { | |
| 135 .enumname = "HY", | |
| 136 .file = FILE_LINE, | |
| 137 .ucdname = "HY", | |
| 138 }, | |
| 139 { | |
| 140 .enumname = "ID", | |
| 141 .file = FILE_LINE, | |
| 142 .ucdname = "ID", | |
| 143 }, | |
| 144 { | |
| 145 .enumname = "IN", | |
| 146 .file = FILE_LINE, | |
| 147 .ucdname = "IN", | |
| 148 }, | |
| 149 { | |
| 150 .enumname = "IS", | |
| 151 .file = FILE_LINE, | |
| 152 .ucdname = "IS", | |
| 153 }, | |
| 154 { | |
| 155 .enumname = "JL", | |
| 156 .file = FILE_LINE, | |
| 157 .ucdname = "JL", | |
| 158 }, | |
| 159 { | |
| 160 .enumname = "JT", | |
| 161 .file = FILE_LINE, | |
| 162 .ucdname = "JT", | |
| 163 }, | |
| 164 { | |
| 165 .enumname = "JV", | |
| 166 .file = FILE_LINE, | |
| 167 .ucdname = "JV", | |
| 168 }, | |
| 169 { | |
| 170 .enumname = "LF", | |
| 171 .file = FILE_LINE, | |
| 172 .ucdname = "LF", | |
| 173 }, | |
| 174 { | |
| 175 .enumname = "NL", | |
| 176 .file = FILE_LINE, | |
| 177 .ucdname = "NL", | |
| 178 }, | |
| 179 { | |
| 180 .enumname = "NS", | |
| 181 .file = FILE_LINE, | |
| 182 .ucdname = "NS", | |
| 183 }, | |
| 184 { | |
| 185 .enumname = "NU", | |
| 186 .file = FILE_LINE, | |
| 187 .ucdname = "NU", | |
| 188 }, | |
| 189 { | |
| 190 .enumname = "OP_WITHOUT_EAW_HWF", | |
| 191 .file = FILE_LINE, | |
| 192 .ucdname = "OP", | |
| 193 }, | |
| 194 { | |
| 195 .enumname = "OP_WITH_EAW_HWF", | |
| 196 .file = NULL, | |
| 197 .ucdname = NULL, | |
| 198 }, | |
| 199 { | |
| 200 .enumname = "PO", | |
| 201 .file = FILE_LINE, | |
| 202 .ucdname = "PO", | |
| 203 }, | |
| 204 { | |
| 205 .enumname = "PR", | |
| 206 .file = FILE_LINE, | |
| 207 .ucdname = "PR", | |
| 208 }, | |
| 209 { | |
| 210 .enumname = "QU", | |
| 211 .file = FILE_LINE, | |
| 212 .ucdname = "QU", | |
| 213 }, | |
| 214 { | |
| 215 .enumname = "RI", | |
| 216 .file = FILE_LINE, | |
| 217 .ucdname = "RI", | |
| 218 }, | |
| 219 { | |
| 220 .enumname = "SP", | |
| 221 .file = FILE_LINE, | |
| 222 .ucdname = "SP", | |
| 223 }, | |
| 224 { | |
| 225 .enumname = "SY", | |
| 226 .file = FILE_LINE, | |
| 227 .ucdname = "SY", | |
| 228 }, | |
| 229 { | |
| 230 .enumname = "WJ", | |
| 231 .file = FILE_LINE, | |
| 232 .ucdname = "WJ", | |
| 233 }, | |
| 234 { | |
| 235 .enumname = "ZW", | |
| 236 .file = FILE_LINE, | |
| 237 .ucdname = "ZW", | |
| 238 }, | |
| 239 { | |
| 240 .enumname = "ZWJ", | |
| 241 .file = FILE_LINE, | |
| 242 .ucdname = "ZWJ", | |
| 243 }, | |
| 244 { | |
| 245 .enumname = "TMP_AI", | |
| 246 .file = FILE_LINE, | |
| 247 .ucdname = "AI", | |
| 248 }, | |
| 249 { | |
| 250 .enumname = "TMP_CJ", | |
| 251 .file = FILE_LINE, | |
| 252 .ucdname = "CJ", | |
| 253 }, | |
| 254 { | |
| 255 .enumname = "TMP_XX", | |
| 256 .file = NULL, | |
| 257 .ucdname = NULL, | |
| 258 }, | |
| 259 { | |
| 260 .enumname = "TMP_MN", | |
| 261 .file = FILE_LINE, | |
| 262 .ucdname = "Mn", | |
| 263 }, | |
| 264 { | |
| 265 .enumname = "TMP_MC", | |
| 266 .file = FILE_LINE, | |
| 267 .ucdname = "Mc", | |
| 268 }, | |
| 269 { | |
| 270 .enumname = "TMP_SA_WITHOUT_MN_OR_MC", | |
| 271 .file = FILE_LINE, | |
| 272 .ucdname = "SA", | |
| 273 }, | |
| 274 { | |
| 275 .enumname = "TMP_SA_WITH_MN_OR_MC", | |
| 276 .file = FILE_LINE, | |
| 277 .ucdname = "SA", | |
| 278 }, | |
| 279 { | |
| 280 .enumname = "TMP_SG", | |
| 281 .file = FILE_LINE, | |
| 282 .ucdname = "SG", | |
| 283 }, | |
| 284 { | |
| 285 .enumname = "TMP_EAW_H", | |
| 286 .file = FILE_EAW, | |
| 287 .ucdname = "H", | |
| 288 }, | |
| 289 { | |
| 290 .enumname = "TMP_EAW_W", | |
| 291 .file = FILE_EAW, | |
| 292 .ucdname = "W", | |
| 293 }, | |
| 294 { | |
| 295 .enumname = "TMP_EAW_F", | |
| 296 .file = FILE_EAW, | |
| 297 .ucdname = "F", | |
| 298 }, | |
| 299 }; | |
| 300 | |
| 301 static uint_least8_t | |
| 302 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t pr… | |
| 303 { | |
| 304 uint_least8_t result = prop2; | |
| 305 char *target = NULL; | |
| 306 | |
| 307 (void)cp; | |
| 308 | |
| 309 if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") || | |
| 310 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") || | |
| 311 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) … | |
| 312 (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") || | |
| 313 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") || | |
| 314 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F")))… | |
| 315 if (!strcmp(line_break_property[prop1].enumname, | |
| 316 "CP_WITHOUT_EAW_HWF") || | |
| 317 !strcmp(line_break_property[prop2].enumname, | |
| 318 "CP_WITHOUT_EAW_HWF")) { | |
| 319 target = "CP_WITH_EAW_HWF"; | |
| 320 } else if (!strcmp(line_break_property[prop1].enumname, | |
| 321 "OP_WITHOUT_EAW_HWF") || | |
| 322 !strcmp(line_break_property[prop2].enumname, | |
| 323 "OP_WITHOUT_EAW_HWF")) { | |
| 324 target = "OP_WITH_EAW_HWF"; | |
| 325 } else { | |
| 326 /* ignore EAW for the rest */ | |
| 327 if ((!strcmp(line_break_property[prop1].enumname, | |
| 328 "TMP_EAW_H") || | |
| 329 !strcmp(line_break_property[prop1].enumname, | |
| 330 "TMP_EAW_W") || | |
| 331 !strcmp(line_break_property[prop1].enumname, | |
| 332 "TMP_EAW_F"))) { | |
| 333 result = prop2; | |
| 334 } else { | |
| 335 result = prop1; | |
| 336 } | |
| 337 } | |
| 338 } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN… | |
| 339 !strcmp(line_break_property[prop1].enumname, "TMP_MC… | |
| 340 (!strcmp(line_break_property[prop2].enumname, "TMP_MN… | |
| 341 !strcmp(line_break_property[prop2].enumname, "TMP_MC… | |
| 342 if (!strcmp(line_break_property[prop1].enumname, | |
| 343 "SA_WITHOUT_MN_OR_MC") || | |
| 344 !strcmp(line_break_property[prop2].enumname, | |
| 345 "SA_WITHOUT_MN_OR_MC")) { | |
| 346 target = "SA_WITH_MN_OR_MC"; | |
| 347 } else { | |
| 348 /* ignore Mn and Mc for the rest */ | |
| 349 if ((!strcmp(line_break_property[prop1].enumname, | |
| 350 "TMP_MN") || | |
| 351 !strcmp(line_break_property[prop1].enumname, | |
| 352 "TMP_MC"))) { | |
| 353 result = prop2; | |
| 354 } else { | |
| 355 result = prop1; | |
| 356 } | |
| 357 } | |
| 358 } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN"… | |
| 359 !strcmp(line_break_property[prop2].enumname, "TMP_CN"… | |
| 360 if (!strcmp(line_break_property[prop1].enumname, | |
| 361 "TMP_EXTENDED_PICTOGRAPHIC") || | |
| 362 !strcmp(line_break_property[prop2].enumname, | |
| 363 "TMP_EXTENDED_PICTOGRAPHIC")) { | |
| 364 target = "BOTH_CN_EXTPICT"; | |
| 365 } else { | |
| 366 /* ignore Cn for all the other properties */ | |
| 367 if (!strcmp(line_break_property[prop1].enumname, | |
| 368 "TMP_CN")) { | |
| 369 result = prop2; | |
| 370 } else { | |
| 371 result = prop1; | |
| 372 } | |
| 373 } | |
| 374 } else if (!strcmp(line_break_property[prop1].enumname, | |
| 375 "TMP_EXTENDED_PICTOGRAPHIC") || | |
| 376 !strcmp(line_break_property[prop2].enumname, | |
| 377 "TMP_EXTENDED_PICTOGRAPHIC")) { | |
| 378 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN… | |
| 379 !strcmp(line_break_property[prop2].enumname, "TMP_CN… | |
| 380 target = "BOTH_CN_EXTPICT"; | |
| 381 } else { | |
| 382 /* ignore Extended_Pictographic for all the other | |
| 383 * properties */ | |
| 384 if (!strcmp(line_break_property[prop1].enumname, | |
| 385 "TMP_EXTENDED_PICTOGRAPHIC")) { | |
| 386 result = prop2; | |
| 387 } else { | |
| 388 result = prop1; | |
| 389 } | |
| 390 } | |
| 391 } else { | |
| 392 fprintf(stderr, | |
| 393 "handle_conflict: Cannot handle conflict %s <- %… | |
| 394 line_break_property[prop1].enumname, | |
| 395 line_break_property[prop2].enumname); | |
| 396 exit(1); | |
| 397 } | |
| 398 | |
| 399 if (target) { | |
| 400 for (result = 0; result < LEN(line_break_property); resu… | |
| 401 if (!strcmp(line_break_property[result].enumname, | |
| 402 target)) { | |
| 403 break; | |
| 404 } | |
| 405 } | |
| 406 if (result == LEN(line_break_property)) { | |
| 407 fprintf(stderr, "handle_conflict: Internal error… | |
| 408 exit(1); | |
| 409 } | |
| 410 } | |
| 411 | |
| 412 return result; | |
| 413 } | |
| 414 | |
| 415 static void | |
| 416 post_process(struct properties *prop) | |
| 417 { | |
| 418 const char *target; | |
| 419 uint_least8_t result; | |
| 420 size_t i; | |
| 421 | |
| 422 /* post-mapping according to the line breaking algorithm */ | |
| 423 for (i = 0; i < UINT32_C(0x110000); i++) { | |
| 424 /* LB1 */ | |
| 425 if (!strcmp(line_break_property[prop[i].property].enumna… | |
| 426 "TMP_AI") || | |
| 427 !strcmp(line_break_property[prop[i].property].enumna… | |
| 428 "TMP_SG") || | |
| 429 !strcmp(line_break_property[prop[i].property].enumna… | |
| 430 "TMP_XX")) { | |
| 431 /* map AI, SG and XX to AL */ | |
| 432 target = "AL"; | |
| 433 } else if (!strcmp(line_break_property[prop[i].property] | |
| 434 .enumname, | |
| 435 "TMP_SA_WITH_MN_OR_MC")) { | |
| 436 /* map SA (with General_Category Mn or Mc) to CM… | |
| 437 target = "CM"; | |
| 438 } else if (!strcmp(line_break_property[prop[i].property] | |
| 439 .enumname, | |
| 440 "TMP_SA_WITHOUT_MN_OR_MC")) { | |
| 441 /* map SA (without General_Category Mn or Mc) to… | |
| 442 target = "AL"; | |
| 443 } else if (!strcmp(line_break_property[prop[i].property] | |
| 444 .enumname, | |
| 445 "TMP_CJ")) { | |
| 446 /* map CJ to NS */ | |
| 447 target = "NS"; | |
| 448 } else if ( | |
| 449 !strcmp(line_break_property[prop[i].property].en… | |
| 450 "TMP_CN") || | |
| 451 !strcmp(line_break_property[prop[i].property].en… | |
| 452 "TMP_EXTENDED_PICTOGRAPHIC") || | |
| 453 !strcmp(line_break_property[prop[i].property].en… | |
| 454 "TMP_MN") || | |
| 455 !strcmp(line_break_property[prop[i].property].en… | |
| 456 "TMP_MC") || | |
| 457 !strcmp(line_break_property[prop[i].property].en… | |
| 458 "TMP_EAW_H") || | |
| 459 !strcmp(line_break_property[prop[i].property].en… | |
| 460 "TMP_EAW_W") || | |
| 461 !strcmp(line_break_property[prop[i].property].en… | |
| 462 "TMP_EAW_F")) { | |
| 463 /* map all the temporary classes "residue" to AL… | |
| 464 target = "AL"; | |
| 465 } else { | |
| 466 target = NULL; | |
| 467 } | |
| 468 | |
| 469 if (target) { | |
| 470 for (result = 0; result < LEN(line_break_propert… | |
| 471 result++) { | |
| 472 if (!strcmp(line_break_property[result] | |
| 473 .enumname, | |
| 474 target)) { | |
| 475 break; | |
| 476 } | |
| 477 } | |
| 478 if (result == LEN(line_break_property)) { | |
| 479 fprintf(stderr, | |
| 480 "handle_conflict: Internal error… | |
| 481 exit(1); | |
| 482 } | |
| 483 | |
| 484 prop[i].property = result; | |
| 485 } | |
| 486 } | |
| 487 } | |
| 488 | |
| 489 int | |
| 490 main(int argc, char *argv[]) | |
| 491 { | |
| 492 (void)argc; | |
| 493 | |
| 494 properties_generate_break_property( | |
| 495 line_break_property, LEN(line_break_property), NULL, | |
| 496 handle_conflict, post_process, "line_break", argv[0]); | |
| 497 | |
| 498 return 0; | |
| 499 } |