| case.c - libgrapheme - unicode string library | |
| git clone git://git.suckless.org/libgrapheme | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| case.c (12993B) | |
| --- | |
| 1 /* See LICENSE file for copyright and license details. */ | |
| 2 #include <stddef.h> | |
| 3 #include <stdint.h> | |
| 4 | |
| 5 #include "../gen/case.h" | |
| 6 #include "../grapheme.h" | |
| 7 #include "util.h" | |
| 8 | |
| 9 static inline enum case_property | |
| 10 get_case_property(uint_least32_t cp) | |
| 11 { | |
| 12 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
| 13 return (enum case_property) | |
| 14 case_minor[case_major[cp >> 8] + (cp & 0xFF)]; | |
| 15 } else { | |
| 16 return CASE_PROP_OTHER; | |
| 17 } | |
| 18 } | |
| 19 | |
| 20 static inline int_least32_t | |
| 21 get_case_offset(uint_least32_t cp, const uint_least16_t *major, | |
| 22 const int_least32_t *minor) | |
| 23 { | |
| 24 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
| 25 /* | |
| 26 * this value might be larger than or equal to 0x110000 | |
| 27 * for the special-case-mapping. This needs to be handled | |
| 28 * separately | |
| 29 */ | |
| 30 return minor[major[cp >> 8] + (cp & 0xFF)]; | |
| 31 } else { | |
| 32 return 0; | |
| 33 } | |
| 34 } | |
| 35 | |
| 36 static inline size_t | |
| 37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, | |
| 38 uint_least8_t final_sigma_level, const uint_least16_t *major, | |
| 39 const int_least32_t *minor, const struct special_case *sc) | |
| 40 { | |
| 41 HERODOTUS_READER tmp; | |
| 42 enum case_property prop; | |
| 43 enum herodotus_status s; | |
| 44 size_t off, i; | |
| 45 uint_least32_t cp, tmp_cp; | |
| 46 int_least32_t map; | |
| 47 | |
| 48 for (; herodotus_read_codepoint(r, true, &cp) == | |
| 49 HERODOTUS_STATUS_SUCCESS;) { | |
| 50 if (sc == lower_special) { | |
| 51 /* | |
| 52 * For the special Final_Sigma-rule (see | |
| 53 * SpecialCasing.txt), which is the only non-loc… | |
| 54 * case-dependent rule, we apply a different map… | |
| 55 * when a sigma is at the end of a word. | |
| 56 * | |
| 57 * Before: cased case-ignorable* | |
| 58 * After: not(case-ignorable* cased) | |
| 59 * | |
| 60 * We check the after-condition on demand, but t… | |
| 61 * before- condition is best checked using the | |
| 62 * "level"-heuristic also used in the sentence a… | |
| 63 * breaking-implementations. | |
| 64 */ | |
| 65 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL L… | |
| 66 SIGMA */ | |
| 67 (final_sigma_level == 1 || | |
| 68 final_sigma_level == 2)) { | |
| 69 /* | |
| 70 * check succeeding characters by first … | |
| 71 * all case-ignorable characters and then | |
| 72 * checking if the succeeding character … | |
| 73 * cased, invalidating the after-conditi… | |
| 74 */ | |
| 75 herodotus_reader_copy(r, &tmp); | |
| 76 for (prop = NUM_CASE_PROPS; | |
| 77 (s = herodotus_read_codepoint(&tmp,… | |
| 78 &tmp_… | |
| 79 HERODOTUS_STATUS_SUCCESS;) { | |
| 80 prop = get_case_property(tmp_cp); | |
| 81 | |
| 82 if (prop != CASE_PROP_CASE_IGNOR… | |
| 83 prop != CASE_PROP_BOTH_CASED… | |
| 84 break; | |
| 85 } | |
| 86 } | |
| 87 | |
| 88 /* | |
| 89 * Now prop is something other than | |
| 90 * case-ignorable or the source-string e… | |
| 91 * it is something other than cased, we … | |
| 92 * that the after-condition holds | |
| 93 */ | |
| 94 if (s != HERODOTUS_STATUS_SUCCESS || | |
| 95 (prop != CASE_PROP_CASED && | |
| 96 prop != CASE_PROP_BOTH_CASED_CASE_I… | |
| 97 /* | |
| 98 * write GREEK SMALL LETTER FINA… | |
| 99 * to destination | |
| 100 */ | |
| 101 herodotus_write_codepoint( | |
| 102 w, UINT32_C(0x03C2)); | |
| 103 | |
| 104 /* reset Final_Sigma-state and c… | |
| 105 */ | |
| 106 final_sigma_level = 0; | |
| 107 continue; | |
| 108 } | |
| 109 } | |
| 110 | |
| 111 /* update state */ | |
| 112 prop = get_case_property(cp); | |
| 113 if ((final_sigma_level == 0 || | |
| 114 final_sigma_level == 1) && | |
| 115 (prop == CASE_PROP_CASED || | |
| 116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE… | |
| 117 /* sequence has begun */ | |
| 118 final_sigma_level = 1; | |
| 119 } else if ( | |
| 120 (final_sigma_level == 1 || | |
| 121 final_sigma_level == 2) && | |
| 122 (prop == CASE_PROP_CASE_IGNORABLE || | |
| 123 prop == CASE_PROP_BOTH_CASED_CASE_IGNOR… | |
| 124 /* case-ignorable sequence begins or con… | |
| 125 */ | |
| 126 final_sigma_level = 2; | |
| 127 } else { | |
| 128 /* sequence broke */ | |
| 129 final_sigma_level = 0; | |
| 130 } | |
| 131 } | |
| 132 | |
| 133 /* get and handle case mapping */ | |
| 134 if (unlikely((map = get_case_offset(cp, major, minor)) >= | |
| 135 INT32_C(0x110000))) { | |
| 136 /* we have a special case and the offset in the … | |
| 137 * is the difference to 0x110000*/ | |
| 138 off = (uint_least32_t)map - UINT32_C(0x110000); | |
| 139 | |
| 140 for (i = 0; i < sc[off].cplen; i++) { | |
| 141 herodotus_write_codepoint(w, sc[off].cp[… | |
| 142 } | |
| 143 } else { | |
| 144 /* we have a simple mapping */ | |
| 145 herodotus_write_codepoint( | |
| 146 w, (uint_least32_t)((int_least32_t)cp + … | |
| 147 } | |
| 148 } | |
| 149 | |
| 150 herodotus_writer_nul_terminate(w); | |
| 151 | |
| 152 return herodotus_writer_number_written(w); | |
| 153 } | |
| 154 | |
| 155 static size_t | |
| 156 herodotus_next_word_break(const HERODOTUS_READER *r) | |
| 157 { | |
| 158 HERODOTUS_READER tmp; | |
| 159 | |
| 160 herodotus_reader_copy(r, &tmp); | |
| 161 | |
| 162 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 163 return grapheme_next_word_break(tmp.src, tmp.srclen); | |
| 164 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
| 165 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen… | |
| 166 } | |
| 167 } | |
| 168 | |
| 169 static inline size_t | |
| 170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) | |
| 171 { | |
| 172 enum case_property prop; | |
| 173 enum herodotus_status s; | |
| 174 uint_least32_t cp; | |
| 175 size_t nwb; | |
| 176 | |
| 177 for (; (nwb = herodotus_next_word_break(r)) > 0;) { | |
| 178 herodotus_reader_push_advance_limit(r, nwb); | |
| 179 for (; (s = herodotus_read_codepoint(r, false, &cp)) == | |
| 180 HERODOTUS_STATUS_SUCCESS;) { | |
| 181 /* check if we have a cased character */ | |
| 182 prop = get_case_property(cp); | |
| 183 if (prop == CASE_PROP_CASED || | |
| 184 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)… | |
| 185 break; | |
| 186 } else { | |
| 187 /* write the data to the output verbatim… | |
| 188 * permits */ | |
| 189 herodotus_write_codepoint(w, cp); | |
| 190 | |
| 191 /* increment reader */ | |
| 192 herodotus_read_codepoint(r, true, &cp); | |
| 193 } | |
| 194 } | |
| 195 | |
| 196 if (s == HERODOTUS_STATUS_END_OF_BUFFER) { | |
| 197 /* we are done */ | |
| 198 herodotus_reader_pop_limit(r); | |
| 199 break; | |
| 200 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { | |
| 201 /* | |
| 202 * we did not encounter any cased character | |
| 203 * up to the word break | |
| 204 */ | |
| 205 herodotus_reader_pop_limit(r); | |
| 206 continue; | |
| 207 } else { | |
| 208 /* | |
| 209 * we encountered a cased character before the w… | |
| 210 * break, convert it to titlecase | |
| 211 */ | |
| 212 herodotus_reader_push_advance_limit( | |
| 213 r, herodotus_reader_next_codepoint_break… | |
| 214 to_case(r, w, 0, title_major, title_minor, | |
| 215 title_special); | |
| 216 herodotus_reader_pop_limit(r); | |
| 217 } | |
| 218 | |
| 219 /* cast the rest of the codepoints in the word to lowerc… | |
| 220 to_case(r, w, 1, lower_major, lower_minor, lower_special… | |
| 221 | |
| 222 /* remove the limit on the word before the next iteratio… | |
| 223 herodotus_reader_pop_limit(r); | |
| 224 } | |
| 225 | |
| 226 herodotus_writer_nul_terminate(w); | |
| 227 | |
| 228 return herodotus_writer_number_written(w); | |
| 229 } | |
| 230 | |
| 231 size_t | |
| 232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, | |
| 233 uint_least32_t *dest, size_t destlen) | |
| 234 { | |
| 235 HERODOTUS_READER r; | |
| 236 HERODOTUS_WRITER w; | |
| 237 | |
| 238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
| 239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle… | |
| 240 | |
| 241 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia… | |
| 242 } | |
| 243 | |
| 244 size_t | |
| 245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, | |
| 246 uint_least32_t *dest, size_t destlen) | |
| 247 { | |
| 248 HERODOTUS_READER r; | |
| 249 HERODOTUS_WRITER w; | |
| 250 | |
| 251 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
| 252 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle… | |
| 253 | |
| 254 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia… | |
| 255 } | |
| 256 | |
| 257 size_t | |
| 258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, | |
| 259 uint_least32_t *dest, size_t destlen) | |
| 260 { | |
| 261 HERODOTUS_READER r; | |
| 262 HERODOTUS_WRITER w; | |
| 263 | |
| 264 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
| 265 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle… | |
| 266 | |
| 267 return to_titlecase(&r, &w); | |
| 268 } | |
| 269 | |
| 270 size_t | |
| 271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, | |
| 272 size_t destlen) | |
| 273 { | |
| 274 HERODOTUS_READER r; | |
| 275 HERODOTUS_WRITER w; | |
| 276 | |
| 277 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
| 278 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
| 279 | |
| 280 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia… | |
| 281 } | |
| 282 | |
| 283 size_t | |
| 284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, | |
| 285 size_t destlen) | |
| 286 { | |
| 287 HERODOTUS_READER r; | |
| 288 HERODOTUS_WRITER w; | |
| 289 | |
| 290 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
| 291 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
| 292 | |
| 293 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia… | |
| 294 } | |
| 295 | |
| 296 size_t | |
| 297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, | |
| 298 size_t destlen) | |
| 299 { | |
| 300 HERODOTUS_READER r; | |
| 301 HERODOTUS_WRITER w; | |
| 302 | |
| 303 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
| 304 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
| 305 | |
| 306 return to_titlecase(&r, &w); | |
| 307 } | |
| 308 | |
| 309 static inline bool | |
| 310 is_case(HERODOTUS_READER *r, const uint_least16_t *major, | |
| 311 const int_least32_t *minor, const struct special_case *sc, | |
| 312 size_t *output) | |
| 313 { | |
| 314 size_t off, i; | |
| 315 bool ret = true; | |
| 316 uint_least32_t cp; | |
| 317 int_least32_t map; | |
| 318 | |
| 319 for (; herodotus_read_codepoint(r, false, &cp) == | |
| 320 HERODOTUS_STATUS_SUCCESS;) { | |
| 321 /* get and handle case mapping */ | |
| 322 if (unlikely((map = get_case_offset(cp, major, minor)) >= | |
| 323 INT32_C(0x110000))) { | |
| 324 /* we have a special case and the offset in the … | |
| 325 * is the difference to 0x110000*/ | |
| 326 off = (uint_least32_t)map - UINT32_C(0x110000); | |
| 327 | |
| 328 for (i = 0; i < sc[off].cplen; i++) { | |
| 329 if (herodotus_read_codepoint(r, false, &… | |
| 330 HERODOTUS_STATUS_SUCCESS) { | |
| 331 if (cp != sc[off].cp[i]) { | |
| 332 ret = false; | |
| 333 goto done; | |
| 334 } else { | |
| 335 /* move forward */ | |
| 336 herodotus_read_codepoint( | |
| 337 r, true, &cp); | |
| 338 } | |
| 339 } else { | |
| 340 /* | |
| 341 * input ended and we didn't see | |
| 342 * any difference so far, so this | |
| 343 * string is in fact okay | |
| 344 */ | |
| 345 ret = true; | |
| 346 goto done; | |
| 347 } | |
| 348 } | |
| 349 } else { | |
| 350 /* we have a simple mapping */ | |
| 351 if (cp != (uint_least32_t)((int_least32_t)cp + m… | |
| 352 /* we have a difference */ | |
| 353 ret = false; | |
| 354 goto done; | |
| 355 } else { | |
| 356 /* move forward */ | |
| 357 herodotus_read_codepoint(r, true, &cp); | |
| 358 } | |
| 359 } | |
| 360 } | |
| 361 done: | |
| 362 if (output) { | |
| 363 *output = herodotus_reader_number_read(r); | |
| 364 } | |
| 365 return ret; | |
| 366 } | |
| 367 | |
| 368 static inline bool | |
| 369 is_titlecase(HERODOTUS_READER *r, size_t *output) | |
| 370 { | |
| 371 enum case_property prop; | |
| 372 enum herodotus_status s; | |
| 373 bool ret = true; | |
| 374 uint_least32_t cp; | |
| 375 size_t nwb; | |
| 376 | |
| 377 for (; (nwb = herodotus_next_word_break(r)) > 0;) { | |
| 378 herodotus_reader_push_advance_limit(r, nwb); | |
| 379 for (; (s = herodotus_read_codepoint(r, false, &cp)) == | |
| 380 HERODOTUS_STATUS_SUCCESS;) { | |
| 381 /* check if we have a cased character */ | |
| 382 prop = get_case_property(cp); | |
| 383 if (prop == CASE_PROP_CASED || | |
| 384 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)… | |
| 385 break; | |
| 386 } else { | |
| 387 /* increment reader */ | |
| 388 herodotus_read_codepoint(r, true, &cp); | |
| 389 } | |
| 390 } | |
| 391 | |
| 392 if (s == HERODOTUS_STATUS_END_OF_BUFFER) { | |
| 393 /* we are done */ | |
| 394 break; | |
| 395 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { | |
| 396 /* | |
| 397 * we did not encounter any cased character | |
| 398 * up to the word break | |
| 399 */ | |
| 400 herodotus_reader_pop_limit(r); | |
| 401 continue; | |
| 402 } else { | |
| 403 /* | |
| 404 * we encountered a cased character before the w… | |
| 405 * break, check if it's titlecase | |
| 406 */ | |
| 407 herodotus_reader_push_advance_limit( | |
| 408 r, herodotus_reader_next_codepoint_break… | |
| 409 if (!is_case(r, title_major, title_minor, title_… | |
| 410 NULL)) { | |
| 411 ret = false; | |
| 412 goto done; | |
| 413 } | |
| 414 herodotus_reader_pop_limit(r); | |
| 415 } | |
| 416 | |
| 417 /* check if the rest of the codepoints in the word are l… | |
| 418 */ | |
| 419 if (!is_case(r, lower_major, lower_minor, lower_special, | |
| 420 NULL)) { | |
| 421 ret = false; | |
| 422 goto done; | |
| 423 } | |
| 424 | |
| 425 /* remove the limit on the word before the next iteratio… | |
| 426 herodotus_reader_pop_limit(r); | |
| 427 } | |
| 428 done: | |
| 429 if (output) { | |
| 430 *output = herodotus_reader_number_read(r); | |
| 431 } | |
| 432 return ret; | |
| 433 } | |
| 434 | |
| 435 bool | |
| 436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *… | |
| 437 { | |
| 438 HERODOTUS_READER r; | |
| 439 | |
| 440 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
| 441 | |
| 442 return is_case(&r, upper_major, upper_minor, upper_special, case… | |
| 443 } | |
| 444 | |
| 445 bool | |
| 446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *… | |
| 447 { | |
| 448 HERODOTUS_READER r; | |
| 449 | |
| 450 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
| 451 | |
| 452 return is_case(&r, lower_major, lower_minor, lower_special, case… | |
| 453 } | |
| 454 | |
| 455 bool | |
| 456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *… | |
| 457 { | |
| 458 HERODOTUS_READER r; | |
| 459 | |
| 460 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
| 461 | |
| 462 return is_titlecase(&r, caselen); | |
| 463 } | |
| 464 | |
| 465 bool | |
| 466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *casel… | |
| 467 { | |
| 468 HERODOTUS_READER r; | |
| 469 | |
| 470 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
| 471 | |
| 472 return is_case(&r, upper_major, upper_minor, upper_special, case… | |
| 473 } | |
| 474 | |
| 475 bool | |
| 476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *casel… | |
| 477 { | |
| 478 HERODOTUS_READER r; | |
| 479 | |
| 480 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
| 481 | |
| 482 return is_case(&r, lower_major, lower_minor, lower_special, case… | |
| 483 } | |
| 484 | |
| 485 bool | |
| 486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *casel… | |
| 487 { | |
| 488 HERODOTUS_READER r; | |
| 489 | |
| 490 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
| 491 | |
| 492 return is_titlecase(&r, caselen); | |
| 493 } |