GopherProxy

	case.c - libgrapheme - unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	case.c (12993B)
	---
	1 /* See LICENSE file for copyright and license details. */
	2 #include <stddef.h>
	3 #include <stdint.h>
	4
	5 #include "../gen/case.h"
	6 #include "../grapheme.h"
	7 #include "util.h"
	8
	9 static inline enum case_property
	10 get_case_property(uint_least32_t cp)
	11 {
	12 if (likely(cp <= UINT32_C(0x10FFFF))) {
	13 return (enum case_property)
	14 case_minor[case_major[cp >> 8] + (cp & 0xFF)];
	15 } else {
	16 return CASE_PROP_OTHER;
	17 }
	18 }
	19
	20 static inline int_least32_t
	21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
	22 const int_least32_t *minor)
	23 {
	24 if (likely(cp <= UINT32_C(0x10FFFF))) {
	25 /*
	26 * this value might be larger than or equal to 0x110000
	27 * for the special-case-mapping. This needs to be handled
	28 * separately
	29 */
	30 return minor[major[cp >> 8] + (cp & 0xFF)];
	31 } else {
	32 return 0;
	33 }
	34 }
	35
	36 static inline size_t
	37 to_case(HERODOTUS_READER r, HERODOTUS_WRITER w,
	38 uint_least8_t final_sigma_level, const uint_least16_t *major,
	39 const int_least32_t minor, const struct special_case sc)
	40 {
	41 HERODOTUS_READER tmp;
	42 enum case_property prop;
	43 enum herodotus_status s;
	44 size_t off, i;
	45 uint_least32_t cp, tmp_cp;
	46 int_least32_t map;
	47
	48 for (; herodotus_read_codepoint(r, true, &cp) ==
	49 HERODOTUS_STATUS_SUCCESS;) {
	50 if (sc == lower_special) {
	51 /*
	52 * For the special Final_Sigma-rule (see
	53 * SpecialCasing.txt), which is the only non-loc…
	54 * case-dependent rule, we apply a different map…
	55 * when a sigma is at the end of a word.
	56 *
	57 * Before: cased case-ignorable*
	58 * After: not(case-ignorable* cased)
	59 *
	60 * We check the after-condition on demand, but t…
	61 * before- condition is best checked using the
	62 * "level"-heuristic also used in the sentence a…
	63 * breaking-implementations.
	64 */
	65 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL L…
	66 SIGMA */
	67 (final_sigma_level == 1 \|\|
	68 final_sigma_level == 2)) {
	69 /*
	70 * check succeeding characters by first …
	71 * all case-ignorable characters and then
	72 * checking if the succeeding character …
	73 * cased, invalidating the after-conditi…
	74 */
	75 herodotus_reader_copy(r, &tmp);
	76 for (prop = NUM_CASE_PROPS;
	77 (s = herodotus_read_codepoint(&tmp,…
	78 &tmp_…
	79 HERODOTUS_STATUS_SUCCESS;) {
	80 prop = get_case_property(tmp_cp);
	81
	82 if (prop != CASE_PROP_CASE_IGNOR…
	83 prop != CASE_PROP_BOTH_CASED…
	84 break;
	85 }
	86 }
	87
	88 /*
	89 * Now prop is something other than
	90 * case-ignorable or the source-string e…
	91 * it is something other than cased, we …
	92 * that the after-condition holds
	93 */
	94 if (s != HERODOTUS_STATUS_SUCCESS \|\|
	95 (prop != CASE_PROP_CASED &&
	96 prop != CASE_PROP_BOTH_CASED_CASE_I…
	97 /*
	98 * write GREEK SMALL LETTER FINA…
	99 * to destination
	100 */
	101 herodotus_write_codepoint(
	102 w, UINT32_C(0x03C2));
	103
	104 /* reset Final_Sigma-state and c…
	105 */
	106 final_sigma_level = 0;
	107 continue;
	108 }
	109 }
	110
	111 /* update state */
	112 prop = get_case_property(cp);
	113 if ((final_sigma_level == 0 \|\|
	114 final_sigma_level == 1) &&
	115 (prop == CASE_PROP_CASED \|\|
	116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE…
	117 /* sequence has begun */
	118 final_sigma_level = 1;
	119 } else if (
	120 (final_sigma_level == 1 \|\|
	121 final_sigma_level == 2) &&
	122 (prop == CASE_PROP_CASE_IGNORABLE \|\|
	123 prop == CASE_PROP_BOTH_CASED_CASE_IGNOR…
	124 /* case-ignorable sequence begins or con…
	125 */
	126 final_sigma_level = 2;
	127 } else {
	128 /* sequence broke */
	129 final_sigma_level = 0;
	130 }
	131 }
	132
	133 /* get and handle case mapping */
	134 if (unlikely((map = get_case_offset(cp, major, minor)) >=
	135 INT32_C(0x110000))) {
	136 /* we have a special case and the offset in the …
	137 * is the difference to 0x110000*/
	138 off = (uint_least32_t)map - UINT32_C(0x110000);
	139
	140 for (i = 0; i < sc[off].cplen; i++) {
	141 herodotus_write_codepoint(w, sc[off].cp[…
	142 }
	143 } else {
	144 /* we have a simple mapping */
	145 herodotus_write_codepoint(
	146 w, (uint_least32_t)((int_least32_t)cp + …
	147 }
	148 }
	149
	150 herodotus_writer_nul_terminate(w);
	151
	152 return herodotus_writer_number_written(w);
	153 }
	154
	155 static size_t
	156 herodotus_next_word_break(const HERODOTUS_READER *r)
	157 {
	158 HERODOTUS_READER tmp;
	159
	160 herodotus_reader_copy(r, &tmp);
	161
	162 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
	163 return grapheme_next_word_break(tmp.src, tmp.srclen);
	164 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
	165 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen…
	166 }
	167 }
	168
	169 static inline size_t
	170 to_titlecase(HERODOTUS_READER r, HERODOTUS_WRITER w)
	171 {
	172 enum case_property prop;
	173 enum herodotus_status s;
	174 uint_least32_t cp;
	175 size_t nwb;
	176
	177 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
	178 herodotus_reader_push_advance_limit(r, nwb);
	179 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
	180 HERODOTUS_STATUS_SUCCESS;) {
	181 /* check if we have a cased character */
	182 prop = get_case_property(cp);
	183 if (prop == CASE_PROP_CASED \|\|
	184 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)…
	185 break;
	186 } else {
	187 /* write the data to the output verbatim…
	188 * permits */
	189 herodotus_write_codepoint(w, cp);
	190
	191 /* increment reader */
	192 herodotus_read_codepoint(r, true, &cp);
	193 }
	194 }
	195
	196 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
	197 /* we are done */
	198 herodotus_reader_pop_limit(r);
	199 break;
	200 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
	201 /*
	202 * we did not encounter any cased character
	203 * up to the word break
	204 */
	205 herodotus_reader_pop_limit(r);
	206 continue;
	207 } else {
	208 /*
	209 * we encountered a cased character before the w…
	210 * break, convert it to titlecase
	211 */
	212 herodotus_reader_push_advance_limit(
	213 r, herodotus_reader_next_codepoint_break…
	214 to_case(r, w, 0, title_major, title_minor,
	215 title_special);
	216 herodotus_reader_pop_limit(r);
	217 }
	218
	219 /* cast the rest of the codepoints in the word to lowerc…
	220 to_case(r, w, 1, lower_major, lower_minor, lower_special…
	221
	222 /* remove the limit on the word before the next iteratio…
	223 herodotus_reader_pop_limit(r);
	224 }
	225
	226 herodotus_writer_nul_terminate(w);
	227
	228 return herodotus_writer_number_written(w);
	229 }
	230
	231 size_t
	232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
	233 uint_least32_t *dest, size_t destlen)
	234 {
	235 HERODOTUS_READER r;
	236 HERODOTUS_WRITER w;
	237
	238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle…
	240
	241 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia…
	242 }
	243
	244 size_t
	245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
	246 uint_least32_t *dest, size_t destlen)
	247 {
	248 HERODOTUS_READER r;
	249 HERODOTUS_WRITER w;
	250
	251 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	252 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle…
	253
	254 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia…
	255 }
	256
	257 size_t
	258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
	259 uint_least32_t *dest, size_t destlen)
	260 {
	261 HERODOTUS_READER r;
	262 HERODOTUS_WRITER w;
	263
	264 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	265 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle…
	266
	267 return to_titlecase(&r, &w);
	268 }
	269
	270 size_t
	271 grapheme_to_uppercase_utf8(const char src, size_t srclen, char dest,
	272 size_t destlen)
	273 {
	274 HERODOTUS_READER r;
	275 HERODOTUS_WRITER w;
	276
	277 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	278 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
	279
	280 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia…
	281 }
	282
	283 size_t
	284 grapheme_to_lowercase_utf8(const char src, size_t srclen, char dest,
	285 size_t destlen)
	286 {
	287 HERODOTUS_READER r;
	288 HERODOTUS_WRITER w;
	289
	290 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	291 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
	292
	293 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia…
	294 }
	295
	296 size_t
	297 grapheme_to_titlecase_utf8(const char src, size_t srclen, char dest,
	298 size_t destlen)
	299 {
	300 HERODOTUS_READER r;
	301 HERODOTUS_WRITER w;
	302
	303 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	304 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
	305
	306 return to_titlecase(&r, &w);
	307 }
	308
	309 static inline bool
	310 is_case(HERODOTUS_READER r, const uint_least16_t major,
	311 const int_least32_t minor, const struct special_case sc,
	312 size_t *output)
	313 {
	314 size_t off, i;
	315 bool ret = true;
	316 uint_least32_t cp;
	317 int_least32_t map;
	318
	319 for (; herodotus_read_codepoint(r, false, &cp) ==
	320 HERODOTUS_STATUS_SUCCESS;) {
	321 /* get and handle case mapping */
	322 if (unlikely((map = get_case_offset(cp, major, minor)) >=
	323 INT32_C(0x110000))) {
	324 /* we have a special case and the offset in the …
	325 * is the difference to 0x110000*/
	326 off = (uint_least32_t)map - UINT32_C(0x110000);
	327
	328 for (i = 0; i < sc[off].cplen; i++) {
	329 if (herodotus_read_codepoint(r, false, &…
	330 HERODOTUS_STATUS_SUCCESS) {
	331 if (cp != sc[off].cp[i]) {
	332 ret = false;
	333 goto done;
	334 } else {
	335 /* move forward */
	336 herodotus_read_codepoint(
	337 r, true, &cp);
	338 }
	339 } else {
	340 /*
	341 * input ended and we didn't see
	342 * any difference so far, so this
	343 * string is in fact okay
	344 */
	345 ret = true;
	346 goto done;
	347 }
	348 }
	349 } else {
	350 /* we have a simple mapping */
	351 if (cp != (uint_least32_t)((int_least32_t)cp + m…
	352 /* we have a difference */
	353 ret = false;
	354 goto done;
	355 } else {
	356 /* move forward */
	357 herodotus_read_codepoint(r, true, &cp);
	358 }
	359 }
	360 }
	361 done:
	362 if (output) {
	363 *output = herodotus_reader_number_read(r);
	364 }
	365 return ret;
	366 }
	367
	368 static inline bool
	369 is_titlecase(HERODOTUS_READER r, size_t output)
	370 {
	371 enum case_property prop;
	372 enum herodotus_status s;
	373 bool ret = true;
	374 uint_least32_t cp;
	375 size_t nwb;
	376
	377 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
	378 herodotus_reader_push_advance_limit(r, nwb);
	379 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
	380 HERODOTUS_STATUS_SUCCESS;) {
	381 /* check if we have a cased character */
	382 prop = get_case_property(cp);
	383 if (prop == CASE_PROP_CASED \|\|
	384 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)…
	385 break;
	386 } else {
	387 /* increment reader */
	388 herodotus_read_codepoint(r, true, &cp);
	389 }
	390 }
	391
	392 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
	393 /* we are done */
	394 break;
	395 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
	396 /*
	397 * we did not encounter any cased character
	398 * up to the word break
	399 */
	400 herodotus_reader_pop_limit(r);
	401 continue;
	402 } else {
	403 /*
	404 * we encountered a cased character before the w…
	405 * break, check if it's titlecase
	406 */
	407 herodotus_reader_push_advance_limit(
	408 r, herodotus_reader_next_codepoint_break…
	409 if (!is_case(r, title_major, title_minor, title_…
	410 NULL)) {
	411 ret = false;
	412 goto done;
	413 }
	414 herodotus_reader_pop_limit(r);
	415 }
	416
	417 /* check if the rest of the codepoints in the word are l…
	418 */
	419 if (!is_case(r, lower_major, lower_minor, lower_special,
	420 NULL)) {
	421 ret = false;
	422 goto done;
	423 }
	424
	425 /* remove the limit on the word before the next iteratio…
	426 herodotus_reader_pop_limit(r);
	427 }
	428 done:
	429 if (output) {
	430 *output = herodotus_reader_number_read(r);
	431 }
	432 return ret;
	433 }
	434
	435 bool
	436 grapheme_is_uppercase(const uint_least32_t src, size_t srclen, size_t …
	437 {
	438 HERODOTUS_READER r;
	439
	440 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	441
	442 return is_case(&r, upper_major, upper_minor, upper_special, case…
	443 }
	444
	445 bool
	446 grapheme_is_lowercase(const uint_least32_t src, size_t srclen, size_t …
	447 {
	448 HERODOTUS_READER r;
	449
	450 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	451
	452 return is_case(&r, lower_major, lower_minor, lower_special, case…
	453 }
	454
	455 bool
	456 grapheme_is_titlecase(const uint_least32_t src, size_t srclen, size_t …
	457 {
	458 HERODOTUS_READER r;
	459
	460 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
	461
	462 return is_titlecase(&r, caselen);
	463 }
	464
	465 bool
	466 grapheme_is_uppercase_utf8(const char src, size_t srclen, size_t casel…
	467 {
	468 HERODOTUS_READER r;
	469
	470 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	471
	472 return is_case(&r, upper_major, upper_minor, upper_special, case…
	473 }
	474
	475 bool
	476 grapheme_is_lowercase_utf8(const char src, size_t srclen, size_t casel…
	477 {
	478 HERODOTUS_READER r;
	479
	480 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	481
	482 return is_case(&r, lower_major, lower_minor, lower_special, case…
	483 }
	484
	485 bool
	486 grapheme_is_titlecase_utf8(const char src, size_t srclen, size_t casel…
	487 {
	488 HERODOTUS_READER r;
	489
	490 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
	491
	492 return is_titlecase(&r, caselen);
	493 }