case.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
case.c (12993B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stddef.h> | |
3 #include <stdint.h> | |
4 | |
5 #include "../gen/case.h" | |
6 #include "../grapheme.h" | |
7 #include "util.h" | |
8 | |
9 static inline enum case_property | |
10 get_case_property(uint_least32_t cp) | |
11 { | |
12 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
13 return (enum case_property) | |
14 case_minor[case_major[cp >> 8] + (cp & 0xFF)]; | |
15 } else { | |
16 return CASE_PROP_OTHER; | |
17 } | |
18 } | |
19 | |
20 static inline int_least32_t | |
21 get_case_offset(uint_least32_t cp, const uint_least16_t *major, | |
22 const int_least32_t *minor) | |
23 { | |
24 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
25 /* | |
26 * this value might be larger than or equal to 0x110000 | |
27 * for the special-case-mapping. This needs to be handled | |
28 * separately | |
29 */ | |
30 return minor[major[cp >> 8] + (cp & 0xFF)]; | |
31 } else { | |
32 return 0; | |
33 } | |
34 } | |
35 | |
36 static inline size_t | |
37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w, | |
38 uint_least8_t final_sigma_level, const uint_least16_t *major, | |
39 const int_least32_t *minor, const struct special_case *sc) | |
40 { | |
41 HERODOTUS_READER tmp; | |
42 enum case_property prop; | |
43 enum herodotus_status s; | |
44 size_t off, i; | |
45 uint_least32_t cp, tmp_cp; | |
46 int_least32_t map; | |
47 | |
48 for (; herodotus_read_codepoint(r, true, &cp) == | |
49 HERODOTUS_STATUS_SUCCESS;) { | |
50 if (sc == lower_special) { | |
51 /* | |
52 * For the special Final_Sigma-rule (see | |
53 * SpecialCasing.txt), which is the only non-loc… | |
54 * case-dependent rule, we apply a different map… | |
55 * when a sigma is at the end of a word. | |
56 * | |
57 * Before: cased case-ignorable* | |
58 * After: not(case-ignorable* cased) | |
59 * | |
60 * We check the after-condition on demand, but t… | |
61 * before- condition is best checked using the | |
62 * "level"-heuristic also used in the sentence a… | |
63 * breaking-implementations. | |
64 */ | |
65 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL L… | |
66 SIGMA */ | |
67 (final_sigma_level == 1 || | |
68 final_sigma_level == 2)) { | |
69 /* | |
70 * check succeeding characters by first … | |
71 * all case-ignorable characters and then | |
72 * checking if the succeeding character … | |
73 * cased, invalidating the after-conditi… | |
74 */ | |
75 herodotus_reader_copy(r, &tmp); | |
76 for (prop = NUM_CASE_PROPS; | |
77 (s = herodotus_read_codepoint(&tmp,… | |
78 &tmp_… | |
79 HERODOTUS_STATUS_SUCCESS;) { | |
80 prop = get_case_property(tmp_cp); | |
81 | |
82 if (prop != CASE_PROP_CASE_IGNOR… | |
83 prop != CASE_PROP_BOTH_CASED… | |
84 break; | |
85 } | |
86 } | |
87 | |
88 /* | |
89 * Now prop is something other than | |
90 * case-ignorable or the source-string e… | |
91 * it is something other than cased, we … | |
92 * that the after-condition holds | |
93 */ | |
94 if (s != HERODOTUS_STATUS_SUCCESS || | |
95 (prop != CASE_PROP_CASED && | |
96 prop != CASE_PROP_BOTH_CASED_CASE_I… | |
97 /* | |
98 * write GREEK SMALL LETTER FINA… | |
99 * to destination | |
100 */ | |
101 herodotus_write_codepoint( | |
102 w, UINT32_C(0x03C2)); | |
103 | |
104 /* reset Final_Sigma-state and c… | |
105 */ | |
106 final_sigma_level = 0; | |
107 continue; | |
108 } | |
109 } | |
110 | |
111 /* update state */ | |
112 prop = get_case_property(cp); | |
113 if ((final_sigma_level == 0 || | |
114 final_sigma_level == 1) && | |
115 (prop == CASE_PROP_CASED || | |
116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE… | |
117 /* sequence has begun */ | |
118 final_sigma_level = 1; | |
119 } else if ( | |
120 (final_sigma_level == 1 || | |
121 final_sigma_level == 2) && | |
122 (prop == CASE_PROP_CASE_IGNORABLE || | |
123 prop == CASE_PROP_BOTH_CASED_CASE_IGNOR… | |
124 /* case-ignorable sequence begins or con… | |
125 */ | |
126 final_sigma_level = 2; | |
127 } else { | |
128 /* sequence broke */ | |
129 final_sigma_level = 0; | |
130 } | |
131 } | |
132 | |
133 /* get and handle case mapping */ | |
134 if (unlikely((map = get_case_offset(cp, major, minor)) >= | |
135 INT32_C(0x110000))) { | |
136 /* we have a special case and the offset in the … | |
137 * is the difference to 0x110000*/ | |
138 off = (uint_least32_t)map - UINT32_C(0x110000); | |
139 | |
140 for (i = 0; i < sc[off].cplen; i++) { | |
141 herodotus_write_codepoint(w, sc[off].cp[… | |
142 } | |
143 } else { | |
144 /* we have a simple mapping */ | |
145 herodotus_write_codepoint( | |
146 w, (uint_least32_t)((int_least32_t)cp + … | |
147 } | |
148 } | |
149 | |
150 herodotus_writer_nul_terminate(w); | |
151 | |
152 return herodotus_writer_number_written(w); | |
153 } | |
154 | |
155 static size_t | |
156 herodotus_next_word_break(const HERODOTUS_READER *r) | |
157 { | |
158 HERODOTUS_READER tmp; | |
159 | |
160 herodotus_reader_copy(r, &tmp); | |
161 | |
162 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
163 return grapheme_next_word_break(tmp.src, tmp.srclen); | |
164 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
165 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen… | |
166 } | |
167 } | |
168 | |
169 static inline size_t | |
170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w) | |
171 { | |
172 enum case_property prop; | |
173 enum herodotus_status s; | |
174 uint_least32_t cp; | |
175 size_t nwb; | |
176 | |
177 for (; (nwb = herodotus_next_word_break(r)) > 0;) { | |
178 herodotus_reader_push_advance_limit(r, nwb); | |
179 for (; (s = herodotus_read_codepoint(r, false, &cp)) == | |
180 HERODOTUS_STATUS_SUCCESS;) { | |
181 /* check if we have a cased character */ | |
182 prop = get_case_property(cp); | |
183 if (prop == CASE_PROP_CASED || | |
184 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)… | |
185 break; | |
186 } else { | |
187 /* write the data to the output verbatim… | |
188 * permits */ | |
189 herodotus_write_codepoint(w, cp); | |
190 | |
191 /* increment reader */ | |
192 herodotus_read_codepoint(r, true, &cp); | |
193 } | |
194 } | |
195 | |
196 if (s == HERODOTUS_STATUS_END_OF_BUFFER) { | |
197 /* we are done */ | |
198 herodotus_reader_pop_limit(r); | |
199 break; | |
200 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { | |
201 /* | |
202 * we did not encounter any cased character | |
203 * up to the word break | |
204 */ | |
205 herodotus_reader_pop_limit(r); | |
206 continue; | |
207 } else { | |
208 /* | |
209 * we encountered a cased character before the w… | |
210 * break, convert it to titlecase | |
211 */ | |
212 herodotus_reader_push_advance_limit( | |
213 r, herodotus_reader_next_codepoint_break… | |
214 to_case(r, w, 0, title_major, title_minor, | |
215 title_special); | |
216 herodotus_reader_pop_limit(r); | |
217 } | |
218 | |
219 /* cast the rest of the codepoints in the word to lowerc… | |
220 to_case(r, w, 1, lower_major, lower_minor, lower_special… | |
221 | |
222 /* remove the limit on the word before the next iteratio… | |
223 herodotus_reader_pop_limit(r); | |
224 } | |
225 | |
226 herodotus_writer_nul_terminate(w); | |
227 | |
228 return herodotus_writer_number_written(w); | |
229 } | |
230 | |
231 size_t | |
232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, | |
233 uint_least32_t *dest, size_t destlen) | |
234 { | |
235 HERODOTUS_READER r; | |
236 HERODOTUS_WRITER w; | |
237 | |
238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle… | |
240 | |
241 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia… | |
242 } | |
243 | |
244 size_t | |
245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, | |
246 uint_least32_t *dest, size_t destlen) | |
247 { | |
248 HERODOTUS_READER r; | |
249 HERODOTUS_WRITER w; | |
250 | |
251 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
252 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle… | |
253 | |
254 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia… | |
255 } | |
256 | |
257 size_t | |
258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, | |
259 uint_least32_t *dest, size_t destlen) | |
260 { | |
261 HERODOTUS_READER r; | |
262 HERODOTUS_WRITER w; | |
263 | |
264 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
265 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle… | |
266 | |
267 return to_titlecase(&r, &w); | |
268 } | |
269 | |
270 size_t | |
271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, | |
272 size_t destlen) | |
273 { | |
274 HERODOTUS_READER r; | |
275 HERODOTUS_WRITER w; | |
276 | |
277 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
278 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
279 | |
280 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia… | |
281 } | |
282 | |
283 size_t | |
284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, | |
285 size_t destlen) | |
286 { | |
287 HERODOTUS_READER r; | |
288 HERODOTUS_WRITER w; | |
289 | |
290 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
291 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
292 | |
293 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia… | |
294 } | |
295 | |
296 size_t | |
297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, | |
298 size_t destlen) | |
299 { | |
300 HERODOTUS_READER r; | |
301 HERODOTUS_WRITER w; | |
302 | |
303 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
304 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen); | |
305 | |
306 return to_titlecase(&r, &w); | |
307 } | |
308 | |
309 static inline bool | |
310 is_case(HERODOTUS_READER *r, const uint_least16_t *major, | |
311 const int_least32_t *minor, const struct special_case *sc, | |
312 size_t *output) | |
313 { | |
314 size_t off, i; | |
315 bool ret = true; | |
316 uint_least32_t cp; | |
317 int_least32_t map; | |
318 | |
319 for (; herodotus_read_codepoint(r, false, &cp) == | |
320 HERODOTUS_STATUS_SUCCESS;) { | |
321 /* get and handle case mapping */ | |
322 if (unlikely((map = get_case_offset(cp, major, minor)) >= | |
323 INT32_C(0x110000))) { | |
324 /* we have a special case and the offset in the … | |
325 * is the difference to 0x110000*/ | |
326 off = (uint_least32_t)map - UINT32_C(0x110000); | |
327 | |
328 for (i = 0; i < sc[off].cplen; i++) { | |
329 if (herodotus_read_codepoint(r, false, &… | |
330 HERODOTUS_STATUS_SUCCESS) { | |
331 if (cp != sc[off].cp[i]) { | |
332 ret = false; | |
333 goto done; | |
334 } else { | |
335 /* move forward */ | |
336 herodotus_read_codepoint( | |
337 r, true, &cp); | |
338 } | |
339 } else { | |
340 /* | |
341 * input ended and we didn't see | |
342 * any difference so far, so this | |
343 * string is in fact okay | |
344 */ | |
345 ret = true; | |
346 goto done; | |
347 } | |
348 } | |
349 } else { | |
350 /* we have a simple mapping */ | |
351 if (cp != (uint_least32_t)((int_least32_t)cp + m… | |
352 /* we have a difference */ | |
353 ret = false; | |
354 goto done; | |
355 } else { | |
356 /* move forward */ | |
357 herodotus_read_codepoint(r, true, &cp); | |
358 } | |
359 } | |
360 } | |
361 done: | |
362 if (output) { | |
363 *output = herodotus_reader_number_read(r); | |
364 } | |
365 return ret; | |
366 } | |
367 | |
368 static inline bool | |
369 is_titlecase(HERODOTUS_READER *r, size_t *output) | |
370 { | |
371 enum case_property prop; | |
372 enum herodotus_status s; | |
373 bool ret = true; | |
374 uint_least32_t cp; | |
375 size_t nwb; | |
376 | |
377 for (; (nwb = herodotus_next_word_break(r)) > 0;) { | |
378 herodotus_reader_push_advance_limit(r, nwb); | |
379 for (; (s = herodotus_read_codepoint(r, false, &cp)) == | |
380 HERODOTUS_STATUS_SUCCESS;) { | |
381 /* check if we have a cased character */ | |
382 prop = get_case_property(cp); | |
383 if (prop == CASE_PROP_CASED || | |
384 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)… | |
385 break; | |
386 } else { | |
387 /* increment reader */ | |
388 herodotus_read_codepoint(r, true, &cp); | |
389 } | |
390 } | |
391 | |
392 if (s == HERODOTUS_STATUS_END_OF_BUFFER) { | |
393 /* we are done */ | |
394 break; | |
395 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) { | |
396 /* | |
397 * we did not encounter any cased character | |
398 * up to the word break | |
399 */ | |
400 herodotus_reader_pop_limit(r); | |
401 continue; | |
402 } else { | |
403 /* | |
404 * we encountered a cased character before the w… | |
405 * break, check if it's titlecase | |
406 */ | |
407 herodotus_reader_push_advance_limit( | |
408 r, herodotus_reader_next_codepoint_break… | |
409 if (!is_case(r, title_major, title_minor, title_… | |
410 NULL)) { | |
411 ret = false; | |
412 goto done; | |
413 } | |
414 herodotus_reader_pop_limit(r); | |
415 } | |
416 | |
417 /* check if the rest of the codepoints in the word are l… | |
418 */ | |
419 if (!is_case(r, lower_major, lower_minor, lower_special, | |
420 NULL)) { | |
421 ret = false; | |
422 goto done; | |
423 } | |
424 | |
425 /* remove the limit on the word before the next iteratio… | |
426 herodotus_reader_pop_limit(r); | |
427 } | |
428 done: | |
429 if (output) { | |
430 *output = herodotus_reader_number_read(r); | |
431 } | |
432 return ret; | |
433 } | |
434 | |
435 bool | |
436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *… | |
437 { | |
438 HERODOTUS_READER r; | |
439 | |
440 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
441 | |
442 return is_case(&r, upper_major, upper_minor, upper_special, case… | |
443 } | |
444 | |
445 bool | |
446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *… | |
447 { | |
448 HERODOTUS_READER r; | |
449 | |
450 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
451 | |
452 return is_case(&r, lower_major, lower_minor, lower_special, case… | |
453 } | |
454 | |
455 bool | |
456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *… | |
457 { | |
458 HERODOTUS_READER r; | |
459 | |
460 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen); | |
461 | |
462 return is_titlecase(&r, caselen); | |
463 } | |
464 | |
465 bool | |
466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *casel… | |
467 { | |
468 HERODOTUS_READER r; | |
469 | |
470 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
471 | |
472 return is_case(&r, upper_major, upper_minor, upper_special, case… | |
473 } | |
474 | |
475 bool | |
476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *casel… | |
477 { | |
478 HERODOTUS_READER r; | |
479 | |
480 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
481 | |
482 return is_case(&r, lower_major, lower_minor, lower_special, case… | |
483 } | |
484 | |
485 bool | |
486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *casel… | |
487 { | |
488 HERODOTUS_READER r; | |
489 | |
490 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen); | |
491 | |
492 return is_titlecase(&r, caselen); | |
493 } |