Introduction
Introduction Statistics Contact Development Disclaimer Help
case.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
case.c (12993B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "../gen/case.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 static inline enum case_property
10 get_case_property(uint_least32_t cp)
11 {
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum case_property)
14 case_minor[case_major[cp >> 8] + (cp & 0xFF)];
15 } else {
16 return CASE_PROP_OTHER;
17 }
18 }
19
20 static inline int_least32_t
21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
22 const int_least32_t *minor)
23 {
24 if (likely(cp <= UINT32_C(0x10FFFF))) {
25 /*
26 * this value might be larger than or equal to 0x110000
27 * for the special-case-mapping. This needs to be handled
28 * separately
29 */
30 return minor[major[cp >> 8] + (cp & 0xFF)];
31 } else {
32 return 0;
33 }
34 }
35
36 static inline size_t
37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
38 uint_least8_t final_sigma_level, const uint_least16_t *major,
39 const int_least32_t *minor, const struct special_case *sc)
40 {
41 HERODOTUS_READER tmp;
42 enum case_property prop;
43 enum herodotus_status s;
44 size_t off, i;
45 uint_least32_t cp, tmp_cp;
46 int_least32_t map;
47
48 for (; herodotus_read_codepoint(r, true, &cp) ==
49 HERODOTUS_STATUS_SUCCESS;) {
50 if (sc == lower_special) {
51 /*
52 * For the special Final_Sigma-rule (see
53 * SpecialCasing.txt), which is the only non-loc…
54 * case-dependent rule, we apply a different map…
55 * when a sigma is at the end of a word.
56 *
57 * Before: cased case-ignorable*
58 * After: not(case-ignorable* cased)
59 *
60 * We check the after-condition on demand, but t…
61 * before- condition is best checked using the
62 * "level"-heuristic also used in the sentence a…
63 * breaking-implementations.
64 */
65 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL L…
66 SIGMA */
67 (final_sigma_level == 1 ||
68 final_sigma_level == 2)) {
69 /*
70 * check succeeding characters by first …
71 * all case-ignorable characters and then
72 * checking if the succeeding character …
73 * cased, invalidating the after-conditi…
74 */
75 herodotus_reader_copy(r, &tmp);
76 for (prop = NUM_CASE_PROPS;
77 (s = herodotus_read_codepoint(&tmp,…
78 &tmp_…
79 HERODOTUS_STATUS_SUCCESS;) {
80 prop = get_case_property(tmp_cp);
81
82 if (prop != CASE_PROP_CASE_IGNOR…
83 prop != CASE_PROP_BOTH_CASED…
84 break;
85 }
86 }
87
88 /*
89 * Now prop is something other than
90 * case-ignorable or the source-string e…
91 * it is something other than cased, we …
92 * that the after-condition holds
93 */
94 if (s != HERODOTUS_STATUS_SUCCESS ||
95 (prop != CASE_PROP_CASED &&
96 prop != CASE_PROP_BOTH_CASED_CASE_I…
97 /*
98 * write GREEK SMALL LETTER FINA…
99 * to destination
100 */
101 herodotus_write_codepoint(
102 w, UINT32_C(0x03C2));
103
104 /* reset Final_Sigma-state and c…
105 */
106 final_sigma_level = 0;
107 continue;
108 }
109 }
110
111 /* update state */
112 prop = get_case_property(cp);
113 if ((final_sigma_level == 0 ||
114 final_sigma_level == 1) &&
115 (prop == CASE_PROP_CASED ||
116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE…
117 /* sequence has begun */
118 final_sigma_level = 1;
119 } else if (
120 (final_sigma_level == 1 ||
121 final_sigma_level == 2) &&
122 (prop == CASE_PROP_CASE_IGNORABLE ||
123 prop == CASE_PROP_BOTH_CASED_CASE_IGNOR…
124 /* case-ignorable sequence begins or con…
125 */
126 final_sigma_level = 2;
127 } else {
128 /* sequence broke */
129 final_sigma_level = 0;
130 }
131 }
132
133 /* get and handle case mapping */
134 if (unlikely((map = get_case_offset(cp, major, minor)) >=
135 INT32_C(0x110000))) {
136 /* we have a special case and the offset in the …
137 * is the difference to 0x110000*/
138 off = (uint_least32_t)map - UINT32_C(0x110000);
139
140 for (i = 0; i < sc[off].cplen; i++) {
141 herodotus_write_codepoint(w, sc[off].cp[…
142 }
143 } else {
144 /* we have a simple mapping */
145 herodotus_write_codepoint(
146 w, (uint_least32_t)((int_least32_t)cp + …
147 }
148 }
149
150 herodotus_writer_nul_terminate(w);
151
152 return herodotus_writer_number_written(w);
153 }
154
155 static size_t
156 herodotus_next_word_break(const HERODOTUS_READER *r)
157 {
158 HERODOTUS_READER tmp;
159
160 herodotus_reader_copy(r, &tmp);
161
162 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
163 return grapheme_next_word_break(tmp.src, tmp.srclen);
164 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
165 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen…
166 }
167 }
168
169 static inline size_t
170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
171 {
172 enum case_property prop;
173 enum herodotus_status s;
174 uint_least32_t cp;
175 size_t nwb;
176
177 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
178 herodotus_reader_push_advance_limit(r, nwb);
179 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
180 HERODOTUS_STATUS_SUCCESS;) {
181 /* check if we have a cased character */
182 prop = get_case_property(cp);
183 if (prop == CASE_PROP_CASED ||
184 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)…
185 break;
186 } else {
187 /* write the data to the output verbatim…
188 * permits */
189 herodotus_write_codepoint(w, cp);
190
191 /* increment reader */
192 herodotus_read_codepoint(r, true, &cp);
193 }
194 }
195
196 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
197 /* we are done */
198 herodotus_reader_pop_limit(r);
199 break;
200 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
201 /*
202 * we did not encounter any cased character
203 * up to the word break
204 */
205 herodotus_reader_pop_limit(r);
206 continue;
207 } else {
208 /*
209 * we encountered a cased character before the w…
210 * break, convert it to titlecase
211 */
212 herodotus_reader_push_advance_limit(
213 r, herodotus_reader_next_codepoint_break…
214 to_case(r, w, 0, title_major, title_minor,
215 title_special);
216 herodotus_reader_pop_limit(r);
217 }
218
219 /* cast the rest of the codepoints in the word to lowerc…
220 to_case(r, w, 1, lower_major, lower_minor, lower_special…
221
222 /* remove the limit on the word before the next iteratio…
223 herodotus_reader_pop_limit(r);
224 }
225
226 herodotus_writer_nul_terminate(w);
227
228 return herodotus_writer_number_written(w);
229 }
230
231 size_t
232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
233 uint_least32_t *dest, size_t destlen)
234 {
235 HERODOTUS_READER r;
236 HERODOTUS_WRITER w;
237
238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle…
240
241 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia…
242 }
243
244 size_t
245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
246 uint_least32_t *dest, size_t destlen)
247 {
248 HERODOTUS_READER r;
249 HERODOTUS_WRITER w;
250
251 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
252 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle…
253
254 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia…
255 }
256
257 size_t
258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
259 uint_least32_t *dest, size_t destlen)
260 {
261 HERODOTUS_READER r;
262 HERODOTUS_WRITER w;
263
264 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
265 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destle…
266
267 return to_titlecase(&r, &w);
268 }
269
270 size_t
271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest,
272 size_t destlen)
273 {
274 HERODOTUS_READER r;
275 HERODOTUS_WRITER w;
276
277 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
278 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
279
280 return to_case(&r, &w, 0, upper_major, upper_minor, upper_specia…
281 }
282
283 size_t
284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest,
285 size_t destlen)
286 {
287 HERODOTUS_READER r;
288 HERODOTUS_WRITER w;
289
290 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
291 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
292
293 return to_case(&r, &w, 0, lower_major, lower_minor, lower_specia…
294 }
295
296 size_t
297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest,
298 size_t destlen)
299 {
300 HERODOTUS_READER r;
301 HERODOTUS_WRITER w;
302
303 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
304 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
305
306 return to_titlecase(&r, &w);
307 }
308
309 static inline bool
310 is_case(HERODOTUS_READER *r, const uint_least16_t *major,
311 const int_least32_t *minor, const struct special_case *sc,
312 size_t *output)
313 {
314 size_t off, i;
315 bool ret = true;
316 uint_least32_t cp;
317 int_least32_t map;
318
319 for (; herodotus_read_codepoint(r, false, &cp) ==
320 HERODOTUS_STATUS_SUCCESS;) {
321 /* get and handle case mapping */
322 if (unlikely((map = get_case_offset(cp, major, minor)) >=
323 INT32_C(0x110000))) {
324 /* we have a special case and the offset in the …
325 * is the difference to 0x110000*/
326 off = (uint_least32_t)map - UINT32_C(0x110000);
327
328 for (i = 0; i < sc[off].cplen; i++) {
329 if (herodotus_read_codepoint(r, false, &…
330 HERODOTUS_STATUS_SUCCESS) {
331 if (cp != sc[off].cp[i]) {
332 ret = false;
333 goto done;
334 } else {
335 /* move forward */
336 herodotus_read_codepoint(
337 r, true, &cp);
338 }
339 } else {
340 /*
341 * input ended and we didn't see
342 * any difference so far, so this
343 * string is in fact okay
344 */
345 ret = true;
346 goto done;
347 }
348 }
349 } else {
350 /* we have a simple mapping */
351 if (cp != (uint_least32_t)((int_least32_t)cp + m…
352 /* we have a difference */
353 ret = false;
354 goto done;
355 } else {
356 /* move forward */
357 herodotus_read_codepoint(r, true, &cp);
358 }
359 }
360 }
361 done:
362 if (output) {
363 *output = herodotus_reader_number_read(r);
364 }
365 return ret;
366 }
367
368 static inline bool
369 is_titlecase(HERODOTUS_READER *r, size_t *output)
370 {
371 enum case_property prop;
372 enum herodotus_status s;
373 bool ret = true;
374 uint_least32_t cp;
375 size_t nwb;
376
377 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
378 herodotus_reader_push_advance_limit(r, nwb);
379 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
380 HERODOTUS_STATUS_SUCCESS;) {
381 /* check if we have a cased character */
382 prop = get_case_property(cp);
383 if (prop == CASE_PROP_CASED ||
384 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)…
385 break;
386 } else {
387 /* increment reader */
388 herodotus_read_codepoint(r, true, &cp);
389 }
390 }
391
392 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
393 /* we are done */
394 break;
395 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
396 /*
397 * we did not encounter any cased character
398 * up to the word break
399 */
400 herodotus_reader_pop_limit(r);
401 continue;
402 } else {
403 /*
404 * we encountered a cased character before the w…
405 * break, check if it's titlecase
406 */
407 herodotus_reader_push_advance_limit(
408 r, herodotus_reader_next_codepoint_break…
409 if (!is_case(r, title_major, title_minor, title_…
410 NULL)) {
411 ret = false;
412 goto done;
413 }
414 herodotus_reader_pop_limit(r);
415 }
416
417 /* check if the rest of the codepoints in the word are l…
418 */
419 if (!is_case(r, lower_major, lower_minor, lower_special,
420 NULL)) {
421 ret = false;
422 goto done;
423 }
424
425 /* remove the limit on the word before the next iteratio…
426 herodotus_reader_pop_limit(r);
427 }
428 done:
429 if (output) {
430 *output = herodotus_reader_number_read(r);
431 }
432 return ret;
433 }
434
435 bool
436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *…
437 {
438 HERODOTUS_READER r;
439
440 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
441
442 return is_case(&r, upper_major, upper_minor, upper_special, case…
443 }
444
445 bool
446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *…
447 {
448 HERODOTUS_READER r;
449
450 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
451
452 return is_case(&r, lower_major, lower_minor, lower_special, case…
453 }
454
455 bool
456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *…
457 {
458 HERODOTUS_READER r;
459
460 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
461
462 return is_titlecase(&r, caselen);
463 }
464
465 bool
466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *casel…
467 {
468 HERODOTUS_READER r;
469
470 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
471
472 return is_case(&r, upper_major, upper_minor, upper_special, case…
473 }
474
475 bool
476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *casel…
477 {
478 HERODOTUS_READER r;
479
480 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
481
482 return is_case(&r, lower_major, lower_minor, lower_special, case…
483 }
484
485 bool
486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *casel…
487 {
488 HERODOTUS_READER r;
489
490 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
491
492 return is_titlecase(&r, caselen);
493 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.