util.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
util.c (11480B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <limits.h> | |
3 #include <stdbool.h> | |
4 #include <stddef.h> | |
5 #include <stdint.h> | |
6 | |
7 #include "../gen/types.h" | |
8 #include "../grapheme.h" | |
9 #include "util.h" | |
10 | |
11 void | |
12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type, | |
13 const void *src, size_t srclen) | |
14 { | |
15 size_t i; | |
16 | |
17 r->type = type; | |
18 r->src = src; | |
19 r->srclen = srclen; | |
20 r->off = 0; | |
21 r->terminated_by_null = false; | |
22 | |
23 for (i = 0; i < LEN(r->soft_limit); i++) { | |
24 r->soft_limit[i] = SIZE_MAX; | |
25 } | |
26 } | |
27 | |
28 void | |
29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *des… | |
30 { | |
31 size_t i; | |
32 | |
33 /* | |
34 * we copy such that we have a "fresh" start and build on the | |
35 * fact that src->soft_limit[i] for any i and src->srclen are | |
36 * always larger or equal to src->off | |
37 */ | |
38 dest->type = src->type; | |
39 if (src->type == HERODOTUS_TYPE_CODEPOINT) { | |
40 dest->src = | |
41 (src->src == NULL) ? | |
42 NULL : | |
43 ((const uint_least32_t *)(src->src)) + s… | |
44 } else { /* src->type == HERODOTUS_TYPE_UTF8 */ | |
45 dest->src = (src->src == NULL) ? | |
46 NULL : | |
47 ((const char *)(src->src)) + src->of… | |
48 } | |
49 if (src->srclen == SIZE_MAX) { | |
50 dest->srclen = SIZE_MAX; | |
51 } else { | |
52 dest->srclen = | |
53 (src->off < src->srclen) ? src->srclen - src->of… | |
54 } | |
55 dest->off = 0; | |
56 dest->terminated_by_null = src->terminated_by_null; | |
57 | |
58 for (i = 0; i < LEN(src->soft_limit); i++) { | |
59 if (src->soft_limit[i] == SIZE_MAX) { | |
60 dest->soft_limit[i] = SIZE_MAX; | |
61 } else { | |
62 /* | |
63 * if we have a degenerate case where the offset… | |
64 * higher than the soft-limit, we simply clamp t… | |
65 * soft-limit to zero given we can't decide here | |
66 * to release the limit and, instead, we just | |
67 * prevent any more reads | |
68 */ | |
69 dest->soft_limit[i] = | |
70 (src->off < src->soft_limit[i]) ? | |
71 src->soft_limit[i] - src->off : | |
72 0; | |
73 } | |
74 } | |
75 } | |
76 | |
77 void | |
78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count) | |
79 { | |
80 size_t i; | |
81 | |
82 for (i = LEN(r->soft_limit) - 1; i >= 1; i--) { | |
83 r->soft_limit[i] = r->soft_limit[i - 1]; | |
84 } | |
85 r->soft_limit[0] = r->off + count; | |
86 } | |
87 | |
88 void | |
89 herodotus_reader_pop_limit(HERODOTUS_READER *r) | |
90 { | |
91 size_t i; | |
92 | |
93 for (i = 0; i < LEN(r->soft_limit) - 1; i++) { | |
94 r->soft_limit[i] = r->soft_limit[i + 1]; | |
95 } | |
96 r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX; | |
97 } | |
98 | |
99 size_t | |
100 herodotus_reader_next_word_break(const HERODOTUS_READER *r) | |
101 { | |
102 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
103 return grapheme_next_word_break( | |
104 (const uint_least32_t *)(r->src) + r->off, | |
105 MIN(r->srclen, r->soft_limit[0]) - r->off); | |
106 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
107 return grapheme_next_word_break_utf8( | |
108 (const char *)(r->src) + r->off, | |
109 MIN(r->srclen, r->soft_limit[0]) - r->off); | |
110 } | |
111 } | |
112 | |
113 size_t | |
114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) | |
115 { | |
116 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
117 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 :… | |
118 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
119 return grapheme_decode_utf8( | |
120 (const char *)(r->src) + r->off, | |
121 MIN(r->srclen, r->soft_limit[0]) - r->off, NULL); | |
122 } | |
123 } | |
124 | |
125 size_t | |
126 herodotus_reader_number_read(const HERODOTUS_READER *r) | |
127 { | |
128 return r->off; | |
129 } | |
130 | |
131 enum herodotus_status | |
132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32… | |
133 { | |
134 size_t ret; | |
135 | |
136 if (r->terminated_by_null || r->off >= r->srclen || r->src == NU… | |
137 *cp = GRAPHEME_INVALID_CODEPOINT; | |
138 return HERODOTUS_STATUS_END_OF_BUFFER; | |
139 } | |
140 | |
141 if (r->off >= r->soft_limit[0]) { | |
142 *cp = GRAPHEME_INVALID_CODEPOINT; | |
143 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED; | |
144 } | |
145 | |
146 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
147 *cp = ((const uint_least32_t *)(r->src))[r->off]; | |
148 ret = 1; | |
149 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
150 ret = grapheme_decode_utf8( | |
151 (const char *)r->src + r->off, | |
152 MIN(r->srclen, r->soft_limit[0]) - r->off, cp); | |
153 } | |
154 | |
155 if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) { | |
156 /* | |
157 * We encountered a null-codepoint. Don't increment | |
158 * offset and return as if the buffer had ended here all | |
159 * along | |
160 */ | |
161 r->terminated_by_null = true; | |
162 return HERODOTUS_STATUS_END_OF_BUFFER; | |
163 } | |
164 | |
165 if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) { | |
166 /* | |
167 * we want more than we have; instead of returning | |
168 * garbage we terminate here. | |
169 */ | |
170 return HERODOTUS_STATUS_END_OF_BUFFER; | |
171 } | |
172 | |
173 /* | |
174 * Increase offset which we now know won't surpass the limits, | |
175 * unless we got told otherwise | |
176 */ | |
177 if (advance) { | |
178 r->off += ret; | |
179 } | |
180 | |
181 return HERODOTUS_STATUS_SUCCESS; | |
182 } | |
183 | |
184 void | |
185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, voi… | |
186 size_t destlen) | |
187 { | |
188 w->type = type; | |
189 w->dest = dest; | |
190 w->destlen = destlen; | |
191 w->off = 0; | |
192 w->first_unwritable_offset = SIZE_MAX; | |
193 } | |
194 | |
195 void | |
196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) | |
197 { | |
198 if (w->dest == NULL) { | |
199 return; | |
200 } | |
201 | |
202 if (w->off < w->destlen) { | |
203 /* We still have space in the buffer. Simply use it */ | |
204 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
205 ((uint_least32_t *)(w->dest))[w->off] = 0; | |
206 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
207 ((char *)(w->dest))[w->off] = '\0'; | |
208 } | |
209 } else if (w->first_unwritable_offset < w->destlen) { | |
210 /* | |
211 * There is no more space in the buffer. However, | |
212 * we have noted down the first offset we couldn't | |
213 * use to write into the buffer and it's smaller than | |
214 * destlen. Thus we bailed writing into the | |
215 * destination when a multibyte-codepoint couldn't be | |
216 * written. So the last "real" byte might be at | |
217 * destlen-4, destlen-3, destlen-2 or destlen-1 | |
218 * (the last case meaning truncation). | |
219 */ | |
220 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
221 ((uint_least32_t | |
222 *)(w->dest))[w->first_unwritable_offse… | |
223 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
224 ((char *)(w->dest))[w->first_unwritable_offset] … | |
225 } | |
226 } else if (w->destlen > 0) { | |
227 /* | |
228 * In this case, there is no more space in the buffer and | |
229 * the last unwritable offset is larger than | |
230 * or equal to the destination buffer length. This means | |
231 * that we are forced to simply write into the last | |
232 * byte. | |
233 */ | |
234 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
235 ((uint_least32_t *)(w->dest))[w->destlen - 1] = … | |
236 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
237 ((char *)(w->dest))[w->destlen - 1] = '\0'; | |
238 } | |
239 } | |
240 | |
241 /* w->off is not incremented in any case */ | |
242 } | |
243 | |
244 size_t | |
245 herodotus_writer_number_written(const HERODOTUS_WRITER *w) | |
246 { | |
247 return w->off; | |
248 } | |
249 | |
250 void | |
251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) | |
252 { | |
253 size_t ret; | |
254 | |
255 /* | |
256 * This function will always faithfully say how many codepoints | |
257 * were written, even if the buffer ends. This is used to enable | |
258 * truncation detection. | |
259 */ | |
260 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
261 if (w->dest != NULL && w->off < w->destlen) { | |
262 ((uint_least32_t *)(w->dest))[w->off] = cp; | |
263 } | |
264 | |
265 w->off += 1; | |
266 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
267 /* | |
268 * First determine how many bytes we need to encode the | |
269 * codepoint | |
270 */ | |
271 ret = grapheme_encode_utf8(cp, NULL, 0); | |
272 | |
273 if (w->dest != NULL && w->off + ret < w->destlen) { | |
274 /* we still have enough room in the buffer */ | |
275 grapheme_encode_utf8(cp, (char *)(w->dest) + w->… | |
276 w->destlen - w->off); | |
277 } else if (w->first_unwritable_offset == SIZE_MAX) { | |
278 /* | |
279 * the first unwritable offset has not been | |
280 * noted down, so this is the first time we can't | |
281 * write (completely) to an offset | |
282 */ | |
283 w->first_unwritable_offset = w->off; | |
284 } | |
285 | |
286 w->off += ret; | |
287 } | |
288 } | |
289 | |
290 void | |
291 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_pro… | |
292 uint_least8_t (*get_break_prop)(uint_least32_t), | |
293 bool (*is_skippable_prop)(uint_least8_t), | |
294 void (*skip_shift_callback)(uint_least8_t, void *), | |
295 struct proper *p) | |
296 { | |
297 uint_least8_t prop; | |
298 uint_least32_t cp; | |
299 size_t i; | |
300 | |
301 /* set internal variables */ | |
302 p->state = state; | |
303 p->no_prop = no_prop; | |
304 p->get_break_prop = get_break_prop; | |
305 p->is_skippable_prop = is_skippable_prop; | |
306 p->skip_shift_callback = skip_shift_callback; | |
307 | |
308 /* | |
309 * Initialize mid-reader, which is basically just there | |
310 * to reflect the current position of the viewing-line | |
311 */ | |
312 herodotus_reader_copy(r, &(p->mid_reader)); | |
313 | |
314 /* | |
315 * In the initialization, we simply (try to) fill in next_prop. | |
316 * If we cannot read in more (due to the buffer ending), we | |
317 * fill in the prop as invalid | |
318 */ | |
319 | |
320 /* | |
321 * initialize the previous properties to have no property | |
322 * (given we are at the start of the buffer) | |
323 */ | |
324 p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; | |
325 p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; | |
326 | |
327 /* | |
328 * initialize the next properties | |
329 */ | |
330 | |
331 /* initialize the raw reader */ | |
332 herodotus_reader_copy(r, &(p->raw_reader)); | |
333 | |
334 /* fill in the two next raw properties (after no-initialization)… | |
335 p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; | |
336 for (i = 0; | |
337 i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &… | |
338 HERODOTUS_STATUS_SUCCESS;) { | |
339 p->raw.next_prop[i++] = p->get_break_prop(cp); | |
340 } | |
341 | |
342 /* initialize the skip reader */ | |
343 herodotus_reader_copy(r, &(p->skip_reader)); | |
344 | |
345 /* fill in the two next skip properties (after no-initialization… | |
346 p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; | |
347 for (i = 0; | |
348 i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, … | |
349 HERODOTUS_STATUS_SUCCESS;) { | |
350 prop = p->get_break_prop(cp); | |
351 if (!p->is_skippable_prop(prop)) { | |
352 p->skip.next_prop[i++] = prop; | |
353 } | |
354 } | |
355 } | |
356 | |
357 int | |
358 proper_advance(struct proper *p) | |
359 { | |
360 uint_least8_t prop; | |
361 uint_least32_t cp; | |
362 | |
363 /* read in next "raw" property */ | |
364 if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == | |
365 HERODOTUS_STATUS_SUCCESS) { | |
366 prop = p->get_break_prop(cp); | |
367 } else { | |
368 prop = p->no_prop; | |
369 } | |
370 | |
371 /* | |
372 * do a shift-in, unless we find that the property that is to | |
373 * be moved past the "raw-viewing-line" (this property is stored | |
374 * in p->raw.next_prop[0]) is a no_prop, indicating that | |
375 * we are at the end of the buffer. | |
376 */ | |
377 if (p->raw.next_prop[0] == p->no_prop) { | |
378 return 1; | |
379 } | |
380 | |
381 /* shift in the properties */ | |
382 p->raw.prev_prop[1] = p->raw.prev_prop[0]; | |
383 p->raw.prev_prop[0] = p->raw.next_prop[0]; | |
384 p->raw.next_prop[0] = p->raw.next_prop[1]; | |
385 p->raw.next_prop[1] = prop; | |
386 | |
387 /* advance the middle reader viewing-line */ | |
388 (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); | |
389 | |
390 /* check skippability-property */ | |
391 if (!p->is_skippable_prop(p->raw.prev_prop[0])) { | |
392 /* | |
393 * the property that has moved past the "raw-viewing-lin… | |
394 * (this property is now (after the raw-shift) stored in | |
395 * p->raw.prev_prop[0] and guaranteed not to be a no-pro… | |
396 * guaranteeing that we won't shift a no-prop past the | |
397 * "viewing-line" in the skip-properties) is not a skipp… | |
398 * property, thus we need to shift the skip property as … | |
399 */ | |
400 p->skip.prev_prop[1] = p->skip.prev_prop[0]; | |
401 p->skip.prev_prop[0] = p->skip.next_prop[0]; | |
402 p->skip.next_prop[0] = p->skip.next_prop[1]; | |
403 | |
404 /* | |
405 * call the skip-shift-callback on the property that | |
406 * passed the skip-viewing-line (this property is now | |
407 * stored in p->skip.prev_prop[0]). | |
408 */ | |
409 p->skip_shift_callback(p->skip.prev_prop[0], p->state); | |
410 | |
411 /* determine the next shift property */ | |
412 p->skip.next_prop[1] = p->no_prop; | |
413 while (herodotus_read_codepoint(&(p->skip_reader), true,… | |
414 HERODOTUS_STATUS_SUCCESS) { | |
415 prop = p->get_break_prop(cp); | |
416 if (!p->is_skippable_prop(prop)) { | |
417 p->skip.next_prop[1] = prop; | |
418 break; | |
419 } | |
420 } | |
421 } | |
422 | |
423 return 0; | |
424 } |