| util.c - libgrapheme - unicode string library | |
| git clone git://git.suckless.org/libgrapheme | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| util.c (11480B) | |
| --- | |
| 1 /* See LICENSE file for copyright and license details. */ | |
| 2 #include <limits.h> | |
| 3 #include <stdbool.h> | |
| 4 #include <stddef.h> | |
| 5 #include <stdint.h> | |
| 6 | |
| 7 #include "../gen/types.h" | |
| 8 #include "../grapheme.h" | |
| 9 #include "util.h" | |
| 10 | |
| 11 void | |
| 12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type, | |
| 13 const void *src, size_t srclen) | |
| 14 { | |
| 15 size_t i; | |
| 16 | |
| 17 r->type = type; | |
| 18 r->src = src; | |
| 19 r->srclen = srclen; | |
| 20 r->off = 0; | |
| 21 r->terminated_by_null = false; | |
| 22 | |
| 23 for (i = 0; i < LEN(r->soft_limit); i++) { | |
| 24 r->soft_limit[i] = SIZE_MAX; | |
| 25 } | |
| 26 } | |
| 27 | |
| 28 void | |
| 29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *des… | |
| 30 { | |
| 31 size_t i; | |
| 32 | |
| 33 /* | |
| 34 * we copy such that we have a "fresh" start and build on the | |
| 35 * fact that src->soft_limit[i] for any i and src->srclen are | |
| 36 * always larger or equal to src->off | |
| 37 */ | |
| 38 dest->type = src->type; | |
| 39 if (src->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 40 dest->src = | |
| 41 (src->src == NULL) ? | |
| 42 NULL : | |
| 43 ((const uint_least32_t *)(src->src)) + s… | |
| 44 } else { /* src->type == HERODOTUS_TYPE_UTF8 */ | |
| 45 dest->src = (src->src == NULL) ? | |
| 46 NULL : | |
| 47 ((const char *)(src->src)) + src->of… | |
| 48 } | |
| 49 if (src->srclen == SIZE_MAX) { | |
| 50 dest->srclen = SIZE_MAX; | |
| 51 } else { | |
| 52 dest->srclen = | |
| 53 (src->off < src->srclen) ? src->srclen - src->of… | |
| 54 } | |
| 55 dest->off = 0; | |
| 56 dest->terminated_by_null = src->terminated_by_null; | |
| 57 | |
| 58 for (i = 0; i < LEN(src->soft_limit); i++) { | |
| 59 if (src->soft_limit[i] == SIZE_MAX) { | |
| 60 dest->soft_limit[i] = SIZE_MAX; | |
| 61 } else { | |
| 62 /* | |
| 63 * if we have a degenerate case where the offset… | |
| 64 * higher than the soft-limit, we simply clamp t… | |
| 65 * soft-limit to zero given we can't decide here | |
| 66 * to release the limit and, instead, we just | |
| 67 * prevent any more reads | |
| 68 */ | |
| 69 dest->soft_limit[i] = | |
| 70 (src->off < src->soft_limit[i]) ? | |
| 71 src->soft_limit[i] - src->off : | |
| 72 0; | |
| 73 } | |
| 74 } | |
| 75 } | |
| 76 | |
| 77 void | |
| 78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count) | |
| 79 { | |
| 80 size_t i; | |
| 81 | |
| 82 for (i = LEN(r->soft_limit) - 1; i >= 1; i--) { | |
| 83 r->soft_limit[i] = r->soft_limit[i - 1]; | |
| 84 } | |
| 85 r->soft_limit[0] = r->off + count; | |
| 86 } | |
| 87 | |
| 88 void | |
| 89 herodotus_reader_pop_limit(HERODOTUS_READER *r) | |
| 90 { | |
| 91 size_t i; | |
| 92 | |
| 93 for (i = 0; i < LEN(r->soft_limit) - 1; i++) { | |
| 94 r->soft_limit[i] = r->soft_limit[i + 1]; | |
| 95 } | |
| 96 r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX; | |
| 97 } | |
| 98 | |
| 99 size_t | |
| 100 herodotus_reader_next_word_break(const HERODOTUS_READER *r) | |
| 101 { | |
| 102 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 103 return grapheme_next_word_break( | |
| 104 (const uint_least32_t *)(r->src) + r->off, | |
| 105 MIN(r->srclen, r->soft_limit[0]) - r->off); | |
| 106 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
| 107 return grapheme_next_word_break_utf8( | |
| 108 (const char *)(r->src) + r->off, | |
| 109 MIN(r->srclen, r->soft_limit[0]) - r->off); | |
| 110 } | |
| 111 } | |
| 112 | |
| 113 size_t | |
| 114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) | |
| 115 { | |
| 116 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 117 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 :… | |
| 118 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
| 119 return grapheme_decode_utf8( | |
| 120 (const char *)(r->src) + r->off, | |
| 121 MIN(r->srclen, r->soft_limit[0]) - r->off, NULL); | |
| 122 } | |
| 123 } | |
| 124 | |
| 125 size_t | |
| 126 herodotus_reader_number_read(const HERODOTUS_READER *r) | |
| 127 { | |
| 128 return r->off; | |
| 129 } | |
| 130 | |
| 131 enum herodotus_status | |
| 132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32… | |
| 133 { | |
| 134 size_t ret; | |
| 135 | |
| 136 if (r->terminated_by_null || r->off >= r->srclen || r->src == NU… | |
| 137 *cp = GRAPHEME_INVALID_CODEPOINT; | |
| 138 return HERODOTUS_STATUS_END_OF_BUFFER; | |
| 139 } | |
| 140 | |
| 141 if (r->off >= r->soft_limit[0]) { | |
| 142 *cp = GRAPHEME_INVALID_CODEPOINT; | |
| 143 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED; | |
| 144 } | |
| 145 | |
| 146 if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 147 *cp = ((const uint_least32_t *)(r->src))[r->off]; | |
| 148 ret = 1; | |
| 149 } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
| 150 ret = grapheme_decode_utf8( | |
| 151 (const char *)r->src + r->off, | |
| 152 MIN(r->srclen, r->soft_limit[0]) - r->off, cp); | |
| 153 } | |
| 154 | |
| 155 if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) { | |
| 156 /* | |
| 157 * We encountered a null-codepoint. Don't increment | |
| 158 * offset and return as if the buffer had ended here all | |
| 159 * along | |
| 160 */ | |
| 161 r->terminated_by_null = true; | |
| 162 return HERODOTUS_STATUS_END_OF_BUFFER; | |
| 163 } | |
| 164 | |
| 165 if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) { | |
| 166 /* | |
| 167 * we want more than we have; instead of returning | |
| 168 * garbage we terminate here. | |
| 169 */ | |
| 170 return HERODOTUS_STATUS_END_OF_BUFFER; | |
| 171 } | |
| 172 | |
| 173 /* | |
| 174 * Increase offset which we now know won't surpass the limits, | |
| 175 * unless we got told otherwise | |
| 176 */ | |
| 177 if (advance) { | |
| 178 r->off += ret; | |
| 179 } | |
| 180 | |
| 181 return HERODOTUS_STATUS_SUCCESS; | |
| 182 } | |
| 183 | |
| 184 void | |
| 185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, voi… | |
| 186 size_t destlen) | |
| 187 { | |
| 188 w->type = type; | |
| 189 w->dest = dest; | |
| 190 w->destlen = destlen; | |
| 191 w->off = 0; | |
| 192 w->first_unwritable_offset = SIZE_MAX; | |
| 193 } | |
| 194 | |
| 195 void | |
| 196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) | |
| 197 { | |
| 198 if (w->dest == NULL) { | |
| 199 return; | |
| 200 } | |
| 201 | |
| 202 if (w->off < w->destlen) { | |
| 203 /* We still have space in the buffer. Simply use it */ | |
| 204 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 205 ((uint_least32_t *)(w->dest))[w->off] = 0; | |
| 206 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
| 207 ((char *)(w->dest))[w->off] = '\0'; | |
| 208 } | |
| 209 } else if (w->first_unwritable_offset < w->destlen) { | |
| 210 /* | |
| 211 * There is no more space in the buffer. However, | |
| 212 * we have noted down the first offset we couldn't | |
| 213 * use to write into the buffer and it's smaller than | |
| 214 * destlen. Thus we bailed writing into the | |
| 215 * destination when a multibyte-codepoint couldn't be | |
| 216 * written. So the last "real" byte might be at | |
| 217 * destlen-4, destlen-3, destlen-2 or destlen-1 | |
| 218 * (the last case meaning truncation). | |
| 219 */ | |
| 220 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 221 ((uint_least32_t | |
| 222 *)(w->dest))[w->first_unwritable_offse… | |
| 223 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
| 224 ((char *)(w->dest))[w->first_unwritable_offset] … | |
| 225 } | |
| 226 } else if (w->destlen > 0) { | |
| 227 /* | |
| 228 * In this case, there is no more space in the buffer and | |
| 229 * the last unwritable offset is larger than | |
| 230 * or equal to the destination buffer length. This means | |
| 231 * that we are forced to simply write into the last | |
| 232 * byte. | |
| 233 */ | |
| 234 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 235 ((uint_least32_t *)(w->dest))[w->destlen - 1] = … | |
| 236 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
| 237 ((char *)(w->dest))[w->destlen - 1] = '\0'; | |
| 238 } | |
| 239 } | |
| 240 | |
| 241 /* w->off is not incremented in any case */ | |
| 242 } | |
| 243 | |
| 244 size_t | |
| 245 herodotus_writer_number_written(const HERODOTUS_WRITER *w) | |
| 246 { | |
| 247 return w->off; | |
| 248 } | |
| 249 | |
| 250 void | |
| 251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) | |
| 252 { | |
| 253 size_t ret; | |
| 254 | |
| 255 /* | |
| 256 * This function will always faithfully say how many codepoints | |
| 257 * were written, even if the buffer ends. This is used to enable | |
| 258 * truncation detection. | |
| 259 */ | |
| 260 if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
| 261 if (w->dest != NULL && w->off < w->destlen) { | |
| 262 ((uint_least32_t *)(w->dest))[w->off] = cp; | |
| 263 } | |
| 264 | |
| 265 w->off += 1; | |
| 266 } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
| 267 /* | |
| 268 * First determine how many bytes we need to encode the | |
| 269 * codepoint | |
| 270 */ | |
| 271 ret = grapheme_encode_utf8(cp, NULL, 0); | |
| 272 | |
| 273 if (w->dest != NULL && w->off + ret < w->destlen) { | |
| 274 /* we still have enough room in the buffer */ | |
| 275 grapheme_encode_utf8(cp, (char *)(w->dest) + w->… | |
| 276 w->destlen - w->off); | |
| 277 } else if (w->first_unwritable_offset == SIZE_MAX) { | |
| 278 /* | |
| 279 * the first unwritable offset has not been | |
| 280 * noted down, so this is the first time we can't | |
| 281 * write (completely) to an offset | |
| 282 */ | |
| 283 w->first_unwritable_offset = w->off; | |
| 284 } | |
| 285 | |
| 286 w->off += ret; | |
| 287 } | |
| 288 } | |
| 289 | |
| 290 void | |
| 291 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_pro… | |
| 292 uint_least8_t (*get_break_prop)(uint_least32_t), | |
| 293 bool (*is_skippable_prop)(uint_least8_t), | |
| 294 void (*skip_shift_callback)(uint_least8_t, void *), | |
| 295 struct proper *p) | |
| 296 { | |
| 297 uint_least8_t prop; | |
| 298 uint_least32_t cp; | |
| 299 size_t i; | |
| 300 | |
| 301 /* set internal variables */ | |
| 302 p->state = state; | |
| 303 p->no_prop = no_prop; | |
| 304 p->get_break_prop = get_break_prop; | |
| 305 p->is_skippable_prop = is_skippable_prop; | |
| 306 p->skip_shift_callback = skip_shift_callback; | |
| 307 | |
| 308 /* | |
| 309 * Initialize mid-reader, which is basically just there | |
| 310 * to reflect the current position of the viewing-line | |
| 311 */ | |
| 312 herodotus_reader_copy(r, &(p->mid_reader)); | |
| 313 | |
| 314 /* | |
| 315 * In the initialization, we simply (try to) fill in next_prop. | |
| 316 * If we cannot read in more (due to the buffer ending), we | |
| 317 * fill in the prop as invalid | |
| 318 */ | |
| 319 | |
| 320 /* | |
| 321 * initialize the previous properties to have no property | |
| 322 * (given we are at the start of the buffer) | |
| 323 */ | |
| 324 p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; | |
| 325 p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; | |
| 326 | |
| 327 /* | |
| 328 * initialize the next properties | |
| 329 */ | |
| 330 | |
| 331 /* initialize the raw reader */ | |
| 332 herodotus_reader_copy(r, &(p->raw_reader)); | |
| 333 | |
| 334 /* fill in the two next raw properties (after no-initialization)… | |
| 335 p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; | |
| 336 for (i = 0; | |
| 337 i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &… | |
| 338 HERODOTUS_STATUS_SUCCESS;) { | |
| 339 p->raw.next_prop[i++] = p->get_break_prop(cp); | |
| 340 } | |
| 341 | |
| 342 /* initialize the skip reader */ | |
| 343 herodotus_reader_copy(r, &(p->skip_reader)); | |
| 344 | |
| 345 /* fill in the two next skip properties (after no-initialization… | |
| 346 p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; | |
| 347 for (i = 0; | |
| 348 i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, … | |
| 349 HERODOTUS_STATUS_SUCCESS;) { | |
| 350 prop = p->get_break_prop(cp); | |
| 351 if (!p->is_skippable_prop(prop)) { | |
| 352 p->skip.next_prop[i++] = prop; | |
| 353 } | |
| 354 } | |
| 355 } | |
| 356 | |
| 357 int | |
| 358 proper_advance(struct proper *p) | |
| 359 { | |
| 360 uint_least8_t prop; | |
| 361 uint_least32_t cp; | |
| 362 | |
| 363 /* read in next "raw" property */ | |
| 364 if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == | |
| 365 HERODOTUS_STATUS_SUCCESS) { | |
| 366 prop = p->get_break_prop(cp); | |
| 367 } else { | |
| 368 prop = p->no_prop; | |
| 369 } | |
| 370 | |
| 371 /* | |
| 372 * do a shift-in, unless we find that the property that is to | |
| 373 * be moved past the "raw-viewing-line" (this property is stored | |
| 374 * in p->raw.next_prop[0]) is a no_prop, indicating that | |
| 375 * we are at the end of the buffer. | |
| 376 */ | |
| 377 if (p->raw.next_prop[0] == p->no_prop) { | |
| 378 return 1; | |
| 379 } | |
| 380 | |
| 381 /* shift in the properties */ | |
| 382 p->raw.prev_prop[1] = p->raw.prev_prop[0]; | |
| 383 p->raw.prev_prop[0] = p->raw.next_prop[0]; | |
| 384 p->raw.next_prop[0] = p->raw.next_prop[1]; | |
| 385 p->raw.next_prop[1] = prop; | |
| 386 | |
| 387 /* advance the middle reader viewing-line */ | |
| 388 (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); | |
| 389 | |
| 390 /* check skippability-property */ | |
| 391 if (!p->is_skippable_prop(p->raw.prev_prop[0])) { | |
| 392 /* | |
| 393 * the property that has moved past the "raw-viewing-lin… | |
| 394 * (this property is now (after the raw-shift) stored in | |
| 395 * p->raw.prev_prop[0] and guaranteed not to be a no-pro… | |
| 396 * guaranteeing that we won't shift a no-prop past the | |
| 397 * "viewing-line" in the skip-properties) is not a skipp… | |
| 398 * property, thus we need to shift the skip property as … | |
| 399 */ | |
| 400 p->skip.prev_prop[1] = p->skip.prev_prop[0]; | |
| 401 p->skip.prev_prop[0] = p->skip.next_prop[0]; | |
| 402 p->skip.next_prop[0] = p->skip.next_prop[1]; | |
| 403 | |
| 404 /* | |
| 405 * call the skip-shift-callback on the property that | |
| 406 * passed the skip-viewing-line (this property is now | |
| 407 * stored in p->skip.prev_prop[0]). | |
| 408 */ | |
| 409 p->skip_shift_callback(p->skip.prev_prop[0], p->state); | |
| 410 | |
| 411 /* determine the next shift property */ | |
| 412 p->skip.next_prop[1] = p->no_prop; | |
| 413 while (herodotus_read_codepoint(&(p->skip_reader), true,… | |
| 414 HERODOTUS_STATUS_SUCCESS) { | |
| 415 prop = p->get_break_prop(cp); | |
| 416 if (!p->is_skippable_prop(prop)) { | |
| 417 p->skip.next_prop[1] = prop; | |
| 418 break; | |
| 419 } | |
| 420 } | |
| 421 } | |
| 422 | |
| 423 return 0; | |
| 424 } |