GopherProxy

	util.c - libgrapheme - unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	util.c (11480B)
	---
	1 /* See LICENSE file for copyright and license details. */
	2 #include <limits.h>
	3 #include <stdbool.h>
	4 #include <stddef.h>
	5 #include <stdint.h>
	6
	7 #include "../gen/types.h"
	8 #include "../grapheme.h"
	9 #include "util.h"
	10
	11 void
	12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
	13 const void *src, size_t srclen)
	14 {
	15 size_t i;
	16
	17 r->type = type;
	18 r->src = src;
	19 r->srclen = srclen;
	20 r->off = 0;
	21 r->terminated_by_null = false;
	22
	23 for (i = 0; i < LEN(r->soft_limit); i++) {
	24 r->soft_limit[i] = SIZE_MAX;
	25 }
	26 }
	27
	28 void
	29 herodotus_reader_copy(const HERODOTUS_READER src, HERODOTUS_READER des…
	30 {
	31 size_t i;
	32
	33 /*
	34 * we copy such that we have a "fresh" start and build on the
	35 * fact that src->soft_limit[i] for any i and src->srclen are
	36 * always larger or equal to src->off
	37 */
	38 dest->type = src->type;
	39 if (src->type == HERODOTUS_TYPE_CODEPOINT) {
	40 dest->src =
	41 (src->src == NULL) ?
	42 NULL :
	43 ((const uint_least32_t *)(src->src)) + s…
	44 } else { /* src->type == HERODOTUS_TYPE_UTF8 */
	45 dest->src = (src->src == NULL) ?
	46 NULL :
	47 ((const char *)(src->src)) + src->of…
	48 }
	49 if (src->srclen == SIZE_MAX) {
	50 dest->srclen = SIZE_MAX;
	51 } else {
	52 dest->srclen =
	53 (src->off < src->srclen) ? src->srclen - src->of…
	54 }
	55 dest->off = 0;
	56 dest->terminated_by_null = src->terminated_by_null;
	57
	58 for (i = 0; i < LEN(src->soft_limit); i++) {
	59 if (src->soft_limit[i] == SIZE_MAX) {
	60 dest->soft_limit[i] = SIZE_MAX;
	61 } else {
	62 /*
	63 * if we have a degenerate case where the offset…
	64 * higher than the soft-limit, we simply clamp t…
	65 * soft-limit to zero given we can't decide here
	66 * to release the limit and, instead, we just
	67 * prevent any more reads
	68 */
	69 dest->soft_limit[i] =
	70 (src->off < src->soft_limit[i]) ?
	71 src->soft_limit[i] - src->off :
	72 0;
	73 }
	74 }
	75 }
	76
	77 void
	78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
	79 {
	80 size_t i;
	81
	82 for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
	83 r->soft_limit[i] = r->soft_limit[i - 1];
	84 }
	85 r->soft_limit[0] = r->off + count;
	86 }
	87
	88 void
	89 herodotus_reader_pop_limit(HERODOTUS_READER *r)
	90 {
	91 size_t i;
	92
	93 for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
	94 r->soft_limit[i] = r->soft_limit[i + 1];
	95 }
	96 r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
	97 }
	98
	99 size_t
	100 herodotus_reader_next_word_break(const HERODOTUS_READER *r)
	101 {
	102 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
	103 return grapheme_next_word_break(
	104 (const uint_least32_t *)(r->src) + r->off,
	105 MIN(r->srclen, r->soft_limit[0]) - r->off);
	106 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
	107 return grapheme_next_word_break_utf8(
	108 (const char *)(r->src) + r->off,
	109 MIN(r->srclen, r->soft_limit[0]) - r->off);
	110 }
	111 }
	112
	113 size_t
	114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
	115 {
	116 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
	117 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 :…
	118 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
	119 return grapheme_decode_utf8(
	120 (const char *)(r->src) + r->off,
	121 MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
	122 }
	123 }
	124
	125 size_t
	126 herodotus_reader_number_read(const HERODOTUS_READER *r)
	127 {
	128 return r->off;
	129 }
	130
	131 enum herodotus_status
	132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32…
	133 {
	134 size_t ret;
	135
	136 if (r->terminated_by_null \|\| r->off >= r->srclen \|\| r->src == NU…
	137 *cp = GRAPHEME_INVALID_CODEPOINT;
	138 return HERODOTUS_STATUS_END_OF_BUFFER;
	139 }
	140
	141 if (r->off >= r->soft_limit[0]) {
	142 *cp = GRAPHEME_INVALID_CODEPOINT;
	143 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
	144 }
	145
	146 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
	147 cp = ((const uint_least32_t )(r->src))[r->off];
	148 ret = 1;
	149 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
	150 ret = grapheme_decode_utf8(
	151 (const char *)r->src + r->off,
	152 MIN(r->srclen, r->soft_limit[0]) - r->off, cp);
	153 }
	154
	155 if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
	156 /*
	157 * We encountered a null-codepoint. Don't increment
	158 * offset and return as if the buffer had ended here all
	159 * along
	160 */
	161 r->terminated_by_null = true;
	162 return HERODOTUS_STATUS_END_OF_BUFFER;
	163 }
	164
	165 if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
	166 /*
	167 * we want more than we have; instead of returning
	168 * garbage we terminate here.
	169 */
	170 return HERODOTUS_STATUS_END_OF_BUFFER;
	171 }
	172
	173 /*
	174 * Increase offset which we now know won't surpass the limits,
	175 * unless we got told otherwise
	176 */
	177 if (advance) {
	178 r->off += ret;
	179 }
	180
	181 return HERODOTUS_STATUS_SUCCESS;
	182 }
	183
	184 void
	185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, voi…
	186 size_t destlen)
	187 {
	188 w->type = type;
	189 w->dest = dest;
	190 w->destlen = destlen;
	191 w->off = 0;
	192 w->first_unwritable_offset = SIZE_MAX;
	193 }
	194
	195 void
	196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
	197 {
	198 if (w->dest == NULL) {
	199 return;
	200 }
	201
	202 if (w->off < w->destlen) {
	203 /* We still have space in the buffer. Simply use it */
	204 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
	205 ((uint_least32_t *)(w->dest))[w->off] = 0;
	206 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
	207 ((char *)(w->dest))[w->off] = '\0';
	208 }
	209 } else if (w->first_unwritable_offset < w->destlen) {
	210 /*
	211 * There is no more space in the buffer. However,
	212 * we have noted down the first offset we couldn't
	213 * use to write into the buffer and it's smaller than
	214 * destlen. Thus we bailed writing into the
	215 * destination when a multibyte-codepoint couldn't be
	216 * written. So the last "real" byte might be at
	217 * destlen-4, destlen-3, destlen-2 or destlen-1
	218 * (the last case meaning truncation).
	219 */
	220 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
	221 ((uint_least32_t
	222 *)(w->dest))[w->first_unwritable_offse…
	223 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
	224 ((char *)(w->dest))[w->first_unwritable_offset] …
	225 }
	226 } else if (w->destlen > 0) {
	227 /*
	228 * In this case, there is no more space in the buffer and
	229 * the last unwritable offset is larger than
	230 * or equal to the destination buffer length. This means
	231 * that we are forced to simply write into the last
	232 * byte.
	233 */
	234 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
	235 ((uint_least32_t *)(w->dest))[w->destlen - 1] = …
	236 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
	237 ((char *)(w->dest))[w->destlen - 1] = '\0';
	238 }
	239 }
	240
	241 /* w->off is not incremented in any case */
	242 }
	243
	244 size_t
	245 herodotus_writer_number_written(const HERODOTUS_WRITER *w)
	246 {
	247 return w->off;
	248 }
	249
	250 void
	251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
	252 {
	253 size_t ret;
	254
	255 /*
	256 * This function will always faithfully say how many codepoints
	257 * were written, even if the buffer ends. This is used to enable
	258 * truncation detection.
	259 */
	260 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
	261 if (w->dest != NULL && w->off < w->destlen) {
	262 ((uint_least32_t *)(w->dest))[w->off] = cp;
	263 }
	264
	265 w->off += 1;
	266 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
	267 /*
	268 * First determine how many bytes we need to encode the
	269 * codepoint
	270 */
	271 ret = grapheme_encode_utf8(cp, NULL, 0);
	272
	273 if (w->dest != NULL && w->off + ret < w->destlen) {
	274 /* we still have enough room in the buffer */
	275 grapheme_encode_utf8(cp, (char *)(w->dest) + w->…
	276 w->destlen - w->off);
	277 } else if (w->first_unwritable_offset == SIZE_MAX) {
	278 /*
	279 * the first unwritable offset has not been
	280 * noted down, so this is the first time we can't
	281 * write (completely) to an offset
	282 */
	283 w->first_unwritable_offset = w->off;
	284 }
	285
	286 w->off += ret;
	287 }
	288 }
	289
	290 void
	291 proper_init(const HERODOTUS_READER r, void state, uint_least8_t no_pro…
	292 uint_least8_t (*get_break_prop)(uint_least32_t),
	293 bool (*is_skippable_prop)(uint_least8_t),
	294 void (skip_shift_callback)(uint_least8_t, void ),
	295 struct proper *p)
	296 {
	297 uint_least8_t prop;
	298 uint_least32_t cp;
	299 size_t i;
	300
	301 /* set internal variables */
	302 p->state = state;
	303 p->no_prop = no_prop;
	304 p->get_break_prop = get_break_prop;
	305 p->is_skippable_prop = is_skippable_prop;
	306 p->skip_shift_callback = skip_shift_callback;
	307
	308 /*
	309 * Initialize mid-reader, which is basically just there
	310 * to reflect the current position of the viewing-line
	311 */
	312 herodotus_reader_copy(r, &(p->mid_reader));
	313
	314 /*
	315 * In the initialization, we simply (try to) fill in next_prop.
	316 * If we cannot read in more (due to the buffer ending), we
	317 * fill in the prop as invalid
	318 */
	319
	320 /*
	321 * initialize the previous properties to have no property
	322 * (given we are at the start of the buffer)
	323 */
	324 p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
	325 p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
	326
	327 /*
	328 * initialize the next properties
	329 */
	330
	331 /* initialize the raw reader */
	332 herodotus_reader_copy(r, &(p->raw_reader));
	333
	334 /* fill in the two next raw properties (after no-initialization)…
	335 p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
	336 for (i = 0;
	337 i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &…
	338 HERODOTUS_STATUS_SUCCESS;) {
	339 p->raw.next_prop[i++] = p->get_break_prop(cp);
	340 }
	341
	342 /* initialize the skip reader */
	343 herodotus_reader_copy(r, &(p->skip_reader));
	344
	345 /* fill in the two next skip properties (after no-initialization…
	346 p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
	347 for (i = 0;
	348 i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, …
	349 HERODOTUS_STATUS_SUCCESS;) {
	350 prop = p->get_break_prop(cp);
	351 if (!p->is_skippable_prop(prop)) {
	352 p->skip.next_prop[i++] = prop;
	353 }
	354 }
	355 }
	356
	357 int
	358 proper_advance(struct proper *p)
	359 {
	360 uint_least8_t prop;
	361 uint_least32_t cp;
	362
	363 /* read in next "raw" property */
	364 if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
	365 HERODOTUS_STATUS_SUCCESS) {
	366 prop = p->get_break_prop(cp);
	367 } else {
	368 prop = p->no_prop;
	369 }
	370
	371 /*
	372 * do a shift-in, unless we find that the property that is to
	373 * be moved past the "raw-viewing-line" (this property is stored
	374 * in p->raw.next_prop[0]) is a no_prop, indicating that
	375 * we are at the end of the buffer.
	376 */
	377 if (p->raw.next_prop[0] == p->no_prop) {
	378 return 1;
	379 }
	380
	381 /* shift in the properties */
	382 p->raw.prev_prop[1] = p->raw.prev_prop[0];
	383 p->raw.prev_prop[0] = p->raw.next_prop[0];
	384 p->raw.next_prop[0] = p->raw.next_prop[1];
	385 p->raw.next_prop[1] = prop;
	386
	387 /* advance the middle reader viewing-line */
	388 (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
	389
	390 /* check skippability-property */
	391 if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
	392 /*
	393 * the property that has moved past the "raw-viewing-lin…
	394 * (this property is now (after the raw-shift) stored in
	395 * p->raw.prev_prop[0] and guaranteed not to be a no-pro…
	396 * guaranteeing that we won't shift a no-prop past the
	397 * "viewing-line" in the skip-properties) is not a skipp…
	398 * property, thus we need to shift the skip property as …
	399 */
	400 p->skip.prev_prop[1] = p->skip.prev_prop[0];
	401 p->skip.prev_prop[0] = p->skip.next_prop[0];
	402 p->skip.next_prop[0] = p->skip.next_prop[1];
	403
	404 /*
	405 * call the skip-shift-callback on the property that
	406 * passed the skip-viewing-line (this property is now
	407 * stored in p->skip.prev_prop[0]).
	408 */
	409 p->skip_shift_callback(p->skip.prev_prop[0], p->state);
	410
	411 /* determine the next shift property */
	412 p->skip.next_prop[1] = p->no_prop;
	413 while (herodotus_read_codepoint(&(p->skip_reader), true,…
	414 HERODOTUS_STATUS_SUCCESS) {
	415 prop = p->get_break_prop(cp);
	416 if (!p->is_skippable_prop(prop)) {
	417 p->skip.next_prop[1] = prop;
	418 break;
	419 }
	420 }
	421 }
	422
	423 return 0;
	424 }