Add helper structure for reading from and writing into buffers - libgrapheme - … | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 563eb65bfbaa4f27c77d73ae81b51882c916993d | |
parent 6d0595242a027c1fcb06136e632f6d727388c4ec | |
Author: Laslo Hunhold <[email protected]> | |
Date: Wed, 21 Sep 2022 20:11:55 +0200 | |
Add helper structure for reading from and writing into buffers | |
The logic behind the input and output buffers is quite intricate | |
and leads to numerous subtle bugs that are best handled with a | |
refactoring using an abstraction layer that hides most of the | |
gory details. | |
The Herodotus reader/writer elegantly does all the magic in the | |
background, allowing us to focus on the algorithms in the front | |
instead. This especially helps with handling NUL-terminated strings, | |
as we are guaranteed not to accidentally read too far. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M src/util.c | 240 +++++++++++++++++++++++++++++… | |
M src/util.h | 64 +++++++++++++++++++++++++++++… | |
2 files changed, 304 insertions(+), 0 deletions(-) | |
--- | |
diff --git a/src/util.c b/src/util.c | |
@@ -7,6 +7,246 @@ | |
#include "../grapheme.h" | |
#include "util.h" | |
+void | |
+herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type, | |
+ const void *src, size_t srclen) | |
+{ | |
+ size_t i; | |
+ | |
+ r->type = type; | |
+ r->src = src; | |
+ r->srclen = srclen; | |
+ r->off = 0; | |
+ r->terminated_by_null = false; | |
+ | |
+ for (i = 0; i < LEN(r->soft_limit); i++) { | |
+ r->soft_limit[i] = SIZE_MAX; | |
+ } | |
+} | |
+ | |
+void | |
+herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest) | |
+{ | |
+ size_t i; | |
+ | |
+ dest->type = src->type; | |
+ dest->src = src->src; | |
+ dest->srclen = src->srclen; | |
+ dest->off = src->off; | |
+ dest->terminated_by_null = src->terminated_by_null; | |
+ | |
+ for (i = 0; i < LEN(src->soft_limit); i++) { | |
+ dest->soft_limit[i] = src->soft_limit[i]; | |
+ } | |
+} | |
+ | |
+void | |
+herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count) | |
+{ | |
+ size_t i; | |
+ | |
+ for (i = LEN(r->soft_limit) - 1; i >= 1; i--) { | |
+ r->soft_limit[i] = r->soft_limit[i - 1]; | |
+ } | |
+ r->soft_limit[0] = r->off + count; | |
+} | |
+ | |
+void | |
+herodotus_reader_pop_limit(HERODOTUS_READER *r) | |
+{ | |
+ size_t i; | |
+ | |
+ for (i = 0; i < LEN(r->soft_limit) - 1; i++) { | |
+ r->soft_limit[i] = r->soft_limit[i + 1]; | |
+ } | |
+ r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX; | |
+} | |
+ | |
+size_t | |
+herodotus_reader_next_word_break(const HERODOTUS_READER *r) | |
+{ | |
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ return grapheme_next_word_break( | |
+ (const uint_least32_t *)(r->src) + r->off, | |
+ MIN(r->srclen, r->soft_limit[0]) - r->off); | |
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
+ return grapheme_next_word_break_utf8( | |
+ (const char *)(r->src) + r->off, | |
+ MIN(r->srclen, r->soft_limit[0]) - r->off); | |
+ } | |
+} | |
+ | |
+size_t | |
+herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) | |
+{ | |
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0; | |
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
+ return grapheme_decode_utf8( | |
+ (const char *)(r->src) + r->off, | |
+ MIN(r->srclen, r->soft_limit[0]) - r->off, NULL); | |
+ } | |
+} | |
+ | |
+enum herodotus_status | |
+herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp) | |
+{ | |
+ size_t ret; | |
+ | |
+ if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) { | |
+ *cp = GRAPHEME_INVALID_CODEPOINT; | |
+ return HERODOTUS_STATUS_END_OF_BUFFER; | |
+ } | |
+ | |
+ if (r->off >= r->soft_limit[0]) { | |
+ *cp = GRAPHEME_INVALID_CODEPOINT; | |
+ return HERODOTUS_STATUS_SOFT_LIMIT_REACHED; | |
+ } | |
+ | |
+ if (r->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ *cp = ((const uint_least32_t *)(r->src))[r->off++]; | |
+ } else { /* r->type == HERODOTUS_TYPE_UTF8 */ | |
+ ret = grapheme_decode_utf8((const char *)r->src + r->off, | |
+ MIN(r->srclen, r->soft_limit[0]) - | |
+ r->off, cp); | |
+ | |
+ if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) { | |
+ /* | |
+ * We encountered a NUL-byte. Don't increment | |
+ * offset and return as if the buffer had ended | |
+ * here all along | |
+ */ | |
+ r->terminated_by_null = true; | |
+ return HERODOTUS_STATUS_END_OF_BUFFER; | |
+ } | |
+ | |
+ if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) { | |
+ /* | |
+ * we want more than we have; instead of | |
+ * returning garbage we terminate here. | |
+ */ | |
+ return HERODOTUS_STATUS_END_OF_BUFFER; | |
+ } | |
+ | |
+ /* | |
+ * Increase offset which we now know won't surpass | |
+ * the limits, unless we got told otherwise | |
+ */ | |
+ if (advance) { | |
+ r->off += ret; | |
+ } | |
+ } | |
+ | |
+ return HERODOTUS_STATUS_SUCCESS; | |
+} | |
+ | |
+void | |
+herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, | |
+ void *dest, size_t destlen) | |
+{ | |
+ w->type = type; | |
+ w->dest = dest; | |
+ w->destlen = destlen; | |
+ w->off = 0; | |
+ w->first_unwritable_offset = SIZE_MAX; | |
+} | |
+ | |
+void | |
+herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) | |
+{ | |
+ if (w->dest == NULL) { | |
+ return; | |
+ } | |
+ | |
+ if (w->off < w->destlen) { | |
+ /* We still have space in the buffer. Simply use it */ | |
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ ((uint_least32_t *)(w->dest))[w->off] = 0; | |
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
+ ((char *)(w->dest))[w->off] = '\0'; | |
+ } | |
+ } else if (w->first_unwritable_offset < w->destlen) { | |
+ /* | |
+ * There is no more space in the buffer. However, | |
+ * we have noted down the first offset we couldn't | |
+ * use to write into the buffer and it's smaller than | |
+ * destlen. Thus we bailed writing into the | |
+ * destination when a multibyte-codepoint couldn't be | |
+ * written. So the last "real" byte might be at | |
+ * destlen-4, destlen-3, destlen-2 or destlen-1 | |
+ * (the last case meaning truncation). | |
+ */ | |
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ ((uint_least32_t *)(w->dest)) | |
+ [w->first_unwritable_offset] = 0; | |
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
+ ((char *)(w->dest))[w->first_unwritable_offset] = '\0'; | |
+ } | |
+ } else { | |
+ /* | |
+ * In this case, there is no more space in the buffer and | |
+ * the last unwritable offset is larger than | |
+ * or equal to the destination buffer length. This means | |
+ * that we are forced to simply write into the last | |
+ * byte. | |
+ */ | |
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ ((uint_least32_t *)(w->dest)) | |
+ [w->destlen - 1] = 0; | |
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
+ ((char *)(w->dest))[w->destlen - 1] = '\0'; | |
+ } | |
+ } | |
+ | |
+ /* w->off is not incremented in any case */ | |
+} | |
+ | |
+size_t | |
+herodotus_writer_number_written(HERODOTUS_WRITER *w) | |
+{ | |
+ return w->off; | |
+} | |
+ | |
+void | |
+herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) | |
+{ | |
+ size_t ret; | |
+ | |
+ /* | |
+ * This function will always faithfully say how many codepoints | |
+ * were written, even if the buffer ends. This is used to enable | |
+ * truncation detection. | |
+ */ | |
+ if (w->type == HERODOTUS_TYPE_CODEPOINT) { | |
+ if (w->dest != NULL && w->off < w->destlen) { | |
+ ((uint_least32_t *)(w->dest))[w->off] = cp; | |
+ } | |
+ | |
+ w->off += 1; | |
+ } else { /* w->type == HERODOTUS_TYPE_UTF8 */ | |
+ /* | |
+ * First determine how many bytes we need to encode the | |
+ * codepoint | |
+ */ | |
+ ret = grapheme_encode_utf8(cp, NULL, 0); | |
+ | |
+ if (w->dest != NULL && w->off + ret < w->destlen) { | |
+ /* we still have enough room in the buffer */ | |
+ grapheme_encode_utf8(cp, (char *)(w->dest) + | |
+ w->off, w->destlen - w->off); | |
+ } else if (w->first_unwritable_offset == SIZE_MAX) { | |
+ /* | |
+ * the first unwritable offset has not been | |
+ * noted down, so this is the first time we can't | |
+ * write (completely) to an offset | |
+ */ | |
+ w->first_unwritable_offset = w->off; | |
+ } | |
+ | |
+ w->off += ret; | |
+ } | |
+} | |
+ | |
inline size_t | |
get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp) | |
{ | |
diff --git a/src/util.h b/src/util.h | |
@@ -2,12 +2,16 @@ | |
#ifndef UTIL_H | |
#define UTIL_H | |
+#include <stdbool.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include "../gen/types.h" | |
#include "../grapheme.h" | |
+#undef MIN | |
+#define MIN(x,y) ((x) < (y) ? (x) : (y)) | |
+#undef LEN | |
#define LEN(x) (sizeof(x) / sizeof(*(x))) | |
#undef likely | |
@@ -25,6 +29,66 @@ | |
#define unlikely(expr) (expr) | |
#endif | |
+/* | |
+ * Herodotus, the ancient greek historian and geographer, | |
+ * was criticized for including legends and other fantastic | |
+ * accounts into his works, among others by his contemporary | |
+ * Thucydides. | |
+ * | |
+ * The Herodotus readers and writers are tailored towards the needs | |
+ * of the library interface, doing all the dirty work behind the | |
+ * scenes. While the reader is relatively faithful in his accounts, | |
+ * the Herodotus writer will never fail and always claim to write the | |
+ * data. Internally, it only writes as much as it can, and will simply | |
+ * keep account of the rest. This way, we can properly signal truncation. | |
+ * | |
+ * In this sense, explaining the naming, the writer is always a bit | |
+ * inaccurate in his accounts. | |
+ * | |
+ */ | |
+enum herodotus_status { | |
+ HERODOTUS_STATUS_SUCCESS, | |
+ HERODOTUS_STATUS_END_OF_BUFFER, | |
+ HERODOTUS_STATUS_SOFT_LIMIT_REACHED, | |
+}; | |
+ | |
+enum herodotus_type { | |
+ HERODOTUS_TYPE_CODEPOINT, | |
+ HERODOTUS_TYPE_UTF8, | |
+}; | |
+ | |
+typedef struct herodotus_reader { | |
+ enum herodotus_type type; | |
+ const void *src; | |
+ size_t srclen; | |
+ size_t off; | |
+ bool terminated_by_null; | |
+ size_t soft_limit[10]; | |
+} HERODOTUS_READER; | |
+ | |
+typedef struct herodotus_writer { | |
+ enum herodotus_type type; | |
+ void *dest; | |
+ size_t destlen; | |
+ size_t off; | |
+ size_t first_unwritable_offset; | |
+} HERODOTUS_WRITER; | |
+ | |
+void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type, | |
+ const void *, size_t); | |
+void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *); | |
+void herodotus_reader_push_advance_limit(HERODOTUS_READER *, size_t); | |
+void herodotus_reader_pop_limit(HERODOTUS_READER *); | |
+size_t herodotus_reader_next_word_break(const HERODOTUS_READER *); | |
+size_t herodotus_reader_next_codepoint_break(const HERODOTUS_READER *); | |
+enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *, bool, uint_… | |
+ | |
+void herodotus_writer_init(HERODOTUS_WRITER *, enum herodotus_type, void *, | |
+ size_t); | |
+void herodotus_writer_nul_terminate(HERODOTUS_WRITER *); | |
+size_t herodotus_writer_number_written(HERODOTUS_WRITER *); | |
+void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t); | |
+ | |
size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *); | |
size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *); | |