Convert GRAPHEME_STATE to uint_least16_t and remove it - libgrapheme - unicode … | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit fc73d06fed76dd7cde37d3704949d01391ea0032 | |
parent a815be4b5de7f7df2da664049fdb04874d37016a | |
Author: Laslo Hunhold <[email protected]> | |
Date: Mon, 3 Oct 2022 23:13:26 +0200 | |
Convert GRAPHEME_STATE to uint_least16_t and remove it | |
I was never quite happy with the fact that the internal state-struct | |
was visible in the grapheme.h-header, given the declaration of the | |
fields only namely served internal purposes and were useless noise to | |
the reader. | |
To keep it in was merely a choice made because I had always hoped | |
to be able to implement maybe a few more state-based pairwise | |
segmentation check functions and use the GRAPHEME_STATE type in more | |
places, but now after implementing the algorithms it becomes clear that | |
they all do not satisfy these pairwise semantics. | |
The first logical step was to convert the struct to an uint_least16_t, | |
which provides enough space (at least 16 bits) to store all the complete | |
state, and have internal deserialiation and serialization functions. | |
The remaining question was if the | |
typedef uint_least16_t GRAPHEME_STATE | |
should be removed. I took inspiration from the Linux kernel coding | |
style[0], which in section 5b lays out the exact case of typedeffing | |
an integer that is meant to store flags (just like in our case). | |
It is argued there that there needs to be a good reason to typedef an | |
integer (e.g. given it might change by architecture or maybe change in | |
later versions). Both cases are not given here (we will _never_ need | |
more than 16 bits to store the grapheme cluster break state and you can | |
even reduce more wastage, e.g. for storing the prop which never exceeds | |
4 bits given NUM_CHAR_BREAK_PROPS == 14 < 15 == 2^4-1), and I must admit | |
that it improves readability a bit given you finally know what you're | |
dealing with. | |
The expression | |
GRAPHEME_STATE state = 0; | |
admittedly looks a little fishy, given you don't really know what happens | |
behind the scenes unless you look in the header, and I want all of the | |
semantics to be crystal clear to the end-user. | |
[0]:https://www.kernel.org/doc/html/latest/process/coding-style.html#typedefs | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M benchmark/character.c | 2 +- | |
M grapheme.h | 9 +-------- | |
M man/grapheme_is_character_break.sh | 4 ++-- | |
M src/character.c | 60 +++++++++++++++++++++++------… | |
4 files changed, 49 insertions(+), 26 deletions(-) | |
--- | |
diff --git a/benchmark/character.c b/benchmark/character.c | |
@@ -23,7 +23,7 @@ struct break_benchmark_payload { | |
void | |
libgrapheme(const void *payload) | |
{ | |
- GRAPHEME_STATE state = { 0 }; | |
+ uint_least16_t state = 0; | |
const struct break_benchmark_payload *p = payload; | |
size_t i; | |
diff --git a/grapheme.h b/grapheme.h | |
@@ -6,19 +6,12 @@ | |
#include <stddef.h> | |
#include <stdint.h> | |
-typedef struct grapheme_internal_segmentation_state { | |
- uint_least8_t prop; | |
- bool prop_set; | |
- bool gb11_flag; | |
- bool gb12_13_flag; | |
-} GRAPHEME_STATE; | |
- | |
#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD) | |
size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *); | |
size_t grapheme_encode_utf8(uint_least32_t, char *, size_t); | |
-bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STAT… | |
+bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_… | |
bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *); | |
bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *); | |
diff --git a/man/grapheme_is_character_break.sh b/man/grapheme_is_character_bre… | |
@@ -8,7 +8,7 @@ cat << EOF | |
.Sh SYNOPSIS | |
.In grapheme.h | |
.Ft size_t | |
-.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "GRA… | |
+.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uin… | |
.Sh DESCRIPTION | |
The | |
.Fn grapheme_is_character_break | |
@@ -52,7 +52,7 @@ if there is not. | |
int | |
main(void) | |
{ | |
- GRAPHEME_STATE state = { 0 }; | |
+ uint_least16_t state = 0; | |
uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */ | |
size_t i; | |
diff --git a/src/character.c b/src/character.c | |
@@ -7,6 +7,13 @@ | |
#include "../grapheme.h" | |
#include "util.h" | |
+struct character_break_state { | |
+ uint_least8_t prop; | |
+ bool prop_set; | |
+ bool gb11_flag; | |
+ bool gb12_13_flag; | |
+}; | |
+ | |
static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { | |
[CHAR_BREAK_PROP_OTHER] = | |
UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
@@ -106,38 +113,59 @@ get_break_prop(uint_least32_t cp) | |
{ | |
if (likely(cp <= 0x10FFFF)) { | |
return (enum char_break_property) | |
- char_break_minor[char_break_major[cp >> 8] + (cp & 0xff… | |
+ char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF… | |
} else { | |
return CHAR_BREAK_PROP_OTHER; | |
} | |
} | |
+static inline void | |
+state_serialize(const struct character_break_state *in, uint_least16_t *out) | |
+{ | |
+ *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | … | |
+ (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | … | |
+ (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | … | |
+ (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); … | |
+} | |
+ | |
+static inline void | |
+state_deserialize(uint_least16_t in, struct character_break_state *out) | |
+{ | |
+ out->prop = in & UINT8_C(0xFF); | |
+ out->prop_set = in & (((uint_least16_t)(1)) << 8); | |
+ out->gb11_flag = in & (((uint_least16_t)(1)) << 9); | |
+ out->gb12_13_flag = in & (((uint_least16_t)(1)) << 10); | |
+} | |
+ | |
bool | |
-grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_S… | |
+grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least… | |
{ | |
+ struct character_break_state state; | |
enum char_break_property cp0_prop, cp1_prop; | |
bool notbreak = false; | |
- if (likely(state)) { | |
- if (likely(state->prop_set)) { | |
- cp0_prop = state->prop; | |
+ if (likely(s)) { | |
+ state_deserialize(*s, &state); | |
+ | |
+ if (likely(state.prop_set)) { | |
+ cp0_prop = state.prop; | |
} else { | |
cp0_prop = get_break_prop(cp0); | |
} | |
cp1_prop = get_break_prop(cp1); | |
/* preserve prop of right codepoint for next iteration */ | |
- state->prop = (uint_least8_t)cp1_prop; | |
- state->prop_set = true; | |
+ state.prop = (uint_least8_t)cp1_prop; | |
+ state.prop_set = true; | |
/* update flags */ | |
- state->gb11_flag = | |
+ state.gb11_flag = | |
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * | |
- state->gb11_flag] & | |
+ state.gb11_flag] & | |
UINT16_C(1) << cp1_prop; | |
- state->gb12_13_flag = | |
+ state.gb12_13_flag = | |
flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS * | |
- state->gb12_13_flag] & | |
+ state.gb12_13_flag] & | |
UINT16_C(1) << cp1_prop; | |
/* | |
@@ -145,17 +173,19 @@ grapheme_is_character_break(uint_least32_t cp0, uint_leas… | |
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_… | |
*/ | |
notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) … | |
- (dont_break_gb11[cp0_prop + state->gb11_flag * | |
+ (dont_break_gb11[cp0_prop + state.gb11_flag * | |
NUM_CHAR_BREAK_PROPS] & | |
(UINT16_C(1) << cp1_prop)) || | |
- (dont_break_gb12_13[cp0_prop + state->gb12_13_flag * | |
+ (dont_break_gb12_13[cp0_prop + state.gb12_13_flag * | |
NUM_CHAR_BREAK_PROPS] & | |
(UINT16_C(1) << cp1_prop)); | |
/* update or reset flags (when we have a break) */ | |
if (likely(!notbreak)) { | |
- state->gb11_flag = state->gb12_13_flag = false; | |
+ state.gb11_flag = state.gb12_13_flag = false; | |
} | |
+ | |
+ state_serialize(&state, s); | |
} else { | |
cp0_prop = get_break_prop(cp0); | |
cp1_prop = get_break_prop(cp1); | |
@@ -178,7 +208,7 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least3… | |
static size_t | |
next_character_break(HERODOTUS_READER *r) | |
{ | |
- GRAPHEME_STATE state = { 0 }; | |
+ uint_least16_t state = 0; | |
uint_least32_t cp0 = 0, cp1 = 0; | |
for (herodotus_read_codepoint(r, true, &cp0); |