Update grapheme break algorithm to Unicode version 15.1.0 - libgrapheme - unico… | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
commit 65b354f0fcb1d925f4340dbb4415ea06e8af2bec | |
parent 3ee106e4ab1d5fe4696ab9089f052706d7cb9a48 | |
Author: Laslo Hunhold <[email protected]> | |
Date: Sun, 1 Sep 2024 22:42:18 +0200 | |
Update grapheme break algorithm to Unicode version 15.1.0 | |
While the change to the algorithm looks harmless in the specification, | |
it comes at the price of more complexity because we have to keep track | |
of a relatively complex state for a sequence of indic conjunct breaks. | |
Fortunately adding so many additional classes only decreases the | |
compression ratio for the grapheme cluster LUTs by ~0.5%. | |
We now pass all 1187 character tests. | |
Signed-off-by: Laslo Hunhold <[email protected]> | |
Diffstat: | |
M Makefile | 4 ++-- | |
M gen/character.c | 104 +++++++++++++++++++++++++++++… | |
M gen/util.c | 5 ++++- | |
M gen/util.h | 1 + | |
M src/character.c | 376 +++++++++++++++++++++++++----… | |
5 files changed, 409 insertions(+), 81 deletions(-) | |
--- | |
diff --git a/Makefile b/Makefile | |
@@ -196,7 +196,7 @@ src/sentence.o: src/sentence.c Makefile config.mk gen/sente… | |
src/utf8.o: src/utf8.c Makefile config.mk grapheme.h | |
src/util.o: src/util.c Makefile config.mk gen/types.h grapheme.h src/util.h | |
src/word.o: src/word.c Makefile config.mk gen/word.h grapheme.h src/util.h | |
-test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectiona… | |
+test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectiona… | |
test/case.o: test/case.c Makefile config.mk grapheme.h test/util.h | |
test/character.o: test/character.c Makefile config.mk gen/character-test.h gra… | |
test/line.o: test/line.c Makefile config.mk gen/line-test.h grapheme.h test/ut… | |
@@ -236,7 +236,7 @@ test/word$(BINSUFFIX): test/word.o test/util.o $(ANAME) | |
gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/Derived… | |
gen/bidirectional-test.h: data/BidiCharacterTest.txt data/BidiTest.txt gen/bid… | |
gen/case.h: data/DerivedCoreProperties.txt data/UnicodeData.txt data/SpecialCa… | |
-gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/charac… | |
+gen/character.h: data/DerivedCoreProperties.txt data/emoji-data.txt data/Graph… | |
gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test$(BINSUFFIX) | |
gen/line.h: data/emoji-data.txt data/EastAsianWidth.txt data/LineBreak.txt gen… | |
gen/line-test.h: data/LineBreakTest.txt gen/line-test$(BINSUFFIX) | |
diff --git a/gen/character.c b/gen/character.c | |
@@ -1,8 +1,12 @@ | |
/* See LICENSE file for copyright and license details. */ | |
#include <stddef.h> | |
+#include <stdio.h> | |
+#include <stdlib.h> | |
+#include <string.h> | |
#include "util.h" | |
+#define FILE_DCP "data/DerivedCoreProperties.txt" | |
#define FILE_EMOJI "data/emoji-data.txt" | |
#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" | |
@@ -13,6 +17,21 @@ static const struct property_spec char_break_property[] = { | |
.ucdname = NULL, | |
}, | |
{ | |
+ .enumname = "BOTH_EXTEND_ICB_EXTEND", | |
+ .file = NULL, | |
+ .ucdname = NULL, | |
+ }, | |
+ { | |
+ .enumname = "BOTH_EXTEND_ICB_LINKER", | |
+ .file = NULL, | |
+ .ucdname = NULL, | |
+ }, | |
+ { | |
+ .enumname = "BOTH_ZWJ_ICB_EXTEND", | |
+ .file = NULL, | |
+ .ucdname = NULL, | |
+ }, | |
+ { | |
.enumname = "CONTROL", | |
.file = FILE_GRAPHEME, | |
.ucdname = "Control", | |
@@ -58,6 +77,24 @@ static const struct property_spec char_break_property[] = { | |
.ucdname = "LVT", | |
}, | |
{ | |
+ .enumname = "ICB_CONSONANT", | |
+ .file = FILE_DCP, | |
+ .ucdname = "InCB", | |
+ .ucdsubname = "Consonant", | |
+ }, | |
+ { | |
+ .enumname = "ICB_EXTEND", | |
+ .file = FILE_DCP, | |
+ .ucdname = "InCB", | |
+ .ucdsubname = "Extend", | |
+ }, | |
+ { | |
+ .enumname = "ICB_LINKER", | |
+ .file = FILE_DCP, | |
+ .ucdname = "InCB", | |
+ .ucdsubname = "Linker", | |
+ }, | |
+ { | |
.enumname = "LF", | |
.file = FILE_GRAPHEME, | |
.ucdname = "LF", | |
@@ -84,14 +121,75 @@ static const struct property_spec char_break_property[] = { | |
}, | |
}; | |
+static uint_least8_t | |
+handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) | |
+{ | |
+ uint_least8_t result; | |
+ | |
+ (void)cp; | |
+ | |
+ if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") && | |
+ !strcmp(char_break_property[prop2].enumname, "ICB_EXTEND")) || | |
+ (!strcmp(char_break_property[prop1].enumname, "ICB_EXTEND") && | |
+ !strcmp(char_break_property[prop2].enumname, "EXTEND"))) { | |
+ for (result = 0; result < LEN(char_break_property); result++) { | |
+ if (!strcmp(char_break_property[result].enumname, | |
+ "BOTH_EXTEND_ICB_EXTEND")) { | |
+ break; | |
+ } | |
+ } | |
+ if (result == LEN(char_break_property)) { | |
+ fprintf(stderr, "handle_conflict: Internal error.\n"); | |
+ exit(1); | |
+ } | |
+ } else if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") && | |
+ !strcmp(char_break_property[prop2].enumname, | |
+ "ICB_LINKER")) || | |
+ (!strcmp(char_break_property[prop1].enumname, | |
+ "ICB_LINKER") && | |
+ !strcmp(char_break_property[prop2].enumname, "EXTEND"))) { | |
+ for (result = 0; result < LEN(char_break_property); result++) { | |
+ if (!strcmp(char_break_property[result].enumname, | |
+ "BOTH_EXTEND_ICB_LINKER")) { | |
+ break; | |
+ } | |
+ } | |
+ if (result == LEN(char_break_property)) { | |
+ fprintf(stderr, "handle_conflict: Internal error.\n"); | |
+ exit(1); | |
+ } | |
+ } else if ((!strcmp(char_break_property[prop1].enumname, "ZWJ") && | |
+ !strcmp(char_break_property[prop2].enumname, | |
+ "ICB_EXTEND")) || | |
+ (!strcmp(char_break_property[prop1].enumname, | |
+ "ICB_EXTEND") && | |
+ !strcmp(char_break_property[prop2].enumname, "ZWJ"))) { | |
+ for (result = 0; result < LEN(char_break_property); result++) { | |
+ if (!strcmp(char_break_property[result].enumname, | |
+ "BOTH_ZWJ_ICB_EXTEND")) { | |
+ break; | |
+ } | |
+ } | |
+ if (result == LEN(char_break_property)) { | |
+ fprintf(stderr, "handle_conflict: Internal error.\n"); | |
+ exit(1); | |
+ } | |
+ } else { | |
+ fprintf(stderr, "handle_conflict: Cannot handle conflict.\n"); | |
+ exit(1); | |
+ } | |
+ | |
+ return result; | |
+} | |
+ | |
int | |
main(int argc, char *argv[]) | |
{ | |
(void)argc; | |
- properties_generate_break_property(char_break_property, | |
- LEN(char_break_property), NULL, NUL… | |
- NULL, "char_break", argv[0]); | |
+ properties_generate_break_property( | |
+ char_break_property, LEN(char_break_property), NULL, | |
+ handle_conflict, NULL, "char_break", argv[0]); | |
return 0; | |
} | |
diff --git a/gen/util.c b/gen/util.c | |
@@ -317,7 +317,10 @@ properties_callback(const char *file, char **field, size_t… | |
(comment != NULL && | |
!strncmp(p->spec[i].ucdname, comment, | |
strlen(p->spec[i].ucdname)) && | |
- comment[strlen(p->spec[i].ucdname)] == ' '))) { | |
+ comment[strlen(p->spec[i].ucdname)] == ' ')) && | |
+ (p->spec[i].ucdsubname == NULL || | |
+ (nfields >= 3 && | |
+ !strcmp(p->spec[i].ucdsubname, field[2])))) { | |
/* parse range in first field */ | |
if (range_parse(field[0], &r)) { | |
return 1; | |
diff --git a/gen/util.h b/gen/util.h | |
@@ -13,6 +13,7 @@ struct property_spec { | |
const char *enumname; | |
const char *file; | |
const char *ucdname; | |
+ const char *ucdsubname; | |
}; | |
struct properties { | |
diff --git a/src/character.c b/src/character.c | |
@@ -1,3 +1,5 @@ | |
+#include <stdio.h> | |
+ | |
/* See LICENSE file for copyright and license details. */ | |
#include <limits.h> | |
#include <stdbool.h> | |
@@ -12,97 +14,239 @@ struct character_break_state { | |
bool prop_set; | |
bool gb11_flag; | |
bool gb12_13_flag; | |
+ uint_least8_t gb9c_level; | |
}; | |
-static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { | |
+static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = { | |
[CHAR_BREAK_PROP_OTHER] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
- [CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_ICB_CONSONANT] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_ICB_EXTEND] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_ICB_LINKER] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ | |
[CHAR_BREAK_PROP_EXTEND] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_HANGUL_L] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_HANGUL_V] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_HANGUL_T] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_HANGUL_LV] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_HANGUL_LVT] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_PREPEND] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ | |
- (UINT16_C(0xFFFF) & | |
- ~(UINT16_C(1) << CHAR_BREAK_PROP_CR | | |
- UINT16_C(1) << CHAR_BREAK_PROP_LF | | |
- UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ | |
+ (UINT32_C(0xFFFFFFFF) & | |
+ ~(UINT32_C(1) << CHAR_BREAK_PROP_CR | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_LF | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ | |
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_SPACINGMARK] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
[CHAR_BREAK_PROP_ZWJ] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
- UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ | |
+ | |
}; | |
-static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { | |
+static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { | |
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
+ UINT32_C(1) | |
+ << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */ | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 … | |
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
+ [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
[CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
+ [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
[CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
+ UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, | |
}; | |
-static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { | |
+static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { | |
[CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
+ [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
+ UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
}; | |
-static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { | |
+static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { | |
[CHAR_BREAK_PROP_REGIONAL_INDICATOR] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
}; | |
-static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { | |
+static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { | |
[CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = | |
- UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
+ UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
}; | |
static inline enum char_break_property | |
@@ -126,7 +270,9 @@ state_serialize(const struct character_break_state *in, uin… | |
(uint_least16_t)(((uint_least16_t)(in->gb11_flag)) | |
<< 9) | /* 10th bit */ | |
(uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) | |
- << 10); /* 11th bit */ | |
+ << 10) | /* 11th bit */ | |
+ (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3)) | |
+ << 11); /* 12th and 13th bit */ | |
} | |
static inline void | |
@@ -136,6 +282,7 @@ state_deserialize(uint_least16_t in, struct character_break… | |
out->prop_set = in & (UINT16_C(1) << 8); | |
out->gb11_flag = in & (UINT16_C(1) << 9); | |
out->gb12_13_flag = in & (UINT16_C(1) << 10); | |
+ out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3); | |
} | |
bool | |
@@ -164,26 +311,105 @@ grapheme_is_character_break(uint_least32_t cp0, uint_lea… | |
state.gb11_flag = | |
flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * | |
state.gb11_flag] & | |
- UINT16_C(1) << cp1_prop; | |
+ UINT32_C(1) << cp1_prop; | |
state.gb12_13_flag = | |
flag_update_gb12_13[cp0_prop + | |
NUM_CHAR_BREAK_PROPS * | |
state.gb12_13_flag] & | |
- UINT16_C(1) << cp1_prop; | |
+ UINT32_C(1) << cp1_prop; | |
+ | |
+ /* | |
+ * update GB9c state, which deals with indic conjunct breaks. | |
+ * We want to detect the following prefix: | |
+ * | |
+ * ICB_CONSONANT | |
+ * [ICB_EXTEND ICB_LINKER]* | |
+ * ICB_LINKER | |
+ * [ICB_EXTEND ICB_LINKER]* | |
+ * | |
+ * This representation is not ideal: In reality, what is | |
+ * meant is that the prefix is a sequence of [ICB_EXTEND | |
+ * ICB_LINKER]*, following an ICB_CONSONANT, that contains at | |
+ * least one ICB_LINKER. We thus use the following equivalent | |
+ * representation that allows us to store the levels 0..3 in 2 | |
+ * bits. | |
+ * | |
+ * ICB_CONSONANT -- Level 1 | |
+ * ICB_EXTEND* -- Level 2 | |
+ * ICB_LINKER -- Level 3 | |
+ * [ICB_EXTEND ICB_LINKER]* -- Level 3 | |
+ * | |
+ * The following chain of if-else-blocks is a bit redundant and | |
+ * of course could be optimised, but this is kept as is for | |
+ * best readability. | |
+ */ | |
+ if (state.gb9c_level == 0 && | |
+ cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { | |
+ /* the sequence has begun */ | |
+ state.gb9c_level = 1; | |
+ } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) && | |
+ (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || | |
+ cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND || | |
+ cp0_prop == | |
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) { | |
+ /* | |
+ * either the level is 1 and thus the ICB consonant is | |
+ * followed by an ICB extend, where we jump | |
+ * to level 2, or we are at level 2 and just witness | |
+ * more ICB extends, staying at level 2. | |
+ */ | |
+ state.gb9c_level = 2; | |
+ } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) && | |
+ (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || | |
+ cp0_prop == | |
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) { | |
+ /* | |
+ * witnessing an ICB linker directly lifts us up to | |
+ * level 3 | |
+ */ | |
+ state.gb9c_level = 3; | |
+ } else if (state.gb9c_level == 3 && | |
+ (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || | |
+ cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND || | |
+ cp0_prop == | |
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND || | |
+ cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || | |
+ cp0_prop == | |
+ CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) { | |
+ /* | |
+ * we stay at level 3 when we observe either ICB | |
+ * extends or linkers | |
+ */ | |
+ state.gb9c_level = 3; | |
+ } else { | |
+ /* | |
+ * the sequence has collapsed, but it could be | |
+ * that the left property is ICB consonant, which | |
+ * means that we jump right back to level 1 instead | |
+ * of 0 | |
+ */ | |
+ if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { | |
+ state.gb9c_level = 1; | |
+ } else { | |
+ state.gb9c_level = 0; | |
+ } | |
+ } | |
/* | |
* Apply grapheme cluster breaking algorithm (UAX #29), see | |
* http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_… | |
*/ | |
- notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) … | |
+ notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) … | |
+ (state.gb9c_level == 3 && | |
+ cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) || | |
(dont_break_gb11[cp0_prop + | |
state.gb11_flag * | |
NUM_CHAR_BREAK_PROPS] & | |
- (UINT16_C(1) << cp1_prop)) || | |
+ (UINT32_C(1) << cp1_prop)) || | |
(dont_break_gb12_13[cp0_prop + | |
state.gb12_13_flag * | |
NUM_CHAR_BREAK_PROPS] & | |
- (UINT16_C(1) << cp1_prop)); | |
+ (UINT32_C(1) << cp1_prop)); | |
/* update or reset flags (when we have a break) */ | |
if (likely(!notbreak)) { | |
@@ -202,11 +428,11 @@ grapheme_is_character_break(uint_least32_t cp0, uint_leas… | |
* Given we have no state, this behaves as if the state-boolea… | |
* were all set to false | |
*/ | |
- notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) … | |
+ notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) … | |
(dont_break_gb11[cp0_prop] & | |
- (UINT16_C(1) << cp1_prop)) || | |
+ (UINT32_C(1) << cp1_prop)) || | |
(dont_break_gb12_13[cp0_prop] & | |
- (UINT16_C(1) << cp1_prop)); | |
+ (UINT32_C(1) << cp1_prop)); | |
} | |
return !notbreak; |