| character.c - libgrapheme - unicode string library | |
| git clone git://git.suckless.org/libgrapheme | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| character.c (18706B) | |
| --- | |
| 1 #include <stdio.h> | |
| 2 | |
| 3 /* See LICENSE file for copyright and license details. */ | |
| 4 #include <limits.h> | |
| 5 #include <stdbool.h> | |
| 6 #include <stddef.h> | |
| 7 | |
| 8 #include "../gen/character.h" | |
| 9 #include "../grapheme.h" | |
| 10 #include "util.h" | |
| 11 | |
| 12 struct character_break_state { | |
| 13 uint_least8_t prop; | |
| 14 bool prop_set; | |
| 15 bool gb11_flag; | |
| 16 bool gb12_13_flag; | |
| 17 uint_least8_t gb9c_level; | |
| 18 }; | |
| 19 | |
| 20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = { | |
| 21 [CHAR_BREAK_PROP_OTHER] = | |
| 22 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 23 UINT32_C(1) | |
| 24 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 25 UINT32_C(1) | |
| 26 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 27 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 28 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 29 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 30 [CHAR_BREAK_PROP_ICB_CONSONANT] = | |
| 31 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 32 UINT32_C(1) | |
| 33 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 34 UINT32_C(1) | |
| 35 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 36 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 37 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 38 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 39 [CHAR_BREAK_PROP_ICB_EXTEND] = | |
| 40 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 41 UINT32_C(1) | |
| 42 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 43 UINT32_C(1) | |
| 44 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 45 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 46 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 47 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 48 [CHAR_BREAK_PROP_ICB_LINKER] = | |
| 49 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 50 UINT32_C(1) | |
| 51 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 52 UINT32_C(1) | |
| 53 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 54 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 55 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 56 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 57 [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* … | |
| 58 [CHAR_BREAK_PROP_EXTEND] = | |
| 59 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 60 UINT32_C(1) | |
| 61 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 62 UINT32_C(1) | |
| 63 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 64 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 65 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 66 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 67 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] = | |
| 68 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 69 UINT32_C(1) | |
| 70 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 71 UINT32_C(1) | |
| 72 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 73 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 74 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 75 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 76 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] = | |
| 77 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 78 UINT32_C(1) | |
| 79 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 80 UINT32_C(1) | |
| 81 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 82 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 83 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 84 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 85 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = | |
| 86 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 87 UINT32_C(1) | |
| 88 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 89 UINT32_C(1) | |
| 90 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 91 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 92 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 93 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 94 [CHAR_BREAK_PROP_HANGUL_L] = | |
| 95 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ | |
| 96 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ | |
| 97 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ | |
| 98 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ | |
| 99 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 100 UINT32_C(1) | |
| 101 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 102 UINT32_C(1) | |
| 103 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 104 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 105 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 106 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 107 [CHAR_BREAK_PROP_HANGUL_V] = | |
| 108 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
| 109 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
| 110 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 111 UINT32_C(1) | |
| 112 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 113 UINT32_C(1) | |
| 114 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 115 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 116 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 117 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 118 [CHAR_BREAK_PROP_HANGUL_T] = | |
| 119 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
| 120 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 121 UINT32_C(1) | |
| 122 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 123 UINT32_C(1) | |
| 124 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 125 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 126 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 127 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 128 [CHAR_BREAK_PROP_HANGUL_LV] = | |
| 129 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ | |
| 130 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ | |
| 131 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 132 UINT32_C(1) | |
| 133 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 134 UINT32_C(1) | |
| 135 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 136 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 137 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 138 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 139 [CHAR_BREAK_PROP_HANGUL_LVT] = | |
| 140 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ | |
| 141 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 142 UINT32_C(1) | |
| 143 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 144 UINT32_C(1) | |
| 145 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 146 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 147 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 148 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 149 [CHAR_BREAK_PROP_PREPEND] = | |
| 150 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 151 UINT32_C(1) | |
| 152 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 153 UINT32_C(1) | |
| 154 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 155 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 156 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 157 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* … | |
| 158 (UINT32_C(0xFFFFFFFF) & | |
| 159 ~(UINT32_C(1) << CHAR_BREAK_PROP_CR | | |
| 160 UINT32_C(1) << CHAR_BREAK_PROP_LF | | |
| 161 UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ | |
| 162 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = | |
| 163 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 164 UINT32_C(1) | |
| 165 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 166 UINT32_C(1) | |
| 167 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 168 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 169 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 170 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 171 [CHAR_BREAK_PROP_SPACINGMARK] = | |
| 172 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 173 UINT32_C(1) | |
| 174 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 175 UINT32_C(1) | |
| 176 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 177 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 178 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 179 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 180 [CHAR_BREAK_PROP_ZWJ] = | |
| 181 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 182 UINT32_C(1) | |
| 183 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 184 UINT32_C(1) | |
| 185 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 186 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 187 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 188 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 189 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] = | |
| 190 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ | |
| 191 UINT32_C(1) | |
| 192 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 193 UINT32_C(1) | |
| 194 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* … | |
| 195 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* … | |
| 196 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 197 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* … | |
| 198 | |
| 199 }; | |
| 200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] =… | |
| 201 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = | |
| 202 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
| 203 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* … | |
| 204 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* … | |
| 205 UINT32_C(1) | |
| 206 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* G… | |
| 207 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /… | |
| 208 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = | |
| 209 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
| 210 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
| 211 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
| 212 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
| 213 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
| 214 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
| 215 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
| 216 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
| 217 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
| 218 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
| 219 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
| 220 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
| 221 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
| 222 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
| 223 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
| 224 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] = | |
| 225 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
| 226 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
| 227 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | | |
| 228 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
| 229 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND, | |
| 230 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = | |
| 231 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | | |
| 232 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | | |
| 233 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | | |
| 234 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | | |
| 235 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, | |
| 236 }; | |
| 237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { | |
| 238 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = | |
| 239 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
| 240 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] = | |
| 241 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, | |
| 242 }; | |
| 243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS… | |
| 244 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = | |
| 245 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
| 246 }; | |
| 247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS]… | |
| 248 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = | |
| 249 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, | |
| 250 }; | |
| 251 | |
| 252 static inline enum char_break_property | |
| 253 get_break_prop(uint_least32_t cp) | |
| 254 { | |
| 255 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
| 256 return (enum char_break_property) | |
| 257 char_break_minor[char_break_major[cp >> 8] + | |
| 258 (cp & 0xFF)]; | |
| 259 } else { | |
| 260 return CHAR_BREAK_PROP_OTHER; | |
| 261 } | |
| 262 } | |
| 263 | |
| 264 static inline void | |
| 265 state_serialize(const struct character_break_state *in, uint_least16_t *… | |
| 266 { | |
| 267 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 b… | |
| 268 (uint_least16_t)(((uint_least16_t)(in->prop_set)) | |
| 269 << 8) | /* 9th bit */ | |
| 270 (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) | |
| 271 << 9) | /* 10th bit */ | |
| 272 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) | |
| 273 << 10) | /* 11th bit */ | |
| 274 (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3)) | |
| 275 << 11); /* 12th and 13th bit */ | |
| 276 } | |
| 277 | |
| 278 static inline void | |
| 279 state_deserialize(uint_least16_t in, struct character_break_state *out) | |
| 280 { | |
| 281 out->prop = in & UINT8_C(0xFF); | |
| 282 out->prop_set = in & (UINT16_C(1) << 8); | |
| 283 out->gb11_flag = in & (UINT16_C(1) << 9); | |
| 284 out->gb12_13_flag = in & (UINT16_C(1) << 10); | |
| 285 out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3); | |
| 286 } | |
| 287 | |
| 288 bool | |
| 289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, | |
| 290 uint_least16_t *s) | |
| 291 { | |
| 292 struct character_break_state state; | |
| 293 enum char_break_property cp0_prop, cp1_prop; | |
| 294 bool notbreak = false; | |
| 295 | |
| 296 if (likely(s)) { | |
| 297 state_deserialize(*s, &state); | |
| 298 | |
| 299 if (likely(state.prop_set)) { | |
| 300 cp0_prop = state.prop; | |
| 301 } else { | |
| 302 cp0_prop = get_break_prop(cp0); | |
| 303 } | |
| 304 cp1_prop = get_break_prop(cp1); | |
| 305 | |
| 306 /* preserve prop of right codepoint for next iteration */ | |
| 307 state.prop = (uint_least8_t)cp1_prop; | |
| 308 state.prop_set = true; | |
| 309 | |
| 310 /* update flags */ | |
| 311 state.gb11_flag = | |
| 312 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS… | |
| 313 state.gb11_f… | |
| 314 UINT32_C(1) << cp1_prop; | |
| 315 state.gb12_13_flag = | |
| 316 flag_update_gb12_13[cp0_prop + | |
| 317 NUM_CHAR_BREAK_PROPS * | |
| 318 state.gb12_13_flag] & | |
| 319 UINT32_C(1) << cp1_prop; | |
| 320 | |
| 321 /* | |
| 322 * update GB9c state, which deals with indic conjunct br… | |
| 323 * We want to detect the following prefix: | |
| 324 * | |
| 325 * ICB_CONSONANT | |
| 326 * [ICB_EXTEND ICB_LINKER]* | |
| 327 * ICB_LINKER | |
| 328 * [ICB_EXTEND ICB_LINKER]* | |
| 329 * | |
| 330 * This representation is not ideal: In reality, what is | |
| 331 * meant is that the prefix is a sequence of [ICB_EXTEND | |
| 332 * ICB_LINKER]*, following an ICB_CONSONANT, that contai… | |
| 333 * least one ICB_LINKER. We thus use the following equiv… | |
| 334 * representation that allows us to store the levels 0..… | |
| 335 * bits. | |
| 336 * | |
| 337 * ICB_CONSONANT -- Level 1 | |
| 338 * ICB_EXTEND* -- Level 2 | |
| 339 * ICB_LINKER -- Level 3 | |
| 340 * [ICB_EXTEND ICB_LINKER]* -- Level 3 | |
| 341 * | |
| 342 * The following chain of if-else-blocks is a bit redund… | |
| 343 * of course could be optimised, but this is kept as is … | |
| 344 * best readability. | |
| 345 */ | |
| 346 if (state.gb9c_level == 0 && | |
| 347 cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { | |
| 348 /* the sequence has begun */ | |
| 349 state.gb9c_level = 1; | |
| 350 } else if ((state.gb9c_level == 1 || state.gb9c_level ==… | |
| 351 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || | |
| 352 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXT… | |
| 353 cp0_prop == | |
| 354 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTE… | |
| 355 /* | |
| 356 * either the level is 1 and thus the ICB conson… | |
| 357 * followed by an ICB extend, where we jump | |
| 358 * to level 2, or we are at level 2 and just wit… | |
| 359 * more ICB extends, staying at level 2. | |
| 360 */ | |
| 361 state.gb9c_level = 2; | |
| 362 } else if ((state.gb9c_level == 1 || state.gb9c_level ==… | |
| 363 (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || | |
| 364 cp0_prop == | |
| 365 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINK… | |
| 366 /* | |
| 367 * witnessing an ICB linker directly lifts us up… | |
| 368 * level 3 | |
| 369 */ | |
| 370 state.gb9c_level = 3; | |
| 371 } else if (state.gb9c_level == 3 && | |
| 372 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND || | |
| 373 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXT… | |
| 374 cp0_prop == | |
| 375 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTE… | |
| 376 cp0_prop == CHAR_BREAK_PROP_ICB_LINKER || | |
| 377 cp0_prop == | |
| 378 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINK… | |
| 379 /* | |
| 380 * we stay at level 3 when we observe either ICB | |
| 381 * extends or linkers | |
| 382 */ | |
| 383 state.gb9c_level = 3; | |
| 384 } else { | |
| 385 /* | |
| 386 * the sequence has collapsed, but it could be | |
| 387 * that the left property is ICB consonant, which | |
| 388 * means that we jump right back to level 1 inst… | |
| 389 * of 0 | |
| 390 */ | |
| 391 if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) { | |
| 392 state.gb9c_level = 1; | |
| 393 } else { | |
| 394 state.gb9c_level = 0; | |
| 395 } | |
| 396 } | |
| 397 | |
| 398 /* | |
| 399 * Apply grapheme cluster breaking algorithm (UAX #29), … | |
| 400 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Bou… | |
| 401 */ | |
| 402 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_p… | |
| 403 (state.gb9c_level == 3 && | |
| 404 cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) || | |
| 405 (dont_break_gb11[cp0_prop + | |
| 406 state.gb11_flag * | |
| 407 NUM_CHAR_BREAK_PROPS… | |
| 408 (UINT32_C(1) << cp1_prop)) || | |
| 409 (dont_break_gb12_13[cp0_prop + | |
| 410 state.gb12_13_flag * | |
| 411 NUM_CHAR_BREAK_PR… | |
| 412 (UINT32_C(1) << cp1_prop)); | |
| 413 | |
| 414 /* update or reset flags (when we have a break) */ | |
| 415 if (likely(!notbreak)) { | |
| 416 state.gb11_flag = state.gb12_13_flag = false; | |
| 417 } | |
| 418 | |
| 419 state_serialize(&state, s); | |
| 420 } else { | |
| 421 cp0_prop = get_break_prop(cp0); | |
| 422 cp1_prop = get_break_prop(cp1); | |
| 423 | |
| 424 /* | |
| 425 * Apply grapheme cluster breaking algorithm (UAX #29), … | |
| 426 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Bou… | |
| 427 * | |
| 428 * Given we have no state, this behaves as if the state-… | |
| 429 * were all set to false | |
| 430 */ | |
| 431 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_p… | |
| 432 (dont_break_gb11[cp0_prop] & | |
| 433 (UINT32_C(1) << cp1_prop)) || | |
| 434 (dont_break_gb12_13[cp0_prop] & | |
| 435 (UINT32_C(1) << cp1_prop)); | |
| 436 } | |
| 437 | |
| 438 return !notbreak; | |
| 439 } | |
| 440 | |
| 441 static size_t | |
| 442 next_character_break(HERODOTUS_READER *r) | |
| 443 { | |
| 444 uint_least16_t state = 0; | |
| 445 uint_least32_t cp0 = 0, cp1 = 0; | |
| 446 | |
| 447 for (herodotus_read_codepoint(r, true, &cp0); | |
| 448 herodotus_read_codepoint(r, false, &cp1) == | |
| 449 HERODOTUS_STATUS_SUCCESS; | |
| 450 herodotus_read_codepoint(r, true, &cp0)) { | |
| 451 if (grapheme_is_character_break(cp0, cp1, &state)) { | |
| 452 break; | |
| 453 } | |
| 454 } | |
| 455 | |
| 456 return herodotus_reader_number_read(r); | |
| 457 } | |
| 458 | |
| 459 size_t | |
| 460 grapheme_next_character_break(const uint_least32_t *str, size_t len) | |
| 461 { | |
| 462 HERODOTUS_READER r; | |
| 463 | |
| 464 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
| 465 | |
| 466 return next_character_break(&r); | |
| 467 } | |
| 468 | |
| 469 size_t | |
| 470 grapheme_next_character_break_utf8(const char *str, size_t len) | |
| 471 { | |
| 472 HERODOTUS_READER r; | |
| 473 | |
| 474 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
| 475 | |
| 476 return next_character_break(&r); | |
| 477 } |