word.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
word.c (8052B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stdbool.h> | |
3 #include <stddef.h> | |
4 | |
5 #include "../gen/word.h" | |
6 #include "../grapheme.h" | |
7 #include "util.h" | |
8 | |
9 struct word_break_state { | |
10 bool ri_even; | |
11 }; | |
12 | |
13 static inline uint_least8_t | |
14 get_word_break_prop(uint_least32_t cp) | |
15 { | |
16 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
17 return (uint_least8_t) | |
18 word_break_minor[word_break_major[cp >> 8] + | |
19 (cp & 0xff)]; | |
20 } else { | |
21 return WORD_BREAK_PROP_OTHER; | |
22 } | |
23 } | |
24 | |
25 static bool | |
26 is_skippable_word_prop(uint_least8_t prop) | |
27 { | |
28 return prop == WORD_BREAK_PROP_EXTEND || | |
29 prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP… | |
30 } | |
31 | |
32 static void | |
33 word_skip_shift_callback(uint_least8_t prop, void *s) | |
34 { | |
35 struct word_break_state *state = (struct word_break_state *)s; | |
36 | |
37 if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) { | |
38 /* | |
39 * The property we just shifted in is | |
40 * a regional indicator, increasing the | |
41 * number of consecutive RIs on the left | |
42 * side of the breakpoint by one, changing | |
43 * the oddness. | |
44 * | |
45 */ | |
46 state->ri_even = !(state->ri_even); | |
47 } else { | |
48 /* | |
49 * We saw no regional indicator, so the | |
50 * number of consecutive RIs on the left | |
51 * side of the breakpoint is zero, which | |
52 * is an even number. | |
53 * | |
54 */ | |
55 state->ri_even = true; | |
56 } | |
57 } | |
58 | |
59 static size_t | |
60 next_word_break(HERODOTUS_READER *r) | |
61 { | |
62 struct proper p; | |
63 struct word_break_state state = { .ri_even = true }; | |
64 | |
65 /* | |
66 * Apply word breaking algorithm (UAX #29), see | |
67 * https://unicode.org/reports/tr29/#Word_Boundary_Rules | |
68 */ | |
69 proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop, | |
70 is_skippable_word_prop, word_skip_shift_callback, &p… | |
71 | |
72 while (!proper_advance(&p)) { | |
73 /* WB3 */ | |
74 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR && | |
75 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) { | |
76 continue; | |
77 } | |
78 | |
79 /* WB3a */ | |
80 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE || | |
81 p.raw.prev_prop[0] == WORD_BREAK_PROP_CR || | |
82 p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) { | |
83 break; | |
84 } | |
85 | |
86 /* WB3b */ | |
87 if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE || | |
88 p.raw.next_prop[0] == WORD_BREAK_PROP_CR || | |
89 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) { | |
90 break; | |
91 } | |
92 | |
93 /* WB3c */ | |
94 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ && | |
95 (p.raw.next_prop[0] == | |
96 WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC || | |
97 p.raw.next_prop[0] == | |
98 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) { | |
99 continue; | |
100 } | |
101 | |
102 /* WB3d */ | |
103 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE && | |
104 p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) { | |
105 continue; | |
106 } | |
107 | |
108 /* WB4 */ | |
109 if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND || | |
110 p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT || | |
111 p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) { | |
112 continue; | |
113 } | |
114 | |
115 /* WB5 */ | |
116 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || | |
117 p.skip.prev_prop[0] == | |
118 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
119 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
120 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || | |
121 p.skip.next_prop[0] == | |
122 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
123 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
124 continue; | |
125 } | |
126 | |
127 /* WB6 */ | |
128 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || | |
129 p.skip.prev_prop[0] == | |
130 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
131 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
132 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER || | |
133 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
134 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE… | |
135 (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER || | |
136 p.skip.next_prop[1] == | |
137 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
138 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTE… | |
139 continue; | |
140 } | |
141 | |
142 /* WB7 */ | |
143 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER || | |
144 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
145 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE… | |
146 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || | |
147 p.skip.next_prop[0] == | |
148 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
149 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
150 (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER || | |
151 p.skip.prev_prop[1] == | |
152 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
153 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTE… | |
154 continue; | |
155 } | |
156 | |
157 /* WB7a */ | |
158 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER… | |
159 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE)… | |
160 continue; | |
161 } | |
162 | |
163 /* WB7b */ | |
164 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER… | |
165 p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE … | |
166 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER… | |
167 continue; | |
168 } | |
169 | |
170 /* WB7c */ | |
171 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE … | |
172 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER… | |
173 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER… | |
174 continue; | |
175 } | |
176 | |
177 /* WB8 */ | |
178 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
179 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) { | |
180 continue; | |
181 } | |
182 | |
183 /* WB9 */ | |
184 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || | |
185 p.skip.prev_prop[0] == | |
186 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
187 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
188 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) { | |
189 continue; | |
190 } | |
191 | |
192 /* WB10 */ | |
193 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
194 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || | |
195 p.skip.next_prop[0] == | |
196 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
197 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
198 continue; | |
199 } | |
200 | |
201 /* WB11 */ | |
202 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM || | |
203 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
204 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE… | |
205 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
206 p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) { | |
207 continue; | |
208 } | |
209 | |
210 /* WB12 */ | |
211 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC && | |
212 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM || | |
213 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET || | |
214 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE… | |
215 p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) { | |
216 continue; | |
217 } | |
218 | |
219 /* WB13 */ | |
220 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA && | |
221 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) { | |
222 continue; | |
223 } | |
224 | |
225 /* WB13a */ | |
226 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER || | |
227 p.skip.prev_prop[0] == | |
228 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
229 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
230 p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC || | |
231 p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA || | |
232 p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET… | |
233 p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET)… | |
234 continue; | |
235 } | |
236 | |
237 /* WB13b */ | |
238 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET … | |
239 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER || | |
240 p.skip.next_prop[0] == | |
241 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT || | |
242 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE… | |
243 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC || | |
244 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) { | |
245 continue; | |
246 } | |
247 | |
248 /* WB15 and WB16 */ | |
249 if (!state.ri_even && | |
250 p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDI… | |
251 continue; | |
252 } | |
253 | |
254 /* WB999 */ | |
255 break; | |
256 } | |
257 | |
258 return herodotus_reader_number_read(&(p.mid_reader)); | |
259 } | |
260 | |
261 size_t | |
262 grapheme_next_word_break(const uint_least32_t *str, size_t len) | |
263 { | |
264 HERODOTUS_READER r; | |
265 | |
266 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
267 | |
268 return next_word_break(&r); | |
269 } | |
270 | |
271 size_t | |
272 grapheme_next_word_break_utf8(const char *str, size_t len) | |
273 { | |
274 HERODOTUS_READER r; | |
275 | |
276 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
277 | |
278 return next_word_break(&r); | |
279 } |