Introduction
Introduction Statistics Contact Development Disclaimer Help
word.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
word.c (8052B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/word.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 struct word_break_state {
10 bool ri_even;
11 };
12
13 static inline uint_least8_t
14 get_word_break_prop(uint_least32_t cp)
15 {
16 if (likely(cp <= UINT32_C(0x10FFFF))) {
17 return (uint_least8_t)
18 word_break_minor[word_break_major[cp >> 8] +
19 (cp & 0xff)];
20 } else {
21 return WORD_BREAK_PROP_OTHER;
22 }
23 }
24
25 static bool
26 is_skippable_word_prop(uint_least8_t prop)
27 {
28 return prop == WORD_BREAK_PROP_EXTEND ||
29 prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP…
30 }
31
32 static void
33 word_skip_shift_callback(uint_least8_t prop, void *s)
34 {
35 struct word_break_state *state = (struct word_break_state *)s;
36
37 if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
38 /*
39 * The property we just shifted in is
40 * a regional indicator, increasing the
41 * number of consecutive RIs on the left
42 * side of the breakpoint by one, changing
43 * the oddness.
44 *
45 */
46 state->ri_even = !(state->ri_even);
47 } else {
48 /*
49 * We saw no regional indicator, so the
50 * number of consecutive RIs on the left
51 * side of the breakpoint is zero, which
52 * is an even number.
53 *
54 */
55 state->ri_even = true;
56 }
57 }
58
59 static size_t
60 next_word_break(HERODOTUS_READER *r)
61 {
62 struct proper p;
63 struct word_break_state state = { .ri_even = true };
64
65 /*
66 * Apply word breaking algorithm (UAX #29), see
67 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
68 */
69 proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
70 is_skippable_word_prop, word_skip_shift_callback, &p…
71
72 while (!proper_advance(&p)) {
73 /* WB3 */
74 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
75 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
76 continue;
77 }
78
79 /* WB3a */
80 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
81 p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
82 p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
83 break;
84 }
85
86 /* WB3b */
87 if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
88 p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
89 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
90 break;
91 }
92
93 /* WB3c */
94 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
95 (p.raw.next_prop[0] ==
96 WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
97 p.raw.next_prop[0] ==
98 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
99 continue;
100 }
101
102 /* WB3d */
103 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
104 p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
105 continue;
106 }
107
108 /* WB4 */
109 if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
110 p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
111 p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
112 continue;
113 }
114
115 /* WB5 */
116 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
117 p.skip.prev_prop[0] ==
118 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
119 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
120 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
121 p.skip.next_prop[0] ==
122 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
123 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
124 continue;
125 }
126
127 /* WB6 */
128 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
129 p.skip.prev_prop[0] ==
130 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
131 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
132 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
133 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
134 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE…
135 (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
136 p.skip.next_prop[1] ==
137 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
138 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTE…
139 continue;
140 }
141
142 /* WB7 */
143 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
144 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
145 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE…
146 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
147 p.skip.next_prop[0] ==
148 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
149 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
150 (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
151 p.skip.prev_prop[1] ==
152 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
153 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTE…
154 continue;
155 }
156
157 /* WB7a */
158 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER…
159 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE)…
160 continue;
161 }
162
163 /* WB7b */
164 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER…
165 p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE …
166 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER…
167 continue;
168 }
169
170 /* WB7c */
171 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE …
172 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER…
173 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER…
174 continue;
175 }
176
177 /* WB8 */
178 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
179 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
180 continue;
181 }
182
183 /* WB9 */
184 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
185 p.skip.prev_prop[0] ==
186 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
187 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
188 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
189 continue;
190 }
191
192 /* WB10 */
193 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
194 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
195 p.skip.next_prop[0] ==
196 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
197 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
198 continue;
199 }
200
201 /* WB11 */
202 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
203 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
204 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE…
205 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
206 p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
207 continue;
208 }
209
210 /* WB12 */
211 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
212 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
213 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
214 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE…
215 p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
216 continue;
217 }
218
219 /* WB13 */
220 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
221 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
222 continue;
223 }
224
225 /* WB13a */
226 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
227 p.skip.prev_prop[0] ==
228 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
229 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
230 p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
231 p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
232 p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET…
233 p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET)…
234 continue;
235 }
236
237 /* WB13b */
238 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET …
239 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
240 p.skip.next_prop[0] ==
241 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
242 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTE…
243 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
244 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
245 continue;
246 }
247
248 /* WB15 and WB16 */
249 if (!state.ri_even &&
250 p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDI…
251 continue;
252 }
253
254 /* WB999 */
255 break;
256 }
257
258 return herodotus_reader_number_read(&(p.mid_reader));
259 }
260
261 size_t
262 grapheme_next_word_break(const uint_least32_t *str, size_t len)
263 {
264 HERODOTUS_READER r;
265
266 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
267
268 return next_word_break(&r);
269 }
270
271 size_t
272 grapheme_next_word_break_utf8(const char *str, size_t len)
273 {
274 HERODOTUS_READER r;
275
276 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
277
278 return next_word_break(&r);
279 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.