sentence.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
sentence.c (8420B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stdbool.h> | |
3 #include <stddef.h> | |
4 | |
5 #include "../gen/sentence.h" | |
6 #include "../grapheme.h" | |
7 #include "util.h" | |
8 | |
9 struct sentence_break_state { | |
10 uint_least8_t aterm_close_sp_level; | |
11 uint_least8_t saterm_close_sp_parasep_level; | |
12 }; | |
13 | |
14 static inline uint_least8_t | |
15 get_sentence_break_prop(uint_least32_t cp) | |
16 { | |
17 if (likely(cp <= UINT32_C(0x10FFFF))) { | |
18 return (uint_least8_t) | |
19 sentence_break_minor[sentence_break_major[cp >> … | |
20 (cp & 0xff)]; | |
21 } else { | |
22 return SENTENCE_BREAK_PROP_OTHER; | |
23 } | |
24 } | |
25 | |
26 static bool | |
27 is_skippable_sentence_prop(uint_least8_t prop) | |
28 { | |
29 return prop == SENTENCE_BREAK_PROP_EXTEND || | |
30 prop == SENTENCE_BREAK_PROP_FORMAT; | |
31 } | |
32 | |
33 static void | |
34 sentence_skip_shift_callback(uint_least8_t prop, void *s) | |
35 { | |
36 struct sentence_break_state *state = (struct sentence_break_stat… | |
37 | |
38 /* | |
39 * Here comes a bit of magic. The rules | |
40 * SB8, SB8a, SB9 and SB10 have very complicated | |
41 * left-hand-side-rules of the form | |
42 * | |
43 * ATerm Close* Sp* | |
44 * SATerm Close* | |
45 * SATerm Close* Sp* | |
46 * SATerm Close* Sp* ParaSep? | |
47 * | |
48 * but instead of backtracking, we keep the | |
49 * state as some kind of "power level" in | |
50 * two state-variables | |
51 * | |
52 * aterm_close_sp_level | |
53 * saterm_close_sp_parasep_level | |
54 * | |
55 * that go from 0 to 3/4: | |
56 * | |
57 * 0: we are not in the sequence | |
58 * 1: we have one ATerm/SATerm to the left of | |
59 * the middle spot | |
60 * 2: we have one ATerm/SATerm and one or more | |
61 * Close to the left of the middle spot | |
62 * 3: we have one ATerm/SATerm, zero or more | |
63 * Close and one or more Sp to the left of | |
64 * the middle spot. | |
65 * 4: we have one SATerm, zero or more Close, | |
66 * zero or more Sp and one ParaSep to the | |
67 * left of the middle spot. | |
68 * | |
69 */ | |
70 if ((state->aterm_close_sp_level == 0 || | |
71 state->aterm_close_sp_level == 1) && | |
72 prop == SENTENCE_BREAK_PROP_ATERM) { | |
73 /* sequence has begun */ | |
74 state->aterm_close_sp_level = 1; | |
75 } else if ((state->aterm_close_sp_level == 1 || | |
76 state->aterm_close_sp_level == 2) && | |
77 prop == SENTENCE_BREAK_PROP_CLOSE) { | |
78 /* close-sequence begins or continued */ | |
79 state->aterm_close_sp_level = 2; | |
80 } else if ((state->aterm_close_sp_level == 1 || | |
81 state->aterm_close_sp_level == 2 || | |
82 state->aterm_close_sp_level == 3) && | |
83 prop == SENTENCE_BREAK_PROP_SP) { | |
84 /* sp-sequence begins or continued */ | |
85 state->aterm_close_sp_level = 3; | |
86 } else { | |
87 /* sequence broke */ | |
88 state->aterm_close_sp_level = 0; | |
89 } | |
90 | |
91 if ((state->saterm_close_sp_parasep_level == 0 || | |
92 state->saterm_close_sp_parasep_level == 1) && | |
93 (prop == SENTENCE_BREAK_PROP_STERM || | |
94 prop == SENTENCE_BREAK_PROP_ATERM)) { | |
95 /* sequence has begun */ | |
96 state->saterm_close_sp_parasep_level = 1; | |
97 } else if ((state->saterm_close_sp_parasep_level == 1 || | |
98 state->saterm_close_sp_parasep_level == 2) && | |
99 prop == SENTENCE_BREAK_PROP_CLOSE) { | |
100 /* close-sequence begins or continued */ | |
101 state->saterm_close_sp_parasep_level = 2; | |
102 } else if ((state->saterm_close_sp_parasep_level == 1 || | |
103 state->saterm_close_sp_parasep_level == 2 || | |
104 state->saterm_close_sp_parasep_level == 3) && | |
105 prop == SENTENCE_BREAK_PROP_SP) { | |
106 /* sp-sequence begins or continued */ | |
107 state->saterm_close_sp_parasep_level = 3; | |
108 } else if ((state->saterm_close_sp_parasep_level == 1 || | |
109 state->saterm_close_sp_parasep_level == 2 || | |
110 state->saterm_close_sp_parasep_level == 3) && | |
111 (prop == SENTENCE_BREAK_PROP_SEP || | |
112 prop == SENTENCE_BREAK_PROP_CR || | |
113 prop == SENTENCE_BREAK_PROP_LF)) { | |
114 /* ParaSep at the end of the sequence */ | |
115 state->saterm_close_sp_parasep_level = 4; | |
116 } else { | |
117 /* sequence broke */ | |
118 state->saterm_close_sp_parasep_level = 0; | |
119 } | |
120 } | |
121 | |
122 static size_t | |
123 next_sentence_break(HERODOTUS_READER *r) | |
124 { | |
125 HERODOTUS_READER tmp; | |
126 enum sentence_break_property prop; | |
127 struct proper p; | |
128 struct sentence_break_state state = { 0 }; | |
129 uint_least32_t cp; | |
130 | |
131 /* | |
132 * Apply sentence breaking algorithm (UAX #29), see | |
133 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules | |
134 */ | |
135 proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS, | |
136 get_sentence_break_prop, is_skippable_sentence_prop, | |
137 sentence_skip_shift_callback, &p); | |
138 | |
139 while (!proper_advance(&p)) { | |
140 /* SB3 */ | |
141 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR && | |
142 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) { | |
143 continue; | |
144 } | |
145 | |
146 /* SB4 */ | |
147 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP || | |
148 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR || | |
149 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) { | |
150 break; | |
151 } | |
152 | |
153 /* SB5 */ | |
154 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND || | |
155 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) { | |
156 continue; | |
157 } | |
158 | |
159 /* SB6 */ | |
160 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && | |
161 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) { | |
162 continue; | |
163 } | |
164 | |
165 /* SB7 */ | |
166 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER || | |
167 p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) && | |
168 p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM && | |
169 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) { | |
170 continue; | |
171 } | |
172 | |
173 /* SB8 */ | |
174 if (state.aterm_close_sp_level == 1 || | |
175 state.aterm_close_sp_level == 2 || | |
176 state.aterm_close_sp_level == 3) { | |
177 /* | |
178 * This is the most complicated rule, requiring | |
179 * the right-hand-side to satisfy the regular ex… | |
180 * | |
181 * ( ¬(OLetter | Upper | Lower | ParaSep | SAT… | |
182 * Lower | |
183 * | |
184 * which we simply check "manually" given LUT-lo… | |
185 * are very cheap by starting at the mid_reader. | |
186 * | |
187 */ | |
188 herodotus_reader_copy(&(p.mid_reader), &tmp); | |
189 | |
190 prop = NUM_SENTENCE_BREAK_PROPS; | |
191 while (herodotus_read_codepoint(&tmp, true, &cp)… | |
192 HERODOTUS_STATUS_SUCCESS) { | |
193 prop = get_sentence_break_prop(cp); | |
194 | |
195 /* | |
196 * the skippable properties are ignored | |
197 * automatically here given they do not | |
198 * match the following condition | |
199 */ | |
200 if (prop == SENTENCE_BREAK_PROP_OLETTER … | |
201 prop == SENTENCE_BREAK_PROP_UPPER || | |
202 prop == SENTENCE_BREAK_PROP_LOWER || | |
203 prop == SENTENCE_BREAK_PROP_SEP || | |
204 prop == SENTENCE_BREAK_PROP_CR || | |
205 prop == SENTENCE_BREAK_PROP_LF || | |
206 prop == SENTENCE_BREAK_PROP_STERM || | |
207 prop == SENTENCE_BREAK_PROP_ATERM) { | |
208 break; | |
209 } | |
210 } | |
211 | |
212 if (prop == SENTENCE_BREAK_PROP_LOWER) { | |
213 continue; | |
214 } | |
215 } | |
216 | |
217 /* SB8a */ | |
218 if ((state.saterm_close_sp_parasep_level == 1 || | |
219 state.saterm_close_sp_parasep_level == 2 || | |
220 state.saterm_close_sp_parasep_level == 3) && | |
221 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINU… | |
222 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM || | |
223 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) { | |
224 continue; | |
225 } | |
226 | |
227 /* SB9 */ | |
228 if ((state.saterm_close_sp_parasep_level == 1 || | |
229 state.saterm_close_sp_parasep_level == 2) && | |
230 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE || | |
231 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || | |
232 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || | |
233 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || | |
234 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { | |
235 continue; | |
236 } | |
237 | |
238 /* SB10 */ | |
239 if ((state.saterm_close_sp_parasep_level == 1 || | |
240 state.saterm_close_sp_parasep_level == 2 || | |
241 state.saterm_close_sp_parasep_level == 3) && | |
242 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP || | |
243 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP || | |
244 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR || | |
245 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) { | |
246 continue; | |
247 } | |
248 | |
249 /* SB11 */ | |
250 if (state.saterm_close_sp_parasep_level == 1 || | |
251 state.saterm_close_sp_parasep_level == 2 || | |
252 state.saterm_close_sp_parasep_level == 3 || | |
253 state.saterm_close_sp_parasep_level == 4) { | |
254 break; | |
255 } | |
256 | |
257 /* SB998 */ | |
258 continue; | |
259 } | |
260 | |
261 return herodotus_reader_number_read(&(p.mid_reader)); | |
262 } | |
263 | |
264 size_t | |
265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len) | |
266 { | |
267 HERODOTUS_READER r; | |
268 | |
269 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); | |
270 | |
271 return next_sentence_break(&r); | |
272 } | |
273 | |
274 size_t | |
275 grapheme_next_sentence_break_utf8(const char *str, size_t len) | |
276 { | |
277 HERODOTUS_READER r; | |
278 | |
279 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); | |
280 | |
281 return next_sentence_break(&r); | |
282 } |