Introduction
Introduction Statistics Contact Development Disclaimer Help
sentence.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
sentence.c (8420B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/sentence.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 struct sentence_break_state {
10 uint_least8_t aterm_close_sp_level;
11 uint_least8_t saterm_close_sp_parasep_level;
12 };
13
14 static inline uint_least8_t
15 get_sentence_break_prop(uint_least32_t cp)
16 {
17 if (likely(cp <= UINT32_C(0x10FFFF))) {
18 return (uint_least8_t)
19 sentence_break_minor[sentence_break_major[cp >> …
20 (cp & 0xff)];
21 } else {
22 return SENTENCE_BREAK_PROP_OTHER;
23 }
24 }
25
26 static bool
27 is_skippable_sentence_prop(uint_least8_t prop)
28 {
29 return prop == SENTENCE_BREAK_PROP_EXTEND ||
30 prop == SENTENCE_BREAK_PROP_FORMAT;
31 }
32
33 static void
34 sentence_skip_shift_callback(uint_least8_t prop, void *s)
35 {
36 struct sentence_break_state *state = (struct sentence_break_stat…
37
38 /*
39 * Here comes a bit of magic. The rules
40 * SB8, SB8a, SB9 and SB10 have very complicated
41 * left-hand-side-rules of the form
42 *
43 * ATerm Close* Sp*
44 * SATerm Close*
45 * SATerm Close* Sp*
46 * SATerm Close* Sp* ParaSep?
47 *
48 * but instead of backtracking, we keep the
49 * state as some kind of "power level" in
50 * two state-variables
51 *
52 * aterm_close_sp_level
53 * saterm_close_sp_parasep_level
54 *
55 * that go from 0 to 3/4:
56 *
57 * 0: we are not in the sequence
58 * 1: we have one ATerm/SATerm to the left of
59 * the middle spot
60 * 2: we have one ATerm/SATerm and one or more
61 * Close to the left of the middle spot
62 * 3: we have one ATerm/SATerm, zero or more
63 * Close and one or more Sp to the left of
64 * the middle spot.
65 * 4: we have one SATerm, zero or more Close,
66 * zero or more Sp and one ParaSep to the
67 * left of the middle spot.
68 *
69 */
70 if ((state->aterm_close_sp_level == 0 ||
71 state->aterm_close_sp_level == 1) &&
72 prop == SENTENCE_BREAK_PROP_ATERM) {
73 /* sequence has begun */
74 state->aterm_close_sp_level = 1;
75 } else if ((state->aterm_close_sp_level == 1 ||
76 state->aterm_close_sp_level == 2) &&
77 prop == SENTENCE_BREAK_PROP_CLOSE) {
78 /* close-sequence begins or continued */
79 state->aterm_close_sp_level = 2;
80 } else if ((state->aterm_close_sp_level == 1 ||
81 state->aterm_close_sp_level == 2 ||
82 state->aterm_close_sp_level == 3) &&
83 prop == SENTENCE_BREAK_PROP_SP) {
84 /* sp-sequence begins or continued */
85 state->aterm_close_sp_level = 3;
86 } else {
87 /* sequence broke */
88 state->aterm_close_sp_level = 0;
89 }
90
91 if ((state->saterm_close_sp_parasep_level == 0 ||
92 state->saterm_close_sp_parasep_level == 1) &&
93 (prop == SENTENCE_BREAK_PROP_STERM ||
94 prop == SENTENCE_BREAK_PROP_ATERM)) {
95 /* sequence has begun */
96 state->saterm_close_sp_parasep_level = 1;
97 } else if ((state->saterm_close_sp_parasep_level == 1 ||
98 state->saterm_close_sp_parasep_level == 2) &&
99 prop == SENTENCE_BREAK_PROP_CLOSE) {
100 /* close-sequence begins or continued */
101 state->saterm_close_sp_parasep_level = 2;
102 } else if ((state->saterm_close_sp_parasep_level == 1 ||
103 state->saterm_close_sp_parasep_level == 2 ||
104 state->saterm_close_sp_parasep_level == 3) &&
105 prop == SENTENCE_BREAK_PROP_SP) {
106 /* sp-sequence begins or continued */
107 state->saterm_close_sp_parasep_level = 3;
108 } else if ((state->saterm_close_sp_parasep_level == 1 ||
109 state->saterm_close_sp_parasep_level == 2 ||
110 state->saterm_close_sp_parasep_level == 3) &&
111 (prop == SENTENCE_BREAK_PROP_SEP ||
112 prop == SENTENCE_BREAK_PROP_CR ||
113 prop == SENTENCE_BREAK_PROP_LF)) {
114 /* ParaSep at the end of the sequence */
115 state->saterm_close_sp_parasep_level = 4;
116 } else {
117 /* sequence broke */
118 state->saterm_close_sp_parasep_level = 0;
119 }
120 }
121
122 static size_t
123 next_sentence_break(HERODOTUS_READER *r)
124 {
125 HERODOTUS_READER tmp;
126 enum sentence_break_property prop;
127 struct proper p;
128 struct sentence_break_state state = { 0 };
129 uint_least32_t cp;
130
131 /*
132 * Apply sentence breaking algorithm (UAX #29), see
133 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
134 */
135 proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
136 get_sentence_break_prop, is_skippable_sentence_prop,
137 sentence_skip_shift_callback, &p);
138
139 while (!proper_advance(&p)) {
140 /* SB3 */
141 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
142 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
143 continue;
144 }
145
146 /* SB4 */
147 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
148 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
149 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
150 break;
151 }
152
153 /* SB5 */
154 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
155 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
156 continue;
157 }
158
159 /* SB6 */
160 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
161 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
162 continue;
163 }
164
165 /* SB7 */
166 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
167 p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
168 p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
169 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
170 continue;
171 }
172
173 /* SB8 */
174 if (state.aterm_close_sp_level == 1 ||
175 state.aterm_close_sp_level == 2 ||
176 state.aterm_close_sp_level == 3) {
177 /*
178 * This is the most complicated rule, requiring
179 * the right-hand-side to satisfy the regular ex…
180 *
181 * ( ¬(OLetter | Upper | Lower | ParaSep | SAT…
182 * Lower
183 *
184 * which we simply check "manually" given LUT-lo…
185 * are very cheap by starting at the mid_reader.
186 *
187 */
188 herodotus_reader_copy(&(p.mid_reader), &tmp);
189
190 prop = NUM_SENTENCE_BREAK_PROPS;
191 while (herodotus_read_codepoint(&tmp, true, &cp)…
192 HERODOTUS_STATUS_SUCCESS) {
193 prop = get_sentence_break_prop(cp);
194
195 /*
196 * the skippable properties are ignored
197 * automatically here given they do not
198 * match the following condition
199 */
200 if (prop == SENTENCE_BREAK_PROP_OLETTER …
201 prop == SENTENCE_BREAK_PROP_UPPER ||
202 prop == SENTENCE_BREAK_PROP_LOWER ||
203 prop == SENTENCE_BREAK_PROP_SEP ||
204 prop == SENTENCE_BREAK_PROP_CR ||
205 prop == SENTENCE_BREAK_PROP_LF ||
206 prop == SENTENCE_BREAK_PROP_STERM ||
207 prop == SENTENCE_BREAK_PROP_ATERM) {
208 break;
209 }
210 }
211
212 if (prop == SENTENCE_BREAK_PROP_LOWER) {
213 continue;
214 }
215 }
216
217 /* SB8a */
218 if ((state.saterm_close_sp_parasep_level == 1 ||
219 state.saterm_close_sp_parasep_level == 2 ||
220 state.saterm_close_sp_parasep_level == 3) &&
221 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINU…
222 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
223 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
224 continue;
225 }
226
227 /* SB9 */
228 if ((state.saterm_close_sp_parasep_level == 1 ||
229 state.saterm_close_sp_parasep_level == 2) &&
230 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
231 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
232 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
233 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
234 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
235 continue;
236 }
237
238 /* SB10 */
239 if ((state.saterm_close_sp_parasep_level == 1 ||
240 state.saterm_close_sp_parasep_level == 2 ||
241 state.saterm_close_sp_parasep_level == 3) &&
242 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
243 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
244 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
245 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
246 continue;
247 }
248
249 /* SB11 */
250 if (state.saterm_close_sp_parasep_level == 1 ||
251 state.saterm_close_sp_parasep_level == 2 ||
252 state.saterm_close_sp_parasep_level == 3 ||
253 state.saterm_close_sp_parasep_level == 4) {
254 break;
255 }
256
257 /* SB998 */
258 continue;
259 }
260
261 return herodotus_reader_number_read(&(p.mid_reader));
262 }
263
264 size_t
265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
266 {
267 HERODOTUS_READER r;
268
269 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
270
271 return next_sentence_break(&r);
272 }
273
274 size_t
275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
276 {
277 HERODOTUS_READER r;
278
279 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
280
281 return next_sentence_break(&r);
282 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.