GopherProxy

	sentence.c - libgrapheme - unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	sentence.c (8420B)
	---
	1 /* See LICENSE file for copyright and license details. */
	2 #include <stdbool.h>
	3 #include <stddef.h>
	4
	5 #include "../gen/sentence.h"
	6 #include "../grapheme.h"
	7 #include "util.h"
	8
	9 struct sentence_break_state {
	10 uint_least8_t aterm_close_sp_level;
	11 uint_least8_t saterm_close_sp_parasep_level;
	12 };
	13
	14 static inline uint_least8_t
	15 get_sentence_break_prop(uint_least32_t cp)
	16 {
	17 if (likely(cp <= UINT32_C(0x10FFFF))) {
	18 return (uint_least8_t)
	19 sentence_break_minor[sentence_break_major[cp >> …
	20 (cp & 0xff)];
	21 } else {
	22 return SENTENCE_BREAK_PROP_OTHER;
	23 }
	24 }
	25
	26 static bool
	27 is_skippable_sentence_prop(uint_least8_t prop)
	28 {
	29 return prop == SENTENCE_BREAK_PROP_EXTEND \|\|
	30 prop == SENTENCE_BREAK_PROP_FORMAT;
	31 }
	32
	33 static void
	34 sentence_skip_shift_callback(uint_least8_t prop, void *s)
	35 {
	36 struct sentence_break_state *state = (struct sentence_break_stat…
	37
	38 /*
	39 * Here comes a bit of magic. The rules
	40 * SB8, SB8a, SB9 and SB10 have very complicated
	41 * left-hand-side-rules of the form
	42 *
	43 * ATerm Close* Sp*
	44 * SATerm Close*
	45 * SATerm Close* Sp*
	46 * SATerm Close* Sp* ParaSep?
	47 *
	48 * but instead of backtracking, we keep the
	49 * state as some kind of "power level" in
	50 * two state-variables
	51 *
	52 * aterm_close_sp_level
	53 * saterm_close_sp_parasep_level
	54 *
	55 * that go from 0 to 3/4:
	56 *
	57 * 0: we are not in the sequence
	58 * 1: we have one ATerm/SATerm to the left of
	59 * the middle spot
	60 * 2: we have one ATerm/SATerm and one or more
	61 * Close to the left of the middle spot
	62 * 3: we have one ATerm/SATerm, zero or more
	63 * Close and one or more Sp to the left of
	64 * the middle spot.
	65 * 4: we have one SATerm, zero or more Close,
	66 * zero or more Sp and one ParaSep to the
	67 * left of the middle spot.
	68 *
	69 */
	70 if ((state->aterm_close_sp_level == 0 \|\|
	71 state->aterm_close_sp_level == 1) &&
	72 prop == SENTENCE_BREAK_PROP_ATERM) {
	73 /* sequence has begun */
	74 state->aterm_close_sp_level = 1;
	75 } else if ((state->aterm_close_sp_level == 1 \|\|
	76 state->aterm_close_sp_level == 2) &&
	77 prop == SENTENCE_BREAK_PROP_CLOSE) {
	78 /* close-sequence begins or continued */
	79 state->aterm_close_sp_level = 2;
	80 } else if ((state->aterm_close_sp_level == 1 \|\|
	81 state->aterm_close_sp_level == 2 \|\|
	82 state->aterm_close_sp_level == 3) &&
	83 prop == SENTENCE_BREAK_PROP_SP) {
	84 /* sp-sequence begins or continued */
	85 state->aterm_close_sp_level = 3;
	86 } else {
	87 /* sequence broke */
	88 state->aterm_close_sp_level = 0;
	89 }
	90
	91 if ((state->saterm_close_sp_parasep_level == 0 \|\|
	92 state->saterm_close_sp_parasep_level == 1) &&
	93 (prop == SENTENCE_BREAK_PROP_STERM \|\|
	94 prop == SENTENCE_BREAK_PROP_ATERM)) {
	95 /* sequence has begun */
	96 state->saterm_close_sp_parasep_level = 1;
	97 } else if ((state->saterm_close_sp_parasep_level == 1 \|\|
	98 state->saterm_close_sp_parasep_level == 2) &&
	99 prop == SENTENCE_BREAK_PROP_CLOSE) {
	100 /* close-sequence begins or continued */
	101 state->saterm_close_sp_parasep_level = 2;
	102 } else if ((state->saterm_close_sp_parasep_level == 1 \|\|
	103 state->saterm_close_sp_parasep_level == 2 \|\|
	104 state->saterm_close_sp_parasep_level == 3) &&
	105 prop == SENTENCE_BREAK_PROP_SP) {
	106 /* sp-sequence begins or continued */
	107 state->saterm_close_sp_parasep_level = 3;
	108 } else if ((state->saterm_close_sp_parasep_level == 1 \|\|
	109 state->saterm_close_sp_parasep_level == 2 \|\|
	110 state->saterm_close_sp_parasep_level == 3) &&
	111 (prop == SENTENCE_BREAK_PROP_SEP \|\|
	112 prop == SENTENCE_BREAK_PROP_CR \|\|
	113 prop == SENTENCE_BREAK_PROP_LF)) {
	114 /* ParaSep at the end of the sequence */
	115 state->saterm_close_sp_parasep_level = 4;
	116 } else {
	117 /* sequence broke */
	118 state->saterm_close_sp_parasep_level = 0;
	119 }
	120 }
	121
	122 static size_t
	123 next_sentence_break(HERODOTUS_READER *r)
	124 {
	125 HERODOTUS_READER tmp;
	126 enum sentence_break_property prop;
	127 struct proper p;
	128 struct sentence_break_state state = { 0 };
	129 uint_least32_t cp;
	130
	131 /*
	132 * Apply sentence breaking algorithm (UAX #29), see
	133 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
	134 */
	135 proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
	136 get_sentence_break_prop, is_skippable_sentence_prop,
	137 sentence_skip_shift_callback, &p);
	138
	139 while (!proper_advance(&p)) {
	140 /* SB3 */
	141 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
	142 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
	143 continue;
	144 }
	145
	146 /* SB4 */
	147 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP \|\|
	148 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR \|\|
	149 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
	150 break;
	151 }
	152
	153 /* SB5 */
	154 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND \|\|
	155 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
	156 continue;
	157 }
	158
	159 /* SB6 */
	160 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
	161 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
	162 continue;
	163 }
	164
	165 /* SB7 */
	166 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER \|\|
	167 p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
	168 p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
	169 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
	170 continue;
	171 }
	172
	173 /* SB8 */
	174 if (state.aterm_close_sp_level == 1 \|\|
	175 state.aterm_close_sp_level == 2 \|\|
	176 state.aterm_close_sp_level == 3) {
	177 /*
	178 * This is the most complicated rule, requiring
	179 * the right-hand-side to satisfy the regular ex…
	180 *
	181 * ( ¬(OLetter \| Upper \| Lower \| ParaSep \| SAT…
	182 * Lower
	183 *
	184 * which we simply check "manually" given LUT-lo…
	185 * are very cheap by starting at the mid_reader.
	186 *
	187 */
	188 herodotus_reader_copy(&(p.mid_reader), &tmp);
	189
	190 prop = NUM_SENTENCE_BREAK_PROPS;
	191 while (herodotus_read_codepoint(&tmp, true, &cp)…
	192 HERODOTUS_STATUS_SUCCESS) {
	193 prop = get_sentence_break_prop(cp);
	194
	195 /*
	196 * the skippable properties are ignored
	197 * automatically here given they do not
	198 * match the following condition
	199 */
	200 if (prop == SENTENCE_BREAK_PROP_OLETTER …
	201 prop == SENTENCE_BREAK_PROP_UPPER \|\|
	202 prop == SENTENCE_BREAK_PROP_LOWER \|\|
	203 prop == SENTENCE_BREAK_PROP_SEP \|\|
	204 prop == SENTENCE_BREAK_PROP_CR \|\|
	205 prop == SENTENCE_BREAK_PROP_LF \|\|
	206 prop == SENTENCE_BREAK_PROP_STERM \|\|
	207 prop == SENTENCE_BREAK_PROP_ATERM) {
	208 break;
	209 }
	210 }
	211
	212 if (prop == SENTENCE_BREAK_PROP_LOWER) {
	213 continue;
	214 }
	215 }
	216
	217 /* SB8a */
	218 if ((state.saterm_close_sp_parasep_level == 1 \|\|
	219 state.saterm_close_sp_parasep_level == 2 \|\|
	220 state.saterm_close_sp_parasep_level == 3) &&
	221 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINU…
	222 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM \|\|
	223 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
	224 continue;
	225 }
	226
	227 /* SB9 */
	228 if ((state.saterm_close_sp_parasep_level == 1 \|\|
	229 state.saterm_close_sp_parasep_level == 2) &&
	230 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE \|\|
	231 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP \|\|
	232 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP \|\|
	233 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR \|\|
	234 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
	235 continue;
	236 }
	237
	238 /* SB10 */
	239 if ((state.saterm_close_sp_parasep_level == 1 \|\|
	240 state.saterm_close_sp_parasep_level == 2 \|\|
	241 state.saterm_close_sp_parasep_level == 3) &&
	242 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP \|\|
	243 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP \|\|
	244 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR \|\|
	245 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
	246 continue;
	247 }
	248
	249 /* SB11 */
	250 if (state.saterm_close_sp_parasep_level == 1 \|\|
	251 state.saterm_close_sp_parasep_level == 2 \|\|
	252 state.saterm_close_sp_parasep_level == 3 \|\|
	253 state.saterm_close_sp_parasep_level == 4) {
	254 break;
	255 }
	256
	257 /* SB998 */
	258 continue;
	259 }
	260
	261 return herodotus_reader_number_read(&(p.mid_reader));
	262 }
	263
	264 size_t
	265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
	266 {
	267 HERODOTUS_READER r;
	268
	269 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
	270
	271 return next_sentence_break(&r);
	272 }
	273
	274 size_t
	275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
	276 {
	277 HERODOTUS_READER r;
	278
	279 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
	280
	281 return next_sentence_break(&r);
	282 }