line.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
line.c (11273B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stdio.h> | |
3 #include <stdlib.h> | |
4 #include <string.h> | |
5 | |
6 #include "util.h" | |
7 | |
8 #define FILE_EAW "data/EastAsianWidth.txt" | |
9 #define FILE_EMOJI "data/emoji-data.txt" | |
10 #define FILE_LINE "data/LineBreak.txt" | |
11 | |
12 static const struct property_spec line_break_property[] = { | |
13 { | |
14 .enumname = "AL", | |
15 .file = FILE_LINE, | |
16 .ucdname = "AL", | |
17 }, | |
18 /* | |
19 * Both extended pictographic and cn are large classes, | |
20 * but we are only interested in their intersection for LB30b, | |
21 * so we have the following two temporary classes. At first | |
22 * the extpict-class is filled, then the cn-class, which leads | |
23 * to conflicts (that we handle by putting them in the "proper" | |
24 * class BOTH_CN_EXTPICT). We make use of the fact that there | |
25 * is no intersection between AL and Cn. | |
26 * | |
27 * Any consecutive conflicts are permitted to overwrite | |
28 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need | |
29 * them, and in the final postprocessing we "reset" all | |
30 * remaining matches (that then didn't fit any of the other | |
31 * classes) to the generic class AL. | |
32 */ | |
33 { | |
34 .enumname = "TMP_CN", | |
35 .file = FILE_LINE, | |
36 .ucdname = "Cn", | |
37 }, | |
38 { | |
39 .enumname = "TMP_EXTENDED_PICTOGRAPHIC", | |
40 .file = FILE_EMOJI, | |
41 .ucdname = "Extended_Pictographic", | |
42 }, | |
43 /* end of special block */ | |
44 { | |
45 .enumname = "B2", | |
46 .file = FILE_LINE, | |
47 .ucdname = "B2", | |
48 }, | |
49 { | |
50 .enumname = "BA", | |
51 .file = FILE_LINE, | |
52 .ucdname = "BA", | |
53 }, | |
54 { | |
55 .enumname = "BB", | |
56 .file = FILE_LINE, | |
57 .ucdname = "BB", | |
58 }, | |
59 { | |
60 .enumname = "BK", | |
61 .file = FILE_LINE, | |
62 .ucdname = "BK", | |
63 }, | |
64 { | |
65 .enumname = "BOTH_CN_EXTPICT", | |
66 .file = NULL, | |
67 .ucdname = NULL, | |
68 }, | |
69 { | |
70 .enumname = "CB", | |
71 .file = FILE_LINE, | |
72 .ucdname = "CB", | |
73 }, | |
74 { | |
75 .enumname = "CL", | |
76 .file = FILE_LINE, | |
77 .ucdname = "CL", | |
78 }, | |
79 { | |
80 .enumname = "CM", | |
81 .file = FILE_LINE, | |
82 .ucdname = "CM", | |
83 }, | |
84 { | |
85 .enumname = "CP_WITHOUT_EAW_HWF", | |
86 .file = FILE_LINE, | |
87 .ucdname = "CP", | |
88 }, | |
89 { | |
90 .enumname = "CP_WITH_EAW_HWF", | |
91 .file = NULL, | |
92 .ucdname = NULL, | |
93 }, | |
94 { | |
95 .enumname = "CR", | |
96 .file = FILE_LINE, | |
97 .ucdname = "CR", | |
98 }, | |
99 { | |
100 .enumname = "EB", | |
101 .file = FILE_LINE, | |
102 .ucdname = "EB", | |
103 }, | |
104 { | |
105 .enumname = "EM", | |
106 .file = FILE_LINE, | |
107 .ucdname = "EM", | |
108 }, | |
109 { | |
110 .enumname = "EX", | |
111 .file = FILE_LINE, | |
112 .ucdname = "EX", | |
113 }, | |
114 { | |
115 .enumname = "GL", | |
116 .file = FILE_LINE, | |
117 .ucdname = "GL", | |
118 }, | |
119 { | |
120 .enumname = "H2", | |
121 .file = FILE_LINE, | |
122 .ucdname = "H2", | |
123 }, | |
124 { | |
125 .enumname = "H3", | |
126 .file = FILE_LINE, | |
127 .ucdname = "H3", | |
128 }, | |
129 { | |
130 .enumname = "HL", | |
131 .file = FILE_LINE, | |
132 .ucdname = "HL", | |
133 }, | |
134 { | |
135 .enumname = "HY", | |
136 .file = FILE_LINE, | |
137 .ucdname = "HY", | |
138 }, | |
139 { | |
140 .enumname = "ID", | |
141 .file = FILE_LINE, | |
142 .ucdname = "ID", | |
143 }, | |
144 { | |
145 .enumname = "IN", | |
146 .file = FILE_LINE, | |
147 .ucdname = "IN", | |
148 }, | |
149 { | |
150 .enumname = "IS", | |
151 .file = FILE_LINE, | |
152 .ucdname = "IS", | |
153 }, | |
154 { | |
155 .enumname = "JL", | |
156 .file = FILE_LINE, | |
157 .ucdname = "JL", | |
158 }, | |
159 { | |
160 .enumname = "JT", | |
161 .file = FILE_LINE, | |
162 .ucdname = "JT", | |
163 }, | |
164 { | |
165 .enumname = "JV", | |
166 .file = FILE_LINE, | |
167 .ucdname = "JV", | |
168 }, | |
169 { | |
170 .enumname = "LF", | |
171 .file = FILE_LINE, | |
172 .ucdname = "LF", | |
173 }, | |
174 { | |
175 .enumname = "NL", | |
176 .file = FILE_LINE, | |
177 .ucdname = "NL", | |
178 }, | |
179 { | |
180 .enumname = "NS", | |
181 .file = FILE_LINE, | |
182 .ucdname = "NS", | |
183 }, | |
184 { | |
185 .enumname = "NU", | |
186 .file = FILE_LINE, | |
187 .ucdname = "NU", | |
188 }, | |
189 { | |
190 .enumname = "OP_WITHOUT_EAW_HWF", | |
191 .file = FILE_LINE, | |
192 .ucdname = "OP", | |
193 }, | |
194 { | |
195 .enumname = "OP_WITH_EAW_HWF", | |
196 .file = NULL, | |
197 .ucdname = NULL, | |
198 }, | |
199 { | |
200 .enumname = "PO", | |
201 .file = FILE_LINE, | |
202 .ucdname = "PO", | |
203 }, | |
204 { | |
205 .enumname = "PR", | |
206 .file = FILE_LINE, | |
207 .ucdname = "PR", | |
208 }, | |
209 { | |
210 .enumname = "QU", | |
211 .file = FILE_LINE, | |
212 .ucdname = "QU", | |
213 }, | |
214 { | |
215 .enumname = "RI", | |
216 .file = FILE_LINE, | |
217 .ucdname = "RI", | |
218 }, | |
219 { | |
220 .enumname = "SP", | |
221 .file = FILE_LINE, | |
222 .ucdname = "SP", | |
223 }, | |
224 { | |
225 .enumname = "SY", | |
226 .file = FILE_LINE, | |
227 .ucdname = "SY", | |
228 }, | |
229 { | |
230 .enumname = "WJ", | |
231 .file = FILE_LINE, | |
232 .ucdname = "WJ", | |
233 }, | |
234 { | |
235 .enumname = "ZW", | |
236 .file = FILE_LINE, | |
237 .ucdname = "ZW", | |
238 }, | |
239 { | |
240 .enumname = "ZWJ", | |
241 .file = FILE_LINE, | |
242 .ucdname = "ZWJ", | |
243 }, | |
244 { | |
245 .enumname = "TMP_AI", | |
246 .file = FILE_LINE, | |
247 .ucdname = "AI", | |
248 }, | |
249 { | |
250 .enumname = "TMP_CJ", | |
251 .file = FILE_LINE, | |
252 .ucdname = "CJ", | |
253 }, | |
254 { | |
255 .enumname = "TMP_XX", | |
256 .file = NULL, | |
257 .ucdname = NULL, | |
258 }, | |
259 { | |
260 .enumname = "TMP_MN", | |
261 .file = FILE_LINE, | |
262 .ucdname = "Mn", | |
263 }, | |
264 { | |
265 .enumname = "TMP_MC", | |
266 .file = FILE_LINE, | |
267 .ucdname = "Mc", | |
268 }, | |
269 { | |
270 .enumname = "TMP_SA_WITHOUT_MN_OR_MC", | |
271 .file = FILE_LINE, | |
272 .ucdname = "SA", | |
273 }, | |
274 { | |
275 .enumname = "TMP_SA_WITH_MN_OR_MC", | |
276 .file = FILE_LINE, | |
277 .ucdname = "SA", | |
278 }, | |
279 { | |
280 .enumname = "TMP_SG", | |
281 .file = FILE_LINE, | |
282 .ucdname = "SG", | |
283 }, | |
284 { | |
285 .enumname = "TMP_EAW_H", | |
286 .file = FILE_EAW, | |
287 .ucdname = "H", | |
288 }, | |
289 { | |
290 .enumname = "TMP_EAW_W", | |
291 .file = FILE_EAW, | |
292 .ucdname = "W", | |
293 }, | |
294 { | |
295 .enumname = "TMP_EAW_F", | |
296 .file = FILE_EAW, | |
297 .ucdname = "F", | |
298 }, | |
299 }; | |
300 | |
301 static uint_least8_t | |
302 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t pr… | |
303 { | |
304 uint_least8_t result = prop2; | |
305 char *target = NULL; | |
306 | |
307 (void)cp; | |
308 | |
309 if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") || | |
310 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") || | |
311 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) … | |
312 (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") || | |
313 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") || | |
314 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F")))… | |
315 if (!strcmp(line_break_property[prop1].enumname, | |
316 "CP_WITHOUT_EAW_HWF") || | |
317 !strcmp(line_break_property[prop2].enumname, | |
318 "CP_WITHOUT_EAW_HWF")) { | |
319 target = "CP_WITH_EAW_HWF"; | |
320 } else if (!strcmp(line_break_property[prop1].enumname, | |
321 "OP_WITHOUT_EAW_HWF") || | |
322 !strcmp(line_break_property[prop2].enumname, | |
323 "OP_WITHOUT_EAW_HWF")) { | |
324 target = "OP_WITH_EAW_HWF"; | |
325 } else { | |
326 /* ignore EAW for the rest */ | |
327 if ((!strcmp(line_break_property[prop1].enumname, | |
328 "TMP_EAW_H") || | |
329 !strcmp(line_break_property[prop1].enumname, | |
330 "TMP_EAW_W") || | |
331 !strcmp(line_break_property[prop1].enumname, | |
332 "TMP_EAW_F"))) { | |
333 result = prop2; | |
334 } else { | |
335 result = prop1; | |
336 } | |
337 } | |
338 } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN… | |
339 !strcmp(line_break_property[prop1].enumname, "TMP_MC… | |
340 (!strcmp(line_break_property[prop2].enumname, "TMP_MN… | |
341 !strcmp(line_break_property[prop2].enumname, "TMP_MC… | |
342 if (!strcmp(line_break_property[prop1].enumname, | |
343 "SA_WITHOUT_MN_OR_MC") || | |
344 !strcmp(line_break_property[prop2].enumname, | |
345 "SA_WITHOUT_MN_OR_MC")) { | |
346 target = "SA_WITH_MN_OR_MC"; | |
347 } else { | |
348 /* ignore Mn and Mc for the rest */ | |
349 if ((!strcmp(line_break_property[prop1].enumname, | |
350 "TMP_MN") || | |
351 !strcmp(line_break_property[prop1].enumname, | |
352 "TMP_MC"))) { | |
353 result = prop2; | |
354 } else { | |
355 result = prop1; | |
356 } | |
357 } | |
358 } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN"… | |
359 !strcmp(line_break_property[prop2].enumname, "TMP_CN"… | |
360 if (!strcmp(line_break_property[prop1].enumname, | |
361 "TMP_EXTENDED_PICTOGRAPHIC") || | |
362 !strcmp(line_break_property[prop2].enumname, | |
363 "TMP_EXTENDED_PICTOGRAPHIC")) { | |
364 target = "BOTH_CN_EXTPICT"; | |
365 } else { | |
366 /* ignore Cn for all the other properties */ | |
367 if (!strcmp(line_break_property[prop1].enumname, | |
368 "TMP_CN")) { | |
369 result = prop2; | |
370 } else { | |
371 result = prop1; | |
372 } | |
373 } | |
374 } else if (!strcmp(line_break_property[prop1].enumname, | |
375 "TMP_EXTENDED_PICTOGRAPHIC") || | |
376 !strcmp(line_break_property[prop2].enumname, | |
377 "TMP_EXTENDED_PICTOGRAPHIC")) { | |
378 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN… | |
379 !strcmp(line_break_property[prop2].enumname, "TMP_CN… | |
380 target = "BOTH_CN_EXTPICT"; | |
381 } else { | |
382 /* ignore Extended_Pictographic for all the other | |
383 * properties */ | |
384 if (!strcmp(line_break_property[prop1].enumname, | |
385 "TMP_EXTENDED_PICTOGRAPHIC")) { | |
386 result = prop2; | |
387 } else { | |
388 result = prop1; | |
389 } | |
390 } | |
391 } else { | |
392 fprintf(stderr, | |
393 "handle_conflict: Cannot handle conflict %s <- %… | |
394 line_break_property[prop1].enumname, | |
395 line_break_property[prop2].enumname); | |
396 exit(1); | |
397 } | |
398 | |
399 if (target) { | |
400 for (result = 0; result < LEN(line_break_property); resu… | |
401 if (!strcmp(line_break_property[result].enumname, | |
402 target)) { | |
403 break; | |
404 } | |
405 } | |
406 if (result == LEN(line_break_property)) { | |
407 fprintf(stderr, "handle_conflict: Internal error… | |
408 exit(1); | |
409 } | |
410 } | |
411 | |
412 return result; | |
413 } | |
414 | |
415 static void | |
416 post_process(struct properties *prop) | |
417 { | |
418 const char *target; | |
419 uint_least8_t result; | |
420 size_t i; | |
421 | |
422 /* post-mapping according to the line breaking algorithm */ | |
423 for (i = 0; i < UINT32_C(0x110000); i++) { | |
424 /* LB1 */ | |
425 if (!strcmp(line_break_property[prop[i].property].enumna… | |
426 "TMP_AI") || | |
427 !strcmp(line_break_property[prop[i].property].enumna… | |
428 "TMP_SG") || | |
429 !strcmp(line_break_property[prop[i].property].enumna… | |
430 "TMP_XX")) { | |
431 /* map AI, SG and XX to AL */ | |
432 target = "AL"; | |
433 } else if (!strcmp(line_break_property[prop[i].property] | |
434 .enumname, | |
435 "TMP_SA_WITH_MN_OR_MC")) { | |
436 /* map SA (with General_Category Mn or Mc) to CM… | |
437 target = "CM"; | |
438 } else if (!strcmp(line_break_property[prop[i].property] | |
439 .enumname, | |
440 "TMP_SA_WITHOUT_MN_OR_MC")) { | |
441 /* map SA (without General_Category Mn or Mc) to… | |
442 target = "AL"; | |
443 } else if (!strcmp(line_break_property[prop[i].property] | |
444 .enumname, | |
445 "TMP_CJ")) { | |
446 /* map CJ to NS */ | |
447 target = "NS"; | |
448 } else if ( | |
449 !strcmp(line_break_property[prop[i].property].en… | |
450 "TMP_CN") || | |
451 !strcmp(line_break_property[prop[i].property].en… | |
452 "TMP_EXTENDED_PICTOGRAPHIC") || | |
453 !strcmp(line_break_property[prop[i].property].en… | |
454 "TMP_MN") || | |
455 !strcmp(line_break_property[prop[i].property].en… | |
456 "TMP_MC") || | |
457 !strcmp(line_break_property[prop[i].property].en… | |
458 "TMP_EAW_H") || | |
459 !strcmp(line_break_property[prop[i].property].en… | |
460 "TMP_EAW_W") || | |
461 !strcmp(line_break_property[prop[i].property].en… | |
462 "TMP_EAW_F")) { | |
463 /* map all the temporary classes "residue" to AL… | |
464 target = "AL"; | |
465 } else { | |
466 target = NULL; | |
467 } | |
468 | |
469 if (target) { | |
470 for (result = 0; result < LEN(line_break_propert… | |
471 result++) { | |
472 if (!strcmp(line_break_property[result] | |
473 .enumname, | |
474 target)) { | |
475 break; | |
476 } | |
477 } | |
478 if (result == LEN(line_break_property)) { | |
479 fprintf(stderr, | |
480 "handle_conflict: Internal error… | |
481 exit(1); | |
482 } | |
483 | |
484 prop[i].property = result; | |
485 } | |
486 } | |
487 } | |
488 | |
489 int | |
490 main(int argc, char *argv[]) | |
491 { | |
492 (void)argc; | |
493 | |
494 properties_generate_break_property( | |
495 line_break_property, LEN(line_break_property), NULL, | |
496 handle_conflict, post_process, "line_break", argv[0]); | |
497 | |
498 return 0; | |
499 } |