case.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
case.c (8442B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <errno.h> | |
3 #include <stdint.h> | |
4 #include <stdio.h> | |
5 #include <stdlib.h> | |
6 #include <string.h> | |
7 | |
8 #include "util.h" | |
9 | |
10 #define FILE_DCP "data/DerivedCoreProperties.txt" | |
11 | |
12 static const struct property_spec case_property[] = { | |
13 { | |
14 .enumname = "OTHER", | |
15 .file = NULL, | |
16 .ucdname = NULL, | |
17 }, | |
18 { | |
19 .enumname = "BOTH_CASED_CASE_IGNORABLE", | |
20 .file = NULL, | |
21 .ucdname = NULL, | |
22 }, | |
23 { | |
24 .enumname = "CASED", | |
25 .file = FILE_DCP, | |
26 .ucdname = "Cased", | |
27 }, | |
28 { | |
29 .enumname = "CASE_IGNORABLE", | |
30 .file = FILE_DCP, | |
31 .ucdname = "Case_Ignorable", | |
32 }, | |
33 { | |
34 .enumname = "UNCASED", | |
35 .file = FILE_DCP, | |
36 .ucdname = "Uncased", | |
37 }, | |
38 }; | |
39 | |
40 static uint_least8_t | |
41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t pr… | |
42 { | |
43 uint_least8_t result; | |
44 | |
45 (void)cp; | |
46 | |
47 if ((!strcmp(case_property[prop1].enumname, "CASED") && | |
48 !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) || | |
49 (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") && | |
50 !strcmp(case_property[prop2].enumname, "CASED"))) { | |
51 for (result = 0; result < LEN(case_property); result++) { | |
52 if (!strcmp(case_property[result].enumname, | |
53 "BOTH_CASED_CASE_IGNORABLE")) { | |
54 break; | |
55 } | |
56 } | |
57 if (result == LEN(case_property)) { | |
58 fprintf(stderr, "handle_conflict: Internal error… | |
59 exit(1); | |
60 } | |
61 } else { | |
62 fprintf(stderr, "handle_conflict: Cannot handle conflict… | |
63 exit(1); | |
64 } | |
65 | |
66 return result; | |
67 } | |
68 | |
69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title; | |
70 | |
71 static struct special_case { | |
72 struct { | |
73 uint_least32_t *cp; | |
74 size_t cplen; | |
75 } upper, lower, title; | |
76 } *sc = NULL; | |
77 | |
78 static size_t sclen = 0; | |
79 | |
80 static int | |
81 unicodedata_callback(const char *file, char **field, size_t nfields, | |
82 char *comment, void *payload) | |
83 { | |
84 uint_least32_t cp, upper, lower, title; | |
85 | |
86 (void)file; | |
87 (void)comment; | |
88 (void)payload; | |
89 | |
90 hextocp(field[0], strlen(field[0]), &cp); | |
91 | |
92 upper = lower = title = cp; | |
93 | |
94 if ((strlen(field[12]) > 0 && | |
95 hextocp(field[12], strlen(field[12]), &upper)) || | |
96 (strlen(field[13]) > 0 && | |
97 hextocp(field[13], strlen(field[13]), &lower)) || | |
98 (nfields >= 15 && strlen(field[14]) > 0 && | |
99 hextocp(field[14], strlen(field[14]), &title))) { | |
100 return 1; | |
101 } | |
102 | |
103 prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)… | |
104 prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)… | |
105 prop_title[cp].property = (int_least32_t)title - (int_least32_t)… | |
106 | |
107 return 0; | |
108 } | |
109 | |
110 static int | |
111 specialcasing_callback(const char *file, char **field, size_t nfields, | |
112 char *comment, void *payload) | |
113 { | |
114 uint_least32_t cp; | |
115 | |
116 (void)file; | |
117 (void)comment; | |
118 (void)payload; | |
119 | |
120 if (nfields > 4 && strlen(field[4]) > 0) { | |
121 /* | |
122 * we have more than 4 fields, i.e. the rule has a | |
123 * condition (language-sensitive, etc.) and is discarded | |
124 */ | |
125 return 0; | |
126 } | |
127 | |
128 /* parse affected codepoint */ | |
129 hextocp(field[0], strlen(field[0]), &cp); | |
130 | |
131 /* extend special case array */ | |
132 if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) { | |
133 fprintf(stderr, "realloc: %s\n", strerror(errno)); | |
134 exit(1); | |
135 } | |
136 | |
137 /* parse field data */ | |
138 parse_cp_list(field[3], &(sc[sclen - 1].upper.cp), | |
139 &(sc[sclen - 1].upper.cplen)); | |
140 parse_cp_list(field[1], &(sc[sclen - 1].lower.cp), | |
141 &(sc[sclen - 1].lower.cplen)); | |
142 parse_cp_list(field[2], &(sc[sclen - 1].title.cp), | |
143 &(sc[sclen - 1].title.cplen)); | |
144 | |
145 /* | |
146 * overwrite value in "single mapping" property table by the | |
147 * special value 0x110000 + (offset in special case array), | |
148 * even if the special case has length 1 | |
149 */ | |
150 prop_upper[cp].property = | |
151 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); | |
152 prop_lower[cp].property = | |
153 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); | |
154 prop_title[cp].property = | |
155 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); | |
156 | |
157 return 0; | |
158 } | |
159 | |
160 static int_least64_t | |
161 get_value(const struct properties *prop, size_t offset) | |
162 { | |
163 return prop[offset].property; | |
164 } | |
165 | |
166 int | |
167 main(int argc, char *argv[]) | |
168 { | |
169 struct properties_compressed comp_upper, comp_lower, comp_title; | |
170 struct properties_major_minor mm_upper, mm_lower, mm_title; | |
171 size_t i, j; | |
172 | |
173 (void)argc; | |
174 | |
175 /* generate case property table from the specification */ | |
176 properties_generate_break_property(case_property, LEN(case_prope… | |
177 NULL, handle_conflict, NULL, … | |
178 argv[0]); | |
179 | |
180 /* | |
181 * allocate property buffers for all 0x110000 codepoints | |
182 * | |
183 * the buffers contain the offset from the "base" character | |
184 * to the respective case mapping. By callocing we set all fields | |
185 * to zero, which is also the Unicode "default" in the sense that | |
186 * there is no case mapping by default (unless we fill it in) | |
187 */ | |
188 if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper… | |
189 !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower… | |
190 !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title… | |
191 fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
192 exit(1); | |
193 } | |
194 parse_file_with_callback("data/UnicodeData.txt", unicodedata_cal… | |
195 NULL); | |
196 parse_file_with_callback("data/SpecialCasing.txt", | |
197 specialcasing_callback, NULL); | |
198 | |
199 /* compress properties */ | |
200 properties_compress(prop_upper, &comp_upper); | |
201 properties_compress(prop_lower, &comp_lower); | |
202 properties_compress(prop_title, &comp_title); | |
203 | |
204 fprintf(stderr, | |
205 "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%,… | |
206 "title=%.2f%%\n", | |
207 argv[0], properties_get_major_minor(&comp_upper, &mm_upp… | |
208 properties_get_major_minor(&comp_lower, &mm_lower), | |
209 properties_get_major_minor(&comp_title, &mm_title)); | |
210 | |
211 /* print tables */ | |
212 printf("/* Automatically generated by %s */\n#include " | |
213 "<stdint.h>\n#include <stddef.h>\n\n", | |
214 argv[0]); | |
215 | |
216 printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t " | |
217 "cplen;\n};\n\n"); | |
218 | |
219 properties_print_lookup_table("upper_major", mm_upper.major, 0x1… | |
220 printf("\n"); | |
221 properties_print_derived_lookup_table("upper_minor", mm_upper.mi… | |
222 mm_upper.minorlen, get_val… | |
223 comp_upper.data); | |
224 printf("\n"); | |
225 properties_print_lookup_table("lower_major", mm_lower.major, 0x1… | |
226 printf("\n"); | |
227 properties_print_derived_lookup_table("lower_minor", mm_lower.mi… | |
228 mm_lower.minorlen, get_val… | |
229 comp_lower.data); | |
230 printf("\n"); | |
231 properties_print_lookup_table("title_major", mm_title.major, 0x1… | |
232 printf("\n"); | |
233 properties_print_derived_lookup_table("title_minor", mm_title.mi… | |
234 mm_title.minorlen, get_val… | |
235 comp_title.data); | |
236 printf("\n"); | |
237 | |
238 printf("static const struct special_case upper_special[] = {\n"); | |
239 for (i = 0; i < sclen; i++) { | |
240 printf("\t{\n"); | |
241 | |
242 printf("\t\t.cp = (uint_least32_t[]){"); | |
243 for (j = 0; j < sc[i].upper.cplen; j++) { | |
244 printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]); | |
245 if (j + 1 < sc[i].upper.cplen) { | |
246 putchar(','); | |
247 } | |
248 } | |
249 printf(" },\n"); | |
250 printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen); | |
251 printf("\t},\n"); | |
252 } | |
253 printf("};\n\n"); | |
254 | |
255 printf("static const struct special_case lower_special[] = {\n"); | |
256 for (i = 0; i < sclen; i++) { | |
257 printf("\t{\n"); | |
258 | |
259 printf("\t\t.cp = (uint_least32_t[]){"); | |
260 for (j = 0; j < sc[i].lower.cplen; j++) { | |
261 printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]); | |
262 if (j + 1 < sc[i].lower.cplen) { | |
263 putchar(','); | |
264 } | |
265 } | |
266 printf(" },\n"); | |
267 printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen); | |
268 printf("\t},\n"); | |
269 } | |
270 printf("};\n\n"); | |
271 | |
272 printf("static const struct special_case title_special[] = {\n"); | |
273 for (i = 0; i < sclen; i++) { | |
274 printf("\t{\n"); | |
275 | |
276 printf("\t\t.cp = (uint_least32_t[]){"); | |
277 for (j = 0; j < sc[i].title.cplen; j++) { | |
278 printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]); | |
279 if (j + 1 < sc[i].title.cplen) { | |
280 putchar(','); | |
281 } | |
282 } | |
283 printf(" },\n"); | |
284 printf("\t\t.cplen = %zu,\n", sc[i].title.cplen); | |
285 printf("\t},\n"); | |
286 } | |
287 printf("};\n\n"); | |
288 | |
289 free(comp_lower.data); | |
290 free(comp_lower.offset); | |
291 free(comp_title.data); | |
292 free(comp_title.offset); | |
293 free(comp_upper.data); | |
294 free(comp_upper.offset); | |
295 free(mm_lower.major); | |
296 free(mm_lower.minor); | |
297 free(mm_title.major); | |
298 free(mm_title.minor); | |
299 free(mm_upper.major); | |
300 free(mm_upper.minor); | |
301 | |
302 return 0; | |
303 } |