| case.c - libgrapheme - unicode string library | |
| git clone git://git.suckless.org/libgrapheme | |
| Log | |
| Files | |
| Refs | |
| README | |
| LICENSE | |
| --- | |
| case.c (8442B) | |
| --- | |
| 1 /* See LICENSE file for copyright and license details. */ | |
| 2 #include <errno.h> | |
| 3 #include <stdint.h> | |
| 4 #include <stdio.h> | |
| 5 #include <stdlib.h> | |
| 6 #include <string.h> | |
| 7 | |
| 8 #include "util.h" | |
| 9 | |
| 10 #define FILE_DCP "data/DerivedCoreProperties.txt" | |
| 11 | |
| 12 static const struct property_spec case_property[] = { | |
| 13 { | |
| 14 .enumname = "OTHER", | |
| 15 .file = NULL, | |
| 16 .ucdname = NULL, | |
| 17 }, | |
| 18 { | |
| 19 .enumname = "BOTH_CASED_CASE_IGNORABLE", | |
| 20 .file = NULL, | |
| 21 .ucdname = NULL, | |
| 22 }, | |
| 23 { | |
| 24 .enumname = "CASED", | |
| 25 .file = FILE_DCP, | |
| 26 .ucdname = "Cased", | |
| 27 }, | |
| 28 { | |
| 29 .enumname = "CASE_IGNORABLE", | |
| 30 .file = FILE_DCP, | |
| 31 .ucdname = "Case_Ignorable", | |
| 32 }, | |
| 33 { | |
| 34 .enumname = "UNCASED", | |
| 35 .file = FILE_DCP, | |
| 36 .ucdname = "Uncased", | |
| 37 }, | |
| 38 }; | |
| 39 | |
| 40 static uint_least8_t | |
| 41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t pr… | |
| 42 { | |
| 43 uint_least8_t result; | |
| 44 | |
| 45 (void)cp; | |
| 46 | |
| 47 if ((!strcmp(case_property[prop1].enumname, "CASED") && | |
| 48 !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) || | |
| 49 (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") && | |
| 50 !strcmp(case_property[prop2].enumname, "CASED"))) { | |
| 51 for (result = 0; result < LEN(case_property); result++) { | |
| 52 if (!strcmp(case_property[result].enumname, | |
| 53 "BOTH_CASED_CASE_IGNORABLE")) { | |
| 54 break; | |
| 55 } | |
| 56 } | |
| 57 if (result == LEN(case_property)) { | |
| 58 fprintf(stderr, "handle_conflict: Internal error… | |
| 59 exit(1); | |
| 60 } | |
| 61 } else { | |
| 62 fprintf(stderr, "handle_conflict: Cannot handle conflict… | |
| 63 exit(1); | |
| 64 } | |
| 65 | |
| 66 return result; | |
| 67 } | |
| 68 | |
| 69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title; | |
| 70 | |
| 71 static struct special_case { | |
| 72 struct { | |
| 73 uint_least32_t *cp; | |
| 74 size_t cplen; | |
| 75 } upper, lower, title; | |
| 76 } *sc = NULL; | |
| 77 | |
| 78 static size_t sclen = 0; | |
| 79 | |
| 80 static int | |
| 81 unicodedata_callback(const char *file, char **field, size_t nfields, | |
| 82 char *comment, void *payload) | |
| 83 { | |
| 84 uint_least32_t cp, upper, lower, title; | |
| 85 | |
| 86 (void)file; | |
| 87 (void)comment; | |
| 88 (void)payload; | |
| 89 | |
| 90 hextocp(field[0], strlen(field[0]), &cp); | |
| 91 | |
| 92 upper = lower = title = cp; | |
| 93 | |
| 94 if ((strlen(field[12]) > 0 && | |
| 95 hextocp(field[12], strlen(field[12]), &upper)) || | |
| 96 (strlen(field[13]) > 0 && | |
| 97 hextocp(field[13], strlen(field[13]), &lower)) || | |
| 98 (nfields >= 15 && strlen(field[14]) > 0 && | |
| 99 hextocp(field[14], strlen(field[14]), &title))) { | |
| 100 return 1; | |
| 101 } | |
| 102 | |
| 103 prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)… | |
| 104 prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)… | |
| 105 prop_title[cp].property = (int_least32_t)title - (int_least32_t)… | |
| 106 | |
| 107 return 0; | |
| 108 } | |
| 109 | |
| 110 static int | |
| 111 specialcasing_callback(const char *file, char **field, size_t nfields, | |
| 112 char *comment, void *payload) | |
| 113 { | |
| 114 uint_least32_t cp; | |
| 115 | |
| 116 (void)file; | |
| 117 (void)comment; | |
| 118 (void)payload; | |
| 119 | |
| 120 if (nfields > 4 && strlen(field[4]) > 0) { | |
| 121 /* | |
| 122 * we have more than 4 fields, i.e. the rule has a | |
| 123 * condition (language-sensitive, etc.) and is discarded | |
| 124 */ | |
| 125 return 0; | |
| 126 } | |
| 127 | |
| 128 /* parse affected codepoint */ | |
| 129 hextocp(field[0], strlen(field[0]), &cp); | |
| 130 | |
| 131 /* extend special case array */ | |
| 132 if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) { | |
| 133 fprintf(stderr, "realloc: %s\n", strerror(errno)); | |
| 134 exit(1); | |
| 135 } | |
| 136 | |
| 137 /* parse field data */ | |
| 138 parse_cp_list(field[3], &(sc[sclen - 1].upper.cp), | |
| 139 &(sc[sclen - 1].upper.cplen)); | |
| 140 parse_cp_list(field[1], &(sc[sclen - 1].lower.cp), | |
| 141 &(sc[sclen - 1].lower.cplen)); | |
| 142 parse_cp_list(field[2], &(sc[sclen - 1].title.cp), | |
| 143 &(sc[sclen - 1].title.cplen)); | |
| 144 | |
| 145 /* | |
| 146 * overwrite value in "single mapping" property table by the | |
| 147 * special value 0x110000 + (offset in special case array), | |
| 148 * even if the special case has length 1 | |
| 149 */ | |
| 150 prop_upper[cp].property = | |
| 151 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); | |
| 152 prop_lower[cp].property = | |
| 153 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); | |
| 154 prop_title[cp].property = | |
| 155 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1)); | |
| 156 | |
| 157 return 0; | |
| 158 } | |
| 159 | |
| 160 static int_least64_t | |
| 161 get_value(const struct properties *prop, size_t offset) | |
| 162 { | |
| 163 return prop[offset].property; | |
| 164 } | |
| 165 | |
| 166 int | |
| 167 main(int argc, char *argv[]) | |
| 168 { | |
| 169 struct properties_compressed comp_upper, comp_lower, comp_title; | |
| 170 struct properties_major_minor mm_upper, mm_lower, mm_title; | |
| 171 size_t i, j; | |
| 172 | |
| 173 (void)argc; | |
| 174 | |
| 175 /* generate case property table from the specification */ | |
| 176 properties_generate_break_property(case_property, LEN(case_prope… | |
| 177 NULL, handle_conflict, NULL, … | |
| 178 argv[0]); | |
| 179 | |
| 180 /* | |
| 181 * allocate property buffers for all 0x110000 codepoints | |
| 182 * | |
| 183 * the buffers contain the offset from the "base" character | |
| 184 * to the respective case mapping. By callocing we set all fields | |
| 185 * to zero, which is also the Unicode "default" in the sense that | |
| 186 * there is no case mapping by default (unless we fill it in) | |
| 187 */ | |
| 188 if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper… | |
| 189 !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower… | |
| 190 !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title… | |
| 191 fprintf(stderr, "calloc: %s\n", strerror(errno)); | |
| 192 exit(1); | |
| 193 } | |
| 194 parse_file_with_callback("data/UnicodeData.txt", unicodedata_cal… | |
| 195 NULL); | |
| 196 parse_file_with_callback("data/SpecialCasing.txt", | |
| 197 specialcasing_callback, NULL); | |
| 198 | |
| 199 /* compress properties */ | |
| 200 properties_compress(prop_upper, &comp_upper); | |
| 201 properties_compress(prop_lower, &comp_lower); | |
| 202 properties_compress(prop_title, &comp_title); | |
| 203 | |
| 204 fprintf(stderr, | |
| 205 "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%,… | |
| 206 "title=%.2f%%\n", | |
| 207 argv[0], properties_get_major_minor(&comp_upper, &mm_upp… | |
| 208 properties_get_major_minor(&comp_lower, &mm_lower), | |
| 209 properties_get_major_minor(&comp_title, &mm_title)); | |
| 210 | |
| 211 /* print tables */ | |
| 212 printf("/* Automatically generated by %s */\n#include " | |
| 213 "<stdint.h>\n#include <stddef.h>\n\n", | |
| 214 argv[0]); | |
| 215 | |
| 216 printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t " | |
| 217 "cplen;\n};\n\n"); | |
| 218 | |
| 219 properties_print_lookup_table("upper_major", mm_upper.major, 0x1… | |
| 220 printf("\n"); | |
| 221 properties_print_derived_lookup_table("upper_minor", mm_upper.mi… | |
| 222 mm_upper.minorlen, get_val… | |
| 223 comp_upper.data); | |
| 224 printf("\n"); | |
| 225 properties_print_lookup_table("lower_major", mm_lower.major, 0x1… | |
| 226 printf("\n"); | |
| 227 properties_print_derived_lookup_table("lower_minor", mm_lower.mi… | |
| 228 mm_lower.minorlen, get_val… | |
| 229 comp_lower.data); | |
| 230 printf("\n"); | |
| 231 properties_print_lookup_table("title_major", mm_title.major, 0x1… | |
| 232 printf("\n"); | |
| 233 properties_print_derived_lookup_table("title_minor", mm_title.mi… | |
| 234 mm_title.minorlen, get_val… | |
| 235 comp_title.data); | |
| 236 printf("\n"); | |
| 237 | |
| 238 printf("static const struct special_case upper_special[] = {\n"); | |
| 239 for (i = 0; i < sclen; i++) { | |
| 240 printf("\t{\n"); | |
| 241 | |
| 242 printf("\t\t.cp = (uint_least32_t[]){"); | |
| 243 for (j = 0; j < sc[i].upper.cplen; j++) { | |
| 244 printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]); | |
| 245 if (j + 1 < sc[i].upper.cplen) { | |
| 246 putchar(','); | |
| 247 } | |
| 248 } | |
| 249 printf(" },\n"); | |
| 250 printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen); | |
| 251 printf("\t},\n"); | |
| 252 } | |
| 253 printf("};\n\n"); | |
| 254 | |
| 255 printf("static const struct special_case lower_special[] = {\n"); | |
| 256 for (i = 0; i < sclen; i++) { | |
| 257 printf("\t{\n"); | |
| 258 | |
| 259 printf("\t\t.cp = (uint_least32_t[]){"); | |
| 260 for (j = 0; j < sc[i].lower.cplen; j++) { | |
| 261 printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]); | |
| 262 if (j + 1 < sc[i].lower.cplen) { | |
| 263 putchar(','); | |
| 264 } | |
| 265 } | |
| 266 printf(" },\n"); | |
| 267 printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen); | |
| 268 printf("\t},\n"); | |
| 269 } | |
| 270 printf("};\n\n"); | |
| 271 | |
| 272 printf("static const struct special_case title_special[] = {\n"); | |
| 273 for (i = 0; i < sclen; i++) { | |
| 274 printf("\t{\n"); | |
| 275 | |
| 276 printf("\t\t.cp = (uint_least32_t[]){"); | |
| 277 for (j = 0; j < sc[i].title.cplen; j++) { | |
| 278 printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]); | |
| 279 if (j + 1 < sc[i].title.cplen) { | |
| 280 putchar(','); | |
| 281 } | |
| 282 } | |
| 283 printf(" },\n"); | |
| 284 printf("\t\t.cplen = %zu,\n", sc[i].title.cplen); | |
| 285 printf("\t},\n"); | |
| 286 } | |
| 287 printf("};\n\n"); | |
| 288 | |
| 289 free(comp_lower.data); | |
| 290 free(comp_lower.offset); | |
| 291 free(comp_title.data); | |
| 292 free(comp_title.offset); | |
| 293 free(comp_upper.data); | |
| 294 free(comp_upper.offset); | |
| 295 free(mm_lower.major); | |
| 296 free(mm_lower.minor); | |
| 297 free(mm_title.major); | |
| 298 free(mm_title.minor); | |
| 299 free(mm_upper.major); | |
| 300 free(mm_upper.minor); | |
| 301 | |
| 302 return 0; | |
| 303 } |