utf8.c - libgrapheme - unicode string library | |
git clone git://git.suckless.org/libgrapheme | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
utf8.c (6131B) | |
--- | |
1 /* See LICENSE file for copyright and license details. */ | |
2 #include <stddef.h> | |
3 #include <stdint.h> | |
4 | |
5 #include "../grapheme.h" | |
6 #include "util.h" | |
7 | |
8 #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u)) | |
9 | |
10 /* lookup-table for the types of sequence first bytes */ | |
11 static const struct { | |
12 uint_least8_t lower; /* lower bound of sequence first byte */ | |
13 uint_least8_t upper; /* upper bound of sequence first byte */ | |
14 uint_least32_t mincp; /* smallest non-overlong encoded codepoint… | |
15 uint_least32_t maxcp; /* largest encodable codepoint */ | |
16 /* | |
17 * implicit: table-offset represents the n… | |
18 * bytes of the form 10xxxxxx (6 bits capa… | |
19 */ | |
20 } lut[] = { | |
21 [0] = { | |
22 /* 0xxxxxxx */ | |
23 .lower = 0x00, /* 00000000 */ | |
24 .upper = 0x7F, /* 01111111 */ | |
25 .mincp = (uint_least32_t)0, | |
26 .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacit… | |
27 }, | |
28 [1] = { | |
29 /* 110xxxxx */ | |
30 .lower = 0xC0, /* 11000000 */ | |
31 .upper = 0xDF, /* 11011111 */ | |
32 .mincp = (uint_least32_t)1 << 7, | |
33 .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits c… | |
34 }, | |
35 [2] = { | |
36 /* 1110xxxx */ | |
37 .lower = 0xE0, /* 11100000 */ | |
38 .upper = 0xEF, /* 11101111 */ | |
39 .mincp = (uint_least32_t)1 << 11, | |
40 .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits… | |
41 }, | |
42 [3] = { | |
43 /* 11110xxx */ | |
44 .lower = 0xF0, /* 11110000 */ | |
45 .upper = 0xF7, /* 11110111 */ | |
46 .mincp = (uint_least32_t)1 << 16, | |
47 .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bi… | |
48 }, | |
49 }; | |
50 | |
51 size_t | |
52 grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp) | |
53 { | |
54 size_t off, i; | |
55 uint_least32_t tmp; | |
56 | |
57 if (cp == NULL) { | |
58 /* | |
59 * instead of checking every time if cp is NULL within | |
60 * the decoder, simply point it at a dummy variable here. | |
61 */ | |
62 cp = &tmp; | |
63 } | |
64 | |
65 if (str == NULL || len == 0) { | |
66 /* a sequence must be at least 1 byte long */ | |
67 *cp = GRAPHEME_INVALID_CODEPOINT; | |
68 return 0; | |
69 } | |
70 | |
71 /* identify sequence type with the first byte */ | |
72 for (off = 0; off < LEN(lut); off++) { | |
73 if (BETWEEN(((const unsigned char *)str)[0], lut[off].lo… | |
74 lut[off].upper)) { | |
75 /* | |
76 * first byte is within the bounds; fill | |
77 * p with the the first bits contained in | |
78 * the first byte (by subtracting the high bits) | |
79 */ | |
80 *cp = ((const unsigned char *)str)[0] - lut[off]… | |
81 break; | |
82 } | |
83 } | |
84 if (off == LEN(lut)) { | |
85 /* | |
86 * first byte does not match a sequence type; | |
87 * set cp as invalid and return 1 byte processed | |
88 * | |
89 * this also includes the cases where bits higher than | |
90 * the 8th are set on systems with CHAR_BIT > 8 | |
91 */ | |
92 *cp = GRAPHEME_INVALID_CODEPOINT; | |
93 return 1; | |
94 } | |
95 if (1 + off > len) { | |
96 /* | |
97 * input is not long enough, set cp as invalid | |
98 */ | |
99 *cp = GRAPHEME_INVALID_CODEPOINT; | |
100 | |
101 /* | |
102 * count the following continuation bytes, but nothing | |
103 * else in case we have a "rogue" case where e.g. such a | |
104 * sequence starter occurs right before a NUL-byte. | |
105 */ | |
106 for (i = 0; 1 + i < len; i++) { | |
107 if (!BETWEEN(((const unsigned char *)str)[1 + i]… | |
108 0xBF)) { | |
109 break; | |
110 } | |
111 } | |
112 | |
113 /* | |
114 * if the continuation bytes do not continue until | |
115 * the end, return the incomplete sequence length. | |
116 * Otherwise return the number of bytes we actually | |
117 * expected, which is larger than n. | |
118 */ | |
119 return ((1 + i) < len) ? (1 + i) : (1 + off); | |
120 } | |
121 | |
122 /* | |
123 * process 'off' following bytes, each of the form 10xxxxxx | |
124 * (i.e. between 0x80 (10000000) and 0xBF (10111111)) | |
125 */ | |
126 for (i = 1; i <= off; i++) { | |
127 if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF… | |
128 /* | |
129 * byte does not match format; return | |
130 * number of bytes processed excluding the | |
131 * unexpected character as recommended since | |
132 * Unicode 6 (chapter 3) | |
133 * | |
134 * this also includes the cases where bits | |
135 * higher than the 8th are set on systems | |
136 * with CHAR_BIT > 8 | |
137 */ | |
138 *cp = GRAPHEME_INVALID_CODEPOINT; | |
139 return 1 + (i - 1); | |
140 } | |
141 /* | |
142 * shift codepoint by 6 bits and add the 6 stored bits | |
143 * in s[i] to it using the bitmask 0x3F (00111111) | |
144 */ | |
145 *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x… | |
146 } | |
147 | |
148 if (*cp < lut[off].mincp || | |
149 BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || | |
150 *cp > UINT32_C(0x10FFFF)) { | |
151 /* | |
152 * codepoint is overlong encoded in the sequence, is a | |
153 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or | |
154 * not representable in UTF-16 (>0x10FFFF) (RFC-3629 | |
155 * specifies the latter two conditions) | |
156 */ | |
157 *cp = GRAPHEME_INVALID_CODEPOINT; | |
158 } | |
159 | |
160 return 1 + off; | |
161 } | |
162 | |
163 size_t | |
164 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len) | |
165 { | |
166 size_t off, i; | |
167 | |
168 if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) || | |
169 cp > UINT32_C(0x10FFFF)) { | |
170 /* | |
171 * codepoint is a high or low UTF-16 surrogate half | |
172 * (0xD800..0xDFFF) or not representable in UTF-16 | |
173 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8. | |
174 */ | |
175 cp = GRAPHEME_INVALID_CODEPOINT; | |
176 } | |
177 | |
178 /* determine necessary sequence type */ | |
179 for (off = 0; off < LEN(lut); off++) { | |
180 if (cp <= lut[off].maxcp) { | |
181 break; | |
182 } | |
183 } | |
184 if (1 + off > len || str == NULL || len == 0) { | |
185 /* | |
186 * specified buffer is too small to store sequence or | |
187 * the caller just wanted to know how many bytes the | |
188 * codepoint needs by passing a NULL-buffer. | |
189 */ | |
190 return 1 + off; | |
191 } | |
192 | |
193 /* build sequence by filling cp-bits into each byte */ | |
194 | |
195 /* | |
196 * lut[off].lower is the bit-format for the first byte and | |
197 * the bits to fill into it are determined by shifting the | |
198 * cp 6 times the number of following bytes, as each | |
199 * following byte stores 6 bits, yielding the wanted bits. | |
200 * | |
201 * We do not overwrite the mask because we guaranteed earlier | |
202 * that there are no bits higher than the mask allows. | |
203 */ | |
204 ((unsigned char *)str)[0] = | |
205 lut[off].lower | (uint_least8_t)(cp >> (6 * off)); | |
206 | |
207 for (i = 1; i <= off; i++) { | |
208 /* | |
209 * the bit-format for following bytes is 10000000 (0x80) | |
210 * and it each stores 6 bits in the 6 low bits that we | |
211 * extract from the properly-shifted value using the | |
212 * mask 00111111 (0x3F) | |
213 */ | |
214 ((unsigned char *)str)[i] = | |
215 0x80 | ((cp >> (6 * (off - i))) & 0x3F); | |
216 } | |
217 | |
218 return 1 + off; | |
219 } |