Introduction
Introduction Statistics Contact Development Disclaimer Help
utf8.c - libgrapheme - unicode string library
git clone git://git.suckless.org/libgrapheme
Log
Files
Refs
README
LICENSE
---
utf8.c (6131B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "../grapheme.h"
6 #include "util.h"
7
8 #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
9
10 /* lookup-table for the types of sequence first bytes */
11 static const struct {
12 uint_least8_t lower; /* lower bound of sequence first byte */
13 uint_least8_t upper; /* upper bound of sequence first byte */
14 uint_least32_t mincp; /* smallest non-overlong encoded codepoint…
15 uint_least32_t maxcp; /* largest encodable codepoint */
16 /*
17 * implicit: table-offset represents the n…
18 * bytes of the form 10xxxxxx (6 bits capa…
19 */
20 } lut[] = {
21 [0] = {
22 /* 0xxxxxxx */
23 .lower = 0x00, /* 00000000 */
24 .upper = 0x7F, /* 01111111 */
25 .mincp = (uint_least32_t)0,
26 .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacit…
27 },
28 [1] = {
29 /* 110xxxxx */
30 .lower = 0xC0, /* 11000000 */
31 .upper = 0xDF, /* 11011111 */
32 .mincp = (uint_least32_t)1 << 7,
33 .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits c…
34 },
35 [2] = {
36 /* 1110xxxx */
37 .lower = 0xE0, /* 11100000 */
38 .upper = 0xEF, /* 11101111 */
39 .mincp = (uint_least32_t)1 << 11,
40 .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits…
41 },
42 [3] = {
43 /* 11110xxx */
44 .lower = 0xF0, /* 11110000 */
45 .upper = 0xF7, /* 11110111 */
46 .mincp = (uint_least32_t)1 << 16,
47 .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bi…
48 },
49 };
50
51 size_t
52 grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
53 {
54 size_t off, i;
55 uint_least32_t tmp;
56
57 if (cp == NULL) {
58 /*
59 * instead of checking every time if cp is NULL within
60 * the decoder, simply point it at a dummy variable here.
61 */
62 cp = &tmp;
63 }
64
65 if (str == NULL || len == 0) {
66 /* a sequence must be at least 1 byte long */
67 *cp = GRAPHEME_INVALID_CODEPOINT;
68 return 0;
69 }
70
71 /* identify sequence type with the first byte */
72 for (off = 0; off < LEN(lut); off++) {
73 if (BETWEEN(((const unsigned char *)str)[0], lut[off].lo…
74 lut[off].upper)) {
75 /*
76 * first byte is within the bounds; fill
77 * p with the the first bits contained in
78 * the first byte (by subtracting the high bits)
79 */
80 *cp = ((const unsigned char *)str)[0] - lut[off]…
81 break;
82 }
83 }
84 if (off == LEN(lut)) {
85 /*
86 * first byte does not match a sequence type;
87 * set cp as invalid and return 1 byte processed
88 *
89 * this also includes the cases where bits higher than
90 * the 8th are set on systems with CHAR_BIT > 8
91 */
92 *cp = GRAPHEME_INVALID_CODEPOINT;
93 return 1;
94 }
95 if (1 + off > len) {
96 /*
97 * input is not long enough, set cp as invalid
98 */
99 *cp = GRAPHEME_INVALID_CODEPOINT;
100
101 /*
102 * count the following continuation bytes, but nothing
103 * else in case we have a "rogue" case where e.g. such a
104 * sequence starter occurs right before a NUL-byte.
105 */
106 for (i = 0; 1 + i < len; i++) {
107 if (!BETWEEN(((const unsigned char *)str)[1 + i]…
108 0xBF)) {
109 break;
110 }
111 }
112
113 /*
114 * if the continuation bytes do not continue until
115 * the end, return the incomplete sequence length.
116 * Otherwise return the number of bytes we actually
117 * expected, which is larger than n.
118 */
119 return ((1 + i) < len) ? (1 + i) : (1 + off);
120 }
121
122 /*
123 * process 'off' following bytes, each of the form 10xxxxxx
124 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
125 */
126 for (i = 1; i <= off; i++) {
127 if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF…
128 /*
129 * byte does not match format; return
130 * number of bytes processed excluding the
131 * unexpected character as recommended since
132 * Unicode 6 (chapter 3)
133 *
134 * this also includes the cases where bits
135 * higher than the 8th are set on systems
136 * with CHAR_BIT > 8
137 */
138 *cp = GRAPHEME_INVALID_CODEPOINT;
139 return 1 + (i - 1);
140 }
141 /*
142 * shift codepoint by 6 bits and add the 6 stored bits
143 * in s[i] to it using the bitmask 0x3F (00111111)
144 */
145 *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x…
146 }
147
148 if (*cp < lut[off].mincp ||
149 BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
150 *cp > UINT32_C(0x10FFFF)) {
151 /*
152 * codepoint is overlong encoded in the sequence, is a
153 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
154 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
155 * specifies the latter two conditions)
156 */
157 *cp = GRAPHEME_INVALID_CODEPOINT;
158 }
159
160 return 1 + off;
161 }
162
163 size_t
164 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
165 {
166 size_t off, i;
167
168 if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
169 cp > UINT32_C(0x10FFFF)) {
170 /*
171 * codepoint is a high or low UTF-16 surrogate half
172 * (0xD800..0xDFFF) or not representable in UTF-16
173 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
174 */
175 cp = GRAPHEME_INVALID_CODEPOINT;
176 }
177
178 /* determine necessary sequence type */
179 for (off = 0; off < LEN(lut); off++) {
180 if (cp <= lut[off].maxcp) {
181 break;
182 }
183 }
184 if (1 + off > len || str == NULL || len == 0) {
185 /*
186 * specified buffer is too small to store sequence or
187 * the caller just wanted to know how many bytes the
188 * codepoint needs by passing a NULL-buffer.
189 */
190 return 1 + off;
191 }
192
193 /* build sequence by filling cp-bits into each byte */
194
195 /*
196 * lut[off].lower is the bit-format for the first byte and
197 * the bits to fill into it are determined by shifting the
198 * cp 6 times the number of following bytes, as each
199 * following byte stores 6 bits, yielding the wanted bits.
200 *
201 * We do not overwrite the mask because we guaranteed earlier
202 * that there are no bits higher than the mask allows.
203 */
204 ((unsigned char *)str)[0] =
205 lut[off].lower | (uint_least8_t)(cp >> (6 * off));
206
207 for (i = 1; i <= off; i++) {
208 /*
209 * the bit-format for following bytes is 10000000 (0x80)
210 * and it each stores 6 bits in the 6 low bits that we
211 * extract from the properly-shifted value using the
212 * mask 00111111 (0x3F)
213 */
214 ((unsigned char *)str)[i] =
215 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
216 }
217
218 return 1 + off;
219 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.