GopherProxy

	utf8.c - libgrapheme - unicode string library
	git clone git://git.suckless.org/libgrapheme
	Log
	Files
	Refs
	README
	LICENSE
	---
	utf8.c (6131B)
	---
	1 /* See LICENSE file for copyright and license details. */
	2 #include <stddef.h>
	3 #include <stdint.h>
	4
	5 #include "../grapheme.h"
	6 #include "util.h"
	7
	8 #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
	9
	10 /* lookup-table for the types of sequence first bytes */
	11 static const struct {
	12 uint_least8_t lower; /* lower bound of sequence first byte */
	13 uint_least8_t upper; /* upper bound of sequence first byte */
	14 uint_least32_t mincp; /* smallest non-overlong encoded codepoint…
	15 uint_least32_t maxcp; /* largest encodable codepoint */
	16 /*
	17 * implicit: table-offset represents the n…
	18 * bytes of the form 10xxxxxx (6 bits capa…
	19 */
	20 } lut[] = {
	21 [0] = {
	22 /* 0xxxxxxx */
	23 .lower = 0x00, /* 00000000 */
	24 .upper = 0x7F, /* 01111111 */
	25 .mincp = (uint_least32_t)0,
	26 .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacit…
	27 },
	28 [1] = {
	29 /* 110xxxxx */
	30 .lower = 0xC0, /* 11000000 */
	31 .upper = 0xDF, /* 11011111 */
	32 .mincp = (uint_least32_t)1 << 7,
	33 .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits c…
	34 },
	35 [2] = {
	36 /* 1110xxxx */
	37 .lower = 0xE0, /* 11100000 */
	38 .upper = 0xEF, /* 11101111 */
	39 .mincp = (uint_least32_t)1 << 11,
	40 .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits…
	41 },
	42 [3] = {
	43 /* 11110xxx */
	44 .lower = 0xF0, /* 11110000 */
	45 .upper = 0xF7, /* 11110111 */
	46 .mincp = (uint_least32_t)1 << 16,
	47 .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bi…
	48 },
	49 };
	50
	51 size_t
	52 grapheme_decode_utf8(const char str, size_t len, uint_least32_t cp)
	53 {
	54 size_t off, i;
	55 uint_least32_t tmp;
	56
	57 if (cp == NULL) {
	58 /*
	59 * instead of checking every time if cp is NULL within
	60 * the decoder, simply point it at a dummy variable here.
	61 */
	62 cp = &tmp;
	63 }
	64
	65 if (str == NULL \|\| len == 0) {
	66 /* a sequence must be at least 1 byte long */
	67 *cp = GRAPHEME_INVALID_CODEPOINT;
	68 return 0;
	69 }
	70
	71 /* identify sequence type with the first byte */
	72 for (off = 0; off < LEN(lut); off++) {
	73 if (BETWEEN(((const unsigned char *)str)[0], lut[off].lo…
	74 lut[off].upper)) {
	75 /*
	76 * first byte is within the bounds; fill
	77 * p with the the first bits contained in
	78 * the first byte (by subtracting the high bits)
	79 */
	80 cp = ((const unsigned char )str)[0] - lut[off]…
	81 break;
	82 }
	83 }
	84 if (off == LEN(lut)) {
	85 /*
	86 * first byte does not match a sequence type;
	87 * set cp as invalid and return 1 byte processed
	88 *
	89 * this also includes the cases where bits higher than
	90 * the 8th are set on systems with CHAR_BIT > 8
	91 */
	92 *cp = GRAPHEME_INVALID_CODEPOINT;
	93 return 1;
	94 }
	95 if (1 + off > len) {
	96 /*
	97 * input is not long enough, set cp as invalid
	98 */
	99 *cp = GRAPHEME_INVALID_CODEPOINT;
	100
	101 /*
	102 * count the following continuation bytes, but nothing
	103 * else in case we have a "rogue" case where e.g. such a
	104 * sequence starter occurs right before a NUL-byte.
	105 */
	106 for (i = 0; 1 + i < len; i++) {
	107 if (!BETWEEN(((const unsigned char *)str)[1 + i]…
	108 0xBF)) {
	109 break;
	110 }
	111 }
	112
	113 /*
	114 * if the continuation bytes do not continue until
	115 * the end, return the incomplete sequence length.
	116 * Otherwise return the number of bytes we actually
	117 * expected, which is larger than n.
	118 */
	119 return ((1 + i) < len) ? (1 + i) : (1 + off);
	120 }
	121
	122 /*
	123 * process 'off' following bytes, each of the form 10xxxxxx
	124 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
	125 */
	126 for (i = 1; i <= off; i++) {
	127 if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF…
	128 /*
	129 * byte does not match format; return
	130 * number of bytes processed excluding the
	131 * unexpected character as recommended since
	132 * Unicode 6 (chapter 3)
	133 *
	134 * this also includes the cases where bits
	135 * higher than the 8th are set on systems
	136 * with CHAR_BIT > 8
	137 */
	138 *cp = GRAPHEME_INVALID_CODEPOINT;
	139 return 1 + (i - 1);
	140 }
	141 /*
	142 * shift codepoint by 6 bits and add the 6 stored bits
	143 * in s[i] to it using the bitmask 0x3F (00111111)
	144 */
	145 cp = (cp << 6) \| (((const unsigned char *)str)[i] & 0x…
	146 }
	147
	148 if (*cp < lut[off].mincp \|\|
	149 BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) \|\|
	150 *cp > UINT32_C(0x10FFFF)) {
	151 /*
	152 * codepoint is overlong encoded in the sequence, is a
	153 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
	154 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
	155 * specifies the latter two conditions)
	156 */
	157 *cp = GRAPHEME_INVALID_CODEPOINT;
	158 }
	159
	160 return 1 + off;
	161 }
	162
	163 size_t
	164 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
	165 {
	166 size_t off, i;
	167
	168 if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) \|\|
	169 cp > UINT32_C(0x10FFFF)) {
	170 /*
	171 * codepoint is a high or low UTF-16 surrogate half
	172 * (0xD800..0xDFFF) or not representable in UTF-16
	173 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
	174 */
	175 cp = GRAPHEME_INVALID_CODEPOINT;
	176 }
	177
	178 /* determine necessary sequence type */
	179 for (off = 0; off < LEN(lut); off++) {
	180 if (cp <= lut[off].maxcp) {
	181 break;
	182 }
	183 }
	184 if (1 + off > len \|\| str == NULL \|\| len == 0) {
	185 /*
	186 * specified buffer is too small to store sequence or
	187 * the caller just wanted to know how many bytes the
	188 * codepoint needs by passing a NULL-buffer.
	189 */
	190 return 1 + off;
	191 }
	192
	193 /* build sequence by filling cp-bits into each byte */
	194
	195 /*
	196 * lut[off].lower is the bit-format for the first byte and
	197 * the bits to fill into it are determined by shifting the
	198 * cp 6 times the number of following bytes, as each
	199 * following byte stores 6 bits, yielding the wanted bits.
	200 *
	201 * We do not overwrite the mask because we guaranteed earlier
	202 * that there are no bits higher than the mask allows.
	203 */
	204 ((unsigned char *)str)[0] =
	205 lut[off].lower \| (uint_least8_t)(cp >> (6 * off));
	206
	207 for (i = 1; i <= off; i++) {
	208 /*
	209 * the bit-format for following bytes is 10000000 (0x80)
	210 * and it each stores 6 bits in the 6 low bits that we
	211 * extract from the properly-shifted value using the
	212 * mask 00111111 (0x3F)
	213 */
	214 ((unsigned char *)str)[i] =
	215 0x80 \| ((cp >> (6 * (off - i))) & 0x3F);
	216 }
	217
	218 return 1 + off;
	219 }