GopherProxy

	tenc-unicode.lisp - clic - Clic is an command line interactive client for gophe…
	git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/
	Log
	Files
	Refs
	Tags
	LICENSE
	---
	tenc-unicode.lisp (42416B)
	---
	1 ;;;; -- Mode: lisp; indent-tabs-mode: nil --
	2 ;;;
	3 ;;; enc-unicode.lisp --- Unicode encodings.
	4 ;;;
	5 ;;; Copyright (C) 2007, Luis Oliveira <[email protected]>
	6 ;;;
	7 ;;; Permission is hereby granted, free of charge, to any person
	8 ;;; obtaining a copy of this software and associated documentation
	9 ;;; files (the "Software"), to deal in the Software without
	10 ;;; restriction, including without limitation the rights to use, copy,
	11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
	12 ;;; of the Software, and to permit persons to whom the Software is
	13 ;;; furnished to do so, subject to the following conditions:
	14 ;;;
	15 ;;; The above copyright notice and this permission notice shall be
	16 ;;; included in all copies or substantial portions of the Software.
	17 ;;;
	18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
	22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	25 ;;; DEALINGS IN THE SOFTWARE.
	26
	27 ;;; This implementation is largely based on OpenMCL's l1-unicode.lisp
	28 ;;; Copyright (C) 2006 Clozure Associates and contributors.
	29
	30 (in-package #:babel-encodings)
	31
	32 (eval-when (:compile-toplevel :load-toplevel :execute)
	33 (defconstant +repl+ #xfffd "Unicode replacement character code point.")
	34 (defconstant +byte-order-mark-code+ #xfeff)
	35 (defconstant +swapped-byte-order-mark-code+ #xfffe)
	36 (defconstant +swapped-byte-order-mark-code-32+ #xfffe0000))
	37
	38 ;;; Some convenience macros adding FIXNUM declarations.
	39 (defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count)))
	40 (defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers)))
	41 (defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers)))
	42 (defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers)))
	43
	44 ;;;; UTF-8
	45
	46 (define-character-encoding :utf-8
	47 "An 8-bit, variable-length character encoding in which
	48 character code points in the range #x00-#x7f can be encoded in a
	49 single octet; characters with larger code values can be encoded
	50 in 2 to 4 bytes."
	51 :max-units-per-char 4
	52 :literal-char-code-limit #x80
	53 :bom-encoding #(#xef #xbb #xbf)
	54 :default-replacement #xfffd)
	55
	56 (define-condition invalid-utf8-starter-byte (character-decoding-error)
	57 ()
	58 (:documentation "Signalled when an invalid UTF-8 starter byte is found…
	59
	60 (define-condition invalid-utf8-continuation-byte (character-decoding-err…
	61 ()
	62 (:documentation
	63 "Signalled when an invalid UTF-8 continuation byte is found."))
	64
	65 (define-condition overlong-utf8-sequence (character-decoding-error)
	66 ()
	67 (:documentation "Signalled upon overlong UTF-8 sequences."))
	68
	69 (define-octet-counter :utf-8 (getter type)
	70 `(named-lambda utf-8-octet-counter (seq start end max)
	71 (declare (type ,type seq) (fixnum start end max))
	72 (loop with noctets fixnum = 0
	73 for i fixnum from start below end
	74 for code of-type code-point = (,getter seq i) do
	75 (let ((new (+ (cond ((< code #x80) 1)
	76 ((< code #x800) 2)
	77 ((< code #x10000) 3)
	78 (t 4))
	79 noctets)))
	80 (if (and (plusp max) (> new max))
	81 (loop-finish)
	82 (setq noctets new)))
	83 finally (return (values noctets i)))))
	84
	85 (define-code-point-counter :utf-8 (getter type)
	86 `(named-lambda utf-8-code-point-counter (seq start end max)
	87 (declare (type ,type seq) (fixnum start end max))
	88 (loop with nchars fixnum = 0
	89 with i fixnum = start
	90 while (< i end) do
	91 ;; check for invalid continuation bytes
	92 (macrolet ((invalid-cb-p (n)
	93 `(and (< (+ i ,n) end)
	94 (not (< #x7f (,',getter seq (+ i ,n)) #xc0…
	95 ;; wrote this code with LET instead of FOR because CLISP's
	96 ;; LOOP doesn't like WHILE clauses before FOR clauses.
	97 (let* ((octet (,getter seq i))
	98 (next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p…
	99 ((or (< octet #xe0) (invalid-cb-p…
	100 ((or (< octet #xf0) (invalid-cb-p…
	101 ((or (< octet #xf8) (invalid-cb-p…
	102 ((or (< octet #xfc) (invalid-cb-p…
	103 (t 6)))))
	104 (declare (type ub8 octet) (fixnum next-i))
	105 (cond
	106 ((> next-i end)
	107 ;; Should we add restarts to this error, we'll have
	108 ;; to figure out a way to communicate with the
	109 ;; decoder since we probably want to do something
	110 ;; about it right here when we have a chance to
	111 ;; change the count or something. (Like an
	112 ;; alternative replacement character or perhaps the
	113 ;; existence of this error so that the decoder
	114 ;; doesn't have to check for it on every iteration
	115 ;; like we do.)
	116 ;;
	117 ;; FIXME: The data for this error is not right.
	118 (decoding-error (vector octet) :utf-8 seq i
	119 nil 'end-of-input-in-character)
	120 (return (values (1+ nchars) end)))
	121 (t
	122 (setq nchars (1+ nchars)
	123 i next-i)
	124 (when (and (plusp max) (= nchars max))
	125 (return (values nchars i)))))))
	126 finally (progn
	127 (assert (= i end))
	128 (return (values nchars i))))))
	129
	130 (define-encoder :utf-8 (getter src-type setter dest-type)
	131 `(named-lambda utf-8-encoder (src start end dest d-start)
	132 (declare (type ,src-type src)
	133 (type ,dest-type dest)
	134 (fixnum start end d-start))
	135 (loop with di fixnum = d-start
	136 for i fixnum from start below end
	137 for code of-type code-point = (,getter src i) do
	138 (macrolet ((set-octet (offset value)
	139 `(,',setter ,value dest (the fixnum (+ di ,offse…
	140 (cond
	141 ;; 1 octet
	142 ((< code #x80)
	143 (set-octet 0 code)
	144 (incf di))
	145 ;; 2 octets
	146 ((< code #x800)
	147 (set-octet 0 (logior #xc0 (f-ash code -6)))
	148 (set-octet 1 (logior #x80 (f-logand code #x3f)))
	149 (incf di 2))
	150 ;; 3 octets
	151 ((< code #x10000)
	152 (set-octet 0 (logior #xe0 (f-ash code -12)))
	153 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)…
	154 (set-octet 2 (logior #x80 (f-logand code #x3f)))
	155 (incf di 3))
	156 ;; 4 octets
	157 (t
	158 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18…
	159 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12…
	160 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)…
	161 (set-octet 3 (logior #x80 (logand code #x3f)))
	162 (incf di 4))))
	163 finally (return (the fixnum (- di d-start))))))
	164
	165 (define-decoder :utf-8 (getter src-type setter dest-type)
	166 `(named-lambda utf-8-decoder (src start end dest d-start)
	167 (declare (type ,src-type src)
	168 (type ,dest-type dest)
	169 (fixnum start end d-start))
	170 (let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0))
	171 (declare (type ub8 u2 u3 u4 u5 u6))
	172 (loop for di fixnum from d-start
	173 for i fixnum from start below end
	174 for u1 of-type ub8 = (,getter src i) do
	175 ;; Note: CONSUME-OCTET doesn't check if I is being
	176 ;; incremented past END. We're assuming that END has
	177 ;; been calculated with the CODE-POINT-POINTER above that
	178 ;; checks this.
	179 (macrolet
	180 ((consume-octet ()
	181 `(let ((next-i (incf i)))
	182 (if (= next-i end)
	183 ;; FIXME: data for this error is incomplete.
	184 ;; and signalling this error twice
	185 (return-from setter-block
	186 (decoding-error nil :utf-8 src i +repl+
	187 'end-of-input-in-character))
	188 (,',getter src next-i))))
	189 (handle-error (n &optional (c 'character-decoding-erro…
	190 `(decoding-error
	191 (vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n))
	192 :utf-8 src (1+ (- i ,n)) +repl+ ',c))
	193 (handle-error-if-icb (var n)
	194 `(when (not (< #x7f ,var #xc0))
	195 (decf i)
	196 (return-from setter-block
	197 (handle-error ,n invalid-utf8-continuation-byte…
	198 (,setter
	199 (block setter-block
	200 (cond
	201 ((< u1 #x80) u1) ; 1 octet
	202 ((< u1 #xc0)
	203 (handle-error 1 invalid-utf8-starter-byte))
	204 (t
	205 (setq u2 (consume-octet))
	206 (handle-error-if-icb u2 1)
	207 (cond
	208 ((< u1 #xc2)
	209 (handle-error 2 overlong-utf8-sequence))
	210 ((< u1 #xe0) ; 2 octets
	211 (logior (f-ash (f-logand #x1f u1) 6)
	212 (f-logxor u2 #x80)))
	213 (t
	214 (setq u3 (consume-octet))
	215 (handle-error-if-icb u3 2)
	216 (cond
	217 ((and (= u1 #xe0) (< u2 #xa0))
	218 (handle-error 3 overlong-utf8-sequence))
	219 ((< u1 #xf0) ; 3 octets
	220 (let ((start (f-logior (f-ash (f-logand u1 #x…
	221 (f-ash (f-logand u2 #x…
	222 (if (<= #xd800 start #xdfc0)
	223 (handle-error 3 character-out-of-range)
	224 (logior start (f-logand u3 #x3f)))))
	225 (t ; 4 octets
	226 (setq u4 (consume-octet))
	227 (handle-error-if-icb u4 3)
	228 (cond
	229 ((and (= u1 #xf0) (< u2 #x90))
	230 (handle-error 4 overlong-utf8-sequence))
	231 ((< u1 #xf8)
	232 (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2…
	233 (handle-error 4 character-out-of-range)
	234 (f-logior (f-ash (f-logand u1 7) 18)
	235 (f-ash (f-logxor u2 #x80) 12)
	236 (f-ash (f-logxor u3 #x80) 6)
	237 (f-logxor u4 #x80))))
	238 ;; from here on we'll be getting either
	239 ;; invalid continuation bytes or overlong
	240 ;; 5-byte or 6-byte sequences.
	241 (t
	242 (setq u5 (consume-octet))
	243 (handle-error-if-icb u5 4)
	244 (cond
	245 ((and (= u1 #xf8) (< u2 #x88))
	246 (handle-error 5 overlong-utf8-sequence))
	247 ((< u1 #xfc)
	248 (handle-error 5 character-out-of-range))
	249 (t
	250 (setq u6 (consume-octet))
	251 (handle-error-if-icb u6 5)
	252 (cond
	253 ((and (= u1 #xfc) (< u2 #x84))
	254 (handle-error 6 overlong-utf8-sequen…
	255 (t
	256 (handle-error 6 character-out-of-ran…
	257 )))))))))))))
	258 dest di))
	259 finally (return (the fixnum (- di d-start)))))))
	260
	261 ;;;; UTF-8B
	262
	263 ;;; The following excerpt from a linux-utf8 message by Markus Kuhn is
	264 ;;; the closest thing to a UTF-8B specification:
	265 ;;;
	266 ;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html>
	267 ;;;
	268 ;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed
	269 ;;; UTF-8 sequence
	270 ;;;
	271 ;;; All the previous options for converting malformed UTF-8 sequences
	272 ;;; to UTF-16 destroy information. This can be highly undesirable in
	273 ;;; applications such as text file editors, where guaranteed binary
	274 ;;; transparency is a desireable feature. (E.g., I frequently edit
	275 ;;; executable code or graphic files with the Emacs text editor and I
	276 ;;; hate the idea that my editor might automatically make U+FFFD
	277 ;;; substitutions at locations that I haven't even edited when I save
	278 ;;; the file again.)
	279 ;;;
	280 ;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx
	281 ;;; mailing list the following approach. Instead of using U+FFFD,
	282 ;;; simply encode malformed UTF-8 sequences as malformed UTF-16
	283 ;;; sequences. Malformed UTF-8 sequences consist excludively of the
	284 ;;; bytes 0x80 - 0xff, and each of these bytes can be represented
	285 ;;; using a 16-bit value from the UTF-16 low-half surrogate zone
	286 ;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from
	287 ;;; the above example would be represented in UTF-16 as U+DCC1
	288 ;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate
	289 ;;; character is also treated like a malformed sequence, then there
	290 ;;; is no way that a single high-half surrogate could precede the
	291 ;;; encoded malformed sequence and cause a valid UTF-16 sequence to
	292 ;;; emerge.
	293 ;;;
	294 ;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8
	295 ;;; round-trip compatibility can be achieved quite easily.
	296 ;;;
	297 ;;; On an output device, a lonely low-half surrogate character should
	298 ;;; be treated just like a character outside the adopted subset of
	299 ;;; representable characters, that is for the end user, the display
	300 ;;; would look exactly like with semantics B), i.e. one symbol per
	301 ;;; byte of a malformed sequence. However in contrast to semantics
	302 ;;; B), no information is thrown away, and a cut&paste in an editor
	303 ;;; or terminal emulator will be guaranteed to reconstruct the
	304 ;;; original byte sequence. This should greatly reduce the incidence
	305 ;;; of accidental corruption of binary data by UTF-8 -> UTF-16 ->
	306 ;;; UTF-8 conversion round trips."
	307
	308 (define-character-encoding :utf-8b
	309 "An 8-bit, variable-length character encoding in which
	310 character code points in the range #x00-#x7f can be encoded in a
	311 single octet; characters with larger code values can be encoded
	312 in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX
	313 code points for each invalid byte."
	314 :max-units-per-char 4
	315 :literal-char-code-limit #x80
	316 :bom-encoding #(#xef #xbb #xbf)
	317 :default-replacement nil)
	318
	319 ;;; TODO: reuse the :UTF-8 octet counter through a simple macro.
	320 (define-octet-counter :utf-8b (getter type)
	321 `(named-lambda utf-8b-octet-counter (seq start end max)
	322 (declare (type ,type seq) (fixnum start end max))
	323 (loop with noctets fixnum = 0
	324 for i fixnum from start below end
	325 for code of-type code-point = (,getter seq i) do
	326 (let ((new (+ (cond ((< code #x80) 1)
	327 ((< code #x800) 2)
	328 ((<= #xdc80 code #xdcff) 1)
	329 ((< code #x10000) 3)
	330 (t 4))
	331 noctets)))
	332 (if (and (plusp max) (> new max))
	333 (loop-finish)
	334 (setq noctets new)))
	335 finally (return (values noctets i)))))
	336
	337 (define-code-point-counter :utf-8b (getter type)
	338 `(named-lambda utf-8b-code-point-counter (seq start end max)
	339 (declare (type ,type seq) (fixnum start end max))
	340 (loop with nchars fixnum = 0
	341 with i fixnum = start
	342 while (< i end) do
	343 ;; wrote this code with LET instead of FOR because CLISP's
	344 ;; LOOP doesn't like WHILE clauses before FOR clauses.
	345 (let* ((octet (,getter seq i))
	346 (noctets (cond ((< octet #x80) 1)
	347 ((< octet #xe0) 2)
	348 ((< octet #xf0) 3)
	349 (t 4))))
	350 (declare (type ub8 octet) (fixnum noctets))
	351 (cond
	352 ((> (+ i noctets) end)
	353 ;; If this error is suppressed these last few bytes
	354 ;; will be encoded as raw bytes later.
	355 (decoding-error (vector octet) :utf-8 seq i
	356 nil 'end-of-input-in-character)
	357 (return (values (+ nchars (- end i)) end)))
	358 (t
	359 ;; FIXME: clean this mess up.
	360 (let* ((u1 octet)
	361 (u2 (if (>= noctets 2) (,getter seq (1+ i)) 0))
	362 (u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0))
	363 (u4 (if (= noctets 4) (,getter seq (+ i 3)) 0))
	364 (inc (or (and (> noctets 1)
	365 (< u1 #xc2))
	366 (and (= noctets 2)
	367 (not (logior u2 #x40)))
	368 (and (= noctets 3)
	369 (not (and (< (f-logxor u2 #x80) #x4…
	370 (< (f-logxor u3 #x80) #x4…
	371 (or (>= u1 #xe1) (>= u2 #…
	372 (or (/= u1 #xed) (< u2 #x…
	373 (and (= noctets 4)
	374 (not
	375 (and (< (f-logxor u2 #x80) #x40)
	376 (< (f-logxor u3 #x80) #x40)
	377 (< (f-logxor u4 #x80) #x40)
	378 (or (>= u1 #xf1) (>= u2 #x90)…
	379 (let ((new-nchars (if inc (+ nchars noctets) (1+ nchar…
	380 (when (and (plusp max) (> new-nchars max))
	381 (return (values nchars i)))
	382 (incf i noctets)
	383 (setq nchars new-nchars))))))
	384 finally (progn
	385 (assert (= i end))
	386 (return (values nchars i))))))
	387
	388 ;;; TODO: reuse the :UTF-8 encoder with through a simple macro.
	389 (define-encoder :utf-8b (getter src-type setter dest-type)
	390 `(named-lambda utf-8b-encoder (src start end dest d-start)
	391 (declare (type ,src-type src)
	392 (type ,dest-type dest)
	393 (fixnum start end d-start))
	394 (loop with di fixnum = d-start
	395 for i fixnum from start below end
	396 for code of-type code-point = (,getter src i) do
	397 (macrolet ((set-octet (offset value)
	398 `(,',setter ,value dest (the fixnum (+ di ,offse…
	399 (cond
	400 ;; 1 octet
	401 ((< code #x80)
	402 (set-octet 0 code)
	403 (incf di))
	404 ;; 2 octets
	405 ((< code #x800)
	406 (set-octet 0 (logior #xc0 (f-ash code -6)))
	407 (set-octet 1 (logior #x80 (f-logand code #x3f)))
	408 (incf di 2))
	409 ;; 1 octet (invalid octet)
	410 ((<= #xdc80 code #xdcff)
	411 (set-octet 0 (f-logand code #xff))
	412 (incf di))
	413 ;; 3 octets
	414 ((< code #x10000)
	415 (set-octet 0 (logior #xe0 (f-ash code -12)))
	416 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)…
	417 (set-octet 2 (logior #x80 (f-logand code #x3f)))
	418 (incf di 3))
	419 ;; 4 octets
	420 (t
	421 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18…
	422 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12…
	423 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)…
	424 (set-octet 3 (logand #x3f code))
	425 (incf di 4))))
	426 finally (return (the fixnum (- di d-start))))))
	427
	428 (define-decoder :utf-8b (getter src-type setter dest-type)
	429 `(named-lambda utf-8b-decoder (src start end dest d-start)
	430 (declare (type ,src-type src)
	431 (type ,dest-type dest)
	432 (fixnum start end d-start))
	433 (let ((u2 0) (u3 0) (u4 0))
	434 (declare (type ub8 u2 u3 u4))
	435 (loop for di fixnum from d-start
	436 for i fixnum from start below end
	437 for u1 of-type ub8 = (,getter src i) do
	438 ;; Unlike the UTF-8 version, this version of
	439 ;; CONSUME-OCTET needs to check if I is being incremented
	440 ;; past END because we might have trailing binary
	441 ;; garbage.
	442 (macrolet
	443 ((consume-octet (n)
	444 `(if (= i (1- end))
	445 (encode-raw-octets ,n)
	446 (,',getter src (incf i))))
	447 (encode-raw-octets (n)
	448 `(progn
	449 ,@(loop for i below n and var in '(u1 u2 u3 u4)
	450 collect `(,',setter (logior #xdc00 ,var) …
	451 unless (= i (1- n))
	452 collect '(incf di))
	453 (return-from set-body))))
	454 (block set-body
	455 (,setter (cond
	456 ((< u1 #x80) ; 1 octet
	457 u1)
	458 ((>= u1 #xc2)
	459 (setq u2 (consume-octet 1))
	460 (cond
	461 ((< u1 #xe0) ; 2 octets
	462 (if (< (f-logxor u2 #x80) #x40)
	463 (logior (f-ash (f-logand #x1f u1) 6)
	464 (f-logxor u2 #x80))
	465 (encode-raw-octets 2)))
	466 (t
	467 (setq u3 (consume-octet 2))
	468 (cond
	469 ((< u1 #xf0) ; 3 octets
	470 (if (and (< (f-logxor u2 #x80) #x40)
	471 (< (f-logxor u3 #x80) #x40)
	472 (or (>= u1 #xe1) (>= u2 #xa0…
	473 (let ((start (f-logior (f-ash (f-…
	474 (f-ash (f-…
	475 (if (<= #xd800 start #xdfc0)
	476 (encode-raw-octets 3)
	477 (logior start (f-logand u3 …
	478 (encode-raw-octets 3)))
	479 (t ; 4 octets
	480 (setq u4 (consume-octet 3))
	481 (if (and (< (f-logxor u2 #x80) #x40)
	482 (< (f-logxor u3 #x80) #x40)
	483 (< (f-logxor u4 #x80) #x40)
	484 (or (>= u1 #xf1) (>= u2 #x90…
	485 (logior
	486 (f-logior (f-ash (f-logand u1 7)…
	487 (f-ash (f-logxor u2 #x…
	488 (f-logior (f-ash (f-logxor u3 #x…
	489 (f-logxor u4 #x80)))
	490 (encode-raw-octets 4)))))))
	491 (t (encode-raw-octets 1)))
	492 dest di)))
	493 finally (return (the fixnum (- di d-start)))))))
	494
	495 ;;;; UTF-16
	496
	497 ;;; TODO: add a way to pass some info at compile-time telling us that,
	498 ;;; for example, the maximum code-point will always be < #x10000 in
	499 ;;; which case we could simply return (* 2 (- end start)).
	500 (defmacro utf16-octet-counter (getter type)
	501 `(named-lambda utf-16-octet-counter (seq start end max)
	502 (declare (type ,type seq) (fixnum start end max))
	503 (loop with noctets fixnum = 0
	504 for i fixnum from start below end
	505 for code of-type code-point = (,getter seq i)
	506 do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctet…
	507 (if (and (plusp max) (> new max))
	508 (loop-finish)
	509 (setq noctets new)))
	510 finally (return (values noctets i)))))
	511
	512 (defmacro utf-16-combine-surrogate-pairs (u1 u2)
	513 `(the (unsigned-byte 21)
	514 (+ #x10000
	515 (the (unsigned-byte 20)
	516 (logior
	517 (the (unsigned-byte 20)
	518 (ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10))
	519 (the (unsigned-byte 10)
	520 (- ,u2 #xdc00)))))))
	521
	522 (defmacro define-utf-16 (name &optional endianness)
	523 (check-type endianness (or null (eql :be) (eql :le)))
	524 (check-type name keyword)
	525 (let ((swap-var (gensym "SWAP"))
	526 (code-point-counter-name
	527 (format-symbol t '#:~a-code-point-counter (string name)))
	528 (encoder-name (format-symbol t '#:~a-encoder (string name)))
	529 (decoder-name (format-symbol t '#:~a-decoder (string name))))
	530 (labels ((make-bom-check-form (end start getter seq)
	531 (if (null endianness)
	532 ``((,',swap-var
	533 (when (> ,,end ,,start)
	534 (case (,,getter ,,seq ,,start 2 :ne)
	535 (#.+byte-order-mark-code+ (incf ,,start 2) nil)
	536 (#.+swapped-byte-order-mark-code+ (incf ,,star…
	537 (t #+little-endian t)))))
	538 '()))
	539 (make-getter-form (getter src i)
	540 (case endianness
	541 (:le ``(,,getter ,,src ,,i 2 :le))
	542 (:be ``(,,getter ,,src ,,i 2 :be))
	543 (t ``(if ,',swap-var
	544 (,,getter ,,src ,,i 2 :re)
	545 (,,getter ,,src ,,i 2 :ne)))))
	546 (make-setter-form (setter code dest di)
	547 (case endianness
	548 (:be ``(,,setter ,,code ,,dest ,,di 2 :be))
	549 (:le ``(,,setter ,,code ,,dest ,,di 2 :le))
	550 (t ``(,,setter ,,code ,,dest ,,di 2 :ne)))))
	551 `(progn
	552 (define-octet-counter ,name (getter type)
	553 `(utf16-octet-counter ,getter ,type))
	554 (define-code-point-counter ,name (getter type)
	555 `(named-lambda ,',code-point-counter-name (seq start end max)
	556 (declare (type ,type seq) (fixnum start end max))
	557 (let* ,,(make-bom-check-form ''end ''start 'getter ''seq)
	558 (loop with count fixnum = 0
	559 with i fixnum = start
	560 while (<= i (- end 2)) do
	561 (let* ((code ,,(make-getter-form 'getter ''seq ''i))
	562 (next-i (+ i (if (or (< code #xd800) (>= code #…
	563 2
	564 4))))
	565 (declare (type (unsigned-byte 16) code) (fixnum next…
	566 (cond
	567 ((> next-i end)
	568 (decoding-error
	569 (vector (,getter seq i) (,getter seq (1+ i)))
	570 ,',name seq i nil 'end-of-input-in-character)
	571 (return (values count i)))
	572 (t
	573 (setq i next-i
	574 count (1+ count))
	575 (when (and (plusp max) (= count max))
	576 (return (values count i))))))
	577 finally (progn
	578 (assert (= i end))
	579 (return (values count i)))))))
	580 (define-encoder ,name (getter src-type setter dest-type)
	581 `(named-lambda ,',encoder-name (src start end dest d-start)
	582 (declare (type ,src-type src)
	583 (type ,dest-type dest)
	584 (fixnum start end d-start))
	585 (loop with di fixnum = d-start
	586 for i fixnum from start below end
	587 for code of-type code-point = (,getter src i)
	588 for high-bits fixnum = (- code #x10000) do
	589 (cond ((< high-bits 0)
	590 ,,(make-setter-form 'setter ''code ''dest ''di)
	591 (incf di 2))
	592 (t
	593 ,,(make-setter-form
	594 'setter ''(logior #xd800 (f-ash high-bits …
	595 ''dest ''di)
	596 ,,(make-setter-form
	597 'setter ''(logior #xdc00 (f-logand high-bi…
	598 ''dest ''(+ di 2))
	599 (incf di 4)))
	600 finally (return (the fixnum (- di d-start))))))
	601 (define-decoder ,name (getter src-type setter dest-type)
	602 `(named-lambda ,',decoder-name (src start end dest d-start)
	603 (declare (type ,src-type src)
	604 (type ,dest-type dest)
	605 (fixnum start end d-start))
	606 (let ,,(make-bom-check-form ''end ''start 'getter ''src)
	607 (loop with i fixnum = start
	608 for di fixnum from d-start
	609 until (= i end) do
	610 (let ((u1 ,,(make-getter-form 'getter ''src ''i)))
	611 (declare (type (unsigned-byte 16) u1))
	612 (incf i 2)
	613 (,setter (cond
	614 ((or (< u1 #xd800) (>= u1 #xe000)) ; 2 oc…
	615 u1)
	616 ((< u1 #xdc00) ; 4 octets
	617 (let ((u2 ,,(make-getter-form 'getter ''…
	618 (declare (type (unsigned-byte 16) u2))
	619 (incf i 2)
	620 (if (and (>= u2 #xdc00) (< u2 #xe000))
	621 (utf-16-combine-surrogate-pairs u1 u…
	622 (decoding-error
	623 (vector (,getter src (- i 4))
	624 (,getter src (- i 3))
	625 (,getter src (- i 2))
	626 (,getter src (- i 1)))
	627 ,',name src i +repl+))))
	628 (t
	629 (decoding-error (vector (,getter src (-…
	630 (,getter src (-…
	631 ,',name src i +repl+)))
	632 dest di))
	633 finally (return (the fixnum (- di d-start)))))))
	634 ',name))))
	635
	636 (define-character-encoding :utf-16
	637 "A 16-bit, variable-length encoding in which characters with
	638 code points less than #x10000 can be encoded in a single 16-bit
	639 word and characters with larger codes can be encoded in a pair of
	640 16-bit words. The endianness of the encoded data is indicated by
	641 the endianness of a byte-order-mark character (#\u+feff)
	642 prepended to the data; in the absence of such a character on
	643 input, the data is assumed to be in big-endian order. Output is
	644 written in native byte-order with a leading byte-order mark."
	645 :max-units-per-char 2
	646 :code-unit-size 16
	647 :native-endianness t ; not necessarily true when decoding
	648 :decode-literal-code-unit-limit #xd800
	649 :encode-literal-code-unit-limit #x10000
	650 :use-bom #+big-endian :utf-16be #+little-endian :utf-16le
	651 :bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe)
	652 :nul-encoding #(0 0)
	653 :default-replacement #xfffd
	654 :ambiguous #+little-endian t #+big-endian nil)
	655
	656 (define-utf-16 :utf-16)
	657
	658 (define-character-encoding :utf-16le
	659 "A 16-bit, variable-length encoding in which characters with
	660 code points less than #x10000 can be encoded in a single 16-bit
	661 word and characters with larger codes can be encoded in a pair of
	662 16-bit words. The data is assumed to be in little-endian order. Output…
	663 written in little-endian byte-order without a leading byte-order mark."
	664 :aliases '(:utf-16/le)
	665 :max-units-per-char 2
	666 :code-unit-size 16
	667 :native-endianness #+little-endian t #+big-endian nil
	668 :decode-literal-code-unit-limit #xd800
	669 :encode-literal-code-unit-limit #x10000
	670 :nul-encoding #(0 0)
	671 :default-replacement #xfffd)
	672
	673 (define-utf-16 :utf-16le :le)
	674
	675 (define-character-encoding :utf-16be
	676 "A 16-bit, variable-length encoding in which characters with
	677 code points less than #x10000 can be encoded in a single 16-bit
	678 word and characters with larger codes can be encoded in a pair of
	679 16-bit words. The data is assumed to be in big-endian order. Output is
	680 written in big-endian byte-order without a leading byte-order mark."
	681 :aliases '(:utf-16/be)
	682 :max-units-per-char 2
	683 :code-unit-size 16
	684 :native-endianness #+little-endian nil #+big-endian t
	685 :decode-literal-code-unit-limit #xd800
	686 :encode-literal-code-unit-limit #x10000
	687 :nul-encoding #(0 0)
	688 :default-replacement #xfffd)
	689
	690 (define-utf-16 :utf-16be :be)
	691
	692 (defmacro define-ucs (name bytes &optional endianness (limit #x110000))
	693 (check-type name keyword)
	694 (check-type bytes (or (eql 2) (eql 4)))
	695 (check-type endianness (or null (eql :le) (eql :be)))
	696 (let ((swap-var (gensym "SWAP"))
	697 (code-point-counter-name
	698 (format-symbol t '#:~a-code-point-counter (string name)))
	699 (encoder-name
	700 (format-symbol t '#:~a-encoder (string name)))
	701 (decoder-name
	702 (format-symbol t '#:~a-decoder (string name))))
	703 (labels ((make-bom-check-form (end start getter src)
	704 (if (null endianness)
	705 ``(when (not (zerop (- ,,end ,,start)))
	706 (case (,,getter ,,src 0 ,',bytes :ne)
	707 (#.+byte-order-mark-code+
	708 (incf ,,start ,',bytes) nil)
	709 (#.+swapped-byte-order-mark-code-32+
	710 (incf ,,start ,',bytes) t)
	711 (t #+little-endian t)))
	712 '()))
	713 (make-setter-form (setter code dest di)
	714 ``(,,setter ,,code ,,dest ,,di ,',bytes
	715 ,',(or endianness :ne)))
	716 (make-getter-form (getter src i)
	717 (if (null endianness)
	718 ``(if ,',swap-var
	719 (,,getter ,,src ,,i ,',bytes :re)
	720 (,,getter ,,src ,,i ,',bytes :ne))
	721 ``(,,getter ,,src ,,i ,',bytes ,',endianness))))
	722 `(progn
	723 (define-code-point-counter ,name (getter type)
	724 `(named-lambda ,',code-point-counter-name (seq start end max)
	725 (declare (type ,type seq) (fixnum start end max))
	726 ;; check for bom
	727 ,,(make-bom-check-form ''end ''start 'getter ''seq)
	728 (multiple-value-bind (count rem)
	729 (floor (- end start) ,',bytes)
	730 (cond
	731 ((and (plusp max) (> count max))
	732 (values max (the fixnum (+ start (* ,',bytes max)))))
	733 (t
	734 ;; check for incomplete last character
	735 (unless (zerop rem)
	736 (let ((vector (make-array ,',bytes :fill-pointer 0)…
	737 (dotimes (i rem)
	738 (vector-push (,getter seq (+ i (- end rem))) ve…
	739 (decoding-error vector ,',name seq (the fixnum (-…
	740 'end-of-input-in-character)
	741 (decf end rem)))
	742 (values count end))))))
	743 (define-encoder ,name (getter src-type setter dest-type)
	744 `(named-lambda ,',encoder-name (src start end dest d-start)
	745 (declare (type ,src-type src)
	746 (type ,dest-type dest)
	747 (fixnum start end d-start))
	748 (loop for i fixnum from start below end
	749 and di fixnum from d-start by ,',bytes
	750 for code of-type code-point = (,getter src i)
	751 do (if (>= code ,',limit)
	752 (encoding-error code ,',name src i +repl+)
	753 ,,(make-setter-form 'setter ''code ''dest ''d…
	754 finally (return (the fixnum (- di d-start))))))
	755 (define-decoder ,name (getter src-type setter dest-type)
	756 `(named-lambda ,',decoder-name (src start end dest d-start)
	757 (declare (type ,src-type src)
	758 (type ,dest-type dest)
	759 (fixnum start end d-start))
	760 (let ((,',swap-var ,,(make-bom-check-form ''end ''start 'g…
	761 (declare (ignorable ,',swap-var))
	762 (loop for i fixnum from start below end by ,',bytes
	763 and di from d-start
	764 do (,setter (let ((unit ,,(make-getter-form 'gette…
	765 (if (>= unit ,',limit)
	766 (decoding-error
	767 (vector (,getter src i)
	768 (,getter src (+ i 1))
	769 ,@,(if (= bytes 4)
	770 ``((,getter src …
	771 (,getter src …
	772 ,',name src i +repl+
	773 'character-out-of-range)
	774 unit))
	775 dest di)
	776 finally (return (the fixnum (- di d-start)))))))
	777 ',name))))
	778
	779 ;;;; UTF-32
	780
	781 (define-character-encoding :utf-32
	782 "A 32-bit, fixed-length encoding in which all Unicode
	783 characters can be encoded in a single 32-bit word. The
	784 endianness of the encoded data is indicated by the endianness of
	785 a byte-order-mark character (#\u+feff) prepended to the data; in
	786 the absence of such a character on input, input data is assumed
	787 to be in big-endian order. Output is written in native byte
	788 order with a leading byte-order mark."
	789 :aliases '(:ucs-4)
	790 :max-units-per-char 1
	791 :code-unit-size 32
	792 :native-endianness t ; not necessarily true when decoding
	793 :literal-char-code-limit #x110000
	794 :use-bom #+little-endian :utf-32le #+big-endian :utf-32be
	795 :bom-encoding
	796 #+big-endian #(#x00 #x00 #xfe #xff)
	797 #+little-endian #(#xff #xfe #x00 #x00)
	798 :nul-encoding #(0 0 0 0)
	799 :ambiguous #+little-endian t #+big-endian nil)
	800
	801 (define-ucs :utf-32 4)
	802
	803 (define-character-encoding :utf-32le
	804 "A 32-bit, fixed-length encoding in which all Unicode
	805 characters can be encoded in a single 32-bit word. Input data is assumed
	806 to be in little-endian order. Output is also written in little-endian b…
	807 order without a leading byte-order mark."
	808 :max-units-per-char 1
	809 :code-unit-size 32
	810 :aliases '(:utf-32/le :ucs-4le :ucs-4/le)
	811 :native-endianness #+little-endian t #+big-endian nil
	812 :literal-char-code-limit #x110000
	813 :nul-encoding #(0 0 0 0))
	814
	815 (define-ucs :utf-32le 4 :le)
	816
	817 (define-character-encoding :utf-32be
	818 "A 32-bit, fixed-length encoding in which all Unicode
	819 characters can be encoded in a single 32-bit word. Input data is assumed
	820 to be in big-endian order. Output is also written in big-endian byte
	821 order without a leading byte-order mark."
	822 :max-units-per-char 1
	823 :code-unit-size 32
	824 :aliases '(:utf-32/be :ucs-4be :ucs-4/be)
	825 :native-endianness #+little-endian nil #+big-endian t
	826 :literal-char-code-limit #x110000
	827 :nul-encoding #(0 0 0 0))
	828
	829 (define-ucs :utf-32be 4 :be)
	830
	831 ;; UCS-2
	832
	833 (define-character-encoding :ucs-2
	834 "A 16-bit, fixed-length encoding in which all Unicode
	835 characters can be encoded in a single 16-bit word. The
	836 endianness of the encoded data is indicated by the endianness of
	837 a byte-order-mark character (#\u+feff) prepended to the data; in
	838 the absence of such a character on input, input data is assumed
	839 to be in big-endian order. Output is written in native byte
	840 order with a leading byte-order mark."
	841 :aliases '(:ucs-2)
	842 :max-units-per-char 1
	843 :code-unit-size 16
	844 :native-endianness t ; not necessarily true when decoding
	845 :literal-char-code-limit #x10000
	846 :use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be
	847 :bom-encoding
	848 #+big-endian #(#xfe #xff)
	849 #+little-endian #(#xff #xfe)
	850 :nul-encoding #(0 0)
	851 :ambiguous #+little-endian t #+big-endian nil)
	852
	853 (define-ucs :ucs-2 2 nil #x10000)
	854
	855 (define-character-encoding :ucs-2le
	856 "A 16-bit, fixed-length encoding in which all Unicode
	857 characters can be encoded in a single 16-bit word. Input data is assumed
	858 to be in little-endian order. Output is also written in little-endian b…
	859 order without a leading byte-order mark."
	860 :max-units-per-char 1
	861 :code-unit-size 16
	862 :aliases '(:ucs-2/le)
	863 :native-endianness #+little-endian t #+big-endian nil
	864 :literal-char-code-limit #x10000
	865 :nul-encoding #(0 0))
	866
	867 (define-ucs :ucs-2le 2 :le #x10000)
	868
	869 (define-character-encoding :ucs-2be
	870 "A 16-bit, fixed-length encoding in which all Unicode
	871 characters can be encoded in a single 16-bit word. Input data is assumed
	872 to be in big-endian order. Output is also written in big-endian byte
	873 order without a leading byte-order mark."
	874 :max-units-per-char 1
	875 :code-unit-size 16
	876 :aliases '(:ucs-2/be)
	877 :native-endianness #+little-endian nil #+big-endian t
	878 :literal-char-code-limit #x10000
	879 :nul-encoding #(0 0))
	880
	881 (define-ucs :ucs-2be 2 :be #x10000)