Introduction
Introduction Statistics Contact Development Disclaimer Help
tenc-unicode.lisp - clic - Clic is an command line interactive client for gophe…
git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/
Log
Files
Refs
Tags
LICENSE
---
tenc-unicode.lisp (42416B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; enc-unicode.lisp --- Unicode encodings.
4 ;;;
5 ;;; Copyright (C) 2007, Luis Oliveira <[email protected]>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 ;;; This implementation is largely based on OpenMCL's l1-unicode.lisp
28 ;;; Copyright (C) 2006 Clozure Associates and contributors.
29
30 (in-package #:babel-encodings)
31
32 (eval-when (:compile-toplevel :load-toplevel :execute)
33 (defconstant +repl+ #xfffd "Unicode replacement character code point.")
34 (defconstant +byte-order-mark-code+ #xfeff)
35 (defconstant +swapped-byte-order-mark-code+ #xfffe)
36 (defconstant +swapped-byte-order-mark-code-32+ #xfffe0000))
37
38 ;;; Some convenience macros adding FIXNUM declarations.
39 (defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count)))
40 (defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers)))
41 (defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers)))
42 (defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers)))
43
44 ;;;; UTF-8
45
46 (define-character-encoding :utf-8
47 "An 8-bit, variable-length character encoding in which
48 character code points in the range #x00-#x7f can be encoded in a
49 single octet; characters with larger code values can be encoded
50 in 2 to 4 bytes."
51 :max-units-per-char 4
52 :literal-char-code-limit #x80
53 :bom-encoding #(#xef #xbb #xbf)
54 :default-replacement #xfffd)
55
56 (define-condition invalid-utf8-starter-byte (character-decoding-error)
57 ()
58 (:documentation "Signalled when an invalid UTF-8 starter byte is found…
59
60 (define-condition invalid-utf8-continuation-byte (character-decoding-err…
61 ()
62 (:documentation
63 "Signalled when an invalid UTF-8 continuation byte is found."))
64
65 (define-condition overlong-utf8-sequence (character-decoding-error)
66 ()
67 (:documentation "Signalled upon overlong UTF-8 sequences."))
68
69 (define-octet-counter :utf-8 (getter type)
70 `(named-lambda utf-8-octet-counter (seq start end max)
71 (declare (type ,type seq) (fixnum start end max))
72 (loop with noctets fixnum = 0
73 for i fixnum from start below end
74 for code of-type code-point = (,getter seq i) do
75 (let ((new (+ (cond ((< code #x80) 1)
76 ((< code #x800) 2)
77 ((< code #x10000) 3)
78 (t 4))
79 noctets)))
80 (if (and (plusp max) (> new max))
81 (loop-finish)
82 (setq noctets new)))
83 finally (return (values noctets i)))))
84
85 (define-code-point-counter :utf-8 (getter type)
86 `(named-lambda utf-8-code-point-counter (seq start end max)
87 (declare (type ,type seq) (fixnum start end max))
88 (loop with nchars fixnum = 0
89 with i fixnum = start
90 while (< i end) do
91 ;; check for invalid continuation bytes
92 (macrolet ((invalid-cb-p (n)
93 `(and (< (+ i ,n) end)
94 (not (< #x7f (,',getter seq (+ i ,n)) #xc0…
95 ;; wrote this code with LET instead of FOR because CLISP's
96 ;; LOOP doesn't like WHILE clauses before FOR clauses.
97 (let* ((octet (,getter seq i))
98 (next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p…
99 ((or (< octet #xe0) (invalid-cb-p…
100 ((or (< octet #xf0) (invalid-cb-p…
101 ((or (< octet #xf8) (invalid-cb-p…
102 ((or (< octet #xfc) (invalid-cb-p…
103 (t 6)))))
104 (declare (type ub8 octet) (fixnum next-i))
105 (cond
106 ((> next-i end)
107 ;; Should we add restarts to this error, we'll have
108 ;; to figure out a way to communicate with the
109 ;; decoder since we probably want to do something
110 ;; about it right here when we have a chance to
111 ;; change the count or something. (Like an
112 ;; alternative replacement character or perhaps the
113 ;; existence of this error so that the decoder
114 ;; doesn't have to check for it on every iteration
115 ;; like we do.)
116 ;;
117 ;; FIXME: The data for this error is not right.
118 (decoding-error (vector octet) :utf-8 seq i
119 nil 'end-of-input-in-character)
120 (return (values (1+ nchars) end)))
121 (t
122 (setq nchars (1+ nchars)
123 i next-i)
124 (when (and (plusp max) (= nchars max))
125 (return (values nchars i)))))))
126 finally (progn
127 (assert (= i end))
128 (return (values nchars i))))))
129
130 (define-encoder :utf-8 (getter src-type setter dest-type)
131 `(named-lambda utf-8-encoder (src start end dest d-start)
132 (declare (type ,src-type src)
133 (type ,dest-type dest)
134 (fixnum start end d-start))
135 (loop with di fixnum = d-start
136 for i fixnum from start below end
137 for code of-type code-point = (,getter src i) do
138 (macrolet ((set-octet (offset value)
139 `(,',setter ,value dest (the fixnum (+ di ,offse…
140 (cond
141 ;; 1 octet
142 ((< code #x80)
143 (set-octet 0 code)
144 (incf di))
145 ;; 2 octets
146 ((< code #x800)
147 (set-octet 0 (logior #xc0 (f-ash code -6)))
148 (set-octet 1 (logior #x80 (f-logand code #x3f)))
149 (incf di 2))
150 ;; 3 octets
151 ((< code #x10000)
152 (set-octet 0 (logior #xe0 (f-ash code -12)))
153 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)…
154 (set-octet 2 (logior #x80 (f-logand code #x3f)))
155 (incf di 3))
156 ;; 4 octets
157 (t
158 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18…
159 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12…
160 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)…
161 (set-octet 3 (logior #x80 (logand code #x3f)))
162 (incf di 4))))
163 finally (return (the fixnum (- di d-start))))))
164
165 (define-decoder :utf-8 (getter src-type setter dest-type)
166 `(named-lambda utf-8-decoder (src start end dest d-start)
167 (declare (type ,src-type src)
168 (type ,dest-type dest)
169 (fixnum start end d-start))
170 (let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0))
171 (declare (type ub8 u2 u3 u4 u5 u6))
172 (loop for di fixnum from d-start
173 for i fixnum from start below end
174 for u1 of-type ub8 = (,getter src i) do
175 ;; Note: CONSUME-OCTET doesn't check if I is being
176 ;; incremented past END. We're assuming that END has
177 ;; been calculated with the CODE-POINT-POINTER above that
178 ;; checks this.
179 (macrolet
180 ((consume-octet ()
181 `(let ((next-i (incf i)))
182 (if (= next-i end)
183 ;; FIXME: data for this error is incomplete.
184 ;; and signalling this error twice
185 (return-from setter-block
186 (decoding-error nil :utf-8 src i +repl+
187 'end-of-input-in-character))
188 (,',getter src next-i))))
189 (handle-error (n &optional (c 'character-decoding-erro…
190 `(decoding-error
191 (vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n))
192 :utf-8 src (1+ (- i ,n)) +repl+ ',c))
193 (handle-error-if-icb (var n)
194 `(when (not (< #x7f ,var #xc0))
195 (decf i)
196 (return-from setter-block
197 (handle-error ,n invalid-utf8-continuation-byte…
198 (,setter
199 (block setter-block
200 (cond
201 ((< u1 #x80) u1) ; 1 octet
202 ((< u1 #xc0)
203 (handle-error 1 invalid-utf8-starter-byte))
204 (t
205 (setq u2 (consume-octet))
206 (handle-error-if-icb u2 1)
207 (cond
208 ((< u1 #xc2)
209 (handle-error 2 overlong-utf8-sequence))
210 ((< u1 #xe0) ; 2 octets
211 (logior (f-ash (f-logand #x1f u1) 6)
212 (f-logxor u2 #x80)))
213 (t
214 (setq u3 (consume-octet))
215 (handle-error-if-icb u3 2)
216 (cond
217 ((and (= u1 #xe0) (< u2 #xa0))
218 (handle-error 3 overlong-utf8-sequence))
219 ((< u1 #xf0) ; 3 octets
220 (let ((start (f-logior (f-ash (f-logand u1 #x…
221 (f-ash (f-logand u2 #x…
222 (if (<= #xd800 start #xdfc0)
223 (handle-error 3 character-out-of-range)
224 (logior start (f-logand u3 #x3f)))))
225 (t ; 4 octets
226 (setq u4 (consume-octet))
227 (handle-error-if-icb u4 3)
228 (cond
229 ((and (= u1 #xf0) (< u2 #x90))
230 (handle-error 4 overlong-utf8-sequence))
231 ((< u1 #xf8)
232 (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2…
233 (handle-error 4 character-out-of-range)
234 (f-logior (f-ash (f-logand u1 7) 18)
235 (f-ash (f-logxor u2 #x80) 12)
236 (f-ash (f-logxor u3 #x80) 6)
237 (f-logxor u4 #x80))))
238 ;; from here on we'll be getting either
239 ;; invalid continuation bytes or overlong
240 ;; 5-byte or 6-byte sequences.
241 (t
242 (setq u5 (consume-octet))
243 (handle-error-if-icb u5 4)
244 (cond
245 ((and (= u1 #xf8) (< u2 #x88))
246 (handle-error 5 overlong-utf8-sequence))
247 ((< u1 #xfc)
248 (handle-error 5 character-out-of-range))
249 (t
250 (setq u6 (consume-octet))
251 (handle-error-if-icb u6 5)
252 (cond
253 ((and (= u1 #xfc) (< u2 #x84))
254 (handle-error 6 overlong-utf8-sequen…
255 (t
256 (handle-error 6 character-out-of-ran…
257 )))))))))))))
258 dest di))
259 finally (return (the fixnum (- di d-start)))))))
260
261 ;;;; UTF-8B
262
263 ;;; The following excerpt from a linux-utf8 message by Markus Kuhn is
264 ;;; the closest thing to a UTF-8B specification:
265 ;;;
266 ;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html>
267 ;;;
268 ;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed
269 ;;; UTF-8 sequence
270 ;;;
271 ;;; All the previous options for converting malformed UTF-8 sequences
272 ;;; to UTF-16 destroy information. This can be highly undesirable in
273 ;;; applications such as text file editors, where guaranteed binary
274 ;;; transparency is a desireable feature. (E.g., I frequently edit
275 ;;; executable code or graphic files with the Emacs text editor and I
276 ;;; hate the idea that my editor might automatically make U+FFFD
277 ;;; substitutions at locations that I haven't even edited when I save
278 ;;; the file again.)
279 ;;;
280 ;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx
281 ;;; mailing list the following approach. Instead of using U+FFFD,
282 ;;; simply encode malformed UTF-8 sequences as malformed UTF-16
283 ;;; sequences. Malformed UTF-8 sequences consist excludively of the
284 ;;; bytes 0x80 - 0xff, and each of these bytes can be represented
285 ;;; using a 16-bit value from the UTF-16 low-half surrogate zone
286 ;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from
287 ;;; the above example would be represented in UTF-16 as U+DCC1
288 ;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate
289 ;;; character is also treated like a malformed sequence, then there
290 ;;; is no way that a single high-half surrogate could precede the
291 ;;; encoded malformed sequence and cause a valid UTF-16 sequence to
292 ;;; emerge.
293 ;;;
294 ;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8
295 ;;; round-trip compatibility can be achieved quite easily.
296 ;;;
297 ;;; On an output device, a lonely low-half surrogate character should
298 ;;; be treated just like a character outside the adopted subset of
299 ;;; representable characters, that is for the end user, the display
300 ;;; would look exactly like with semantics B), i.e. one symbol per
301 ;;; byte of a malformed sequence. However in contrast to semantics
302 ;;; B), no information is thrown away, and a cut&paste in an editor
303 ;;; or terminal emulator will be guaranteed to reconstruct the
304 ;;; original byte sequence. This should greatly reduce the incidence
305 ;;; of accidental corruption of binary data by UTF-8 -> UTF-16 ->
306 ;;; UTF-8 conversion round trips."
307
308 (define-character-encoding :utf-8b
309 "An 8-bit, variable-length character encoding in which
310 character code points in the range #x00-#x7f can be encoded in a
311 single octet; characters with larger code values can be encoded
312 in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX
313 code points for each invalid byte."
314 :max-units-per-char 4
315 :literal-char-code-limit #x80
316 :bom-encoding #(#xef #xbb #xbf)
317 :default-replacement nil)
318
319 ;;; TODO: reuse the :UTF-8 octet counter through a simple macro.
320 (define-octet-counter :utf-8b (getter type)
321 `(named-lambda utf-8b-octet-counter (seq start end max)
322 (declare (type ,type seq) (fixnum start end max))
323 (loop with noctets fixnum = 0
324 for i fixnum from start below end
325 for code of-type code-point = (,getter seq i) do
326 (let ((new (+ (cond ((< code #x80) 1)
327 ((< code #x800) 2)
328 ((<= #xdc80 code #xdcff) 1)
329 ((< code #x10000) 3)
330 (t 4))
331 noctets)))
332 (if (and (plusp max) (> new max))
333 (loop-finish)
334 (setq noctets new)))
335 finally (return (values noctets i)))))
336
337 (define-code-point-counter :utf-8b (getter type)
338 `(named-lambda utf-8b-code-point-counter (seq start end max)
339 (declare (type ,type seq) (fixnum start end max))
340 (loop with nchars fixnum = 0
341 with i fixnum = start
342 while (< i end) do
343 ;; wrote this code with LET instead of FOR because CLISP's
344 ;; LOOP doesn't like WHILE clauses before FOR clauses.
345 (let* ((octet (,getter seq i))
346 (noctets (cond ((< octet #x80) 1)
347 ((< octet #xe0) 2)
348 ((< octet #xf0) 3)
349 (t 4))))
350 (declare (type ub8 octet) (fixnum noctets))
351 (cond
352 ((> (+ i noctets) end)
353 ;; If this error is suppressed these last few bytes
354 ;; will be encoded as raw bytes later.
355 (decoding-error (vector octet) :utf-8 seq i
356 nil 'end-of-input-in-character)
357 (return (values (+ nchars (- end i)) end)))
358 (t
359 ;; FIXME: clean this mess up.
360 (let* ((u1 octet)
361 (u2 (if (>= noctets 2) (,getter seq (1+ i)) 0))
362 (u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0))
363 (u4 (if (= noctets 4) (,getter seq (+ i 3)) 0))
364 (inc (or (and (> noctets 1)
365 (< u1 #xc2))
366 (and (= noctets 2)
367 (not (logior u2 #x40)))
368 (and (= noctets 3)
369 (not (and (< (f-logxor u2 #x80) #x4…
370 (< (f-logxor u3 #x80) #x4…
371 (or (>= u1 #xe1) (>= u2 #…
372 (or (/= u1 #xed) (< u2 #x…
373 (and (= noctets 4)
374 (not
375 (and (< (f-logxor u2 #x80) #x40)
376 (< (f-logxor u3 #x80) #x40)
377 (< (f-logxor u4 #x80) #x40)
378 (or (>= u1 #xf1) (>= u2 #x90)…
379 (let ((new-nchars (if inc (+ nchars noctets) (1+ nchar…
380 (when (and (plusp max) (> new-nchars max))
381 (return (values nchars i)))
382 (incf i noctets)
383 (setq nchars new-nchars))))))
384 finally (progn
385 (assert (= i end))
386 (return (values nchars i))))))
387
388 ;;; TODO: reuse the :UTF-8 encoder with through a simple macro.
389 (define-encoder :utf-8b (getter src-type setter dest-type)
390 `(named-lambda utf-8b-encoder (src start end dest d-start)
391 (declare (type ,src-type src)
392 (type ,dest-type dest)
393 (fixnum start end d-start))
394 (loop with di fixnum = d-start
395 for i fixnum from start below end
396 for code of-type code-point = (,getter src i) do
397 (macrolet ((set-octet (offset value)
398 `(,',setter ,value dest (the fixnum (+ di ,offse…
399 (cond
400 ;; 1 octet
401 ((< code #x80)
402 (set-octet 0 code)
403 (incf di))
404 ;; 2 octets
405 ((< code #x800)
406 (set-octet 0 (logior #xc0 (f-ash code -6)))
407 (set-octet 1 (logior #x80 (f-logand code #x3f)))
408 (incf di 2))
409 ;; 1 octet (invalid octet)
410 ((<= #xdc80 code #xdcff)
411 (set-octet 0 (f-logand code #xff))
412 (incf di))
413 ;; 3 octets
414 ((< code #x10000)
415 (set-octet 0 (logior #xe0 (f-ash code -12)))
416 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)…
417 (set-octet 2 (logior #x80 (f-logand code #x3f)))
418 (incf di 3))
419 ;; 4 octets
420 (t
421 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18…
422 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12…
423 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)…
424 (set-octet 3 (logand #x3f code))
425 (incf di 4))))
426 finally (return (the fixnum (- di d-start))))))
427
428 (define-decoder :utf-8b (getter src-type setter dest-type)
429 `(named-lambda utf-8b-decoder (src start end dest d-start)
430 (declare (type ,src-type src)
431 (type ,dest-type dest)
432 (fixnum start end d-start))
433 (let ((u2 0) (u3 0) (u4 0))
434 (declare (type ub8 u2 u3 u4))
435 (loop for di fixnum from d-start
436 for i fixnum from start below end
437 for u1 of-type ub8 = (,getter src i) do
438 ;; Unlike the UTF-8 version, this version of
439 ;; CONSUME-OCTET needs to check if I is being incremented
440 ;; past END because we might have trailing binary
441 ;; garbage.
442 (macrolet
443 ((consume-octet (n)
444 `(if (= i (1- end))
445 (encode-raw-octets ,n)
446 (,',getter src (incf i))))
447 (encode-raw-octets (n)
448 `(progn
449 ,@(loop for i below n and var in '(u1 u2 u3 u4)
450 collect `(,',setter (logior #xdc00 ,var) …
451 unless (= i (1- n))
452 collect '(incf di))
453 (return-from set-body))))
454 (block set-body
455 (,setter (cond
456 ((< u1 #x80) ; 1 octet
457 u1)
458 ((>= u1 #xc2)
459 (setq u2 (consume-octet 1))
460 (cond
461 ((< u1 #xe0) ; 2 octets
462 (if (< (f-logxor u2 #x80) #x40)
463 (logior (f-ash (f-logand #x1f u1) 6)
464 (f-logxor u2 #x80))
465 (encode-raw-octets 2)))
466 (t
467 (setq u3 (consume-octet 2))
468 (cond
469 ((< u1 #xf0) ; 3 octets
470 (if (and (< (f-logxor u2 #x80) #x40)
471 (< (f-logxor u3 #x80) #x40)
472 (or (>= u1 #xe1) (>= u2 #xa0…
473 (let ((start (f-logior (f-ash (f-…
474 (f-ash (f-…
475 (if (<= #xd800 start #xdfc0)
476 (encode-raw-octets 3)
477 (logior start (f-logand u3 …
478 (encode-raw-octets 3)))
479 (t ; 4 octets
480 (setq u4 (consume-octet 3))
481 (if (and (< (f-logxor u2 #x80) #x40)
482 (< (f-logxor u3 #x80) #x40)
483 (< (f-logxor u4 #x80) #x40)
484 (or (>= u1 #xf1) (>= u2 #x90…
485 (logior
486 (f-logior (f-ash (f-logand u1 7)…
487 (f-ash (f-logxor u2 #x…
488 (f-logior (f-ash (f-logxor u3 #x…
489 (f-logxor u4 #x80)))
490 (encode-raw-octets 4)))))))
491 (t (encode-raw-octets 1)))
492 dest di)))
493 finally (return (the fixnum (- di d-start)))))))
494
495 ;;;; UTF-16
496
497 ;;; TODO: add a way to pass some info at compile-time telling us that,
498 ;;; for example, the maximum code-point will always be < #x10000 in
499 ;;; which case we could simply return (* 2 (- end start)).
500 (defmacro utf16-octet-counter (getter type)
501 `(named-lambda utf-16-octet-counter (seq start end max)
502 (declare (type ,type seq) (fixnum start end max))
503 (loop with noctets fixnum = 0
504 for i fixnum from start below end
505 for code of-type code-point = (,getter seq i)
506 do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctet…
507 (if (and (plusp max) (> new max))
508 (loop-finish)
509 (setq noctets new)))
510 finally (return (values noctets i)))))
511
512 (defmacro utf-16-combine-surrogate-pairs (u1 u2)
513 `(the (unsigned-byte 21)
514 (+ #x10000
515 (the (unsigned-byte 20)
516 (logior
517 (the (unsigned-byte 20)
518 (ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10))
519 (the (unsigned-byte 10)
520 (- ,u2 #xdc00)))))))
521
522 (defmacro define-utf-16 (name &optional endianness)
523 (check-type endianness (or null (eql :be) (eql :le)))
524 (check-type name keyword)
525 (let ((swap-var (gensym "SWAP"))
526 (code-point-counter-name
527 (format-symbol t '#:~a-code-point-counter (string name)))
528 (encoder-name (format-symbol t '#:~a-encoder (string name)))
529 (decoder-name (format-symbol t '#:~a-decoder (string name))))
530 (labels ((make-bom-check-form (end start getter seq)
531 (if (null endianness)
532 ``((,',swap-var
533 (when (> ,,end ,,start)
534 (case (,,getter ,,seq ,,start 2 :ne)
535 (#.+byte-order-mark-code+ (incf ,,start 2) nil)
536 (#.+swapped-byte-order-mark-code+ (incf ,,star…
537 (t #+little-endian t)))))
538 '()))
539 (make-getter-form (getter src i)
540 (case endianness
541 (:le ``(,,getter ,,src ,,i 2 :le))
542 (:be ``(,,getter ,,src ,,i 2 :be))
543 (t ``(if ,',swap-var
544 (,,getter ,,src ,,i 2 :re)
545 (,,getter ,,src ,,i 2 :ne)))))
546 (make-setter-form (setter code dest di)
547 (case endianness
548 (:be ``(,,setter ,,code ,,dest ,,di 2 :be))
549 (:le ``(,,setter ,,code ,,dest ,,di 2 :le))
550 (t ``(,,setter ,,code ,,dest ,,di 2 :ne)))))
551 `(progn
552 (define-octet-counter ,name (getter type)
553 `(utf16-octet-counter ,getter ,type))
554 (define-code-point-counter ,name (getter type)
555 `(named-lambda ,',code-point-counter-name (seq start end max)
556 (declare (type ,type seq) (fixnum start end max))
557 (let* ,,(make-bom-check-form ''end ''start 'getter ''seq)
558 (loop with count fixnum = 0
559 with i fixnum = start
560 while (<= i (- end 2)) do
561 (let* ((code ,,(make-getter-form 'getter ''seq ''i))
562 (next-i (+ i (if (or (< code #xd800) (>= code #…
563 2
564 4))))
565 (declare (type (unsigned-byte 16) code) (fixnum next…
566 (cond
567 ((> next-i end)
568 (decoding-error
569 (vector (,getter seq i) (,getter seq (1+ i)))
570 ,',name seq i nil 'end-of-input-in-character)
571 (return (values count i)))
572 (t
573 (setq i next-i
574 count (1+ count))
575 (when (and (plusp max) (= count max))
576 (return (values count i))))))
577 finally (progn
578 (assert (= i end))
579 (return (values count i)))))))
580 (define-encoder ,name (getter src-type setter dest-type)
581 `(named-lambda ,',encoder-name (src start end dest d-start)
582 (declare (type ,src-type src)
583 (type ,dest-type dest)
584 (fixnum start end d-start))
585 (loop with di fixnum = d-start
586 for i fixnum from start below end
587 for code of-type code-point = (,getter src i)
588 for high-bits fixnum = (- code #x10000) do
589 (cond ((< high-bits 0)
590 ,,(make-setter-form 'setter ''code ''dest ''di)
591 (incf di 2))
592 (t
593 ,,(make-setter-form
594 'setter ''(logior #xd800 (f-ash high-bits …
595 ''dest ''di)
596 ,,(make-setter-form
597 'setter ''(logior #xdc00 (f-logand high-bi…
598 ''dest ''(+ di 2))
599 (incf di 4)))
600 finally (return (the fixnum (- di d-start))))))
601 (define-decoder ,name (getter src-type setter dest-type)
602 `(named-lambda ,',decoder-name (src start end dest d-start)
603 (declare (type ,src-type src)
604 (type ,dest-type dest)
605 (fixnum start end d-start))
606 (let ,,(make-bom-check-form ''end ''start 'getter ''src)
607 (loop with i fixnum = start
608 for di fixnum from d-start
609 until (= i end) do
610 (let ((u1 ,,(make-getter-form 'getter ''src ''i)))
611 (declare (type (unsigned-byte 16) u1))
612 (incf i 2)
613 (,setter (cond
614 ((or (< u1 #xd800) (>= u1 #xe000)) ; 2 oc…
615 u1)
616 ((< u1 #xdc00) ; 4 octets
617 (let ((u2 ,,(make-getter-form 'getter ''…
618 (declare (type (unsigned-byte 16) u2))
619 (incf i 2)
620 (if (and (>= u2 #xdc00) (< u2 #xe000))
621 (utf-16-combine-surrogate-pairs u1 u…
622 (decoding-error
623 (vector (,getter src (- i 4))
624 (,getter src (- i 3))
625 (,getter src (- i 2))
626 (,getter src (- i 1)))
627 ,',name src i +repl+))))
628 (t
629 (decoding-error (vector (,getter src (-…
630 (,getter src (-…
631 ,',name src i +repl+)))
632 dest di))
633 finally (return (the fixnum (- di d-start)))))))
634 ',name))))
635
636 (define-character-encoding :utf-16
637 "A 16-bit, variable-length encoding in which characters with
638 code points less than #x10000 can be encoded in a single 16-bit
639 word and characters with larger codes can be encoded in a pair of
640 16-bit words. The endianness of the encoded data is indicated by
641 the endianness of a byte-order-mark character (#\u+feff)
642 prepended to the data; in the absence of such a character on
643 input, the data is assumed to be in big-endian order. Output is
644 written in native byte-order with a leading byte-order mark."
645 :max-units-per-char 2
646 :code-unit-size 16
647 :native-endianness t ; not necessarily true when decoding
648 :decode-literal-code-unit-limit #xd800
649 :encode-literal-code-unit-limit #x10000
650 :use-bom #+big-endian :utf-16be #+little-endian :utf-16le
651 :bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe)
652 :nul-encoding #(0 0)
653 :default-replacement #xfffd
654 :ambiguous #+little-endian t #+big-endian nil)
655
656 (define-utf-16 :utf-16)
657
658 (define-character-encoding :utf-16le
659 "A 16-bit, variable-length encoding in which characters with
660 code points less than #x10000 can be encoded in a single 16-bit
661 word and characters with larger codes can be encoded in a pair of
662 16-bit words. The data is assumed to be in little-endian order. Output…
663 written in little-endian byte-order without a leading byte-order mark."
664 :aliases '(:utf-16/le)
665 :max-units-per-char 2
666 :code-unit-size 16
667 :native-endianness #+little-endian t #+big-endian nil
668 :decode-literal-code-unit-limit #xd800
669 :encode-literal-code-unit-limit #x10000
670 :nul-encoding #(0 0)
671 :default-replacement #xfffd)
672
673 (define-utf-16 :utf-16le :le)
674
675 (define-character-encoding :utf-16be
676 "A 16-bit, variable-length encoding in which characters with
677 code points less than #x10000 can be encoded in a single 16-bit
678 word and characters with larger codes can be encoded in a pair of
679 16-bit words. The data is assumed to be in big-endian order. Output is
680 written in big-endian byte-order without a leading byte-order mark."
681 :aliases '(:utf-16/be)
682 :max-units-per-char 2
683 :code-unit-size 16
684 :native-endianness #+little-endian nil #+big-endian t
685 :decode-literal-code-unit-limit #xd800
686 :encode-literal-code-unit-limit #x10000
687 :nul-encoding #(0 0)
688 :default-replacement #xfffd)
689
690 (define-utf-16 :utf-16be :be)
691
692 (defmacro define-ucs (name bytes &optional endianness (limit #x110000))
693 (check-type name keyword)
694 (check-type bytes (or (eql 2) (eql 4)))
695 (check-type endianness (or null (eql :le) (eql :be)))
696 (let ((swap-var (gensym "SWAP"))
697 (code-point-counter-name
698 (format-symbol t '#:~a-code-point-counter (string name)))
699 (encoder-name
700 (format-symbol t '#:~a-encoder (string name)))
701 (decoder-name
702 (format-symbol t '#:~a-decoder (string name))))
703 (labels ((make-bom-check-form (end start getter src)
704 (if (null endianness)
705 ``(when (not (zerop (- ,,end ,,start)))
706 (case (,,getter ,,src 0 ,',bytes :ne)
707 (#.+byte-order-mark-code+
708 (incf ,,start ,',bytes) nil)
709 (#.+swapped-byte-order-mark-code-32+
710 (incf ,,start ,',bytes) t)
711 (t #+little-endian t)))
712 '()))
713 (make-setter-form (setter code dest di)
714 ``(,,setter ,,code ,,dest ,,di ,',bytes
715 ,',(or endianness :ne)))
716 (make-getter-form (getter src i)
717 (if (null endianness)
718 ``(if ,',swap-var
719 (,,getter ,,src ,,i ,',bytes :re)
720 (,,getter ,,src ,,i ,',bytes :ne))
721 ``(,,getter ,,src ,,i ,',bytes ,',endianness))))
722 `(progn
723 (define-code-point-counter ,name (getter type)
724 `(named-lambda ,',code-point-counter-name (seq start end max)
725 (declare (type ,type seq) (fixnum start end max))
726 ;; check for bom
727 ,,(make-bom-check-form ''end ''start 'getter ''seq)
728 (multiple-value-bind (count rem)
729 (floor (- end start) ,',bytes)
730 (cond
731 ((and (plusp max) (> count max))
732 (values max (the fixnum (+ start (* ,',bytes max)))))
733 (t
734 ;; check for incomplete last character
735 (unless (zerop rem)
736 (let ((vector (make-array ,',bytes :fill-pointer 0)…
737 (dotimes (i rem)
738 (vector-push (,getter seq (+ i (- end rem))) ve…
739 (decoding-error vector ,',name seq (the fixnum (-…
740 'end-of-input-in-character)
741 (decf end rem)))
742 (values count end))))))
743 (define-encoder ,name (getter src-type setter dest-type)
744 `(named-lambda ,',encoder-name (src start end dest d-start)
745 (declare (type ,src-type src)
746 (type ,dest-type dest)
747 (fixnum start end d-start))
748 (loop for i fixnum from start below end
749 and di fixnum from d-start by ,',bytes
750 for code of-type code-point = (,getter src i)
751 do (if (>= code ,',limit)
752 (encoding-error code ,',name src i +repl+)
753 ,,(make-setter-form 'setter ''code ''dest ''d…
754 finally (return (the fixnum (- di d-start))))))
755 (define-decoder ,name (getter src-type setter dest-type)
756 `(named-lambda ,',decoder-name (src start end dest d-start)
757 (declare (type ,src-type src)
758 (type ,dest-type dest)
759 (fixnum start end d-start))
760 (let ((,',swap-var ,,(make-bom-check-form ''end ''start 'g…
761 (declare (ignorable ,',swap-var))
762 (loop for i fixnum from start below end by ,',bytes
763 and di from d-start
764 do (,setter (let ((unit ,,(make-getter-form 'gette…
765 (if (>= unit ,',limit)
766 (decoding-error
767 (vector (,getter src i)
768 (,getter src (+ i 1))
769 ,@,(if (= bytes 4)
770 ``((,getter src …
771 (,getter src …
772 ,',name src i +repl+
773 'character-out-of-range)
774 unit))
775 dest di)
776 finally (return (the fixnum (- di d-start)))))))
777 ',name))))
778
779 ;;;; UTF-32
780
781 (define-character-encoding :utf-32
782 "A 32-bit, fixed-length encoding in which all Unicode
783 characters can be encoded in a single 32-bit word. The
784 endianness of the encoded data is indicated by the endianness of
785 a byte-order-mark character (#\u+feff) prepended to the data; in
786 the absence of such a character on input, input data is assumed
787 to be in big-endian order. Output is written in native byte
788 order with a leading byte-order mark."
789 :aliases '(:ucs-4)
790 :max-units-per-char 1
791 :code-unit-size 32
792 :native-endianness t ; not necessarily true when decoding
793 :literal-char-code-limit #x110000
794 :use-bom #+little-endian :utf-32le #+big-endian :utf-32be
795 :bom-encoding
796 #+big-endian #(#x00 #x00 #xfe #xff)
797 #+little-endian #(#xff #xfe #x00 #x00)
798 :nul-encoding #(0 0 0 0)
799 :ambiguous #+little-endian t #+big-endian nil)
800
801 (define-ucs :utf-32 4)
802
803 (define-character-encoding :utf-32le
804 "A 32-bit, fixed-length encoding in which all Unicode
805 characters can be encoded in a single 32-bit word. Input data is assumed
806 to be in little-endian order. Output is also written in little-endian b…
807 order without a leading byte-order mark."
808 :max-units-per-char 1
809 :code-unit-size 32
810 :aliases '(:utf-32/le :ucs-4le :ucs-4/le)
811 :native-endianness #+little-endian t #+big-endian nil
812 :literal-char-code-limit #x110000
813 :nul-encoding #(0 0 0 0))
814
815 (define-ucs :utf-32le 4 :le)
816
817 (define-character-encoding :utf-32be
818 "A 32-bit, fixed-length encoding in which all Unicode
819 characters can be encoded in a single 32-bit word. Input data is assumed
820 to be in big-endian order. Output is also written in big-endian byte
821 order without a leading byte-order mark."
822 :max-units-per-char 1
823 :code-unit-size 32
824 :aliases '(:utf-32/be :ucs-4be :ucs-4/be)
825 :native-endianness #+little-endian nil #+big-endian t
826 :literal-char-code-limit #x110000
827 :nul-encoding #(0 0 0 0))
828
829 (define-ucs :utf-32be 4 :be)
830
831 ;; UCS-2
832
833 (define-character-encoding :ucs-2
834 "A 16-bit, fixed-length encoding in which all Unicode
835 characters can be encoded in a single 16-bit word. The
836 endianness of the encoded data is indicated by the endianness of
837 a byte-order-mark character (#\u+feff) prepended to the data; in
838 the absence of such a character on input, input data is assumed
839 to be in big-endian order. Output is written in native byte
840 order with a leading byte-order mark."
841 :aliases '(:ucs-2)
842 :max-units-per-char 1
843 :code-unit-size 16
844 :native-endianness t ; not necessarily true when decoding
845 :literal-char-code-limit #x10000
846 :use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be
847 :bom-encoding
848 #+big-endian #(#xfe #xff)
849 #+little-endian #(#xff #xfe)
850 :nul-encoding #(0 0)
851 :ambiguous #+little-endian t #+big-endian nil)
852
853 (define-ucs :ucs-2 2 nil #x10000)
854
855 (define-character-encoding :ucs-2le
856 "A 16-bit, fixed-length encoding in which all Unicode
857 characters can be encoded in a single 16-bit word. Input data is assumed
858 to be in little-endian order. Output is also written in little-endian b…
859 order without a leading byte-order mark."
860 :max-units-per-char 1
861 :code-unit-size 16
862 :aliases '(:ucs-2/le)
863 :native-endianness #+little-endian t #+big-endian nil
864 :literal-char-code-limit #x10000
865 :nul-encoding #(0 0))
866
867 (define-ucs :ucs-2le 2 :le #x10000)
868
869 (define-character-encoding :ucs-2be
870 "A 16-bit, fixed-length encoding in which all Unicode
871 characters can be encoded in a single 16-bit word. Input data is assumed
872 to be in big-endian order. Output is also written in big-endian byte
873 order without a leading byte-order mark."
874 :max-units-per-char 1
875 :code-unit-size 16
876 :aliases '(:ucs-2/be)
877 :native-endianness #+little-endian nil #+big-endian t
878 :literal-char-code-limit #x10000
879 :nul-encoding #(0 0))
880
881 (define-ucs :ucs-2be 2 :be #x10000)
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.