enc-unicode.lisp - clic - Clic is an command line interactive client for gopher… | |
git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65… | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
enc-unicode.lisp (42416B) | |
--- | |
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- | |
2 ;;; | |
3 ;;; enc-unicode.lisp --- Unicode encodings. | |
4 ;;; | |
5 ;;; Copyright (C) 2007, Luis Oliveira <[email protected]> | |
6 ;;; | |
7 ;;; Permission is hereby granted, free of charge, to any person | |
8 ;;; obtaining a copy of this software and associated documentation | |
9 ;;; files (the "Software"), to deal in the Software without | |
10 ;;; restriction, including without limitation the rights to use, copy, | |
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies | |
12 ;;; of the Software, and to permit persons to whom the Software is | |
13 ;;; furnished to do so, subject to the following conditions: | |
14 ;;; | |
15 ;;; The above copyright notice and this permission notice shall be | |
16 ;;; included in all copies or substantial portions of the Software. | |
17 ;;; | |
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
25 ;;; DEALINGS IN THE SOFTWARE. | |
26 | |
27 ;;; This implementation is largely based on OpenMCL's l1-unicode.lisp | |
28 ;;; Copyright (C) 2006 Clozure Associates and contributors. | |
29 | |
30 (in-package #:babel-encodings) | |
31 | |
32 (eval-when (:compile-toplevel :load-toplevel :execute) | |
33 (defconstant +repl+ #xfffd "Unicode replacement character code point.") | |
34 (defconstant +byte-order-mark-code+ #xfeff) | |
35 (defconstant +swapped-byte-order-mark-code+ #xfffe) | |
36 (defconstant +swapped-byte-order-mark-code-32+ #xfffe0000)) | |
37 | |
38 ;;; Some convenience macros adding FIXNUM declarations. | |
39 (defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count))) | |
40 (defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers))) | |
41 (defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers))) | |
42 (defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers))) | |
43 | |
44 ;;;; UTF-8 | |
45 | |
46 (define-character-encoding :utf-8 | |
47 "An 8-bit, variable-length character encoding in which | |
48 character code points in the range #x00-#x7f can be encoded in a | |
49 single octet; characters with larger code values can be encoded | |
50 in 2 to 4 bytes." | |
51 :max-units-per-char 4 | |
52 :literal-char-code-limit #x80 | |
53 :bom-encoding #(#xef #xbb #xbf) | |
54 :default-replacement #xfffd) | |
55 | |
56 (define-condition invalid-utf8-starter-byte (character-decoding-error) | |
57 () | |
58 (:documentation "Signalled when an invalid UTF-8 starter byte is found… | |
59 | |
60 (define-condition invalid-utf8-continuation-byte (character-decoding-err… | |
61 () | |
62 (:documentation | |
63 "Signalled when an invalid UTF-8 continuation byte is found.")) | |
64 | |
65 (define-condition overlong-utf8-sequence (character-decoding-error) | |
66 () | |
67 (:documentation "Signalled upon overlong UTF-8 sequences.")) | |
68 | |
69 (define-octet-counter :utf-8 (getter type) | |
70 `(named-lambda utf-8-octet-counter (seq start end max) | |
71 (declare (type ,type seq) (fixnum start end max)) | |
72 (loop with noctets fixnum = 0 | |
73 for i fixnum from start below end | |
74 for code of-type code-point = (,getter seq i) do | |
75 (let ((new (+ (cond ((< code #x80) 1) | |
76 ((< code #x800) 2) | |
77 ((< code #x10000) 3) | |
78 (t 4)) | |
79 noctets))) | |
80 (if (and (plusp max) (> new max)) | |
81 (loop-finish) | |
82 (setq noctets new))) | |
83 finally (return (values noctets i))))) | |
84 | |
85 (define-code-point-counter :utf-8 (getter type) | |
86 `(named-lambda utf-8-code-point-counter (seq start end max) | |
87 (declare (type ,type seq) (fixnum start end max)) | |
88 (loop with nchars fixnum = 0 | |
89 with i fixnum = start | |
90 while (< i end) do | |
91 ;; check for invalid continuation bytes | |
92 (macrolet ((invalid-cb-p (n) | |
93 `(and (< (+ i ,n) end) | |
94 (not (< #x7f (,',getter seq (+ i ,n)) #xc0… | |
95 ;; wrote this code with LET instead of FOR because CLISP's | |
96 ;; LOOP doesn't like WHILE clauses before FOR clauses. | |
97 (let* ((octet (,getter seq i)) | |
98 (next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p… | |
99 ((or (< octet #xe0) (invalid-cb-p… | |
100 ((or (< octet #xf0) (invalid-cb-p… | |
101 ((or (< octet #xf8) (invalid-cb-p… | |
102 ((or (< octet #xfc) (invalid-cb-p… | |
103 (t 6))))) | |
104 (declare (type ub8 octet) (fixnum next-i)) | |
105 (cond | |
106 ((> next-i end) | |
107 ;; Should we add restarts to this error, we'll have | |
108 ;; to figure out a way to communicate with the | |
109 ;; decoder since we probably want to do something | |
110 ;; about it right here when we have a chance to | |
111 ;; change the count or something. (Like an | |
112 ;; alternative replacement character or perhaps the | |
113 ;; existence of this error so that the decoder | |
114 ;; doesn't have to check for it on every iteration | |
115 ;; like we do.) | |
116 ;; | |
117 ;; FIXME: The data for this error is not right. | |
118 (decoding-error (vector octet) :utf-8 seq i | |
119 nil 'end-of-input-in-character) | |
120 (return (values (1+ nchars) end))) | |
121 (t | |
122 (setq nchars (1+ nchars) | |
123 i next-i) | |
124 (when (and (plusp max) (= nchars max)) | |
125 (return (values nchars i))))))) | |
126 finally (progn | |
127 (assert (= i end)) | |
128 (return (values nchars i)))))) | |
129 | |
130 (define-encoder :utf-8 (getter src-type setter dest-type) | |
131 `(named-lambda utf-8-encoder (src start end dest d-start) | |
132 (declare (type ,src-type src) | |
133 (type ,dest-type dest) | |
134 (fixnum start end d-start)) | |
135 (loop with di fixnum = d-start | |
136 for i fixnum from start below end | |
137 for code of-type code-point = (,getter src i) do | |
138 (macrolet ((set-octet (offset value) | |
139 `(,',setter ,value dest (the fixnum (+ di ,offse… | |
140 (cond | |
141 ;; 1 octet | |
142 ((< code #x80) | |
143 (set-octet 0 code) | |
144 (incf di)) | |
145 ;; 2 octets | |
146 ((< code #x800) | |
147 (set-octet 0 (logior #xc0 (f-ash code -6))) | |
148 (set-octet 1 (logior #x80 (f-logand code #x3f))) | |
149 (incf di 2)) | |
150 ;; 3 octets | |
151 ((< code #x10000) | |
152 (set-octet 0 (logior #xe0 (f-ash code -12))) | |
153 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)… | |
154 (set-octet 2 (logior #x80 (f-logand code #x3f))) | |
155 (incf di 3)) | |
156 ;; 4 octets | |
157 (t | |
158 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18… | |
159 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12… | |
160 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)… | |
161 (set-octet 3 (logior #x80 (logand code #x3f))) | |
162 (incf di 4)))) | |
163 finally (return (the fixnum (- di d-start)))))) | |
164 | |
165 (define-decoder :utf-8 (getter src-type setter dest-type) | |
166 `(named-lambda utf-8-decoder (src start end dest d-start) | |
167 (declare (type ,src-type src) | |
168 (type ,dest-type dest) | |
169 (fixnum start end d-start)) | |
170 (let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0)) | |
171 (declare (type ub8 u2 u3 u4 u5 u6)) | |
172 (loop for di fixnum from d-start | |
173 for i fixnum from start below end | |
174 for u1 of-type ub8 = (,getter src i) do | |
175 ;; Note: CONSUME-OCTET doesn't check if I is being | |
176 ;; incremented past END. We're assuming that END has | |
177 ;; been calculated with the CODE-POINT-POINTER above that | |
178 ;; checks this. | |
179 (macrolet | |
180 ((consume-octet () | |
181 `(let ((next-i (incf i))) | |
182 (if (= next-i end) | |
183 ;; FIXME: data for this error is incomplete. | |
184 ;; and signalling this error twice | |
185 (return-from setter-block | |
186 (decoding-error nil :utf-8 src i +repl+ | |
187 'end-of-input-in-character)) | |
188 (,',getter src next-i)))) | |
189 (handle-error (n &optional (c 'character-decoding-erro… | |
190 `(decoding-error | |
191 (vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n)) | |
192 :utf-8 src (1+ (- i ,n)) +repl+ ',c)) | |
193 (handle-error-if-icb (var n) | |
194 `(when (not (< #x7f ,var #xc0)) | |
195 (decf i) | |
196 (return-from setter-block | |
197 (handle-error ,n invalid-utf8-continuation-byte… | |
198 (,setter | |
199 (block setter-block | |
200 (cond | |
201 ((< u1 #x80) u1) ; 1 octet | |
202 ((< u1 #xc0) | |
203 (handle-error 1 invalid-utf8-starter-byte)) | |
204 (t | |
205 (setq u2 (consume-octet)) | |
206 (handle-error-if-icb u2 1) | |
207 (cond | |
208 ((< u1 #xc2) | |
209 (handle-error 2 overlong-utf8-sequence)) | |
210 ((< u1 #xe0) ; 2 octets | |
211 (logior (f-ash (f-logand #x1f u1) 6) | |
212 (f-logxor u2 #x80))) | |
213 (t | |
214 (setq u3 (consume-octet)) | |
215 (handle-error-if-icb u3 2) | |
216 (cond | |
217 ((and (= u1 #xe0) (< u2 #xa0)) | |
218 (handle-error 3 overlong-utf8-sequence)) | |
219 ((< u1 #xf0) ; 3 octets | |
220 (let ((start (f-logior (f-ash (f-logand u1 #x… | |
221 (f-ash (f-logand u2 #x… | |
222 (if (<= #xd800 start #xdfc0) | |
223 (handle-error 3 character-out-of-range) | |
224 (logior start (f-logand u3 #x3f))))) | |
225 (t ; 4 octets | |
226 (setq u4 (consume-octet)) | |
227 (handle-error-if-icb u4 3) | |
228 (cond | |
229 ((and (= u1 #xf0) (< u2 #x90)) | |
230 (handle-error 4 overlong-utf8-sequence)) | |
231 ((< u1 #xf8) | |
232 (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2… | |
233 (handle-error 4 character-out-of-range) | |
234 (f-logior (f-ash (f-logand u1 7) 18) | |
235 (f-ash (f-logxor u2 #x80) 12) | |
236 (f-ash (f-logxor u3 #x80) 6) | |
237 (f-logxor u4 #x80)))) | |
238 ;; from here on we'll be getting either | |
239 ;; invalid continuation bytes or overlong | |
240 ;; 5-byte or 6-byte sequences. | |
241 (t | |
242 (setq u5 (consume-octet)) | |
243 (handle-error-if-icb u5 4) | |
244 (cond | |
245 ((and (= u1 #xf8) (< u2 #x88)) | |
246 (handle-error 5 overlong-utf8-sequence)) | |
247 ((< u1 #xfc) | |
248 (handle-error 5 character-out-of-range)) | |
249 (t | |
250 (setq u6 (consume-octet)) | |
251 (handle-error-if-icb u6 5) | |
252 (cond | |
253 ((and (= u1 #xfc) (< u2 #x84)) | |
254 (handle-error 6 overlong-utf8-sequen… | |
255 (t | |
256 (handle-error 6 character-out-of-ran… | |
257 ))))))))))))) | |
258 dest di)) | |
259 finally (return (the fixnum (- di d-start))))))) | |
260 | |
261 ;;;; UTF-8B | |
262 | |
263 ;;; The following excerpt from a linux-utf8 message by Markus Kuhn is | |
264 ;;; the closest thing to a UTF-8B specification: | |
265 ;;; | |
266 ;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html> | |
267 ;;; | |
268 ;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed | |
269 ;;; UTF-8 sequence | |
270 ;;; | |
271 ;;; All the previous options for converting malformed UTF-8 sequences | |
272 ;;; to UTF-16 destroy information. This can be highly undesirable in | |
273 ;;; applications such as text file editors, where guaranteed binary | |
274 ;;; transparency is a desireable feature. (E.g., I frequently edit | |
275 ;;; executable code or graphic files with the Emacs text editor and I | |
276 ;;; hate the idea that my editor might automatically make U+FFFD | |
277 ;;; substitutions at locations that I haven't even edited when I save | |
278 ;;; the file again.) | |
279 ;;; | |
280 ;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx | |
281 ;;; mailing list the following approach. Instead of using U+FFFD, | |
282 ;;; simply encode malformed UTF-8 sequences as malformed UTF-16 | |
283 ;;; sequences. Malformed UTF-8 sequences consist excludively of the | |
284 ;;; bytes 0x80 - 0xff, and each of these bytes can be represented | |
285 ;;; using a 16-bit value from the UTF-16 low-half surrogate zone | |
286 ;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from | |
287 ;;; the above example would be represented in UTF-16 as U+DCC1 | |
288 ;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate | |
289 ;;; character is also treated like a malformed sequence, then there | |
290 ;;; is no way that a single high-half surrogate could precede the | |
291 ;;; encoded malformed sequence and cause a valid UTF-16 sequence to | |
292 ;;; emerge. | |
293 ;;; | |
294 ;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8 | |
295 ;;; round-trip compatibility can be achieved quite easily. | |
296 ;;; | |
297 ;;; On an output device, a lonely low-half surrogate character should | |
298 ;;; be treated just like a character outside the adopted subset of | |
299 ;;; representable characters, that is for the end user, the display | |
300 ;;; would look exactly like with semantics B), i.e. one symbol per | |
301 ;;; byte of a malformed sequence. However in contrast to semantics | |
302 ;;; B), no information is thrown away, and a cut&paste in an editor | |
303 ;;; or terminal emulator will be guaranteed to reconstruct the | |
304 ;;; original byte sequence. This should greatly reduce the incidence | |
305 ;;; of accidental corruption of binary data by UTF-8 -> UTF-16 -> | |
306 ;;; UTF-8 conversion round trips." | |
307 | |
308 (define-character-encoding :utf-8b | |
309 "An 8-bit, variable-length character encoding in which | |
310 character code points in the range #x00-#x7f can be encoded in a | |
311 single octet; characters with larger code values can be encoded | |
312 in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX | |
313 code points for each invalid byte." | |
314 :max-units-per-char 4 | |
315 :literal-char-code-limit #x80 | |
316 :bom-encoding #(#xef #xbb #xbf) | |
317 :default-replacement nil) | |
318 | |
319 ;;; TODO: reuse the :UTF-8 octet counter through a simple macro. | |
320 (define-octet-counter :utf-8b (getter type) | |
321 `(named-lambda utf-8b-octet-counter (seq start end max) | |
322 (declare (type ,type seq) (fixnum start end max)) | |
323 (loop with noctets fixnum = 0 | |
324 for i fixnum from start below end | |
325 for code of-type code-point = (,getter seq i) do | |
326 (let ((new (+ (cond ((< code #x80) 1) | |
327 ((< code #x800) 2) | |
328 ((<= #xdc80 code #xdcff) 1) | |
329 ((< code #x10000) 3) | |
330 (t 4)) | |
331 noctets))) | |
332 (if (and (plusp max) (> new max)) | |
333 (loop-finish) | |
334 (setq noctets new))) | |
335 finally (return (values noctets i))))) | |
336 | |
337 (define-code-point-counter :utf-8b (getter type) | |
338 `(named-lambda utf-8b-code-point-counter (seq start end max) | |
339 (declare (type ,type seq) (fixnum start end max)) | |
340 (loop with nchars fixnum = 0 | |
341 with i fixnum = start | |
342 while (< i end) do | |
343 ;; wrote this code with LET instead of FOR because CLISP's | |
344 ;; LOOP doesn't like WHILE clauses before FOR clauses. | |
345 (let* ((octet (,getter seq i)) | |
346 (noctets (cond ((< octet #x80) 1) | |
347 ((< octet #xe0) 2) | |
348 ((< octet #xf0) 3) | |
349 (t 4)))) | |
350 (declare (type ub8 octet) (fixnum noctets)) | |
351 (cond | |
352 ((> (+ i noctets) end) | |
353 ;; If this error is suppressed these last few bytes | |
354 ;; will be encoded as raw bytes later. | |
355 (decoding-error (vector octet) :utf-8 seq i | |
356 nil 'end-of-input-in-character) | |
357 (return (values (+ nchars (- end i)) end))) | |
358 (t | |
359 ;; FIXME: clean this mess up. | |
360 (let* ((u1 octet) | |
361 (u2 (if (>= noctets 2) (,getter seq (1+ i)) 0)) | |
362 (u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0)) | |
363 (u4 (if (= noctets 4) (,getter seq (+ i 3)) 0)) | |
364 (inc (or (and (> noctets 1) | |
365 (< u1 #xc2)) | |
366 (and (= noctets 2) | |
367 (not (logior u2 #x40))) | |
368 (and (= noctets 3) | |
369 (not (and (< (f-logxor u2 #x80) #x4… | |
370 (< (f-logxor u3 #x80) #x4… | |
371 (or (>= u1 #xe1) (>= u2 #… | |
372 (or (/= u1 #xed) (< u2 #x… | |
373 (and (= noctets 4) | |
374 (not | |
375 (and (< (f-logxor u2 #x80) #x40) | |
376 (< (f-logxor u3 #x80) #x40) | |
377 (< (f-logxor u4 #x80) #x40) | |
378 (or (>= u1 #xf1) (>= u2 #x90)… | |
379 (let ((new-nchars (if inc (+ nchars noctets) (1+ nchar… | |
380 (when (and (plusp max) (> new-nchars max)) | |
381 (return (values nchars i))) | |
382 (incf i noctets) | |
383 (setq nchars new-nchars)))))) | |
384 finally (progn | |
385 (assert (= i end)) | |
386 (return (values nchars i)))))) | |
387 | |
388 ;;; TODO: reuse the :UTF-8 encoder with through a simple macro. | |
389 (define-encoder :utf-8b (getter src-type setter dest-type) | |
390 `(named-lambda utf-8b-encoder (src start end dest d-start) | |
391 (declare (type ,src-type src) | |
392 (type ,dest-type dest) | |
393 (fixnum start end d-start)) | |
394 (loop with di fixnum = d-start | |
395 for i fixnum from start below end | |
396 for code of-type code-point = (,getter src i) do | |
397 (macrolet ((set-octet (offset value) | |
398 `(,',setter ,value dest (the fixnum (+ di ,offse… | |
399 (cond | |
400 ;; 1 octet | |
401 ((< code #x80) | |
402 (set-octet 0 code) | |
403 (incf di)) | |
404 ;; 2 octets | |
405 ((< code #x800) | |
406 (set-octet 0 (logior #xc0 (f-ash code -6))) | |
407 (set-octet 1 (logior #x80 (f-logand code #x3f))) | |
408 (incf di 2)) | |
409 ;; 1 octet (invalid octet) | |
410 ((<= #xdc80 code #xdcff) | |
411 (set-octet 0 (f-logand code #xff)) | |
412 (incf di)) | |
413 ;; 3 octets | |
414 ((< code #x10000) | |
415 (set-octet 0 (logior #xe0 (f-ash code -12))) | |
416 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6)… | |
417 (set-octet 2 (logior #x80 (f-logand code #x3f))) | |
418 (incf di 3)) | |
419 ;; 4 octets | |
420 (t | |
421 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18… | |
422 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12… | |
423 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6)… | |
424 (set-octet 3 (logand #x3f code)) | |
425 (incf di 4)))) | |
426 finally (return (the fixnum (- di d-start)))))) | |
427 | |
428 (define-decoder :utf-8b (getter src-type setter dest-type) | |
429 `(named-lambda utf-8b-decoder (src start end dest d-start) | |
430 (declare (type ,src-type src) | |
431 (type ,dest-type dest) | |
432 (fixnum start end d-start)) | |
433 (let ((u2 0) (u3 0) (u4 0)) | |
434 (declare (type ub8 u2 u3 u4)) | |
435 (loop for di fixnum from d-start | |
436 for i fixnum from start below end | |
437 for u1 of-type ub8 = (,getter src i) do | |
438 ;; Unlike the UTF-8 version, this version of | |
439 ;; CONSUME-OCTET needs to check if I is being incremented | |
440 ;; past END because we might have trailing binary | |
441 ;; garbage. | |
442 (macrolet | |
443 ((consume-octet (n) | |
444 `(if (= i (1- end)) | |
445 (encode-raw-octets ,n) | |
446 (,',getter src (incf i)))) | |
447 (encode-raw-octets (n) | |
448 `(progn | |
449 ,@(loop for i below n and var in '(u1 u2 u3 u4) | |
450 collect `(,',setter (logior #xdc00 ,var) … | |
451 unless (= i (1- n)) | |
452 collect '(incf di)) | |
453 (return-from set-body)))) | |
454 (block set-body | |
455 (,setter (cond | |
456 ((< u1 #x80) ; 1 octet | |
457 u1) | |
458 ((>= u1 #xc2) | |
459 (setq u2 (consume-octet 1)) | |
460 (cond | |
461 ((< u1 #xe0) ; 2 octets | |
462 (if (< (f-logxor u2 #x80) #x40) | |
463 (logior (f-ash (f-logand #x1f u1) 6) | |
464 (f-logxor u2 #x80)) | |
465 (encode-raw-octets 2))) | |
466 (t | |
467 (setq u3 (consume-octet 2)) | |
468 (cond | |
469 ((< u1 #xf0) ; 3 octets | |
470 (if (and (< (f-logxor u2 #x80) #x40) | |
471 (< (f-logxor u3 #x80) #x40) | |
472 (or (>= u1 #xe1) (>= u2 #xa0… | |
473 (let ((start (f-logior (f-ash (f-… | |
474 (f-ash (f-… | |
475 (if (<= #xd800 start #xdfc0) | |
476 (encode-raw-octets 3) | |
477 (logior start (f-logand u3 … | |
478 (encode-raw-octets 3))) | |
479 (t ; 4 octets | |
480 (setq u4 (consume-octet 3)) | |
481 (if (and (< (f-logxor u2 #x80) #x40) | |
482 (< (f-logxor u3 #x80) #x40) | |
483 (< (f-logxor u4 #x80) #x40) | |
484 (or (>= u1 #xf1) (>= u2 #x90… | |
485 (logior | |
486 (f-logior (f-ash (f-logand u1 7)… | |
487 (f-ash (f-logxor u2 #x… | |
488 (f-logior (f-ash (f-logxor u3 #x… | |
489 (f-logxor u4 #x80))) | |
490 (encode-raw-octets 4))))))) | |
491 (t (encode-raw-octets 1))) | |
492 dest di))) | |
493 finally (return (the fixnum (- di d-start))))))) | |
494 | |
495 ;;;; UTF-16 | |
496 | |
497 ;;; TODO: add a way to pass some info at compile-time telling us that, | |
498 ;;; for example, the maximum code-point will always be < #x10000 in | |
499 ;;; which case we could simply return (* 2 (- end start)). | |
500 (defmacro utf16-octet-counter (getter type) | |
501 `(named-lambda utf-16-octet-counter (seq start end max) | |
502 (declare (type ,type seq) (fixnum start end max)) | |
503 (loop with noctets fixnum = 0 | |
504 for i fixnum from start below end | |
505 for code of-type code-point = (,getter seq i) | |
506 do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctet… | |
507 (if (and (plusp max) (> new max)) | |
508 (loop-finish) | |
509 (setq noctets new))) | |
510 finally (return (values noctets i))))) | |
511 | |
512 (defmacro utf-16-combine-surrogate-pairs (u1 u2) | |
513 `(the (unsigned-byte 21) | |
514 (+ #x10000 | |
515 (the (unsigned-byte 20) | |
516 (logior | |
517 (the (unsigned-byte 20) | |
518 (ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10)) | |
519 (the (unsigned-byte 10) | |
520 (- ,u2 #xdc00))))))) | |
521 | |
522 (defmacro define-utf-16 (name &optional endianness) | |
523 (check-type endianness (or null (eql :be) (eql :le))) | |
524 (check-type name keyword) | |
525 (let ((swap-var (gensym "SWAP")) | |
526 (code-point-counter-name | |
527 (format-symbol t '#:~a-code-point-counter (string name))) | |
528 (encoder-name (format-symbol t '#:~a-encoder (string name))) | |
529 (decoder-name (format-symbol t '#:~a-decoder (string name)))) | |
530 (labels ((make-bom-check-form (end start getter seq) | |
531 (if (null endianness) | |
532 ``((,',swap-var | |
533 (when (> ,,end ,,start) | |
534 (case (,,getter ,,seq ,,start 2 :ne) | |
535 (#.+byte-order-mark-code+ (incf ,,start 2) nil) | |
536 (#.+swapped-byte-order-mark-code+ (incf ,,star… | |
537 (t #+little-endian t))))) | |
538 '())) | |
539 (make-getter-form (getter src i) | |
540 (case endianness | |
541 (:le ``(,,getter ,,src ,,i 2 :le)) | |
542 (:be ``(,,getter ,,src ,,i 2 :be)) | |
543 (t ``(if ,',swap-var | |
544 (,,getter ,,src ,,i 2 :re) | |
545 (,,getter ,,src ,,i 2 :ne))))) | |
546 (make-setter-form (setter code dest di) | |
547 (case endianness | |
548 (:be ``(,,setter ,,code ,,dest ,,di 2 :be)) | |
549 (:le ``(,,setter ,,code ,,dest ,,di 2 :le)) | |
550 (t ``(,,setter ,,code ,,dest ,,di 2 :ne))))) | |
551 `(progn | |
552 (define-octet-counter ,name (getter type) | |
553 `(utf16-octet-counter ,getter ,type)) | |
554 (define-code-point-counter ,name (getter type) | |
555 `(named-lambda ,',code-point-counter-name (seq start end max) | |
556 (declare (type ,type seq) (fixnum start end max)) | |
557 (let* ,,(make-bom-check-form ''end ''start 'getter ''seq) | |
558 (loop with count fixnum = 0 | |
559 with i fixnum = start | |
560 while (<= i (- end 2)) do | |
561 (let* ((code ,,(make-getter-form 'getter ''seq ''i)) | |
562 (next-i (+ i (if (or (< code #xd800) (>= code #… | |
563 2 | |
564 4)))) | |
565 (declare (type (unsigned-byte 16) code) (fixnum next… | |
566 (cond | |
567 ((> next-i end) | |
568 (decoding-error | |
569 (vector (,getter seq i) (,getter seq (1+ i))) | |
570 ,',name seq i nil 'end-of-input-in-character) | |
571 (return (values count i))) | |
572 (t | |
573 (setq i next-i | |
574 count (1+ count)) | |
575 (when (and (plusp max) (= count max)) | |
576 (return (values count i)))))) | |
577 finally (progn | |
578 (assert (= i end)) | |
579 (return (values count i))))))) | |
580 (define-encoder ,name (getter src-type setter dest-type) | |
581 `(named-lambda ,',encoder-name (src start end dest d-start) | |
582 (declare (type ,src-type src) | |
583 (type ,dest-type dest) | |
584 (fixnum start end d-start)) | |
585 (loop with di fixnum = d-start | |
586 for i fixnum from start below end | |
587 for code of-type code-point = (,getter src i) | |
588 for high-bits fixnum = (- code #x10000) do | |
589 (cond ((< high-bits 0) | |
590 ,,(make-setter-form 'setter ''code ''dest ''di) | |
591 (incf di 2)) | |
592 (t | |
593 ,,(make-setter-form | |
594 'setter ''(logior #xd800 (f-ash high-bits … | |
595 ''dest ''di) | |
596 ,,(make-setter-form | |
597 'setter ''(logior #xdc00 (f-logand high-bi… | |
598 ''dest ''(+ di 2)) | |
599 (incf di 4))) | |
600 finally (return (the fixnum (- di d-start)))))) | |
601 (define-decoder ,name (getter src-type setter dest-type) | |
602 `(named-lambda ,',decoder-name (src start end dest d-start) | |
603 (declare (type ,src-type src) | |
604 (type ,dest-type dest) | |
605 (fixnum start end d-start)) | |
606 (let ,,(make-bom-check-form ''end ''start 'getter ''src) | |
607 (loop with i fixnum = start | |
608 for di fixnum from d-start | |
609 until (= i end) do | |
610 (let ((u1 ,,(make-getter-form 'getter ''src ''i))) | |
611 (declare (type (unsigned-byte 16) u1)) | |
612 (incf i 2) | |
613 (,setter (cond | |
614 ((or (< u1 #xd800) (>= u1 #xe000)) ; 2 oc… | |
615 u1) | |
616 ((< u1 #xdc00) ; 4 octets | |
617 (let ((u2 ,,(make-getter-form 'getter ''… | |
618 (declare (type (unsigned-byte 16) u2)) | |
619 (incf i 2) | |
620 (if (and (>= u2 #xdc00) (< u2 #xe000)) | |
621 (utf-16-combine-surrogate-pairs u1 u… | |
622 (decoding-error | |
623 (vector (,getter src (- i 4)) | |
624 (,getter src (- i 3)) | |
625 (,getter src (- i 2)) | |
626 (,getter src (- i 1))) | |
627 ,',name src i +repl+)))) | |
628 (t | |
629 (decoding-error (vector (,getter src (-… | |
630 (,getter src (-… | |
631 ,',name src i +repl+))) | |
632 dest di)) | |
633 finally (return (the fixnum (- di d-start))))))) | |
634 ',name)))) | |
635 | |
636 (define-character-encoding :utf-16 | |
637 "A 16-bit, variable-length encoding in which characters with | |
638 code points less than #x10000 can be encoded in a single 16-bit | |
639 word and characters with larger codes can be encoded in a pair of | |
640 16-bit words. The endianness of the encoded data is indicated by | |
641 the endianness of a byte-order-mark character (#\u+feff) | |
642 prepended to the data; in the absence of such a character on | |
643 input, the data is assumed to be in big-endian order. Output is | |
644 written in native byte-order with a leading byte-order mark." | |
645 :max-units-per-char 2 | |
646 :code-unit-size 16 | |
647 :native-endianness t ; not necessarily true when decoding | |
648 :decode-literal-code-unit-limit #xd800 | |
649 :encode-literal-code-unit-limit #x10000 | |
650 :use-bom #+big-endian :utf-16be #+little-endian :utf-16le | |
651 :bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe) | |
652 :nul-encoding #(0 0) | |
653 :default-replacement #xfffd | |
654 :ambiguous #+little-endian t #+big-endian nil) | |
655 | |
656 (define-utf-16 :utf-16) | |
657 | |
658 (define-character-encoding :utf-16le | |
659 "A 16-bit, variable-length encoding in which characters with | |
660 code points less than #x10000 can be encoded in a single 16-bit | |
661 word and characters with larger codes can be encoded in a pair of | |
662 16-bit words. The data is assumed to be in little-endian order. Output… | |
663 written in little-endian byte-order without a leading byte-order mark." | |
664 :aliases '(:utf-16/le) | |
665 :max-units-per-char 2 | |
666 :code-unit-size 16 | |
667 :native-endianness #+little-endian t #+big-endian nil | |
668 :decode-literal-code-unit-limit #xd800 | |
669 :encode-literal-code-unit-limit #x10000 | |
670 :nul-encoding #(0 0) | |
671 :default-replacement #xfffd) | |
672 | |
673 (define-utf-16 :utf-16le :le) | |
674 | |
675 (define-character-encoding :utf-16be | |
676 "A 16-bit, variable-length encoding in which characters with | |
677 code points less than #x10000 can be encoded in a single 16-bit | |
678 word and characters with larger codes can be encoded in a pair of | |
679 16-bit words. The data is assumed to be in big-endian order. Output is | |
680 written in big-endian byte-order without a leading byte-order mark." | |
681 :aliases '(:utf-16/be) | |
682 :max-units-per-char 2 | |
683 :code-unit-size 16 | |
684 :native-endianness #+little-endian nil #+big-endian t | |
685 :decode-literal-code-unit-limit #xd800 | |
686 :encode-literal-code-unit-limit #x10000 | |
687 :nul-encoding #(0 0) | |
688 :default-replacement #xfffd) | |
689 | |
690 (define-utf-16 :utf-16be :be) | |
691 | |
692 (defmacro define-ucs (name bytes &optional endianness (limit #x110000)) | |
693 (check-type name keyword) | |
694 (check-type bytes (or (eql 2) (eql 4))) | |
695 (check-type endianness (or null (eql :le) (eql :be))) | |
696 (let ((swap-var (gensym "SWAP")) | |
697 (code-point-counter-name | |
698 (format-symbol t '#:~a-code-point-counter (string name))) | |
699 (encoder-name | |
700 (format-symbol t '#:~a-encoder (string name))) | |
701 (decoder-name | |
702 (format-symbol t '#:~a-decoder (string name)))) | |
703 (labels ((make-bom-check-form (end start getter src) | |
704 (if (null endianness) | |
705 ``(when (not (zerop (- ,,end ,,start))) | |
706 (case (,,getter ,,src 0 ,',bytes :ne) | |
707 (#.+byte-order-mark-code+ | |
708 (incf ,,start ,',bytes) nil) | |
709 (#.+swapped-byte-order-mark-code-32+ | |
710 (incf ,,start ,',bytes) t) | |
711 (t #+little-endian t))) | |
712 '())) | |
713 (make-setter-form (setter code dest di) | |
714 ``(,,setter ,,code ,,dest ,,di ,',bytes | |
715 ,',(or endianness :ne))) | |
716 (make-getter-form (getter src i) | |
717 (if (null endianness) | |
718 ``(if ,',swap-var | |
719 (,,getter ,,src ,,i ,',bytes :re) | |
720 (,,getter ,,src ,,i ,',bytes :ne)) | |
721 ``(,,getter ,,src ,,i ,',bytes ,',endianness)))) | |
722 `(progn | |
723 (define-code-point-counter ,name (getter type) | |
724 `(named-lambda ,',code-point-counter-name (seq start end max) | |
725 (declare (type ,type seq) (fixnum start end max)) | |
726 ;; check for bom | |
727 ,,(make-bom-check-form ''end ''start 'getter ''seq) | |
728 (multiple-value-bind (count rem) | |
729 (floor (- end start) ,',bytes) | |
730 (cond | |
731 ((and (plusp max) (> count max)) | |
732 (values max (the fixnum (+ start (* ,',bytes max))))) | |
733 (t | |
734 ;; check for incomplete last character | |
735 (unless (zerop rem) | |
736 (let ((vector (make-array ,',bytes :fill-pointer 0)… | |
737 (dotimes (i rem) | |
738 (vector-push (,getter seq (+ i (- end rem))) ve… | |
739 (decoding-error vector ,',name seq (the fixnum (-… | |
740 'end-of-input-in-character) | |
741 (decf end rem))) | |
742 (values count end)))))) | |
743 (define-encoder ,name (getter src-type setter dest-type) | |
744 `(named-lambda ,',encoder-name (src start end dest d-start) | |
745 (declare (type ,src-type src) | |
746 (type ,dest-type dest) | |
747 (fixnum start end d-start)) | |
748 (loop for i fixnum from start below end | |
749 and di fixnum from d-start by ,',bytes | |
750 for code of-type code-point = (,getter src i) | |
751 do (if (>= code ,',limit) | |
752 (encoding-error code ,',name src i +repl+) | |
753 ,,(make-setter-form 'setter ''code ''dest ''d… | |
754 finally (return (the fixnum (- di d-start)))))) | |
755 (define-decoder ,name (getter src-type setter dest-type) | |
756 `(named-lambda ,',decoder-name (src start end dest d-start) | |
757 (declare (type ,src-type src) | |
758 (type ,dest-type dest) | |
759 (fixnum start end d-start)) | |
760 (let ((,',swap-var ,,(make-bom-check-form ''end ''start 'g… | |
761 (declare (ignorable ,',swap-var)) | |
762 (loop for i fixnum from start below end by ,',bytes | |
763 and di from d-start | |
764 do (,setter (let ((unit ,,(make-getter-form 'gette… | |
765 (if (>= unit ,',limit) | |
766 (decoding-error | |
767 (vector (,getter src i) | |
768 (,getter src (+ i 1)) | |
769 ,@,(if (= bytes 4) | |
770 ``((,getter src … | |
771 (,getter src … | |
772 ,',name src i +repl+ | |
773 'character-out-of-range) | |
774 unit)) | |
775 dest di) | |
776 finally (return (the fixnum (- di d-start))))))) | |
777 ',name)))) | |
778 | |
779 ;;;; UTF-32 | |
780 | |
781 (define-character-encoding :utf-32 | |
782 "A 32-bit, fixed-length encoding in which all Unicode | |
783 characters can be encoded in a single 32-bit word. The | |
784 endianness of the encoded data is indicated by the endianness of | |
785 a byte-order-mark character (#\u+feff) prepended to the data; in | |
786 the absence of such a character on input, input data is assumed | |
787 to be in big-endian order. Output is written in native byte | |
788 order with a leading byte-order mark." | |
789 :aliases '(:ucs-4) | |
790 :max-units-per-char 1 | |
791 :code-unit-size 32 | |
792 :native-endianness t ; not necessarily true when decoding | |
793 :literal-char-code-limit #x110000 | |
794 :use-bom #+little-endian :utf-32le #+big-endian :utf-32be | |
795 :bom-encoding | |
796 #+big-endian #(#x00 #x00 #xfe #xff) | |
797 #+little-endian #(#xff #xfe #x00 #x00) | |
798 :nul-encoding #(0 0 0 0) | |
799 :ambiguous #+little-endian t #+big-endian nil) | |
800 | |
801 (define-ucs :utf-32 4) | |
802 | |
803 (define-character-encoding :utf-32le | |
804 "A 32-bit, fixed-length encoding in which all Unicode | |
805 characters can be encoded in a single 32-bit word. Input data is assumed | |
806 to be in little-endian order. Output is also written in little-endian b… | |
807 order without a leading byte-order mark." | |
808 :max-units-per-char 1 | |
809 :code-unit-size 32 | |
810 :aliases '(:utf-32/le :ucs-4le :ucs-4/le) | |
811 :native-endianness #+little-endian t #+big-endian nil | |
812 :literal-char-code-limit #x110000 | |
813 :nul-encoding #(0 0 0 0)) | |
814 | |
815 (define-ucs :utf-32le 4 :le) | |
816 | |
817 (define-character-encoding :utf-32be | |
818 "A 32-bit, fixed-length encoding in which all Unicode | |
819 characters can be encoded in a single 32-bit word. Input data is assumed | |
820 to be in big-endian order. Output is also written in big-endian byte | |
821 order without a leading byte-order mark." | |
822 :max-units-per-char 1 | |
823 :code-unit-size 32 | |
824 :aliases '(:utf-32/be :ucs-4be :ucs-4/be) | |
825 :native-endianness #+little-endian nil #+big-endian t | |
826 :literal-char-code-limit #x110000 | |
827 :nul-encoding #(0 0 0 0)) | |
828 | |
829 (define-ucs :utf-32be 4 :be) | |
830 | |
831 ;; UCS-2 | |
832 | |
833 (define-character-encoding :ucs-2 | |
834 "A 16-bit, fixed-length encoding in which all Unicode | |
835 characters can be encoded in a single 16-bit word. The | |
836 endianness of the encoded data is indicated by the endianness of | |
837 a byte-order-mark character (#\u+feff) prepended to the data; in | |
838 the absence of such a character on input, input data is assumed | |
839 to be in big-endian order. Output is written in native byte | |
840 order with a leading byte-order mark." | |
841 :aliases '(:ucs-2) | |
842 :max-units-per-char 1 | |
843 :code-unit-size 16 | |
844 :native-endianness t ; not necessarily true when decoding | |
845 :literal-char-code-limit #x10000 | |
846 :use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be | |
847 :bom-encoding | |
848 #+big-endian #(#xfe #xff) | |
849 #+little-endian #(#xff #xfe) | |
850 :nul-encoding #(0 0) | |
851 :ambiguous #+little-endian t #+big-endian nil) | |
852 | |
853 (define-ucs :ucs-2 2 nil #x10000) | |
854 | |
855 (define-character-encoding :ucs-2le | |
856 "A 16-bit, fixed-length encoding in which all Unicode | |
857 characters can be encoded in a single 16-bit word. Input data is assumed | |
858 to be in little-endian order. Output is also written in little-endian b… | |
859 order without a leading byte-order mark." | |
860 :max-units-per-char 1 | |
861 :code-unit-size 16 | |
862 :aliases '(:ucs-2/le) | |
863 :native-endianness #+little-endian t #+big-endian nil | |
864 :literal-char-code-limit #x10000 | |
865 :nul-encoding #(0 0)) | |
866 | |
867 (define-ucs :ucs-2le 2 :le #x10000) | |
868 | |
869 (define-character-encoding :ucs-2be | |
870 "A 16-bit, fixed-length encoding in which all Unicode | |
871 characters can be encoded in a single 16-bit word. Input data is assumed | |
872 to be in big-endian order. Output is also written in big-endian byte | |
873 order without a leading byte-order mark." | |
874 :max-units-per-char 1 | |
875 :code-unit-size 16 | |
876 :aliases '(:ucs-2/be) | |
877 :native-endianness #+little-endian nil #+big-endian t | |
878 :literal-char-code-limit #x10000 | |
879 :nul-encoding #(0 0)) | |
880 | |
881 (define-ucs :ucs-2be 2 :be #x10000) |