decode.lisp - clic - Clic is an command line interactive client for gopher writ… | |
git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65… | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
decode.lisp (25456B) | |
--- | |
1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 1… | |
2 ;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/… | |
3 | |
4 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved. | |
5 | |
6 ;;; Redistribution and use in source and binary forms, with or without | |
7 ;;; modification, are permitted provided that the following conditions | |
8 ;;; are met: | |
9 | |
10 ;;; * Redistributions of source code must retain the above copyright | |
11 ;;; notice, this list of conditions and the following disclaimer. | |
12 | |
13 ;;; * Redistributions in binary form must reproduce the above | |
14 ;;; copyright notice, this list of conditions and the following | |
15 ;;; disclaimer in the documentation and/or other materials | |
16 ;;; provided with the distribution. | |
17 | |
18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED | |
19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
21 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | |
22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |
24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | |
26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 | |
30 (in-package :flexi-streams) | |
31 | |
32 (defun recover-from-encoding-error (external-format format-control &rest… | |
33 "Helper function used by OCTETS-TO-CHAR-CODE below to deal with | |
34 encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns | |
35 its character code in this case. Otherwise signals an | |
36 EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this | |
37 function and provides a corresponding USE-VALUE restart." | |
38 (when *substitution-char* | |
39 (return-from recover-from-encoding-error (char-code *substitution-ch… | |
40 (restart-case | |
41 (apply #'signal-encoding-error external-format format-control form… | |
42 (use-value (char) | |
43 :report "Specify a character to be used instead." | |
44 :interactive (lambda () | |
45 (loop | |
46 (format *query-io* "Type a character: ") | |
47 (let ((line (read-line *query-io*))) | |
48 (when (= 1 (length line)) | |
49 (return (list (char line 0))))))) | |
50 (char-code char)))) | |
51 | |
52 (defgeneric octets-to-char-code (format reader) | |
53 (declare #.*standard-optimize-settings*) | |
54 (:documentation "Converts a sequence of octets to a character code | |
55 \(which is returned, or NIL in case of EOF) using the external format | |
56 FORMAT. The sequence is obtained by calling the function \(which must | |
57 be a functional object) READER with no arguments which should return | |
58 one octet per call. In the case of EOF, READER should return NIL. | |
59 | |
60 The special variable *CURRENT-UNREADER* must be bound correctly | |
61 whenever this function is called.")) | |
62 | |
63 (defgeneric octets-to-string* (format sequence start end) | |
64 (declare #.*standard-optimize-settings*) | |
65 (:documentation "A generic function which dispatches on the external | |
66 format and does the real work for OCTETS-TO-STRING.")) | |
67 | |
68 (defmethod octets-to-string* :around (format (list list) start end) | |
69 (declare #.*standard-optimize-settings*) | |
70 (octets-to-string* format (coerce list 'vector) start end)) | |
71 | |
72 (defmacro define-sequence-readers ((format-class) &body body) | |
73 "Non-hygienic utility macro which defines methods for READ-SEQUENCE* | |
74 and OCTETS-TO-STRING* for the class FORMAT-CLASS. BODY is described | |
75 in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain | |
76 a form \(UNGET <form>) which has to be replaced by the correct code to | |
77 `unread' the octets for the character designated by <form>." | |
78 (let* ((body `((block char-decoder | |
79 (locally | |
80 (declare #.*fixnum-optimize-settings*) | |
81 ,@body))))) | |
82 `(progn | |
83 (defmethod read-sequence* ((format ,format-class) flexi-input-str… | |
84 (with-accessors ((position flexi-stream-position) | |
85 (bound flexi-stream-bound) | |
86 (octet-stack flexi-stream-octet-stack) | |
87 (last-octet flexi-stream-last-octet) | |
88 (last-char-code flexi-stream-last-char-code) | |
89 (stream flexi-stream-stream)) | |
90 flexi-input-stream | |
91 (let* (buffer | |
92 (buffer-pos 0) | |
93 (buffer-end 0) | |
94 (index start) | |
95 donep | |
96 ;; whether we will later be able to rewind the stream … | |
97 ;; needed (to get rid of unused octets in the buffer) | |
98 (can-rewind-p (maybe-rewind stream 0)) | |
99 (factor (encoding-factor format)) | |
100 (integer-factor (floor factor)) | |
101 ;; it's an interesting question whether it makes sense | |
102 ;; performance-wise to make RESERVE significantly bigg… | |
103 ;; (and thus put potentially a lot more octets into | |
104 ;; OCTET-STACK), especially for UTF-8 | |
105 (reserve (cond ((or (not (floatp factor)) | |
106 (not can-rewind-p)) 0) | |
107 (t (ceiling (* (- factor integer-factor… | |
108 (declare (fixnum buffer-pos buffer-end index integer-factor… | |
109 (boolean can-rewind-p)) | |
110 (flet ((compute-fill-amount () | |
111 "Computes the amount of octets we can savely read … | |
112 the buffer without violating the stream's bound \(if there is one) and | |
113 without potentially reading much more than we need \(unless we can | |
114 rewind afterwards)." | |
115 (let ((minimum (min (the fixnum (+ (the fixnum (* … | |
116 … | |
117 reserve)) | |
118 +buffer-size+))) | |
119 (cond (bound (min minimum (- bound position))) | |
120 (t minimum)))) | |
121 (fill-buffer (end) | |
122 "Tries to fill the buffer from BUFFER-POS to END a… | |
123 returns NIL if the buffer doesn't contain any new data." | |
124 (when donep | |
125 (return-from fill-buffer nil)) | |
126 ;; put data from octet stack into buffer if there … | |
127 (loop | |
128 (when (>= buffer-pos end) | |
129 (return)) | |
130 (let ((next-octet (pop octet-stack))) | |
131 (cond (next-octet | |
132 (setf (aref (the (array octet *) buffer)… | |
133 (incf buffer-pos)) | |
134 (t (return))))) | |
135 (setq buffer-end (read-sequence buffer stream | |
136 :start buffer-pos | |
137 :end end)) | |
138 ;; we reached EOF, so we remember this | |
139 (when (< buffer-end end) | |
140 (setq donep t)) | |
141 ;; BUFFER-POS is only greater than zero if the buf… | |
142 ;; already contains unread data from the octet sta… | |
143 ;; (see below), so we test for ZEROP here and do /… | |
144 ;; compare with BUFFER-POS | |
145 (unless (zerop buffer-end) | |
146 (incf position buffer-end)))) | |
147 (let ((minimum (compute-fill-amount))) | |
148 (declare (fixnum minimum)) | |
149 (setq buffer (make-octet-buffer minimum)) | |
150 ;; fill buffer for the first time or return immediately… | |
151 ;; we don't succeed | |
152 (unless (fill-buffer minimum) | |
153 (return-from read-sequence* start))) | |
154 (setq buffer-pos 0) | |
155 (macrolet ((iterate (set-place) | |
156 "A very unhygienic macro to implement the | |
157 actual iteration through the sequence including housekeeping for the | |
158 flexi stream. SET-PLACE is the place \(using the index INDEX) used to | |
159 access the sequence." | |
160 `(flet ((leave () | |
161 "This is the function used to | |
162 abort the LOOP iteration below." | |
163 (when (> index start) | |
164 (setq last-octet nil | |
165 last-char-code ,(sublis '(… | |
166 (return-from read-sequence* index)… | |
167 (loop | |
168 (when (>= index end) | |
169 ;; check if there are octets in the | |
170 ;; buffer we didn't use - see | |
171 ;; COMPUTE-FILL-AMOUNT above | |
172 (let ((rest (- buffer-end buffer-pos))) | |
173 (when (plusp rest) | |
174 (or (and can-rewind-p | |
175 (maybe-rewind stream rest… | |
176 (loop | |
177 (when (>= buffer-pos buffer-e… | |
178 (return)) | |
179 (decf buffer-end) | |
180 (push (aref (the (array octet… | |
181 octet-stack))))) | |
182 (leave)) | |
183 (let ((next-char-code | |
184 (progn (symbol-macrolet | |
185 ((octet-getter | |
186 ;; this is the code … | |
187 ;; NIL) and to fill … | |
188 (block next-octet | |
189 (when (>= buffer-p… | |
190 (setq buffer-pos… | |
191 (unless (fill-bu… | |
192 (return-from n… | |
193 (prog1 | |
194 (aref (the (ar… | |
195 (incf buffer-pos… | |
196 (macrolet ((unget (form) | |
197 `(unread-ch… | |
198 ,',@body))))) | |
199 (unless next-char-code | |
200 (leave)) | |
201 (setf ,set-place (code-char next-char-… | |
202 (incf index)))))) | |
203 (etypecase sequence | |
204 (string (iterate (char sequence index))) | |
205 (array (iterate (aref sequence index))) | |
206 (list (iterate (nth index sequence))))))))) | |
207 (defmethod octets-to-string* ((format ,format-class) sequence sta… | |
208 (declare #.*standard-optimize-settings*) | |
209 (declare (fixnum start end)) | |
210 (let* ((i start) | |
211 (string-length (compute-number-of-chars format sequence … | |
212 (string (make-array string-length :element-type 'char*))) | |
213 (declare (fixnum i string-length)) | |
214 (loop for j of-type fixnum from 0 below string-length | |
215 do (setf (schar string j) | |
216 (code-char (macrolet ((unget (form) | |
217 `(decf i (character-le… | |
218 (symbol-macrolet ((octet-getter (… | |
219 … | |
220 … | |
221 … | |
222 ,@body)))) | |
223 finally (return string))))))) | |
224 | |
225 (defmacro define-char-decoders ((lf-format-class cr-format-class crlf-fo… | |
226 "Non-hygienic utility macro which defines several decoding-related | |
227 methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and | |
228 CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same | |
229 encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and | |
230 similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class. | |
231 BODY is a code template for the code to read octets and return one | |
232 character code. BODY must contain a symbol OCTET-GETTER representing | |
233 the form which is used to obtain the next octet." | |
234 (let* ((body (with-unique-names (char-code) | |
235 `((let ((,char-code (progn ,@body))) | |
236 (when (and ,char-code | |
237 (or (<= #xd8 (logand* #x00ff (ash* ,char… | |
238 (> ,char-code #x10ffff))) | |
239 (recover-from-encoding-error format "Illegal code… | |
240 ,char-code))))) | |
241 `(progn | |
242 (defmethod octets-to-char-code ((format ,lf-format-class) reader) | |
243 (declare #.*fixnum-optimize-settings*) | |
244 (declare (function reader)) | |
245 (symbol-macrolet ((octet-getter (funcall reader))) | |
246 ,@(sublis '((char-decoder . octets-to-char-code)) | |
247 body))) | |
248 (define-sequence-readers (,lf-format-class) ,@body) | |
249 (define-sequence-readers (,cr-format-class) | |
250 ,(with-unique-names (char-code) | |
251 `(let ((,char-code (progn ,@body))) | |
252 (case ,char-code | |
253 (#.+cr+ #.(char-code #\Newline)) | |
254 (otherwise ,char-code))))) | |
255 (define-sequence-readers (,crlf-format-class) | |
256 ,(with-unique-names (char-code next-char-code get-char-code) | |
257 `(flet ((,get-char-code () ,@body)) | |
258 (let ((,char-code (,get-char-code))) | |
259 (case ,char-code | |
260 (#.+cr+ | |
261 (let ((,next-char-code (,get-char-code))) | |
262 (case ,next-char-code | |
263 (#.+lf+ #.(char-code #\Newline)) | |
264 ;; we saw a CR but no LF afterwards, but then th… | |
265 ;; ended, so we just return #\Return | |
266 ((nil) +cr+) | |
267 ;; if the character we peeked at wasn't a | |
268 ;; linefeed character we unread its constituents | |
269 (otherwise (unget (code-char ,next-char-code)) | |
270 ,char-code)))) | |
271 (otherwise ,char-code))))))))) | |
272 | |
273 (define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flex… | |
274 octet-getter) | |
275 | |
276 (define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-cr… | |
277 (when-let (octet octet-getter) | |
278 (if (> (the octet octet) 127) | |
279 (recover-from-encoding-error format | |
280 "No character which corresponds to oc… | |
281 octet))) | |
282 | |
283 (define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-cr… | |
284 (with-accessors ((decoding-table external-format-decoding-table)) | |
285 format | |
286 (when-let (octet octet-getter) | |
287 (let ((char-code (aref (the (simple-array char-code-integer *) dec… | |
288 (the octet octet)))) | |
289 (if (or (null char-code) | |
290 (= (the char-code-integer char-code) 65533)) | |
291 (recover-from-encoding-error format | |
292 "No character which corresponds t… | |
293 char-code))))) | |
294 | |
295 (define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-cr… | |
296 (let (first-octet-seen) | |
297 (declare (boolean first-octet-seen)) | |
298 (macrolet ((read-next-byte () | |
299 '(prog1 | |
300 (or octet-getter | |
301 (cond (first-octet-seen | |
302 (return-from char-decoder | |
303 (recover-from-encoding-error format | |
304 "End of … | |
305 (t (return-from char-decoder nil)))) | |
306 (setq first-octet-seen t)))) | |
307 (flet ((recover-from-overlong-sequence (value) | |
308 (restart-case | |
309 (recover-from-encoding-error format "`Overlong' UTF-8… | |
310 value) | |
311 (accept-overlong-sequence () | |
312 :report "Accept the code point and continue." | |
313 value)))) | |
314 (let ((octet (read-next-byte))) | |
315 (declare (type octet octet)) | |
316 (block utf-8-sequence | |
317 (multiple-value-bind (start count) | |
318 (cond ((not (logbitp 7 octet)) | |
319 ;; avoid the overlong checks below | |
320 (return-from utf-8-sequence octet)) | |
321 ((= #b11000000 (logand* octet #b11100000)) | |
322 (values (logand* octet #b00011111) 1)) | |
323 ((= #b11100000 (logand* octet #b11110000)) | |
324 (values (logand* octet #b00001111) 2)) | |
325 ((= #b11110000 (logand* octet #b11111000)) | |
326 (values (logand* octet #b00000111) 3)) | |
327 (t (return-from char-decoder | |
328 (recover-from-encoding-error format | |
329 "Unexpected valu… | |
330 octet)))) | |
331 (declare (fixnum count)) | |
332 (loop for result of-type code-point | |
333 = start then (+ (ash* result 6) | |
334 (logand* octet #b111111)) | |
335 repeat count | |
336 for octet of-type octet = (read-next-byte) | |
337 unless (= #b10000000 (logand* octet #b11000000)) | |
338 do (return-from char-decoder | |
339 (recover-from-encoding-error format | |
340 "Unexpected value … | |
341 finally (return (cond ((< result (ecase count | |
342 (1 #x00080) | |
343 (2 #x00800) | |
344 (3 #x10000))) | |
345 (recover-from-overlong-sequen… | |
346 (t result))))))))))) | |
347 | |
348 (define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format … | |
349 (let (first-octet-seen) | |
350 (declare (boolean first-octet-seen)) | |
351 (macrolet ((read-next-byte () | |
352 '(prog1 | |
353 (or octet-getter | |
354 (cond (first-octet-seen | |
355 (return-from char-decoder | |
356 (recover-from-encoding-error format | |
357 "End of … | |
358 (t (return-from char-decoder nil)))) | |
359 (setq first-octet-seen t)))) | |
360 (flet ((read-next-word () | |
361 (+ (the octet (read-next-byte)) | |
362 (ash* (the octet (read-next-byte)) 8)))) | |
363 (declare (inline read-next-word)) | |
364 (let ((word (read-next-word))) | |
365 (declare (type (unsigned-byte 16) word)) | |
366 (cond ((<= #xd800 word #xdfff) | |
367 (let ((next-word (read-next-word))) | |
368 (declare (type (unsigned-byte 16) next-word)) | |
369 (unless (<= #xdc00 next-word #xdfff) | |
370 (return-from char-decoder | |
371 (recover-from-encoding-error format | |
372 "Unexpected UTF-16 w… | |
373 next-word word))) | |
374 (+ (ash* (logand* #b1111111111 word) 10) | |
375 (logand* #b1111111111 next-word) | |
376 #x10000))) | |
377 (t word))))))) | |
378 | |
379 (define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format … | |
380 (let (first-octet-seen) | |
381 (declare (boolean first-octet-seen)) | |
382 (macrolet ((read-next-byte () | |
383 '(prog1 | |
384 (or octet-getter | |
385 (cond (first-octet-seen | |
386 (return-from char-decoder | |
387 (recover-from-encoding-error format | |
388 "End of … | |
389 (t (return-from char-decoder nil)))) | |
390 (setq first-octet-seen t)))) | |
391 (flet ((read-next-word () | |
392 (+ (ash* (the octet (read-next-byte)) 8) | |
393 (the octet (read-next-byte))))) | |
394 (declare (inline read-next-word)) | |
395 (let ((word (read-next-word))) | |
396 (declare (type (unsigned-byte 16) word)) | |
397 (cond ((<= #xd800 word #xdfff) | |
398 (let ((next-word (read-next-word))) | |
399 (declare (type (unsigned-byte 16) next-word)) | |
400 (unless (<= #xdc00 next-word #xdfff) | |
401 (return-from char-decoder | |
402 (recover-from-encoding-error format | |
403 "Unexpected UTF-16 w… | |
404 next-word word))) | |
405 (+ (ash* (logand* #b1111111111 word) 10) | |
406 (logand* #b1111111111 next-word) | |
407 #x10000))) | |
408 (t word))))))) | |
409 | |
410 (define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format … | |
411 (let (first-octet-seen) | |
412 (declare (boolean first-octet-seen)) | |
413 (macrolet ((read-next-byte () | |
414 '(prog1 | |
415 (or octet-getter | |
416 (cond (first-octet-seen | |
417 (return-from char-decoder | |
418 (recover-from-encoding-error format | |
419 "End of … | |
420 (t (return-from char-decoder nil)))) | |
421 (setq first-octet-seen t)))) | |
422 (loop for count of-type fixnum from 0 to 24 by 8 | |
423 for octet of-type octet = (read-next-byte) | |
424 sum (ash* octet count))))) | |
425 | |
426 (define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format … | |
427 (let (first-octet-seen) | |
428 (declare (boolean first-octet-seen)) | |
429 (macrolet ((read-next-byte () | |
430 '(prog1 | |
431 (or octet-getter | |
432 (cond (first-octet-seen | |
433 (return-from char-decoder | |
434 (recover-from-encoding-error format | |
435 "End of … | |
436 (t (return-from char-decoder nil)))) | |
437 (setq first-octet-seen t)))) | |
438 (loop for count of-type fixnum from 24 downto 0 by 8 | |
439 for octet of-type octet = (read-next-byte) | |
440 sum (ash* octet count))))) | |
441 | |
442 (defmethod octets-to-char-code ((format flexi-cr-mixin) reader) | |
443 (declare #.*fixnum-optimize-settings*) | |
444 (declare (ignore reader)) | |
445 (let ((char-code (call-next-method))) | |
446 (case char-code | |
447 (#.+cr+ #.(char-code #\Newline)) | |
448 (otherwise char-code)))) | |
449 | |
450 (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader) | |
451 (declare #.*fixnum-optimize-settings*) | |
452 (declare (function *current-unreader*)) | |
453 (declare (ignore reader)) | |
454 (let ((char-code (call-next-method))) | |
455 (case char-code | |
456 (#.+cr+ | |
457 (let ((next-char-code (call-next-method))) | |
458 (case next-char-code | |
459 (#.+lf+ #.(char-code #\Newline)) | |
460 ;; we saw a CR but no LF afterwards, but then the data | |
461 ;; ended, so we just return #\Return | |
462 ((nil) +cr+) | |
463 ;; if the character we peeked at wasn't a | |
464 ;; linefeed character we unread its constituents | |
465 (otherwise (funcall *current-unreader* (code-char next-char-c… | |
466 char-code)))) | |
467 (otherwise char-code)))) | |
468 |