GopherProxy

	decode.lisp - clic - Clic is an command line interactive client for gopher writ…
	git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65…
	Log
	Files
	Refs
	Tags
	README
	LICENSE
	---
	decode.lisp (25456B)
	---
	1 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 1…
	2 ;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/…
	3
	4 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
	5
	6 ;;; Redistribution and use in source and binary forms, with or without
	7 ;;; modification, are permitted provided that the following conditions
	8 ;;; are met:
	9
	10 ;;; * Redistributions of source code must retain the above copyright
	11 ;;; notice, this list of conditions and the following disclaimer.
	12
	13 ;;; * Redistributions in binary form must reproduce the above
	14 ;;; copyright notice, this list of conditions and the following
	15 ;;; disclaimer in the documentation and/or other materials
	16 ;;; provided with the distribution.
	17
	18 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
	19 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	20 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	21 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
	22 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	23 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
	24 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	25 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	26 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	27 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	28 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	29
	30 (in-package :flexi-streams)
	31
	32 (defun recover-from-encoding-error (external-format format-control &rest…
	33 "Helper function used by OCTETS-TO-CHAR-CODE below to deal with
	34 encoding errors. Checks if SUBSTITUTION-CHAR is not NIL and returns
	35 its character code in this case. Otherwise signals an
	36 EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this
	37 function and provides a corresponding USE-VALUE restart."
	38 (when substitution-char
	39 (return-from recover-from-encoding-error (char-code *substitution-ch…
	40 (restart-case
	41 (apply #'signal-encoding-error external-format format-control form…
	42 (use-value (char)
	43 :report "Specify a character to be used instead."
	44 :interactive (lambda ()
	45 (loop
	46 (format query-io "Type a character: ")
	47 (let ((line (read-line query-io)))
	48 (when (= 1 (length line))
	49 (return (list (char line 0)))))))
	50 (char-code char))))
	51
	52 (defgeneric octets-to-char-code (format reader)
	53 (declare #.standard-optimize-settings)
	54 (:documentation "Converts a sequence of octets to a character code
	55 \(which is returned, or NIL in case of EOF) using the external format
	56 FORMAT. The sequence is obtained by calling the function \(which must
	57 be a functional object) READER with no arguments which should return
	58 one octet per call. In the case of EOF, READER should return NIL.
	59
	60 The special variable CURRENT-UNREADER must be bound correctly
	61 whenever this function is called."))
	62
	63 (defgeneric octets-to-string* (format sequence start end)
	64 (declare #.standard-optimize-settings)
	65 (:documentation "A generic function which dispatches on the external
	66 format and does the real work for OCTETS-TO-STRING."))
	67
	68 (defmethod octets-to-string* :around (format (list list) start end)
	69 (declare #.standard-optimize-settings)
	70 (octets-to-string* format (coerce list 'vector) start end))
	71
	72 (defmacro define-sequence-readers ((format-class) &body body)
	73 "Non-hygienic utility macro which defines methods for READ-SEQUENCE*
	74 and OCTETS-TO-STRING* for the class FORMAT-CLASS. BODY is described
	75 in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain
	76 a form \(UNGET <form>) which has to be replaced by the correct code to
	77 `unread' the octets for the character designated by <form>."
	78 (let* ((body `((block char-decoder
	79 (locally
	80 (declare #.fixnum-optimize-settings)
	81 ,@body)))))
	82 `(progn
	83 (defmethod read-sequence* ((format ,format-class) flexi-input-str…
	84 (with-accessors ((position flexi-stream-position)
	85 (bound flexi-stream-bound)
	86 (octet-stack flexi-stream-octet-stack)
	87 (last-octet flexi-stream-last-octet)
	88 (last-char-code flexi-stream-last-char-code)
	89 (stream flexi-stream-stream))
	90 flexi-input-stream
	91 (let* (buffer
	92 (buffer-pos 0)
	93 (buffer-end 0)
	94 (index start)
	95 donep
	96 ;; whether we will later be able to rewind the stream …
	97 ;; needed (to get rid of unused octets in the buffer)
	98 (can-rewind-p (maybe-rewind stream 0))
	99 (factor (encoding-factor format))
	100 (integer-factor (floor factor))
	101 ;; it's an interesting question whether it makes sense
	102 ;; performance-wise to make RESERVE significantly bigg…
	103 ;; (and thus put potentially a lot more octets into
	104 ;; OCTET-STACK), especially for UTF-8
	105 (reserve (cond ((or (not (floatp factor))
	106 (not can-rewind-p)) 0)
	107 (t (ceiling (* (- factor integer-factor…
	108 (declare (fixnum buffer-pos buffer-end index integer-factor…
	109 (boolean can-rewind-p))
	110 (flet ((compute-fill-amount ()
	111 "Computes the amount of octets we can savely read …
	112 the buffer without violating the stream's bound \(if there is one) and
	113 without potentially reading much more than we need \(unless we can
	114 rewind afterwards)."
	115 (let ((minimum (min (the fixnum (+ (the fixnum (* …
	116 …
	117 reserve))
	118 +buffer-size+)))
	119 (cond (bound (min minimum (- bound position)))
	120 (t minimum))))
	121 (fill-buffer (end)
	122 "Tries to fill the buffer from BUFFER-POS to END a…
	123 returns NIL if the buffer doesn't contain any new data."
	124 (when donep
	125 (return-from fill-buffer nil))
	126 ;; put data from octet stack into buffer if there …
	127 (loop
	128 (when (>= buffer-pos end)
	129 (return))
	130 (let ((next-octet (pop octet-stack)))
	131 (cond (next-octet
	132 (setf (aref (the (array octet *) buffer)…
	133 (incf buffer-pos))
	134 (t (return)))))
	135 (setq buffer-end (read-sequence buffer stream
	136 :start buffer-pos
	137 :end end))
	138 ;; we reached EOF, so we remember this
	139 (when (< buffer-end end)
	140 (setq donep t))
	141 ;; BUFFER-POS is only greater than zero if the buf…
	142 ;; already contains unread data from the octet sta…
	143 ;; (see below), so we test for ZEROP here and do /…
	144 ;; compare with BUFFER-POS
	145 (unless (zerop buffer-end)
	146 (incf position buffer-end))))
	147 (let ((minimum (compute-fill-amount)))
	148 (declare (fixnum minimum))
	149 (setq buffer (make-octet-buffer minimum))
	150 ;; fill buffer for the first time or return immediately…
	151 ;; we don't succeed
	152 (unless (fill-buffer minimum)
	153 (return-from read-sequence* start)))
	154 (setq buffer-pos 0)
	155 (macrolet ((iterate (set-place)
	156 "A very unhygienic macro to implement the
	157 actual iteration through the sequence including housekeeping for the
	158 flexi stream. SET-PLACE is the place \(using the index INDEX) used to
	159 access the sequence."
	160 `(flet ((leave ()
	161 "This is the function used to
	162 abort the LOOP iteration below."
	163 (when (> index start)
	164 (setq last-octet nil
	165 last-char-code ,(sublis '(…
	166 (return-from read-sequence* index)…
	167 (loop
	168 (when (>= index end)
	169 ;; check if there are octets in the
	170 ;; buffer we didn't use - see
	171 ;; COMPUTE-FILL-AMOUNT above
	172 (let ((rest (- buffer-end buffer-pos)))
	173 (when (plusp rest)
	174 (or (and can-rewind-p
	175 (maybe-rewind stream rest…
	176 (loop
	177 (when (>= buffer-pos buffer-e…
	178 (return))
	179 (decf buffer-end)
	180 (push (aref (the (array octet…
	181 octet-stack)))))
	182 (leave))
	183 (let ((next-char-code
	184 (progn (symbol-macrolet
	185 ((octet-getter
	186 ;; this is the code …
	187 ;; NIL) and to fill …
	188 (block next-octet
	189 (when (>= buffer-p…
	190 (setq buffer-pos…
	191 (unless (fill-bu…
	192 (return-from n…
	193 (prog1
	194 (aref (the (ar…
	195 (incf buffer-pos…
	196 (macrolet ((unget (form)
	197 `(unread-ch…
	198 ,',@body)))))
	199 (unless next-char-code
	200 (leave))
	201 (setf ,set-place (code-char next-char-…
	202 (incf index))))))
	203 (etypecase sequence
	204 (string (iterate (char sequence index)))
	205 (array (iterate (aref sequence index)))
	206 (list (iterate (nth index sequence)))))))))
	207 (defmethod octets-to-string* ((format ,format-class) sequence sta…
	208 (declare #.standard-optimize-settings)
	209 (declare (fixnum start end))
	210 (let* ((i start)
	211 (string-length (compute-number-of-chars format sequence …
	212 (string (make-array string-length :element-type 'char*)))
	213 (declare (fixnum i string-length))
	214 (loop for j of-type fixnum from 0 below string-length
	215 do (setf (schar string j)
	216 (code-char (macrolet ((unget (form)
	217 `(decf i (character-le…
	218 (symbol-macrolet ((octet-getter (…
	219 …
	220 …
	221 …
	222 ,@body))))
	223 finally (return string)))))))
	224
	225 (defmacro define-char-decoders ((lf-format-class cr-format-class crlf-fo…
	226 "Non-hygienic utility macro which defines several decoding-related
	227 methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and
	228 CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same
	229 encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and
	230 similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class.
	231 BODY is a code template for the code to read octets and return one
	232 character code. BODY must contain a symbol OCTET-GETTER representing
	233 the form which is used to obtain the next octet."
	234 (let* ((body (with-unique-names (char-code)
	235 `((let ((,char-code (progn ,@body)))
	236 (when (and ,char-code
	237 (or (<= #xd8 (logand* #x00ff (ash* ,char…
	238 (> ,char-code #x10ffff)))
	239 (recover-from-encoding-error format "Illegal code…
	240 ,char-code)))))
	241 `(progn
	242 (defmethod octets-to-char-code ((format ,lf-format-class) reader)
	243 (declare #.fixnum-optimize-settings)
	244 (declare (function reader))
	245 (symbol-macrolet ((octet-getter (funcall reader)))
	246 ,@(sublis '((char-decoder . octets-to-char-code))
	247 body)))
	248 (define-sequence-readers (,lf-format-class) ,@body)
	249 (define-sequence-readers (,cr-format-class)
	250 ,(with-unique-names (char-code)
	251 `(let ((,char-code (progn ,@body)))
	252 (case ,char-code
	253 (#.+cr+ #.(char-code #\Newline))
	254 (otherwise ,char-code)))))
	255 (define-sequence-readers (,crlf-format-class)
	256 ,(with-unique-names (char-code next-char-code get-char-code)
	257 `(flet ((,get-char-code () ,@body))
	258 (let ((,char-code (,get-char-code)))
	259 (case ,char-code
	260 (#.+cr+
	261 (let ((,next-char-code (,get-char-code)))
	262 (case ,next-char-code
	263 (#.+lf+ #.(char-code #\Newline))
	264 ;; we saw a CR but no LF afterwards, but then th…
	265 ;; ended, so we just return #\Return
	266 ((nil) +cr+)
	267 ;; if the character we peeked at wasn't a
	268 ;; linefeed character we unread its constituents
	269 (otherwise (unget (code-char ,next-char-code))
	270 ,char-code))))
	271 (otherwise ,char-code)))))))))
	272
	273 (define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flex…
	274 octet-getter)
	275
	276 (define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-cr…
	277 (when-let (octet octet-getter)
	278 (if (> (the octet octet) 127)
	279 (recover-from-encoding-error format
	280 "No character which corresponds to oc…
	281 octet)))
	282
	283 (define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-cr…
	284 (with-accessors ((decoding-table external-format-decoding-table))
	285 format
	286 (when-let (octet octet-getter)
	287 (let ((char-code (aref (the (simple-array char-code-integer *) dec…
	288 (the octet octet))))
	289 (if (or (null char-code)
	290 (= (the char-code-integer char-code) 65533))
	291 (recover-from-encoding-error format
	292 "No character which corresponds t…
	293 char-code)))))
	294
	295 (define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-cr…
	296 (let (first-octet-seen)
	297 (declare (boolean first-octet-seen))
	298 (macrolet ((read-next-byte ()
	299 '(prog1
	300 (or octet-getter
	301 (cond (first-octet-seen
	302 (return-from char-decoder
	303 (recover-from-encoding-error format
	304 "End of …
	305 (t (return-from char-decoder nil))))
	306 (setq first-octet-seen t))))
	307 (flet ((recover-from-overlong-sequence (value)
	308 (restart-case
	309 (recover-from-encoding-error format "`Overlong' UTF-8…
	310 value)
	311 (accept-overlong-sequence ()
	312 :report "Accept the code point and continue."
	313 value))))
	314 (let ((octet (read-next-byte)))
	315 (declare (type octet octet))
	316 (block utf-8-sequence
	317 (multiple-value-bind (start count)
	318 (cond ((not (logbitp 7 octet))
	319 ;; avoid the overlong checks below
	320 (return-from utf-8-sequence octet))
	321 ((= #b11000000 (logand* octet #b11100000))
	322 (values (logand* octet #b00011111) 1))
	323 ((= #b11100000 (logand* octet #b11110000))
	324 (values (logand* octet #b00001111) 2))
	325 ((= #b11110000 (logand* octet #b11111000))
	326 (values (logand* octet #b00000111) 3))
	327 (t (return-from char-decoder
	328 (recover-from-encoding-error format
	329 "Unexpected valu…
	330 octet))))
	331 (declare (fixnum count))
	332 (loop for result of-type code-point
	333 = start then (+ (ash* result 6)
	334 (logand* octet #b111111))
	335 repeat count
	336 for octet of-type octet = (read-next-byte)
	337 unless (= #b10000000 (logand* octet #b11000000))
	338 do (return-from char-decoder
	339 (recover-from-encoding-error format
	340 "Unexpected value …
	341 finally (return (cond ((< result (ecase count
	342 (1 #x00080)
	343 (2 #x00800)
	344 (3 #x10000)))
	345 (recover-from-overlong-sequen…
	346 (t result)))))))))))
	347
	348 (define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format …
	349 (let (first-octet-seen)
	350 (declare (boolean first-octet-seen))
	351 (macrolet ((read-next-byte ()
	352 '(prog1
	353 (or octet-getter
	354 (cond (first-octet-seen
	355 (return-from char-decoder
	356 (recover-from-encoding-error format
	357 "End of …
	358 (t (return-from char-decoder nil))))
	359 (setq first-octet-seen t))))
	360 (flet ((read-next-word ()
	361 (+ (the octet (read-next-byte))
	362 (ash* (the octet (read-next-byte)) 8))))
	363 (declare (inline read-next-word))
	364 (let ((word (read-next-word)))
	365 (declare (type (unsigned-byte 16) word))
	366 (cond ((<= #xd800 word #xdfff)
	367 (let ((next-word (read-next-word)))
	368 (declare (type (unsigned-byte 16) next-word))
	369 (unless (<= #xdc00 next-word #xdfff)
	370 (return-from char-decoder
	371 (recover-from-encoding-error format
	372 "Unexpected UTF-16 w…
	373 next-word word)))
	374 (+ (ash* (logand* #b1111111111 word) 10)
	375 (logand* #b1111111111 next-word)
	376 #x10000)))
	377 (t word)))))))
	378
	379 (define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format …
	380 (let (first-octet-seen)
	381 (declare (boolean first-octet-seen))
	382 (macrolet ((read-next-byte ()
	383 '(prog1
	384 (or octet-getter
	385 (cond (first-octet-seen
	386 (return-from char-decoder
	387 (recover-from-encoding-error format
	388 "End of …
	389 (t (return-from char-decoder nil))))
	390 (setq first-octet-seen t))))
	391 (flet ((read-next-word ()
	392 (+ (ash* (the octet (read-next-byte)) 8)
	393 (the octet (read-next-byte)))))
	394 (declare (inline read-next-word))
	395 (let ((word (read-next-word)))
	396 (declare (type (unsigned-byte 16) word))
	397 (cond ((<= #xd800 word #xdfff)
	398 (let ((next-word (read-next-word)))
	399 (declare (type (unsigned-byte 16) next-word))
	400 (unless (<= #xdc00 next-word #xdfff)
	401 (return-from char-decoder
	402 (recover-from-encoding-error format
	403 "Unexpected UTF-16 w…
	404 next-word word)))
	405 (+ (ash* (logand* #b1111111111 word) 10)
	406 (logand* #b1111111111 next-word)
	407 #x10000)))
	408 (t word)))))))
	409
	410 (define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format …
	411 (let (first-octet-seen)
	412 (declare (boolean first-octet-seen))
	413 (macrolet ((read-next-byte ()
	414 '(prog1
	415 (or octet-getter
	416 (cond (first-octet-seen
	417 (return-from char-decoder
	418 (recover-from-encoding-error format
	419 "End of …
	420 (t (return-from char-decoder nil))))
	421 (setq first-octet-seen t))))
	422 (loop for count of-type fixnum from 0 to 24 by 8
	423 for octet of-type octet = (read-next-byte)
	424 sum (ash* octet count)))))
	425
	426 (define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format …
	427 (let (first-octet-seen)
	428 (declare (boolean first-octet-seen))
	429 (macrolet ((read-next-byte ()
	430 '(prog1
	431 (or octet-getter
	432 (cond (first-octet-seen
	433 (return-from char-decoder
	434 (recover-from-encoding-error format
	435 "End of …
	436 (t (return-from char-decoder nil))))
	437 (setq first-octet-seen t))))
	438 (loop for count of-type fixnum from 24 downto 0 by 8
	439 for octet of-type octet = (read-next-byte)
	440 sum (ash* octet count)))))
	441
	442 (defmethod octets-to-char-code ((format flexi-cr-mixin) reader)
	443 (declare #.fixnum-optimize-settings)
	444 (declare (ignore reader))
	445 (let ((char-code (call-next-method)))
	446 (case char-code
	447 (#.+cr+ #.(char-code #\Newline))
	448 (otherwise char-code))))
	449
	450 (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)
	451 (declare #.fixnum-optimize-settings)
	452 (declare (function current-unreader))
	453 (declare (ignore reader))
	454 (let ((char-code (call-next-method)))
	455 (case char-code
	456 (#.+cr+
	457 (let ((next-char-code (call-next-method)))
	458 (case next-char-code
	459 (#.+lf+ #.(char-code #\Newline))
	460 ;; we saw a CR but no LF afterwards, but then the data
	461 ;; ended, so we just return #\Return
	462 ((nil) +cr+)
	463 ;; if the character we peeked at wasn't a
	464 ;; linefeed character we unread its constituents
	465 (otherwise (funcall current-unreader (code-char next-char-c…
	466 char-code))))
	467 (otherwise char-code))))
	468