Introduction
Introduction Statistics Contact Development Disclaimer Help
tencodings.lisp - clic - Clic is an command line interactive client for gopher …
git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/
Log
Files
Refs
Tags
LICENSE
---
tencodings.lisp (22651B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; encodings.lisp --- Character encodings and mappings.
4 ;;;
5 ;;; Copyright (C) 2007, Luis Oliveira <[email protected]>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 (in-package #:babel-encodings)
28
29 ;;;; Character Encodings
30
31 (defclass character-encoding ()
32 ((name :initarg :name :reader enc-name
33 :initform (error "Must specify a NAME for this character encodi…
34 ;; Most of these documentation strings are taken from OpenMCL.
35 (documentation
36 :initarg :documentation :reader enc-documentation :initform nil)
37 ;; A non-exhaustive list of aliases for the encoding.
38 (aliases :initarg :aliases :initform nil :reader enc-aliases)
39 ;; Specified in bits. Usually 8, 16 or 32.
40 (code-unit-size
41 :initarg :code-unit-size :reader enc-code-unit-size :initform 8)
42 (max-units-per-char
43 :initarg :max-units-per-char :reader enc-max-units-per-char :initfor…
44 ;; If NIL, it is necessary to swap 16- and 32-bit units.
45 (native-endianness
46 :initarg :native-endianness :reader enc-native-endianness :initform …
47 ;; Code units less than this value map to themselves on input.
48 (decode-literal-code-unit-limit
49 :initarg :decode-literal-code-unit-limit :initform 0
50 :reader enc-decode-literal-code-unit-limit)
51 ;; Code points less than this value map to themselves on output.
52 (encode-literal-code-unit-limit
53 :initarg :encode-literal-code-unit-limit :initform 0
54 :reader enc-encode-literal-code-unit-limit)
55 ;; Defines whether it is necessary to prepend a byte-order-mark to
56 ;; determine the endianness.
57 (use-bom :initarg :use-bom :initform nil :reader enc-use-bom)
58 ;; How the byte-order-mark should be encoded, specified as a
59 ;; sequence of octets. NIL if it cannot be encoded.
60 (bom-encoding
61 :initarg :bom-encoding :reader enc-bom-encoding :initform nil)
62 ;; How should NUL be encoded, specified as sequence of octets.
63 (nul-encoding
64 :initarg :nul-encoding :reader enc-nul-encoding :initform #(0))
65 ;; Preferred replacement character code point.
66 (default-replacement
67 :initarg :default-replacement :reader enc-default-replacement
68 :initform #x1a)
69 ;; Does VALID-STRING => OCTETS => STRING2 guarantee a valid
70 ;; STRING2? UTF-{16,32} on little-endian plaforms don't because
71 ;; they assume different endianness on each direction.
72 (ambiguous
73 :initarg :ambiguous :reader ambiguous-encoding-p :initform nil)))
74
75 ;;; I'm too lazy to write all the identical limits twice.
76 (defmethod initialize-instance :after ((enc character-encoding)
77 &key literal-char-code-limit)
78 (when literal-char-code-limit
79 (setf (slot-value enc 'encode-literal-code-unit-limit)
80 literal-char-code-limit)
81 (setf (slot-value enc 'decode-literal-code-unit-limit)
82 literal-char-code-limit)))
83
84 #-(and)
85 (defmethod describe-object ((enc character-encoding) s)
86 "Prints out the name, aliases and documentation slots of a
87 character encoding object."
88 (with-slots (name aliases documentation) enc
89 (format s "~&~S" name)
90 (when aliases
91 (format s " [Aliases:~{ ~S~}]" aliases))
92 (format s "~&~A~%~%" documentation))
93 (call-next-method))
94
95 (defvar *supported-character-encodings* nil)
96
97 (defun list-character-encodings ()
98 "List of keyword symbols denoting supported character
99 encodings. This list does not include aliases."
100 *supported-character-encodings*)
101
102 (defvar *character-encodings* (make-hash-table :test 'eq))
103
104 (defvar *default-character-encoding* :utf-8
105 "Special variable used to determine the default character
106 encoding.")
107
108 (defun get-character-encoding (name)
109 "Lookups the character encoding denoted by the keyword symbol
110 NAME. Signals an error if one is not found. If NAME is already
111 a CHARACTER-ENCONDING object, it is returned unmodified."
112 (when (typep name 'character-encoding)
113 (return-from get-character-encoding name))
114 (when (eq name :default)
115 (setq name *default-character-encoding*))
116 (or (gethash name *character-encodings*)
117 (error "Unknown character encoding: ~S" name)))
118
119 (defmethod ambiguous-encoding-p ((encoding symbol))
120 (ambiguous-encoding-p (get-character-encoding encoding)))
121
122 (defun notice-character-encoding (enc)
123 (pushnew (enc-name enc) *supported-character-encodings*)
124 (dolist (kw (cons (enc-name enc) (enc-aliases enc)))
125 (setf (gethash kw *character-encodings*) enc))
126 (enc-name enc))
127
128 (defmacro define-character-encoding (name docstring &body options)
129 `(notice-character-encoding
130 (make-instance 'character-encoding :name ,name ,@options
131 :documentation ,docstring)))
132
133 ;;;; Mappings
134
135 ;;; TODO: describe what mappings are
136
137 (defun make-fixed-width-counter (getter type &optional (unit-size-in-bit…
138 (declare (ignore getter type))
139 (check-type unit-size-in-bits positive-fixnum)
140 (let ((unit-size-in-bytes (/ unit-size-in-bits 8)))
141 `(named-lambda fixed-width-counter (seq start end max)
142 (declare (ignore seq) (fixnum start end max))
143 ;; XXX: the result can be bigger than a fixnum when (> unit-size
144 ;; 1) and we don't want that to happen. Possible solution: signal
145 ;; a warning (hmm, make that an actual error) and truncate.
146 (if (plusp max)
147 (let ((count (the fixnum (min (floor max ,unit-size-in-bytes)
148 (the fixnum (- end start))))))
149 (values (the fixnum (* count ,unit-size-in-bytes))
150 (the fixnum (+ start count))))
151 (values (the fixnum (* (the fixnum (- end start))
152 ,unit-size-in-bytes))
153 (the fixnum end))))))
154
155 ;;; Useful to develop new encodings incrementally starting with octet
156 ;;; and code-unit counters.
157 (defun make-dummy-coder (sg st ds dt)
158 (declare (ignore sg st ds dt))
159 `(named-lambda dummy-coder (src s e dest i)
160 (declare (ignore src s e dest i))
161 (error "this encoder/decoder hasn't been implemented yet")))
162
163 ;;; TODO: document here
164 ;;;
165 ;;; ENCODER -- (lambda (src-getter src-type dest-setter dest-type) ...)
166 ;;; DECODER -- (lambda (src-getter src-type dest-setter dest-type) ...)
167 ;;;
168 ;;; OCTET-COUNTER -- (lambda (getter type) ...)
169 ;;; CODE-POINT-COUNTER -- (lambda (getter type) ...)
170 (defclass abstract-mapping ()
171 ((encoder-factory :accessor encoder-factory :initform 'make-dummy-code…
172 (decoder-factory :accessor decoder-factory :initform 'make-dummy-code…
173 (octet-counter-factory :accessor octet-counter-factory
174 :initform 'make-fixed-width-counter)
175 (code-point-counter-factory :accessor code-point-counter-factory
176 :initform 'make-fixed-width-counter)))
177
178 ;;; TODO: document these
179 ;;;
180 ;;; ENCODER -- (lambda (src start end dest d-start) ...)
181 ;;; DECODER -- (lambda (src start end dest d-start) ...)
182 ;;;
183 ;;; OCTET-COUNTER -- (lambda (seq start end max-octets) ...)
184 ;;; CODE-POINT-COUNTER -- (lambda (seq start end max-chars) ...)
185 ;;; => N-CHARS NEW-END
186 ;;; (important: describe NEW-END)
187 (defclass concrete-mapping ()
188 ((encoder :accessor encoder)
189 (decoder :accessor decoder)
190 (octet-counter :accessor octet-counter)
191 (code-point-counter :accessor code-point-counter)))
192
193 (defparameter *abstract-mappings* (make-hash-table :test 'eq))
194
195 (defun get-abstract-mapping (encoding)
196 (gethash encoding *abstract-mappings*))
197
198 (defun (setf get-abstract-mapping) (value encoding)
199 (setf (gethash encoding *abstract-mappings*) value))
200
201 (defun %register-mapping-part (encoding slot-name fn)
202 (let ((mapping (get-abstract-mapping encoding)))
203 (unless mapping
204 (setq mapping (make-instance 'abstract-mapping))
205 (setf (get-abstract-mapping encoding) mapping))
206 (setf (slot-value mapping slot-name) fn)))
207
208 ;;; See enc-*.lisp for example usages of these 4 macros.
209
210 (defmacro define-encoder (encoding (sa st da dt) &body body)
211 `(%register-mapping-part ,encoding 'encoder-factory
212 (named-lambda encoder (,sa ,st ,da ,dt)
213 ,@body)))
214
215 (defmacro define-decoder (encoding (sa st da dt) &body body)
216 `(%register-mapping-part ,encoding 'decoder-factory
217 (named-lambda decoder (,sa ,st ,da ,dt)
218 ,@body)))
219
220 (defmacro define-octet-counter (encoding (acc type) &body body)
221 `(%register-mapping-part ,encoding 'octet-counter-factory
222 (named-lambda octet-counter-factory (,acc ,ty…
223 ,@body)))
224
225 (defmacro define-code-point-counter (encoding (acc type) &body body)
226 `(%register-mapping-part ,encoding 'code-point-counter-factory
227 (named-lambda code-point-counter (,acc ,type)
228 ,@body)))
229
230 (defun instantiate-encoder (encoding am octet-seq-getter octet-seq-type
231 code-point-seq-setter code-point-seq-type)
232 (declare (ignore encoding))
233 (funcall (encoder-factory am)
234 octet-seq-getter
235 octet-seq-type
236 code-point-seq-setter
237 code-point-seq-type))
238
239 (defun instantiate-decoder (encoding am octet-seq-getter octet-seq-type
240 code-point-seq-setter code-point-seq-type)
241 (declare (ignore encoding))
242 (funcall (decoder-factory am)
243 octet-seq-getter
244 octet-seq-type
245 code-point-seq-setter
246 code-point-seq-type))
247
248 (defun instantiate-code-point-counter (encoding am octet-seq-getter
249 octet-seq-type)
250 (declare (ignore encoding))
251 (funcall (code-point-counter-factory am)
252 octet-seq-getter
253 octet-seq-type))
254
255 (defun instantiate-octet-counter (encoding am code-point-seq-getter
256 code-point-seq-type)
257 (if (= 1 (enc-max-units-per-char encoding))
258 (make-fixed-width-counter code-point-seq-getter code-point-seq-type
259 (enc-code-unit-size encoding))
260 (funcall (octet-counter-factory am)
261 code-point-seq-getter
262 code-point-seq-type)))
263
264 ;;; Expands into code generated by the available abstract mappings
265 ;;; that will be compiled into concrete mappings. This is used in
266 ;;; e.g. strings.lisp to define mappings between strings and
267 ;;; (unsigned-byte 8) vectors.
268 ;;;
269 ;;; For each encoding funcall the abstract mappings at macro-expansion
270 ;;; time with the src/dest accessors and types to generate the
271 ;;; appropriate code for the concrete mappings. These functions are
272 ;;; then saved in their respective slots of the CONCRETE-MAPPING
273 ;;; object.
274 (defmacro instantiate-concrete-mappings
275 (&key (encodings (hash-table-keys *abstract-mappings*))
276 (optimize '((speed 3) (debug 0) (compilation-speed 0)))
277 octet-seq-getter octet-seq-setter octet-seq-type
278 code-point-seq-getter code-point-seq-setter code-point-seq-type
279 (instantiate-decoders t))
280 `(let ((ht (make-hash-table :test 'eq)))
281 (declare (optimize ,@optimize)
282 #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note))
283 (flet ((notice-mapping (encoding-name cm)
284 (let* ((encoding (get-character-encoding encoding-name))
285 (aliases (enc-aliases encoding)))
286 (dolist (kw (cons (enc-name encoding) aliases))
287 (setf (gethash kw ht) cm)))))
288 ,@(loop for encoding-name in encodings
289 for encoding = (get-character-encoding encoding-name)
290 for am = (gethash encoding-name *abstract-mappings*)
291 collect
292 `(let ((cm (make-instance 'concrete-mapping)))
293 (setf (encoder cm)
294 ,(instantiate-encoder encoding am
295 code-point-seq-getter
296 code-point-seq-type
297 octet-seq-setter
298 octet-seq-type))
299 ,(when instantiate-decoders
300 `(progn
301 (setf (decoder cm)
302 ,(instantiate-decoder encoding am
303 octet-seq-getter
304 octet-seq-type
305 code-point-seq-setter
306 code-point-seq-type))
307 (setf (code-point-counter cm)
308 ,(instantiate-code-point-counter
309 encoding am octet-seq-getter octet-seq-t…
310 (setf (octet-counter cm)
311 ,(instantiate-octet-counter encoding am
312 code-point-seq-getter
313 code-point-seq-type))
314 (notice-mapping ,encoding-name cm))))
315 ht))
316
317 ;;; debugging stuff
318
319 #-(and)
320 (defun pprint-instantiate-concrete-mappings
321 (&key (encodings (hash-table-keys *abstract-mappings*))
322 (optimize '((debug 3) (safety 3)))
323 (octet-seq-setter 'ub-set) (octet-seq-getter 'ub-get)
324 (octet-seq-type '(simple-array (unsigned-byte 8) (*)))
325 (code-point-seq-setter 'string-set)
326 (code-point-seq-getter 'string-get)
327 (code-point-seq-type 'simple-unicode-string))
328 (let ((encodings (ensure-list encodings))
329 (*package* (find-package :babel-encodings))
330 (*print-case* :downcase))
331 (pprint
332 (macroexpand
333 `(instantiate-concrete-mappings
334 :encodings ,encodings
335 :optimize ,optimize
336 :octet-seq-getter ,octet-seq-getter
337 :octet-seq-setter ,octet-seq-setter
338 :octet-seq-type ,octet-seq-type
339 :code-point-seq-getter ,code-point-seq-getter
340 :code-point-seq-setter ,code-point-seq-setter
341 :code-point-seq-type ,code-point-seq-type))))
342 (values))
343
344 ;;;; Utilities used in enc-*.lisp
345
346 (defconstant +default-substitution-code-point+ #x1a
347 "Default ASCII substitution character code point used in case of an en…
348
349 ;;; We're converting between objects of the (UNSIGNED-BYTE 8) and
350 ;;; (MOD #x110000) types which are aliased here to UB8 and CODE-POINT
351 ;;; for convenience.
352 (deftype ub8 () '(unsigned-byte 8))
353 (deftype code-point () '(mod #x110000))
354
355 ;;; Utility macro around DEFINE-ENCODER that takes care of most of the
356 ;;; work need to deal with an 8-bit, fixed-width character encoding.
357 ;;;
358 ;;; BODY will be inside a loop and its return value will placed in the
359 ;;; destination buffer. BODY will be surounded by lexical BLOCK which
360 ;;; will have the ENCODING's name, usually a keyword. It handles all
361 ;;; sorts of type declarations.
362 ;;;
363 ;;; See enc-ascii.lisp for a simple usage example.
364 (defmacro define-unibyte-encoder (encoding (code) &body body)
365 (with-unique-names (s-getter s-type d-setter d-type
366 src start end dest d-start i di)
367 `(define-encoder ,encoding (,s-getter ,s-type ,d-setter ,d-type)
368 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder)
369 (,',src ,',start ,',end ,',dest ,',d-start)
370 (declare (type ,,s-type ,',src)
371 (type ,,d-type ,',dest)
372 (fixnum ,',start ,',end ,',d-start))
373 (loop for ,',i fixnum from ,',start below ,',end
374 and ,',di fixnum from ,',d-start do
375 (,,d-setter
376 (macrolet
377 ;; this should probably be a function...
378 ((handle-error (&optional (c ''character-encoding-e…
379 `(encoding-error
380 ,',',code ,',',encoding ,',',src ,',',i
381 +default-substitution-code-point+ ,c)))
382 (let ((,',code (,,s-getter ,',src ,',i)))
383 (declare (type code-point ,',code))
384 (block ,',encoding ,@',body)))
385 ,',dest ,',di)
386 finally (return (the fixnum (- ,',di ,',d-start))))))))
387
388 ;;; The decoder version of the above macro.
389 (defmacro define-unibyte-decoder (encoding (octet) &body body)
390 (with-unique-names (s-getter s-type d-setter d-type
391 src start end dest d-start i di)
392 `(define-decoder ,encoding (,s-getter ,s-type ,d-setter ,d-type)
393 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder)
394 (,',src ,',start ,',end ,',dest ,',d-start)
395 (declare (type ,,s-type ,',src)
396 (type ,,d-type ,',dest)
397 (fixnum ,',start ,',end ,',d-start))
398 (loop for ,',i fixnum from ,',start below ,',end
399 and ,',di fixnum from ,',d-start do
400 (,,d-setter
401 (macrolet
402 ;; this should probably be a function...
403 ((handle-error (&optional (c ''character-decoding-e…
404 `(decoding-error
405 (vector ,',',octet) ,',',encoding ,',',src ,',…
406 +default-substitution-code-point+ ,c)))
407 (let ((,',octet (,,s-getter ,',src ,',i)))
408 (declare (type ub8 ,',octet))
409 (block ,',encoding ,@',body)))
410 ,',dest ,',di)
411 finally (return (the fixnum (- ,',di ,',d-start))))))))
412
413 ;;;; Error Conditions
414 ;;;
415 ;;; For now, we don't define any actual restarts. The only mechanism
416 ;;; for "restarting" a coding error is the
417 ;;; *SUPPRESS-CHARACTER-CODING-ERRORS* special variable which, when
418 ;;; bound to T (the default), suppresses any error and uses a default
419 ;;; replacement character instead.
420 ;;;
421 ;;; If it turns out that other more options are necessary, possible
422 ;;; alternative approaches include:
423 ;;;
424 ;;; a) use a *REPLACEMENT-CHARACTER* special variable that lets us
425 ;;; pick our own replacement character. The encoder must do
426 ;;; additional work to check if this is character is encodable.
427 ;;;
428 ;;; b) offer a restart to pick a replacement character. Same
429 ;;; problem as above.
430 ;;;
431 ;;; Both approaches pose encoding problems when dealing with a
432 ;;; variable-width encodings because different replacement characters
433 ;;; will need different numbers of octets. This is not a problem for
434 ;;; UTF but will be a problem for the CJK charsets. Approach (a) is
435 ;;; nevertheless easier since the replacement character is known in
436 ;;; advance and therefore the octet-counter can account for it.
437 ;;;
438 ;;; For more complex restarts like SBCL's -- that'll let you specify
439 ;;; _several_ replacement characters for a single character error --
440 ;;; will probably need extra support code outside the encoder/decoder
441 ;;; (i.e. in the string-to-octets function, for example) since the
442 ;;; encoders/decoders deal with pre-allocated fixed-length buffers.
443 ;;;
444 ;;; SBCL has ASCII-specific (MALFORMED-ASCII) and UTF8-specific
445 ;;; errors. Why? Do we want to add some of those too?
446
447 ;;; FIXME: We used to deal with this with an extra ERRORP argument for
448 ;;; encoders, decoders, etc... Still undecided on the best way to do
449 ;;; it. We could also use a simple restart instead of this...
450 ;;;
451 ;;; In any case, this is not for the users to bind and it's not
452 ;;; exported from the BABEL package.
453 (defvar *suppress-character-coding-errors* nil
454 "If non-NIL, encoding or decoding errors are suppressed and the
455 the current character encoding's default replacement character is
456 used.")
457
458 ;;; All of Babel's error conditions are subtypes of
459 ;;; CHARACTER-CODING-ERROR. This error hierarchy is based on SBCL's.
460 (define-condition character-coding-error (error)
461 ((buffer :initarg :buffer :reader character-coding-error-buffer)
462 (position :initarg :position :reader character-coding-error-position)
463 (encoding :initarg :encoding :reader character-coding-error-encoding)…
464
465 (define-condition character-encoding-error (character-coding-error)
466 ((code :initarg :code :reader character-encoding-error-code))
467 (:report (lambda (c s)
468 (format s "Unable to encode character code point ~A as ~S."
469 (character-encoding-error-code c)
470 (character-coding-error-encoding c)))))
471
472 (declaim (inline encoding-error))
473 (defun encoding-error (code enc buf pos &optional
474 (sub +default-substitution-code-point+)
475 (e 'character-encoding-error))
476 (unless *suppress-character-coding-errors*
477 (error e :encoding enc :buffer buf :position pos :code code))
478 sub)
479
480 (define-condition character-decoding-error (character-coding-error)
481 ((octets :initarg :octets :reader character-decoding-error-octets))
482 (:report (lambda (c s)
483 (format s "Illegal ~S character starting at position ~D."
484 (character-coding-error-encoding c)
485 (character-coding-error-position c)))))
486
487 (define-condition end-of-input-in-character (character-decoding-error)
488 ()
489 (:documentation "Signalled by DECODERs or CODE-POINT-COUNTERs
490 of variable-width character encodings."))
491
492 (define-condition character-out-of-range (character-decoding-error)
493 ()
494 (:documentation
495 "Signalled when the character being decoded is out of range."))
496
497 (declaim (inline decoding-error))
498 (defun decoding-error (octets enc buf pos &optional
499 (sub +default-substitution-code-point+)
500 (e 'character-decoding-error))
501 (unless *suppress-character-coding-errors*
502 (error e :octets octets :encoding enc :buffer buf :position pos))
503 sub)
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.