| tencodings.lisp - clic - Clic is an command line interactive client for gopher … | |
| git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/ | |
| Log | |
| Files | |
| Refs | |
| Tags | |
| LICENSE | |
| --- | |
| tencodings.lisp (22651B) | |
| --- | |
| 1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- | |
| 2 ;;; | |
| 3 ;;; encodings.lisp --- Character encodings and mappings. | |
| 4 ;;; | |
| 5 ;;; Copyright (C) 2007, Luis Oliveira <[email protected]> | |
| 6 ;;; | |
| 7 ;;; Permission is hereby granted, free of charge, to any person | |
| 8 ;;; obtaining a copy of this software and associated documentation | |
| 9 ;;; files (the "Software"), to deal in the Software without | |
| 10 ;;; restriction, including without limitation the rights to use, copy, | |
| 11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies | |
| 12 ;;; of the Software, and to permit persons to whom the Software is | |
| 13 ;;; furnished to do so, subject to the following conditions: | |
| 14 ;;; | |
| 15 ;;; The above copyright notice and this permission notice shall be | |
| 16 ;;; included in all copies or substantial portions of the Software. | |
| 17 ;;; | |
| 18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
| 19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
| 20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
| 21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
| 22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
| 23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| 25 ;;; DEALINGS IN THE SOFTWARE. | |
| 26 | |
| 27 (in-package #:babel-encodings) | |
| 28 | |
| 29 ;;;; Character Encodings | |
| 30 | |
| 31 (defclass character-encoding () | |
| 32 ((name :initarg :name :reader enc-name | |
| 33 :initform (error "Must specify a NAME for this character encodi… | |
| 34 ;; Most of these documentation strings are taken from OpenMCL. | |
| 35 (documentation | |
| 36 :initarg :documentation :reader enc-documentation :initform nil) | |
| 37 ;; A non-exhaustive list of aliases for the encoding. | |
| 38 (aliases :initarg :aliases :initform nil :reader enc-aliases) | |
| 39 ;; Specified in bits. Usually 8, 16 or 32. | |
| 40 (code-unit-size | |
| 41 :initarg :code-unit-size :reader enc-code-unit-size :initform 8) | |
| 42 (max-units-per-char | |
| 43 :initarg :max-units-per-char :reader enc-max-units-per-char :initfor… | |
| 44 ;; If NIL, it is necessary to swap 16- and 32-bit units. | |
| 45 (native-endianness | |
| 46 :initarg :native-endianness :reader enc-native-endianness :initform … | |
| 47 ;; Code units less than this value map to themselves on input. | |
| 48 (decode-literal-code-unit-limit | |
| 49 :initarg :decode-literal-code-unit-limit :initform 0 | |
| 50 :reader enc-decode-literal-code-unit-limit) | |
| 51 ;; Code points less than this value map to themselves on output. | |
| 52 (encode-literal-code-unit-limit | |
| 53 :initarg :encode-literal-code-unit-limit :initform 0 | |
| 54 :reader enc-encode-literal-code-unit-limit) | |
| 55 ;; Defines whether it is necessary to prepend a byte-order-mark to | |
| 56 ;; determine the endianness. | |
| 57 (use-bom :initarg :use-bom :initform nil :reader enc-use-bom) | |
| 58 ;; How the byte-order-mark should be encoded, specified as a | |
| 59 ;; sequence of octets. NIL if it cannot be encoded. | |
| 60 (bom-encoding | |
| 61 :initarg :bom-encoding :reader enc-bom-encoding :initform nil) | |
| 62 ;; How should NUL be encoded, specified as sequence of octets. | |
| 63 (nul-encoding | |
| 64 :initarg :nul-encoding :reader enc-nul-encoding :initform #(0)) | |
| 65 ;; Preferred replacement character code point. | |
| 66 (default-replacement | |
| 67 :initarg :default-replacement :reader enc-default-replacement | |
| 68 :initform #x1a) | |
| 69 ;; Does VALID-STRING => OCTETS => STRING2 guarantee a valid | |
| 70 ;; STRING2? UTF-{16,32} on little-endian plaforms don't because | |
| 71 ;; they assume different endianness on each direction. | |
| 72 (ambiguous | |
| 73 :initarg :ambiguous :reader ambiguous-encoding-p :initform nil))) | |
| 74 | |
| 75 ;;; I'm too lazy to write all the identical limits twice. | |
| 76 (defmethod initialize-instance :after ((enc character-encoding) | |
| 77 &key literal-char-code-limit) | |
| 78 (when literal-char-code-limit | |
| 79 (setf (slot-value enc 'encode-literal-code-unit-limit) | |
| 80 literal-char-code-limit) | |
| 81 (setf (slot-value enc 'decode-literal-code-unit-limit) | |
| 82 literal-char-code-limit))) | |
| 83 | |
| 84 #-(and) | |
| 85 (defmethod describe-object ((enc character-encoding) s) | |
| 86 "Prints out the name, aliases and documentation slots of a | |
| 87 character encoding object." | |
| 88 (with-slots (name aliases documentation) enc | |
| 89 (format s "~&~S" name) | |
| 90 (when aliases | |
| 91 (format s " [Aliases:~{ ~S~}]" aliases)) | |
| 92 (format s "~&~A~%~%" documentation)) | |
| 93 (call-next-method)) | |
| 94 | |
| 95 (defvar *supported-character-encodings* nil) | |
| 96 | |
| 97 (defun list-character-encodings () | |
| 98 "List of keyword symbols denoting supported character | |
| 99 encodings. This list does not include aliases." | |
| 100 *supported-character-encodings*) | |
| 101 | |
| 102 (defvar *character-encodings* (make-hash-table :test 'eq)) | |
| 103 | |
| 104 (defvar *default-character-encoding* :utf-8 | |
| 105 "Special variable used to determine the default character | |
| 106 encoding.") | |
| 107 | |
| 108 (defun get-character-encoding (name) | |
| 109 "Lookups the character encoding denoted by the keyword symbol | |
| 110 NAME. Signals an error if one is not found. If NAME is already | |
| 111 a CHARACTER-ENCONDING object, it is returned unmodified." | |
| 112 (when (typep name 'character-encoding) | |
| 113 (return-from get-character-encoding name)) | |
| 114 (when (eq name :default) | |
| 115 (setq name *default-character-encoding*)) | |
| 116 (or (gethash name *character-encodings*) | |
| 117 (error "Unknown character encoding: ~S" name))) | |
| 118 | |
| 119 (defmethod ambiguous-encoding-p ((encoding symbol)) | |
| 120 (ambiguous-encoding-p (get-character-encoding encoding))) | |
| 121 | |
| 122 (defun notice-character-encoding (enc) | |
| 123 (pushnew (enc-name enc) *supported-character-encodings*) | |
| 124 (dolist (kw (cons (enc-name enc) (enc-aliases enc))) | |
| 125 (setf (gethash kw *character-encodings*) enc)) | |
| 126 (enc-name enc)) | |
| 127 | |
| 128 (defmacro define-character-encoding (name docstring &body options) | |
| 129 `(notice-character-encoding | |
| 130 (make-instance 'character-encoding :name ,name ,@options | |
| 131 :documentation ,docstring))) | |
| 132 | |
| 133 ;;;; Mappings | |
| 134 | |
| 135 ;;; TODO: describe what mappings are | |
| 136 | |
| 137 (defun make-fixed-width-counter (getter type &optional (unit-size-in-bit… | |
| 138 (declare (ignore getter type)) | |
| 139 (check-type unit-size-in-bits positive-fixnum) | |
| 140 (let ((unit-size-in-bytes (/ unit-size-in-bits 8))) | |
| 141 `(named-lambda fixed-width-counter (seq start end max) | |
| 142 (declare (ignore seq) (fixnum start end max)) | |
| 143 ;; XXX: the result can be bigger than a fixnum when (> unit-size | |
| 144 ;; 1) and we don't want that to happen. Possible solution: signal | |
| 145 ;; a warning (hmm, make that an actual error) and truncate. | |
| 146 (if (plusp max) | |
| 147 (let ((count (the fixnum (min (floor max ,unit-size-in-bytes) | |
| 148 (the fixnum (- end start)))))) | |
| 149 (values (the fixnum (* count ,unit-size-in-bytes)) | |
| 150 (the fixnum (+ start count)))) | |
| 151 (values (the fixnum (* (the fixnum (- end start)) | |
| 152 ,unit-size-in-bytes)) | |
| 153 (the fixnum end)))))) | |
| 154 | |
| 155 ;;; Useful to develop new encodings incrementally starting with octet | |
| 156 ;;; and code-unit counters. | |
| 157 (defun make-dummy-coder (sg st ds dt) | |
| 158 (declare (ignore sg st ds dt)) | |
| 159 `(named-lambda dummy-coder (src s e dest i) | |
| 160 (declare (ignore src s e dest i)) | |
| 161 (error "this encoder/decoder hasn't been implemented yet"))) | |
| 162 | |
| 163 ;;; TODO: document here | |
| 164 ;;; | |
| 165 ;;; ENCODER -- (lambda (src-getter src-type dest-setter dest-type) ...) | |
| 166 ;;; DECODER -- (lambda (src-getter src-type dest-setter dest-type) ...) | |
| 167 ;;; | |
| 168 ;;; OCTET-COUNTER -- (lambda (getter type) ...) | |
| 169 ;;; CODE-POINT-COUNTER -- (lambda (getter type) ...) | |
| 170 (defclass abstract-mapping () | |
| 171 ((encoder-factory :accessor encoder-factory :initform 'make-dummy-code… | |
| 172 (decoder-factory :accessor decoder-factory :initform 'make-dummy-code… | |
| 173 (octet-counter-factory :accessor octet-counter-factory | |
| 174 :initform 'make-fixed-width-counter) | |
| 175 (code-point-counter-factory :accessor code-point-counter-factory | |
| 176 :initform 'make-fixed-width-counter))) | |
| 177 | |
| 178 ;;; TODO: document these | |
| 179 ;;; | |
| 180 ;;; ENCODER -- (lambda (src start end dest d-start) ...) | |
| 181 ;;; DECODER -- (lambda (src start end dest d-start) ...) | |
| 182 ;;; | |
| 183 ;;; OCTET-COUNTER -- (lambda (seq start end max-octets) ...) | |
| 184 ;;; CODE-POINT-COUNTER -- (lambda (seq start end max-chars) ...) | |
| 185 ;;; => N-CHARS NEW-END | |
| 186 ;;; (important: describe NEW-END) | |
| 187 (defclass concrete-mapping () | |
| 188 ((encoder :accessor encoder) | |
| 189 (decoder :accessor decoder) | |
| 190 (octet-counter :accessor octet-counter) | |
| 191 (code-point-counter :accessor code-point-counter))) | |
| 192 | |
| 193 (defparameter *abstract-mappings* (make-hash-table :test 'eq)) | |
| 194 | |
| 195 (defun get-abstract-mapping (encoding) | |
| 196 (gethash encoding *abstract-mappings*)) | |
| 197 | |
| 198 (defun (setf get-abstract-mapping) (value encoding) | |
| 199 (setf (gethash encoding *abstract-mappings*) value)) | |
| 200 | |
| 201 (defun %register-mapping-part (encoding slot-name fn) | |
| 202 (let ((mapping (get-abstract-mapping encoding))) | |
| 203 (unless mapping | |
| 204 (setq mapping (make-instance 'abstract-mapping)) | |
| 205 (setf (get-abstract-mapping encoding) mapping)) | |
| 206 (setf (slot-value mapping slot-name) fn))) | |
| 207 | |
| 208 ;;; See enc-*.lisp for example usages of these 4 macros. | |
| 209 | |
| 210 (defmacro define-encoder (encoding (sa st da dt) &body body) | |
| 211 `(%register-mapping-part ,encoding 'encoder-factory | |
| 212 (named-lambda encoder (,sa ,st ,da ,dt) | |
| 213 ,@body))) | |
| 214 | |
| 215 (defmacro define-decoder (encoding (sa st da dt) &body body) | |
| 216 `(%register-mapping-part ,encoding 'decoder-factory | |
| 217 (named-lambda decoder (,sa ,st ,da ,dt) | |
| 218 ,@body))) | |
| 219 | |
| 220 (defmacro define-octet-counter (encoding (acc type) &body body) | |
| 221 `(%register-mapping-part ,encoding 'octet-counter-factory | |
| 222 (named-lambda octet-counter-factory (,acc ,ty… | |
| 223 ,@body))) | |
| 224 | |
| 225 (defmacro define-code-point-counter (encoding (acc type) &body body) | |
| 226 `(%register-mapping-part ,encoding 'code-point-counter-factory | |
| 227 (named-lambda code-point-counter (,acc ,type) | |
| 228 ,@body))) | |
| 229 | |
| 230 (defun instantiate-encoder (encoding am octet-seq-getter octet-seq-type | |
| 231 code-point-seq-setter code-point-seq-type) | |
| 232 (declare (ignore encoding)) | |
| 233 (funcall (encoder-factory am) | |
| 234 octet-seq-getter | |
| 235 octet-seq-type | |
| 236 code-point-seq-setter | |
| 237 code-point-seq-type)) | |
| 238 | |
| 239 (defun instantiate-decoder (encoding am octet-seq-getter octet-seq-type | |
| 240 code-point-seq-setter code-point-seq-type) | |
| 241 (declare (ignore encoding)) | |
| 242 (funcall (decoder-factory am) | |
| 243 octet-seq-getter | |
| 244 octet-seq-type | |
| 245 code-point-seq-setter | |
| 246 code-point-seq-type)) | |
| 247 | |
| 248 (defun instantiate-code-point-counter (encoding am octet-seq-getter | |
| 249 octet-seq-type) | |
| 250 (declare (ignore encoding)) | |
| 251 (funcall (code-point-counter-factory am) | |
| 252 octet-seq-getter | |
| 253 octet-seq-type)) | |
| 254 | |
| 255 (defun instantiate-octet-counter (encoding am code-point-seq-getter | |
| 256 code-point-seq-type) | |
| 257 (if (= 1 (enc-max-units-per-char encoding)) | |
| 258 (make-fixed-width-counter code-point-seq-getter code-point-seq-type | |
| 259 (enc-code-unit-size encoding)) | |
| 260 (funcall (octet-counter-factory am) | |
| 261 code-point-seq-getter | |
| 262 code-point-seq-type))) | |
| 263 | |
| 264 ;;; Expands into code generated by the available abstract mappings | |
| 265 ;;; that will be compiled into concrete mappings. This is used in | |
| 266 ;;; e.g. strings.lisp to define mappings between strings and | |
| 267 ;;; (unsigned-byte 8) vectors. | |
| 268 ;;; | |
| 269 ;;; For each encoding funcall the abstract mappings at macro-expansion | |
| 270 ;;; time with the src/dest accessors and types to generate the | |
| 271 ;;; appropriate code for the concrete mappings. These functions are | |
| 272 ;;; then saved in their respective slots of the CONCRETE-MAPPING | |
| 273 ;;; object. | |
| 274 (defmacro instantiate-concrete-mappings | |
| 275 (&key (encodings (hash-table-keys *abstract-mappings*)) | |
| 276 (optimize '((speed 3) (debug 0) (compilation-speed 0))) | |
| 277 octet-seq-getter octet-seq-setter octet-seq-type | |
| 278 code-point-seq-getter code-point-seq-setter code-point-seq-type | |
| 279 (instantiate-decoders t)) | |
| 280 `(let ((ht (make-hash-table :test 'eq))) | |
| 281 (declare (optimize ,@optimize) | |
| 282 #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note)) | |
| 283 (flet ((notice-mapping (encoding-name cm) | |
| 284 (let* ((encoding (get-character-encoding encoding-name)) | |
| 285 (aliases (enc-aliases encoding))) | |
| 286 (dolist (kw (cons (enc-name encoding) aliases)) | |
| 287 (setf (gethash kw ht) cm))))) | |
| 288 ,@(loop for encoding-name in encodings | |
| 289 for encoding = (get-character-encoding encoding-name) | |
| 290 for am = (gethash encoding-name *abstract-mappings*) | |
| 291 collect | |
| 292 `(let ((cm (make-instance 'concrete-mapping))) | |
| 293 (setf (encoder cm) | |
| 294 ,(instantiate-encoder encoding am | |
| 295 code-point-seq-getter | |
| 296 code-point-seq-type | |
| 297 octet-seq-setter | |
| 298 octet-seq-type)) | |
| 299 ,(when instantiate-decoders | |
| 300 `(progn | |
| 301 (setf (decoder cm) | |
| 302 ,(instantiate-decoder encoding am | |
| 303 octet-seq-getter | |
| 304 octet-seq-type | |
| 305 code-point-seq-setter | |
| 306 code-point-seq-type)) | |
| 307 (setf (code-point-counter cm) | |
| 308 ,(instantiate-code-point-counter | |
| 309 encoding am octet-seq-getter octet-seq-t… | |
| 310 (setf (octet-counter cm) | |
| 311 ,(instantiate-octet-counter encoding am | |
| 312 code-point-seq-getter | |
| 313 code-point-seq-type)) | |
| 314 (notice-mapping ,encoding-name cm)))) | |
| 315 ht)) | |
| 316 | |
| 317 ;;; debugging stuff | |
| 318 | |
| 319 #-(and) | |
| 320 (defun pprint-instantiate-concrete-mappings | |
| 321 (&key (encodings (hash-table-keys *abstract-mappings*)) | |
| 322 (optimize '((debug 3) (safety 3))) | |
| 323 (octet-seq-setter 'ub-set) (octet-seq-getter 'ub-get) | |
| 324 (octet-seq-type '(simple-array (unsigned-byte 8) (*))) | |
| 325 (code-point-seq-setter 'string-set) | |
| 326 (code-point-seq-getter 'string-get) | |
| 327 (code-point-seq-type 'simple-unicode-string)) | |
| 328 (let ((encodings (ensure-list encodings)) | |
| 329 (*package* (find-package :babel-encodings)) | |
| 330 (*print-case* :downcase)) | |
| 331 (pprint | |
| 332 (macroexpand | |
| 333 `(instantiate-concrete-mappings | |
| 334 :encodings ,encodings | |
| 335 :optimize ,optimize | |
| 336 :octet-seq-getter ,octet-seq-getter | |
| 337 :octet-seq-setter ,octet-seq-setter | |
| 338 :octet-seq-type ,octet-seq-type | |
| 339 :code-point-seq-getter ,code-point-seq-getter | |
| 340 :code-point-seq-setter ,code-point-seq-setter | |
| 341 :code-point-seq-type ,code-point-seq-type)))) | |
| 342 (values)) | |
| 343 | |
| 344 ;;;; Utilities used in enc-*.lisp | |
| 345 | |
| 346 (defconstant +default-substitution-code-point+ #x1a | |
| 347 "Default ASCII substitution character code point used in case of an en… | |
| 348 | |
| 349 ;;; We're converting between objects of the (UNSIGNED-BYTE 8) and | |
| 350 ;;; (MOD #x110000) types which are aliased here to UB8 and CODE-POINT | |
| 351 ;;; for convenience. | |
| 352 (deftype ub8 () '(unsigned-byte 8)) | |
| 353 (deftype code-point () '(mod #x110000)) | |
| 354 | |
| 355 ;;; Utility macro around DEFINE-ENCODER that takes care of most of the | |
| 356 ;;; work need to deal with an 8-bit, fixed-width character encoding. | |
| 357 ;;; | |
| 358 ;;; BODY will be inside a loop and its return value will placed in the | |
| 359 ;;; destination buffer. BODY will be surounded by lexical BLOCK which | |
| 360 ;;; will have the ENCODING's name, usually a keyword. It handles all | |
| 361 ;;; sorts of type declarations. | |
| 362 ;;; | |
| 363 ;;; See enc-ascii.lisp for a simple usage example. | |
| 364 (defmacro define-unibyte-encoder (encoding (code) &body body) | |
| 365 (with-unique-names (s-getter s-type d-setter d-type | |
| 366 src start end dest d-start i di) | |
| 367 `(define-encoder ,encoding (,s-getter ,s-type ,d-setter ,d-type) | |
| 368 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder) | |
| 369 (,',src ,',start ,',end ,',dest ,',d-start) | |
| 370 (declare (type ,,s-type ,',src) | |
| 371 (type ,,d-type ,',dest) | |
| 372 (fixnum ,',start ,',end ,',d-start)) | |
| 373 (loop for ,',i fixnum from ,',start below ,',end | |
| 374 and ,',di fixnum from ,',d-start do | |
| 375 (,,d-setter | |
| 376 (macrolet | |
| 377 ;; this should probably be a function... | |
| 378 ((handle-error (&optional (c ''character-encoding-e… | |
| 379 `(encoding-error | |
| 380 ,',',code ,',',encoding ,',',src ,',',i | |
| 381 +default-substitution-code-point+ ,c))) | |
| 382 (let ((,',code (,,s-getter ,',src ,',i))) | |
| 383 (declare (type code-point ,',code)) | |
| 384 (block ,',encoding ,@',body))) | |
| 385 ,',dest ,',di) | |
| 386 finally (return (the fixnum (- ,',di ,',d-start)))))))) | |
| 387 | |
| 388 ;;; The decoder version of the above macro. | |
| 389 (defmacro define-unibyte-decoder (encoding (octet) &body body) | |
| 390 (with-unique-names (s-getter s-type d-setter d-type | |
| 391 src start end dest d-start i di) | |
| 392 `(define-decoder ,encoding (,s-getter ,s-type ,d-setter ,d-type) | |
| 393 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder) | |
| 394 (,',src ,',start ,',end ,',dest ,',d-start) | |
| 395 (declare (type ,,s-type ,',src) | |
| 396 (type ,,d-type ,',dest) | |
| 397 (fixnum ,',start ,',end ,',d-start)) | |
| 398 (loop for ,',i fixnum from ,',start below ,',end | |
| 399 and ,',di fixnum from ,',d-start do | |
| 400 (,,d-setter | |
| 401 (macrolet | |
| 402 ;; this should probably be a function... | |
| 403 ((handle-error (&optional (c ''character-decoding-e… | |
| 404 `(decoding-error | |
| 405 (vector ,',',octet) ,',',encoding ,',',src ,',… | |
| 406 +default-substitution-code-point+ ,c))) | |
| 407 (let ((,',octet (,,s-getter ,',src ,',i))) | |
| 408 (declare (type ub8 ,',octet)) | |
| 409 (block ,',encoding ,@',body))) | |
| 410 ,',dest ,',di) | |
| 411 finally (return (the fixnum (- ,',di ,',d-start)))))))) | |
| 412 | |
| 413 ;;;; Error Conditions | |
| 414 ;;; | |
| 415 ;;; For now, we don't define any actual restarts. The only mechanism | |
| 416 ;;; for "restarting" a coding error is the | |
| 417 ;;; *SUPPRESS-CHARACTER-CODING-ERRORS* special variable which, when | |
| 418 ;;; bound to T (the default), suppresses any error and uses a default | |
| 419 ;;; replacement character instead. | |
| 420 ;;; | |
| 421 ;;; If it turns out that other more options are necessary, possible | |
| 422 ;;; alternative approaches include: | |
| 423 ;;; | |
| 424 ;;; a) use a *REPLACEMENT-CHARACTER* special variable that lets us | |
| 425 ;;; pick our own replacement character. The encoder must do | |
| 426 ;;; additional work to check if this is character is encodable. | |
| 427 ;;; | |
| 428 ;;; b) offer a restart to pick a replacement character. Same | |
| 429 ;;; problem as above. | |
| 430 ;;; | |
| 431 ;;; Both approaches pose encoding problems when dealing with a | |
| 432 ;;; variable-width encodings because different replacement characters | |
| 433 ;;; will need different numbers of octets. This is not a problem for | |
| 434 ;;; UTF but will be a problem for the CJK charsets. Approach (a) is | |
| 435 ;;; nevertheless easier since the replacement character is known in | |
| 436 ;;; advance and therefore the octet-counter can account for it. | |
| 437 ;;; | |
| 438 ;;; For more complex restarts like SBCL's -- that'll let you specify | |
| 439 ;;; _several_ replacement characters for a single character error -- | |
| 440 ;;; will probably need extra support code outside the encoder/decoder | |
| 441 ;;; (i.e. in the string-to-octets function, for example) since the | |
| 442 ;;; encoders/decoders deal with pre-allocated fixed-length buffers. | |
| 443 ;;; | |
| 444 ;;; SBCL has ASCII-specific (MALFORMED-ASCII) and UTF8-specific | |
| 445 ;;; errors. Why? Do we want to add some of those too? | |
| 446 | |
| 447 ;;; FIXME: We used to deal with this with an extra ERRORP argument for | |
| 448 ;;; encoders, decoders, etc... Still undecided on the best way to do | |
| 449 ;;; it. We could also use a simple restart instead of this... | |
| 450 ;;; | |
| 451 ;;; In any case, this is not for the users to bind and it's not | |
| 452 ;;; exported from the BABEL package. | |
| 453 (defvar *suppress-character-coding-errors* nil | |
| 454 "If non-NIL, encoding or decoding errors are suppressed and the | |
| 455 the current character encoding's default replacement character is | |
| 456 used.") | |
| 457 | |
| 458 ;;; All of Babel's error conditions are subtypes of | |
| 459 ;;; CHARACTER-CODING-ERROR. This error hierarchy is based on SBCL's. | |
| 460 (define-condition character-coding-error (error) | |
| 461 ((buffer :initarg :buffer :reader character-coding-error-buffer) | |
| 462 (position :initarg :position :reader character-coding-error-position) | |
| 463 (encoding :initarg :encoding :reader character-coding-error-encoding)… | |
| 464 | |
| 465 (define-condition character-encoding-error (character-coding-error) | |
| 466 ((code :initarg :code :reader character-encoding-error-code)) | |
| 467 (:report (lambda (c s) | |
| 468 (format s "Unable to encode character code point ~A as ~S." | |
| 469 (character-encoding-error-code c) | |
| 470 (character-coding-error-encoding c))))) | |
| 471 | |
| 472 (declaim (inline encoding-error)) | |
| 473 (defun encoding-error (code enc buf pos &optional | |
| 474 (sub +default-substitution-code-point+) | |
| 475 (e 'character-encoding-error)) | |
| 476 (unless *suppress-character-coding-errors* | |
| 477 (error e :encoding enc :buffer buf :position pos :code code)) | |
| 478 sub) | |
| 479 | |
| 480 (define-condition character-decoding-error (character-coding-error) | |
| 481 ((octets :initarg :octets :reader character-decoding-error-octets)) | |
| 482 (:report (lambda (c s) | |
| 483 (format s "Illegal ~S character starting at position ~D." | |
| 484 (character-coding-error-encoding c) | |
| 485 (character-coding-error-position c))))) | |
| 486 | |
| 487 (define-condition end-of-input-in-character (character-decoding-error) | |
| 488 () | |
| 489 (:documentation "Signalled by DECODERs or CODE-POINT-COUNTERs | |
| 490 of variable-width character encodings.")) | |
| 491 | |
| 492 (define-condition character-out-of-range (character-decoding-error) | |
| 493 () | |
| 494 (:documentation | |
| 495 "Signalled when the character being decoded is out of range.")) | |
| 496 | |
| 497 (declaim (inline decoding-error)) | |
| 498 (defun decoding-error (octets enc buf pos &optional | |
| 499 (sub +default-substitution-code-point+) | |
| 500 (e 'character-decoding-error)) | |
| 501 (unless *suppress-character-coding-errors* | |
| 502 (error e :octets octets :encoding enc :buffer buf :position pos)) | |
| 503 sub) |