strings.lisp - clic - Clic is an command line interactive client for gopher wri… | |
git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65… | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
strings.lisp (15562B) | |
--- | |
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- | |
2 ;;; | |
3 ;;; strings.lisp --- Conversions between strings and UB8 vectors. | |
4 ;;; | |
5 ;;; Copyright (C) 2007, Luis Oliveira <[email protected]> | |
6 ;;; | |
7 ;;; Permission is hereby granted, free of charge, to any person | |
8 ;;; obtaining a copy of this software and associated documentation | |
9 ;;; files (the "Software"), to deal in the Software without | |
10 ;;; restriction, including without limitation the rights to use, copy, | |
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies | |
12 ;;; of the Software, and to permit persons to whom the Software is | |
13 ;;; furnished to do so, subject to the following conditions: | |
14 ;;; | |
15 ;;; The above copyright notice and this permission notice shall be | |
16 ;;; included in all copies or substantial portions of the Software. | |
17 ;;; | |
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
25 ;;; DEALINGS IN THE SOFTWARE. | |
26 | |
27 (in-package #:babel) | |
28 | |
29 ;;; The usefulness of this string/octets interface of Babel's is very | |
30 ;;; limited on Lisps with 8-bit characters which will in effect only | |
31 ;;; support the latin-1 subset of Unicode. That is, all encodings are | |
32 ;;; supported but we can only store the first 256 code points in Lisp | |
33 ;;; strings. Support for using other 8-bit encodings for strings on | |
34 ;;; these Lisps could be added with an extra encoding/decoding step. | |
35 ;;; Supporting other encodings with larger code units would be silly | |
36 ;;; (it would break expectations about common string operations) and | |
37 ;;; better done with something like Closure's runes. | |
38 | |
39 ;;; Can we handle unicode fully? | |
40 (eval-when (:compile-toplevel :load-toplevel :execute) | |
41 ;; The EVAL is just here to avoid warnings... | |
42 (case (eval char-code-limit) | |
43 (#x100 (pushnew '8-bit-chars *features*)) | |
44 (#x10000 (pushnew 'ucs-2-chars *features*)) | |
45 (#x110000 #| yay |#) | |
46 ;; This is here mostly because if the CHAR-CODE-LIMIT is bigger | |
47 ;; than #x11000, strange things might happen but we probably | |
48 ;; shouldn't descriminate against other, smaller, values. | |
49 (t (error "Strange CHAR-CODE-LIMIT (#x~X), bailing out." | |
50 char-code-limit)))) | |
51 | |
52 ;;; Adapted from Ironclad. TODO: check if it's worthwhile adding | |
53 ;;; implementation-specific accessors such as SAP-REF-* for SBCL. | |
54 (defmacro ub-get (vector index &optional (bytes 1) (endianness :ne)) | |
55 (let ((big-endian (member endianness | |
56 '(:be #+big-endian :ne #+little-endian :re))… | |
57 (once-only (vector index) | |
58 `(logand | |
59 ,(1- (ash 1 (* 8 bytes))) | |
60 (logior | |
61 ,@(loop for i from 0 below bytes | |
62 for offset = (if big-endian i (- bytes i 1)) | |
63 for shift = (if big-endian | |
64 (* (- bytes i 1) 8) | |
65 (* offset 8)) | |
66 collect `(ash (aref ,vector (+ ,index ,offset)) ,shift)… | |
67 | |
68 (defmacro ub-set (value vector index &optional (bytes 1) (endianness :ne… | |
69 (let ((big-endian (member endianness | |
70 '(:be #+big-endian :ne #+little-endian :re))… | |
71 `(progn | |
72 ,@(loop for i from 1 to bytes | |
73 for offset = (if big-endian (- bytes i) (1- i)) collect | |
74 `(setf (aref ,vector (+ ,index ,offset)) | |
75 (ldb (byte 8 ,(* 8 (1- i))) ,value))) | |
76 (values)))) | |
77 | |
78 (defmacro string-get (string index) | |
79 `(char-code (schar ,string ,index))) | |
80 | |
81 (defmacro string-set (code string index) | |
82 `(setf (schar ,string ,index) (code-char ,code))) | |
83 | |
84 ;;; SIMPLE-BASE-STRING would also be a subtype of SIMPLE-STRING so we | |
85 ;;; don't use that because on SBCL BASE-CHARs can only hold ASCII. | |
86 ;;; Also, with (> SPEED SAFETY) (setf (schar base-str n) big-char) | |
87 ;;; will quietly work, sort of. | |
88 ;;; | |
89 ;;; XXX: test this on various lisps. | |
90 | |
91 (defconstant unicode-char-code-limit | |
92 char-code-limit | |
93 "An alias for CL:CHAR-CODE-LIMIT which might be lower than | |
94 #x110000 on some Lisps.") | |
95 | |
96 (deftype unicode-char () | |
97 "This character type can hold any characters whose CHAR-CODEs | |
98 are less than UNICODE-CHAR-CODE-LIMIT." | |
99 #+lispworks 'lw:simple-char | |
100 #-lispworks 'character) | |
101 | |
102 (deftype simple-unicode-string () | |
103 "Alias for (SIMPLE-ARRAY UNICODE-CHAR (*))." | |
104 '(simple-array unicode-char (*))) | |
105 | |
106 (deftype unicode-string () | |
107 "Alias for (VECTOR UNICODE-CHAR *)." | |
108 '(vector unicode-char *)) | |
109 | |
110 (defparameter *string-vector-mappings* | |
111 (instantiate-concrete-mappings | |
112 ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0)) | |
113 :octet-seq-setter ub-set | |
114 :octet-seq-getter ub-get | |
115 :octet-seq-type (simple-array (unsigned-byte 8) (*)) | |
116 :code-point-seq-setter string-set | |
117 :code-point-seq-getter string-get | |
118 :code-point-seq-type simple-unicode-string)) | |
119 | |
120 #+sbcl | |
121 (defparameter *simple-base-string-vector-mappings* | |
122 (instantiate-concrete-mappings | |
123 ;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0)) | |
124 :instantiate-decoders nil | |
125 :octet-seq-setter ub-set | |
126 :octet-seq-getter ub-get | |
127 :octet-seq-type (simple-array (unsigned-byte 8) (*)) | |
128 :code-point-seq-setter string-set | |
129 :code-point-seq-getter string-get | |
130 :code-point-seq-type simple-base-string)) | |
131 | |
132 ;;; Do we want a more a specific error condition here? | |
133 (defun check-vector-bounds (vector start end) | |
134 (unless (<= 0 start end (length vector)) | |
135 (error "Invalid start (~A) and end (~A) values for vector of length … | |
136 start end (length vector)))) | |
137 | |
138 (defmacro with-simple-vector (((v vector) (s start) (e end)) &body body) | |
139 "If VECTOR is a displaced or adjustable array, binds V to the | |
140 underlying simple vector, adds an adequate offset to START and | |
141 END and binds those offset values to S and E. Otherwise, if | |
142 VECTOR is already a simple array, it's simply bound to V with no | |
143 further changes. | |
144 | |
145 START and END are unchecked and assumed to be within bounds. | |
146 | |
147 Note that in some Lisps, a slow copying implementation is | |
148 necessary to obtain a simple vector thus V will be bound to a | |
149 copy of VECTOR coerced to a simple-vector. Therefore, you | |
150 shouldn't attempt to modify V." | |
151 #+sbcl | |
152 `(sb-kernel:with-array-data ((,v ,vector) (,s ,start) (,e ,end)) | |
153 ,@body) | |
154 #+(or cmu scl) | |
155 `(lisp::with-array-data ((,v ,vector) (,s ,start) (,e ,end)) | |
156 ,@body) | |
157 #+openmcl | |
158 (with-unique-names (offset) | |
159 `(multiple-value-bind (,v ,offset) | |
160 (ccl::array-data-and-offset ,vector) | |
161 (let ((,s (+ ,start ,offset)) | |
162 (,e (+ ,end ,offset))) | |
163 ,@body))) | |
164 #+allegro | |
165 (with-unique-names (offset) | |
166 `(excl::with-underlying-simple-vector (,vector ,v ,offset) | |
167 (let ((,e (+ ,end ,offset)) | |
168 (,s (+ ,start ,offset))) | |
169 ,@body))) | |
170 ;; slow, copying implementation | |
171 #-(or sbcl cmu scl openmcl allegro) | |
172 (once-only (vector) | |
173 `(funcall (if (adjustable-array-p ,vector) | |
174 #'call-with-array-data/copy | |
175 #'call-with-array-data/fast) | |
176 ,vector ,start ,end | |
177 (lambda (,v ,s ,e) ,@body)))) | |
178 | |
179 #-(or sbcl cmu scl openmcl allegro) | |
180 (progn | |
181 ;; Stolen from f2cl. | |
182 (defun array-data-and-offset (array) | |
183 (loop with offset = 0 do | |
184 (multiple-value-bind (displaced-to index-offset) | |
185 (array-displacement array) | |
186 (when (null displaced-to) | |
187 (return-from array-data-and-offset | |
188 (values array offset))) | |
189 (incf offset index-offset) | |
190 (setf array displaced-to)))) | |
191 | |
192 (defun call-with-array-data/fast (vector start end fn) | |
193 (multiple-value-bind (data offset) | |
194 (array-data-and-offset vector) | |
195 (funcall fn data (+ offset start) (+ offset end)))) | |
196 | |
197 (defun call-with-array-data/copy (vector start end fn) | |
198 (funcall fn (replace (make-array (- end start) :element-type | |
199 (array-element-type vector)) | |
200 vector :start2 start :end2 end) | |
201 0 (- end start)))) | |
202 | |
203 (defmacro with-checked-simple-vector (((v vector) (s start) (e end)) &bo… | |
204 "Like WITH-SIMPLE-VECTOR but bound-checks START and END." | |
205 (once-only (vector start) | |
206 `(let ((,e (or ,end (length ,vector)))) | |
207 (check-vector-bounds ,vector ,start ,e) | |
208 (with-simple-vector ((,v ,vector) (,s ,start) (,e ,e)) | |
209 ,@body)))) | |
210 | |
211 ;;; Future features these functions should have: | |
212 ;;; | |
213 ;;; * null-terminate | |
214 ;;; * specify target vector/string + offset | |
215 ;;; * documentation :) | |
216 | |
217 (declaim (inline octets-to-string string-to-octets string-size-in-octets | |
218 vector-size-in-chars concatenate-strings-to-octets | |
219 bom-vector)) | |
220 | |
221 (defun octets-to-string (vector &key (start 0) end | |
222 (errorp (not *suppress-character-coding-errors*… | |
223 (encoding *default-character-encoding*)) | |
224 (check-type vector (vector (unsigned-byte 8))) | |
225 (with-checked-simple-vector ((vector vector) (start start) (end end)) | |
226 (declare (type (simple-array (unsigned-byte 8) (*)) vector)) | |
227 (let ((*suppress-character-coding-errors* (not errorp)) | |
228 (mapping (lookup-mapping *string-vector-mappings* encoding))) | |
229 (multiple-value-bind (size new-end) | |
230 (funcall (code-point-counter mapping) vector start end -1) | |
231 ;; TODO we could optimize ASCII here: the result should | |
232 ;; be a simple-base-string filled using code-char... | |
233 (let ((string (make-string size :element-type 'unicode-char))) | |
234 (funcall (decoder mapping) vector start new-end string 0) | |
235 string))))) | |
236 | |
237 (defun bom-vector (encoding use-bom) | |
238 (check-type use-bom (member :default t nil)) | |
239 (the simple-vector | |
240 (if (null use-bom) | |
241 #() | |
242 (let ((enc (typecase encoding | |
243 (external-format (external-format-encoding encoding… | |
244 (t (get-character-encoding encoding))))) | |
245 (if (or (eq use-bom t) | |
246 (and (eq use-bom :default) (enc-use-bom enc))) | |
247 ;; VALUES avoids a "type assertion too complex to check" n… | |
248 (values (enc-bom-encoding enc)) | |
249 #()))))) | |
250 | |
251 (defun string-to-octets (string &key (encoding *default-character-encodi… | |
252 (start 0) end (use-bom :default) | |
253 (errorp (not *suppress-character-coding-errors*… | |
254 (declare (optimize (speed 3) (safety 2))) | |
255 (let ((*suppress-character-coding-errors* (not errorp))) | |
256 (etypecase string | |
257 ;; On some lisps (e.g. clisp and ccl) all strings are BASE-STRING … | |
258 ;; characters are BASE-CHAR. So, only enable this optimization for | |
259 ;; selected targets. | |
260 #+sbcl | |
261 (simple-base-string | |
262 (unless end | |
263 (setf end (length string))) | |
264 (check-vector-bounds string start end) | |
265 (let* ((mapping (lookup-mapping *simple-base-string-vector-mappin… | |
266 encoding)) | |
267 (bom (bom-vector encoding use-bom)) | |
268 (bom-length (length bom)) | |
269 ;; OPTIMIZE: we could use the (length string) information … | |
270 ;; because it's a simple-base-string where each character … | |
271 (result (make-array | |
272 (+ (the array-index | |
273 (funcall (the function (octet-counter mappin… | |
274 string start end -1)) | |
275 bom-length) | |
276 :element-type '(unsigned-byte 8)))) | |
277 (replace result bom) | |
278 (funcall (the function (encoder mapping)) | |
279 string start end result bom-length) | |
280 result)) | |
281 (string | |
282 ;; FIXME: we shouldn't really need that coercion to UNICODE-STRING | |
283 ;; but we kind of because it's declared all over. To avoid that, | |
284 ;; we'd need different types for input and output strings. Or ma… | |
285 ;; this is not a problem; figure that out. | |
286 (with-checked-simple-vector ((string (coerce string 'unicode-stri… | |
287 (start start) (end end)) | |
288 (declare (type simple-unicode-string string)) | |
289 (let* ((mapping (lookup-mapping *string-vector-mappings* encodi… | |
290 (bom (bom-vector encoding use-bom)) | |
291 (bom-length (length bom)) | |
292 (result (make-array | |
293 (+ (the array-index | |
294 (funcall (the function (octet-counter mapp… | |
295 string start end -1)) | |
296 bom-length) | |
297 :element-type '(unsigned-byte 8)))) | |
298 (replace result bom) | |
299 (funcall (the function (encoder mapping)) | |
300 string start end result bom-length) | |
301 result)))))) | |
302 | |
303 (defun concatenate-strings-to-octets (encoding &rest strings) | |
304 "Optimized equivalent of | |
305 \(string-to-octets \(apply #'concatenate 'string strings) | |
306 :encoding encoding)" | |
307 (declare (dynamic-extent strings)) | |
308 (let* ((mapping (lookup-mapping *string-vector-mappings* encoding)) | |
309 (octet-counter (octet-counter mapping)) | |
310 (vector (make-array | |
311 (the array-index | |
312 (reduce #'+ strings | |
313 :key (lambda (string) | |
314 (funcall octet-counter | |
315 string 0 (length string) -1)… | |
316 :element-type '(unsigned-byte 8))) | |
317 (current-index 0)) | |
318 (declare (type array-index current-index)) | |
319 (dolist (string strings) | |
320 (check-type string string) | |
321 (with-checked-simple-vector ((string (coerce string 'unicode-strin… | |
322 (start 0) (end (length string))) | |
323 (declare (type simple-unicode-string string)) | |
324 (incf current-index | |
325 (funcall (encoder mapping) | |
326 string start end vector current-index)))) | |
327 vector)) | |
328 | |
329 (defun string-size-in-octets (string &key (start 0) end (max -1 maxp) | |
330 (errorp (not *suppress-character-coding-er… | |
331 (encoding *default-character-encoding*)) | |
332 (check-type string string) | |
333 (with-checked-simple-vector ((string (coerce string 'unicode-string)) | |
334 (start start) (end end)) | |
335 (declare (type simple-unicode-string string)) | |
336 (let ((mapping (lookup-mapping *string-vector-mappings* encoding)) | |
337 (*suppress-character-coding-errors* (not errorp))) | |
338 (when maxp (assert (plusp max))) | |
339 (funcall (octet-counter mapping) string start end max)))) | |
340 | |
341 (defun vector-size-in-chars (vector &key (start 0) end (max -1 maxp) | |
342 (errorp (not *suppress-character-coding-err… | |
343 (encoding *default-character-encoding*)) | |
344 (check-type vector (vector (unsigned-byte 8))) | |
345 (with-checked-simple-vector ((vector vector) (start start) (end end)) | |
346 (declare (type (simple-array (unsigned-byte 8) (*)) vector)) | |
347 (let ((mapping (lookup-mapping *string-vector-mappings* encoding)) | |
348 (*suppress-character-coding-errors* (not errorp))) | |
349 (when maxp (assert (plusp max))) | |
350 (funcall (code-point-counter mapping) vector start end max)))) | |
351 | |
352 (declaim (notinline octets-to-string string-to-octets string-size-in-oct… | |
353 vector-size-in-chars concatenate-strings-to-octets)) |