Introduction
Introduction Statistics Contact Development Disclaimer Help
tenc-gbk.lisp - clic - Clic is an command line interactive client for gopher wr…
git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/
Log
Files
Refs
Tags
LICENSE
---
tenc-gbk.lisp (8297B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; enc-gbk.lisp --- GBK encodings.
4 ;;;
5 ;;; Copyright (C) 2011, Li Wenpeng <[email protected]>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 (in-package #:babel-encodings)
28
29 ;; populated in gbk-map.lisp
30 (defvar *gbk-unicode-mapping*)
31
32 (defconstant +gbk2-offset+ 0)
33 (defconstant +gbk3-offset+ 6763)
34 (defconstant +gbk4-offset+ (+ 6763 6080))
35 (defconstant +gbk1-offset+ 20902)
36 (defconstant +gbk5-offset+ (+ 20902 846))
37
38 (define-character-encoding :gbk
39 "GBK is an extension of the GB2312 character set for simplified
40 Chinese characters, used in the People's Republic of China."
41 :max-units-per-char 4
42 :literal-char-code-limit #x80)
43
44 (define-condition invalid-gbk-byte (character-decoding-error)
45 ()
46 (:documentation "Signalled when an invalid GBK byte is found."))
47
48 (define-condition invalid-gbk-character (character-encoding-error)
49 ()
50 (:documentation "Signalled when an invalid GBK character is found."))
51
52 (define-octet-counter :gbk (getter type)
53 `(lambda (seq start end max)
54 (declare (type ,type seq) (fixnum start end max))
55 (let ((noctets 0))
56 (loop for i from start below end
57 for u1 of-type code-point = (,getter seq i)
58 do (cond ((< u1 #x80) (incf noctets))
59 (t (incf noctets 2)))
60 (when (and (plusp max) (= noctets max))
61 (return (values noctets i)))
62 finally (return (values noctets i))))))
63
64 (define-code-point-counter :gbk (getter type)
65 `(lambda (seq start end max)
66 (declare (type ,type seq))
67 (let (u1 (noctets 0))
68 (loop with i = start
69 while (< i end)
70 do (setf u1 (,getter seq i))
71 (cond
72 ((eq 0 (logand u1 #x80)) (incf i))
73 (t (incf i 2)))
74 (incf noctets)
75 (when (and (plusp max) (= noctets max))
76 (return (values noctets i)))
77 finally (return (values noctets i))))))
78
79 (define-encoder :gbk (getter src-type setter dest-type)
80 `(lambda (src start end dest d-start)
81 (declare (type ,src-type src)
82 (type ,dest-type dest)
83 (fixnum start end d-start))
84 (macrolet
85 ((do-encoding (index)
86 `(let ((u1 0) (u2 0))
87 (cond
88 ((<= +gbk2-offset+ ,index (- +gbk3-offset+ 1)) ; gbk/2
89 (setf u1 (+ #xB0 (truncate (/ ,index 94))))
90 (setf u2 (+ #xA1 (mod ,index 94))))
91 ((<= +gbk3-offset+ ,index (- +gbk4-offset+ 1)) ; gbk/3
92 (setf index (- ,index +gbk3-offset+))
93 (setf u1 (+ #x81 (truncate (/ ,index 190))))
94 (setf u2 (+ #x40 (mod ,index 190)))
95 (if (>= u2 #x7F) (incf u2)))
96 ((<= +gbk4-offset+ ,index (- +gbk1-offset+ 1)) ; gbk/4
97 (setf index (- ,index +gbk4-offset+))
98 (setf u1 (+ #xAA (truncate (/ ,index 96))))
99 (setf u2 (+ #x40 (mod ,index 96)))
100 (if (>= u2 #x7F) (incf u2)))
101 ((<= +gbk1-offset+ ,index (- +gbk5-offset+ 1)) ; gbk/1
102 (setf index (- ,index +gbk1-offset+))
103 (setf u1 (+ #xA1 (truncate (/ ,index 94))))
104 (setf u2 (+ #xA1 (mod ,index 94))))
105 ((<= +gbk5-offset+ ,index (length *gbk-unicode-mapping*…
106 (setf index (- ,index +gbk5-offset+))
107 (setf u1 (+ #xA8 (truncate (/ ,index 96))))
108 (setf u2 (+ #x40 (mod ,index 96)))
109 (if (>= u2 #x7F) (incf u2))))
110 (values u1 u2))))
111 (let ((c 0) index (noctets 0))
112 (loop for i from start below end
113 for code of-type code-point = (,getter src i)
114 do (macrolet
115 ((handle-error (&optional (c 'character-encoding-e…
116 `(encoding-error code :gbk src i +repl+ ',c)))
117 (setf c (code-char code))
118 (cond
119 ((< code #x80) ; ascii
120 (,setter code dest noctets)
121 (incf noctets))
122 (t ; gbk
123 (setf index
124 (position c *gbk-unicode-mapping*))
125
126 (if (not index)
127 (handle-error invalid-gbk-character))
128 (multiple-value-bind (uh ul) (do-encoding index)
129 (,setter uh dest noctets)
130 (,setter ul dest (+ 1 noctets))
131 (incf noctets 2)))))
132 finally (return (the fixnum (- noctets d-start))))))))
133
134 (define-decoder :gbk (getter src-type setter dest-type)
135 `(lambda (src start end dest d-start)
136 (declare (type ,src-type src)
137 (type ,dest-type dest))
138 (let ((u1 0) (u2 0) (index 0) (tmp 0) (noctets 0))
139 (loop with i = start
140 while (< i end)
141 do (macrolet
142 ((handle-error (&optional (c 'character-decoding-err…
143 `(decoding-error #(u1 u2) :gbk src i +repl+ ',c)))
144 (setf u1 (,getter src i))
145 (incf i)
146 (cond
147 ((eq 0 (logand u1 #x80))
148 (,setter u1 dest noctets))
149 (t
150 (setf u2 (,getter src i))
151 (incf i)
152 (setf index
153 (block setter-block
154 (cond
155 ((and (<= #xB0 u1 #xF7) (<= #xA1 u2 #xFE))
156 (+ +gbk2-offset+ (+ (* 94 (- u1 #xB0)) (…
157
158 ((and (<= #x81 u1 #xA0) (<= #x40 u2 #xFE))
159 (cond ((> u2 #x7F) (setf tmp 1))
160 (t (setf tmp 0)))
161 (+ +gbk3-offset+ (* 190 (- u1 #x81)) (- …
162
163 ((and (<= #xAA u1 #xFE) (<= #x40 #xA0))
164 (cond ((> u2 #x7F) (setf tmp 1))
165 (t (setf tmp 0)))
166 (+ +gbk4-offset+ (* 96 (- u1 #xAA)) (- u…
167
168 ((and (<= #xA1 u1 #xA9) (<= #xA1 u2 #xFE))
169 (+ +gbk1-offset+ (* 94 (- u1 #xA1)) (- u…
170
171 ((and (<= #xA8 u1 #xA9) (<= #x40 #xA0))
172 (cond ((> u2 #x7F) (setf tmp 1))
173 (t (setf tmp 0)))
174 (+ +gbk5-offset+ (* 96 (- u1 #xA8)) (- u…
175 (t
176 (handle-error invalid-gbk-byte)))))
177
178 (when (>= index (length *gbk-unicode-mapping*))
179 (handle-error invalid-gbk-byte))
180 (,setter (char-code
181 (elt *gbk-unicode-mapping* index))
182 dest noctets)))
183 (incf noctets))
184 finally (return (the fixnum (- noctets d-start)))))))
You are viewing proxied material from bitreich.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.