enc-gbk.lisp - clic - Clic is an command line interactive client for gopher wri… | |
git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65… | |
Log | |
Files | |
Refs | |
Tags | |
README | |
LICENSE | |
--- | |
enc-gbk.lisp (8297B) | |
--- | |
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*- | |
2 ;;; | |
3 ;;; enc-gbk.lisp --- GBK encodings. | |
4 ;;; | |
5 ;;; Copyright (C) 2011, Li Wenpeng <[email protected]> | |
6 ;;; | |
7 ;;; Permission is hereby granted, free of charge, to any person | |
8 ;;; obtaining a copy of this software and associated documentation | |
9 ;;; files (the "Software"), to deal in the Software without | |
10 ;;; restriction, including without limitation the rights to use, copy, | |
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies | |
12 ;;; of the Software, and to permit persons to whom the Software is | |
13 ;;; furnished to do so, subject to the following conditions: | |
14 ;;; | |
15 ;;; The above copyright notice and this permission notice shall be | |
16 ;;; included in all copies or substantial portions of the Software. | |
17 ;;; | |
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
25 ;;; DEALINGS IN THE SOFTWARE. | |
26 | |
27 (in-package #:babel-encodings) | |
28 | |
29 ;; populated in gbk-map.lisp | |
30 (defvar *gbk-unicode-mapping*) | |
31 | |
32 (defconstant +gbk2-offset+ 0) | |
33 (defconstant +gbk3-offset+ 6763) | |
34 (defconstant +gbk4-offset+ (+ 6763 6080)) | |
35 (defconstant +gbk1-offset+ 20902) | |
36 (defconstant +gbk5-offset+ (+ 20902 846)) | |
37 | |
38 (define-character-encoding :gbk | |
39 "GBK is an extension of the GB2312 character set for simplified | |
40 Chinese characters, used in the People's Republic of China." | |
41 :max-units-per-char 4 | |
42 :literal-char-code-limit #x80) | |
43 | |
44 (define-condition invalid-gbk-byte (character-decoding-error) | |
45 () | |
46 (:documentation "Signalled when an invalid GBK byte is found.")) | |
47 | |
48 (define-condition invalid-gbk-character (character-encoding-error) | |
49 () | |
50 (:documentation "Signalled when an invalid GBK character is found.")) | |
51 | |
52 (define-octet-counter :gbk (getter type) | |
53 `(lambda (seq start end max) | |
54 (declare (type ,type seq) (fixnum start end max)) | |
55 (let ((noctets 0)) | |
56 (loop for i from start below end | |
57 for u1 of-type code-point = (,getter seq i) | |
58 do (cond ((< u1 #x80) (incf noctets)) | |
59 (t (incf noctets 2))) | |
60 (when (and (plusp max) (= noctets max)) | |
61 (return (values noctets i))) | |
62 finally (return (values noctets i)))))) | |
63 | |
64 (define-code-point-counter :gbk (getter type) | |
65 `(lambda (seq start end max) | |
66 (declare (type ,type seq)) | |
67 (let (u1 (noctets 0)) | |
68 (loop with i = start | |
69 while (< i end) | |
70 do (setf u1 (,getter seq i)) | |
71 (cond | |
72 ((eq 0 (logand u1 #x80)) (incf i)) | |
73 (t (incf i 2))) | |
74 (incf noctets) | |
75 (when (and (plusp max) (= noctets max)) | |
76 (return (values noctets i))) | |
77 finally (return (values noctets i)))))) | |
78 | |
79 (define-encoder :gbk (getter src-type setter dest-type) | |
80 `(lambda (src start end dest d-start) | |
81 (declare (type ,src-type src) | |
82 (type ,dest-type dest) | |
83 (fixnum start end d-start)) | |
84 (macrolet | |
85 ((do-encoding (index) | |
86 `(let ((u1 0) (u2 0)) | |
87 (cond | |
88 ((<= +gbk2-offset+ ,index (- +gbk3-offset+ 1)) ; gbk/2 | |
89 (setf u1 (+ #xB0 (truncate (/ ,index 94)))) | |
90 (setf u2 (+ #xA1 (mod ,index 94)))) | |
91 ((<= +gbk3-offset+ ,index (- +gbk4-offset+ 1)) ; gbk/3 | |
92 (setf index (- ,index +gbk3-offset+)) | |
93 (setf u1 (+ #x81 (truncate (/ ,index 190)))) | |
94 (setf u2 (+ #x40 (mod ,index 190))) | |
95 (if (>= u2 #x7F) (incf u2))) | |
96 ((<= +gbk4-offset+ ,index (- +gbk1-offset+ 1)) ; gbk/4 | |
97 (setf index (- ,index +gbk4-offset+)) | |
98 (setf u1 (+ #xAA (truncate (/ ,index 96)))) | |
99 (setf u2 (+ #x40 (mod ,index 96))) | |
100 (if (>= u2 #x7F) (incf u2))) | |
101 ((<= +gbk1-offset+ ,index (- +gbk5-offset+ 1)) ; gbk/1 | |
102 (setf index (- ,index +gbk1-offset+)) | |
103 (setf u1 (+ #xA1 (truncate (/ ,index 94)))) | |
104 (setf u2 (+ #xA1 (mod ,index 94)))) | |
105 ((<= +gbk5-offset+ ,index (length *gbk-unicode-mapping*… | |
106 (setf index (- ,index +gbk5-offset+)) | |
107 (setf u1 (+ #xA8 (truncate (/ ,index 96)))) | |
108 (setf u2 (+ #x40 (mod ,index 96))) | |
109 (if (>= u2 #x7F) (incf u2)))) | |
110 (values u1 u2)))) | |
111 (let ((c 0) index (noctets 0)) | |
112 (loop for i from start below end | |
113 for code of-type code-point = (,getter src i) | |
114 do (macrolet | |
115 ((handle-error (&optional (c 'character-encoding-e… | |
116 `(encoding-error code :gbk src i +repl+ ',c))) | |
117 (setf c (code-char code)) | |
118 (cond | |
119 ((< code #x80) ; ascii | |
120 (,setter code dest noctets) | |
121 (incf noctets)) | |
122 (t ; gbk | |
123 (setf index | |
124 (position c *gbk-unicode-mapping*)) | |
125 | |
126 (if (not index) | |
127 (handle-error invalid-gbk-character)) | |
128 (multiple-value-bind (uh ul) (do-encoding index) | |
129 (,setter uh dest noctets) | |
130 (,setter ul dest (+ 1 noctets)) | |
131 (incf noctets 2))))) | |
132 finally (return (the fixnum (- noctets d-start)))))))) | |
133 | |
134 (define-decoder :gbk (getter src-type setter dest-type) | |
135 `(lambda (src start end dest d-start) | |
136 (declare (type ,src-type src) | |
137 (type ,dest-type dest)) | |
138 (let ((u1 0) (u2 0) (index 0) (tmp 0) (noctets 0)) | |
139 (loop with i = start | |
140 while (< i end) | |
141 do (macrolet | |
142 ((handle-error (&optional (c 'character-decoding-err… | |
143 `(decoding-error #(u1 u2) :gbk src i +repl+ ',c))) | |
144 (setf u1 (,getter src i)) | |
145 (incf i) | |
146 (cond | |
147 ((eq 0 (logand u1 #x80)) | |
148 (,setter u1 dest noctets)) | |
149 (t | |
150 (setf u2 (,getter src i)) | |
151 (incf i) | |
152 (setf index | |
153 (block setter-block | |
154 (cond | |
155 ((and (<= #xB0 u1 #xF7) (<= #xA1 u2 #xFE)) | |
156 (+ +gbk2-offset+ (+ (* 94 (- u1 #xB0)) (… | |
157 | |
158 ((and (<= #x81 u1 #xA0) (<= #x40 u2 #xFE)) | |
159 (cond ((> u2 #x7F) (setf tmp 1)) | |
160 (t (setf tmp 0))) | |
161 (+ +gbk3-offset+ (* 190 (- u1 #x81)) (- … | |
162 | |
163 ((and (<= #xAA u1 #xFE) (<= #x40 #xA0)) | |
164 (cond ((> u2 #x7F) (setf tmp 1)) | |
165 (t (setf tmp 0))) | |
166 (+ +gbk4-offset+ (* 96 (- u1 #xAA)) (- u… | |
167 | |
168 ((and (<= #xA1 u1 #xA9) (<= #xA1 u2 #xFE)) | |
169 (+ +gbk1-offset+ (* 94 (- u1 #xA1)) (- u… | |
170 | |
171 ((and (<= #xA8 u1 #xA9) (<= #x40 #xA0)) | |
172 (cond ((> u2 #x7F) (setf tmp 1)) | |
173 (t (setf tmp 0))) | |
174 (+ +gbk5-offset+ (* 96 (- u1 #xA8)) (- u… | |
175 (t | |
176 (handle-error invalid-gbk-byte))))) | |
177 | |
178 (when (>= index (length *gbk-unicode-mapping*)) | |
179 (handle-error invalid-gbk-byte)) | |
180 (,setter (char-code | |
181 (elt *gbk-unicode-mapping* index)) | |
182 dest noctets))) | |
183 (incf noctets)) | |
184 finally (return (the fixnum (- noctets d-start))))))) |