re.c - 9base - revived minimalist port of Plan 9 userland to Unix | |
git clone git://git.suckless.org/9base | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
re.c (6951B) | |
--- | |
1 /**************************************************************** | |
2 Copyright (C) Lucent Technologies 1997 | |
3 All Rights Reserved | |
4 | |
5 Permission to use, copy, modify, and distribute this software and | |
6 its documentation for any purpose and without fee is hereby | |
7 granted, provided that the above copyright notice appear in all | |
8 copies and that both that the copyright notice and this | |
9 permission notice and warranty disclaimer appear in supporting | |
10 documentation, and that the name Lucent Technologies or any of | |
11 its entities not be used in advertising or publicity pertaining | |
12 to distribution of the software without specific, written prior | |
13 permission. | |
14 | |
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, | |
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. | |
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY | |
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER | |
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, | |
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF | |
22 THIS SOFTWARE. | |
23 ****************************************************************/ | |
24 | |
25 | |
26 #define DEBUG | |
27 #include <stdio.h> | |
28 #include <u.h> | |
29 #include <libc.h> | |
30 #include <ctype.h> | |
31 #include <bio.h> | |
32 #include <regexp.h> | |
33 #include "awk.h" | |
34 #include "y.tab.h" | |
35 | |
36 /* This file provides the interface between the main body of | |
37 * awk and the pattern matching package. It preprocesses | |
38 * patterns prior to compilation to provide awk-like semantics | |
39 * to character sequences not supported by the pattern package. | |
40 * The following conversions are performed: | |
41 * | |
42 * "()" -> "[]" | |
43 * "[-" -> "[\-" | |
44 * "[^-" -> "[^\-" | |
45 * "-]" -> "\-]" | |
46 * "[]" -> "[]*" | |
47 * "\xdddd" -> "\z" where 'z' is the UTF se… | |
48 * for the hex value | |
49 * "\ddd" -> "\o" where 'o' is a ch… | |
50 * "\b" -> "\B" where 'B' is… | |
51 * "\t" -> "\T" where 'T' is… | |
52 * "\f" -> "\F" where 'F' is… | |
53 * "\n" -> "\N" where 'N' is… | |
54 * "\r" -> "\r" where 'C' is… | |
55 */ | |
56 | |
57 #define MAXRE 512 | |
58 | |
59 static char re[MAXRE]; /* copy buffer */ | |
60 | |
61 char *patbeg; | |
62 int patlen; /* number of chars in pattern … | |
63 | |
64 #define NPATS 20 /* number of slots in patt… | |
65 | |
66 static struct pat_list /* dynamic pattern cache */ | |
67 { | |
68 char *re; | |
69 int use; | |
70 Reprog *program; | |
71 } pattern[NPATS]; | |
72 | |
73 static int npats; /* cache fill level */ | |
74 | |
75 /* Compile a pattern */ | |
76 void | |
77 *compre(char *pat) | |
78 { | |
79 int i, j, inclass; | |
80 char c, *p, *s; | |
81 Reprog *program; | |
82 | |
83 if (!compile_time) { /* search cache for dynamic pattern … | |
84 for (i = 0; i < npats; i++) | |
85 if (!strcmp(pat, pattern[i].re)) { | |
86 pattern[i].use++; | |
87 return((void *) pattern[i].program); | |
88 } | |
89 } | |
90 /* Preprocess Pattern for compilation */ | |
91 p = re; | |
92 s = pat; | |
93 inclass = 0; | |
94 while (c = *s++) { | |
95 if (c == '\\') { | |
96 quoted(&s, &p, re+MAXRE); | |
97 continue; | |
98 } | |
99 else if (!inclass && c == '(' && *s == ')') { | |
100 if (p < re+MAXRE-2) { /* '()' -> '[]*' */ | |
101 *p++ = '['; | |
102 *p++ = ']'; | |
103 c = '*'; | |
104 s++; | |
105 } | |
106 else overflow(); | |
107 } | |
108 else if (c == '['){ /* '[-' -> '[… | |
109 inclass = 1; | |
110 if (*s == '-') { | |
111 if (p < re+MAXRE-2) { | |
112 *p++ = '['; | |
113 *p++ = '\\'; | |
114 c = *s++; | |
115 } | |
116 else overflow(); | |
117 } /* '[^-' -> '[^… | |
118 else if (*s == '^' && s[1] == '-'){ | |
119 if (p < re+MAXRE-3) { | |
120 *p++ = '['; | |
121 *p++ = *s++; | |
122 *p++ = '\\'; | |
123 c = *s++; | |
124 } | |
125 else overflow(); | |
126 } | |
127 else if (*s == '['){ /* skip '[['… | |
128 if (p < re+MAXRE-1) | |
129 *p++ = c; | |
130 else overflow(); | |
131 c = *s++; | |
132 } | |
133 else if (*s == '^' && s[1] == '[') { /* s… | |
134 if (p < re+MAXRE-2) { | |
135 *p++ = c; | |
136 *p++ = *s++; | |
137 c = *s++; | |
138 } | |
139 else overflow(); | |
140 } | |
141 else if (*s == ']') { /* '[]' -> … | |
142 if (p < re+MAXRE-2) { | |
143 *p++ = c; | |
144 *p++ = *s++; | |
145 c = '*'; | |
146 inclass = 0; | |
147 } | |
148 else overflow(); | |
149 } | |
150 } | |
151 else if (c == '-' && *s == ']') { /* '-]' -> '\-]… | |
152 if (p < re+MAXRE-1) | |
153 *p++ = '\\'; | |
154 else overflow(); | |
155 } | |
156 else if (c == ']') | |
157 inclass = 0; | |
158 if (p < re+MAXRE-1) | |
159 *p++ = c; | |
160 else overflow(); | |
161 } | |
162 *p = 0; | |
163 program = regcomp(re); /* compile pattern */ | |
164 if (!compile_time) { | |
165 if (npats < NPATS) /* Room in cache */ | |
166 i = npats++; | |
167 else { /* Throw out least used */ | |
168 int use = pattern[0].use; | |
169 i = 0; | |
170 for (j = 1; j < NPATS; j++) { | |
171 if (pattern[j].use < use) { | |
172 use = pattern[j].use; | |
173 i = j; | |
174 } | |
175 } | |
176 xfree(pattern[i].program); | |
177 xfree(pattern[i].re); | |
178 } | |
179 pattern[i].re = tostring(pat); | |
180 pattern[i].program = program; | |
181 pattern[i].use = 1; | |
182 } | |
183 return((void *) program); | |
184 } | |
185 | |
186 /* T/F match indication - matched string not exported */ | |
187 int | |
188 match(void *p, char *s, char *start) | |
189 { | |
190 return regexec((Reprog *) p, (char *) s, 0, 0); | |
191 } | |
192 | |
193 /* match and delimit the matched string */ | |
194 int | |
195 pmatch(void *p, char *s, char *start) | |
196 { | |
197 Resub m; | |
198 | |
199 m.s.sp = start; | |
200 m.e.ep = 0; | |
201 if (regexec((Reprog *) p, (char *) s, &m, 1)) { | |
202 patbeg = m.s.sp; | |
203 patlen = m.e.ep-m.s.sp; | |
204 return 1; | |
205 } | |
206 patlen = -1; | |
207 patbeg = start; | |
208 return 0; | |
209 } | |
210 | |
211 /* perform a non-empty match */ | |
212 int | |
213 nematch(void *p, char *s, char *start) | |
214 { | |
215 if (pmatch(p, s, start) == 1 && patlen > 0) | |
216 return 1; | |
217 patlen = -1; | |
218 patbeg = start; | |
219 return 0; | |
220 } | |
221 /* in the parsing of regular expressions, metacharacters like . have */ | |
222 /* to be seen literally; \056 is not a metacharacter. */ | |
223 | |
224 int | |
225 hexstr(char **pp) /* find and eval hex string at pp, return new p… | |
226 { | |
227 char c; | |
228 int n = 0; | |
229 int i; | |
230 | |
231 for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i… | |
232 if (isdigit(c)) | |
233 n = 16 * n + c - '0'; | |
234 else if ('a' <= c && c <= 'f') | |
235 n = 16 * n + c - 'a' + 10; | |
236 else if ('A' <= c && c <= 'F') | |
237 n = 16 * n + c - 'A' + 10; | |
238 } | |
239 *pp += i; | |
240 return n; | |
241 } | |
242 | |
243 /* look for awk-specific escape sequences */ | |
244 | |
245 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg … | |
246 | |
247 void | |
248 quoted(char **s, char **to, char *end) /* handle escaped sequence… | |
249 { | |
250 char *p = *s; | |
251 char *t = *to; | |
252 wchar_t c; | |
253 | |
254 switch(c = *p++) { | |
255 case 't': | |
256 c = '\t'; | |
257 break; | |
258 case 'n': | |
259 c = '\n'; | |
260 break; | |
261 case 'f': | |
262 c = '\f'; | |
263 break; | |
264 case 'r': | |
265 c = '\r'; | |
266 break; | |
267 case 'b': | |
268 c = '\b'; | |
269 break; | |
270 default: | |
271 if (t < end-1) /* all else must be escape… | |
272 *t++ = '\\'; | |
273 if (c == 'x') { /* hexadecimal goo follow… | |
274 c = hexstr(&p); | |
275 if (t < end-MB_CUR_MAX) | |
276 t += wctomb(t, c); | |
277 else overflow(); | |
278 *to = t; | |
279 *s = p; | |
280 return; | |
281 } else if (isoctdigit(c)) { /* \d \dd \ddd */ | |
282 c -= '0'; | |
283 if (isoctdigit(*p)) { | |
284 c = 8 * c + *p++ - '0'; | |
285 if (isoctdigit(*p)) | |
286 c = 8 * c + *p++ - '0'; | |
287 } | |
288 } | |
289 break; | |
290 } | |
291 if (t < end-1) | |
292 *t++ = c; | |
293 *s = p; | |
294 *to = t; | |
295 } | |
296 /* count rune positions */ | |
297 int | |
298 countposn(char *s, int n) | |
299 { | |
300 int i, j; | |
301 char *end; | |
302 | |
303 for (i = 0, end = s+n; *s && s < end; i++){ | |
304 j = mblen(s, n); | |
305 if(j <= 0) | |
306 j = 1; | |
307 s += j; | |
308 } | |
309 return(i); | |
310 } | |
311 | |
312 /* pattern package error handler */ | |
313 | |
314 void | |
315 regerror(char *s) | |
316 { | |
317 FATAL("%s", s); | |
318 } | |
319 | |
320 void | |
321 overflow(void) | |
322 { | |
323 FATAL("%s", "regular expression too big"); | |
324 } | |
325 |