regcomp.c - 9base - revived minimalist port of Plan 9 userland to Unix | |
git clone git://git.suckless.org/9base | |
Log | |
Files | |
Refs | |
README | |
LICENSE | |
--- | |
regcomp.c (9743B) | |
--- | |
1 #include "lib9.h" | |
2 #include "regexp9.h" | |
3 #include "regcomp.h" | |
4 | |
5 #define TRUE 1 | |
6 #define FALSE 0 | |
7 | |
8 /* | |
9 * Parser Information | |
10 */ | |
11 typedef | |
12 struct Node | |
13 { | |
14 Reinst* first; | |
15 Reinst* last; | |
16 }Node; | |
17 | |
18 #define NSTACK 20 | |
19 static Node andstack[NSTACK]; | |
20 static Node *andp; | |
21 static int atorstack[NSTACK]; | |
22 static int* atorp; | |
23 static int cursubid; /* id of current subex… | |
24 static int subidstack[NSTACK]; /* parallel to ators… | |
25 static int* subidp; | |
26 static int lastwasand; /* Last token was operand */ | |
27 static int nbra; | |
28 static char* exprp; /* pointer to next char… | |
29 static int lexdone; | |
30 static int nclass; | |
31 static Reclass*classp; | |
32 static Reinst* freep; | |
33 static int errors; | |
34 static Rune yyrune; /* last lex'd rune */ | |
35 static Reclass*yyclassp; /* last lex'd class */ | |
36 | |
37 /* predeclared crap */ | |
38 static void operator(int); | |
39 static void pushand(Reinst*, Reinst*); | |
40 static void pushator(int); | |
41 static void evaluntil(int); | |
42 static int bldcclass(void); | |
43 | |
44 static jmp_buf regkaboom; | |
45 | |
46 static void | |
47 rcerror(char *s) | |
48 { | |
49 errors++; | |
50 regerror(s); | |
51 longjmp(regkaboom, 1); | |
52 } | |
53 | |
54 static Reinst* | |
55 newinst(int t) | |
56 { | |
57 freep->type = t; | |
58 freep->u2.left = 0; | |
59 freep->u1.right = 0; | |
60 return freep++; | |
61 } | |
62 | |
63 static void | |
64 operand(int t) | |
65 { | |
66 Reinst *i; | |
67 | |
68 if(lastwasand) | |
69 operator(CAT); /* catenate is implicit */ | |
70 i = newinst(t); | |
71 | |
72 if(t == CCLASS || t == NCCLASS) | |
73 i->u1.cp = yyclassp; | |
74 if(t == RUNE) | |
75 i->u1.r = yyrune; | |
76 | |
77 pushand(i, i); | |
78 lastwasand = TRUE; | |
79 } | |
80 | |
81 static void | |
82 operator(int t) | |
83 { | |
84 if(t==RBRA && --nbra<0) | |
85 rcerror("unmatched right paren"); | |
86 if(t==LBRA){ | |
87 if(++cursubid >= NSUBEXP) | |
88 rcerror ("too many subexpressions"); | |
89 nbra++; | |
90 if(lastwasand) | |
91 operator(CAT); | |
92 } else | |
93 evaluntil(t); | |
94 if(t != RBRA) | |
95 pushator(t); | |
96 lastwasand = FALSE; | |
97 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) | |
98 lastwasand = TRUE; /* these look like operands */ | |
99 } | |
100 | |
101 static void | |
102 regerr2(char *s, int c) | |
103 { | |
104 char buf[100]; | |
105 char *cp = buf; | |
106 while(*s) | |
107 *cp++ = *s++; | |
108 *cp++ = c; | |
109 *cp = '\0'; | |
110 rcerror(buf); | |
111 } | |
112 | |
113 static void | |
114 cant(char *s) | |
115 { | |
116 char buf[100]; | |
117 strcpy(buf, "can't happen: "); | |
118 strcat(buf, s); | |
119 rcerror(buf); | |
120 } | |
121 | |
122 static void | |
123 pushand(Reinst *f, Reinst *l) | |
124 { | |
125 if(andp >= &andstack[NSTACK]) | |
126 cant("operand stack overflow"); | |
127 andp->first = f; | |
128 andp->last = l; | |
129 andp++; | |
130 } | |
131 | |
132 static void | |
133 pushator(int t) | |
134 { | |
135 if(atorp >= &atorstack[NSTACK]) | |
136 cant("operator stack overflow"); | |
137 *atorp++ = t; | |
138 *subidp++ = cursubid; | |
139 } | |
140 | |
141 static Node* | |
142 popand(int op) | |
143 { | |
144 Reinst *inst; | |
145 | |
146 if(andp <= &andstack[0]){ | |
147 regerr2("missing operand for ", op); | |
148 inst = newinst(NOP); | |
149 pushand(inst,inst); | |
150 } | |
151 return --andp; | |
152 } | |
153 | |
154 static int | |
155 popator(void) | |
156 { | |
157 if(atorp <= &atorstack[0]) | |
158 cant("operator stack underflow"); | |
159 --subidp; | |
160 return *--atorp; | |
161 } | |
162 | |
163 static void | |
164 evaluntil(int pri) | |
165 { | |
166 Node *op1, *op2; | |
167 Reinst *inst1, *inst2; | |
168 | |
169 while(pri==RBRA || atorp[-1]>=pri){ | |
170 switch(popator()){ | |
171 default: | |
172 rcerror("unknown operator in evaluntil"); | |
173 break; | |
174 case LBRA: /* must have been RBRA */ | |
175 op1 = popand('('); | |
176 inst2 = newinst(RBRA); | |
177 inst2->u1.subid = *subidp; | |
178 op1->last->u2.next = inst2; | |
179 inst1 = newinst(LBRA); | |
180 inst1->u1.subid = *subidp; | |
181 inst1->u2.next = op1->first; | |
182 pushand(inst1, inst2); | |
183 return; | |
184 case OR: | |
185 op2 = popand('|'); | |
186 op1 = popand('|'); | |
187 inst2 = newinst(NOP); | |
188 op2->last->u2.next = inst2; | |
189 op1->last->u2.next = inst2; | |
190 inst1 = newinst(OR); | |
191 inst1->u1.right = op1->first; | |
192 inst1->u2.left = op2->first; | |
193 pushand(inst1, inst2); | |
194 break; | |
195 case CAT: | |
196 op2 = popand(0); | |
197 op1 = popand(0); | |
198 op1->last->u2.next = op2->first; | |
199 pushand(op1->first, op2->last); | |
200 break; | |
201 case STAR: | |
202 op2 = popand('*'); | |
203 inst1 = newinst(OR); | |
204 op2->last->u2.next = inst1; | |
205 inst1->u1.right = op2->first; | |
206 pushand(inst1, inst1); | |
207 break; | |
208 case PLUS: | |
209 op2 = popand('+'); | |
210 inst1 = newinst(OR); | |
211 op2->last->u2.next = inst1; | |
212 inst1->u1.right = op2->first; | |
213 pushand(op2->first, inst1); | |
214 break; | |
215 case QUEST: | |
216 op2 = popand('?'); | |
217 inst1 = newinst(OR); | |
218 inst2 = newinst(NOP); | |
219 inst1->u2.left = inst2; | |
220 inst1->u1.right = op2->first; | |
221 op2->last->u2.next = inst2; | |
222 pushand(inst1, inst2); | |
223 break; | |
224 } | |
225 } | |
226 } | |
227 | |
228 static Reprog* | |
229 optimize(Reprog *pp) | |
230 { | |
231 Reinst *inst, *target; | |
232 int size; | |
233 Reprog *npp; | |
234 Reclass *cl; | |
235 int diff; | |
236 | |
237 /* | |
238 * get rid of NOOP chains | |
239 */ | |
240 for(inst=pp->firstinst; inst->type!=END; inst++){ | |
241 target = inst->u2.next; | |
242 while(target->type == NOP) | |
243 target = target->u2.next; | |
244 inst->u2.next = target; | |
245 } | |
246 | |
247 /* | |
248 * The original allocation is for an area larger than | |
249 * necessary. Reallocate to the actual space used | |
250 * and then relocate the code. | |
251 */ | |
252 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); | |
253 npp = realloc(pp, size); | |
254 if(npp==0 || npp==pp) | |
255 return pp; | |
256 diff = (char *)npp - (char *)pp; | |
257 freep = (Reinst *)((char *)freep + diff); | |
258 for(inst=npp->firstinst; inst<freep; inst++){ | |
259 switch(inst->type){ | |
260 case OR: | |
261 case STAR: | |
262 case PLUS: | |
263 case QUEST: | |
264 inst->u1.right = (void*)((char*)inst->u1.right +… | |
265 break; | |
266 case CCLASS: | |
267 case NCCLASS: | |
268 inst->u1.right = (void*)((char*)inst->u1.right +… | |
269 cl = inst->u1.cp; | |
270 cl->end = (void*)((char*)cl->end + diff); | |
271 break; | |
272 } | |
273 inst->u2.left = (void*)((char*)inst->u2.left + diff); | |
274 } | |
275 npp->startinst = (void*)((char*)npp->startinst + diff); | |
276 return npp; | |
277 } | |
278 | |
279 #ifdef DEBUG | |
280 static void | |
281 dumpstack(void){ | |
282 Node *stk; | |
283 int *ip; | |
284 | |
285 print("operators\n"); | |
286 for(ip=atorstack; ip<atorp; ip++) | |
287 print("0%o\n", *ip); | |
288 print("operands\n"); | |
289 for(stk=andstack; stk<andp; stk++) | |
290 print("0%o\t0%o\n", stk->first->type, stk->last->type); | |
291 } | |
292 | |
293 static void | |
294 dump(Reprog *pp) | |
295 { | |
296 Reinst *l; | |
297 Rune *p; | |
298 | |
299 l = pp->firstinst; | |
300 do{ | |
301 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, | |
302 l->u2.left-pp->firstinst, l->u1.right-pp->firsti… | |
303 if(l->type == RUNE) | |
304 print("\t%C\n", l->u1.r); | |
305 else if(l->type == CCLASS || l->type == NCCLASS){ | |
306 print("\t["); | |
307 if(l->type == NCCLASS) | |
308 print("^"); | |
309 for(p = l->u1.cp->spans; p < l->u1.cp->end; p +=… | |
310 if(p[0] == p[1]) | |
311 print("%C", p[0]); | |
312 else | |
313 print("%C-%C", p[0], p[1]); | |
314 print("]\n"); | |
315 } else | |
316 print("\n"); | |
317 }while(l++->type); | |
318 } | |
319 #endif | |
320 | |
321 static Reclass* | |
322 newclass(void) | |
323 { | |
324 if(nclass >= NCLASS) | |
325 regerr2("too many character classes; limit", NCLASS+'0'); | |
326 return &(classp[nclass++]); | |
327 } | |
328 | |
329 static int | |
330 nextc(Rune *rp) | |
331 { | |
332 if(lexdone){ | |
333 *rp = 0; | |
334 return 1; | |
335 } | |
336 exprp += chartorune(rp, exprp); | |
337 if(*rp == '\\'){ | |
338 exprp += chartorune(rp, exprp); | |
339 return 1; | |
340 } | |
341 if(*rp == 0) | |
342 lexdone = 1; | |
343 return 0; | |
344 } | |
345 | |
346 static int | |
347 lex(int literal, int dot_type) | |
348 { | |
349 int quoted; | |
350 | |
351 quoted = nextc(&yyrune); | |
352 if(literal || quoted){ | |
353 if(yyrune == 0) | |
354 return END; | |
355 return RUNE; | |
356 } | |
357 | |
358 switch(yyrune){ | |
359 case 0: | |
360 return END; | |
361 case '*': | |
362 return STAR; | |
363 case '?': | |
364 return QUEST; | |
365 case '+': | |
366 return PLUS; | |
367 case '|': | |
368 return OR; | |
369 case '.': | |
370 return dot_type; | |
371 case '(': | |
372 return LBRA; | |
373 case ')': | |
374 return RBRA; | |
375 case '^': | |
376 return BOL; | |
377 case '$': | |
378 return EOL; | |
379 case '[': | |
380 return bldcclass(); | |
381 } | |
382 return RUNE; | |
383 } | |
384 | |
385 static int | |
386 bldcclass(void) | |
387 { | |
388 int type; | |
389 Rune r[NCCRUNE]; | |
390 Rune *p, *ep, *np; | |
391 Rune rune; | |
392 int quoted; | |
393 | |
394 /* we have already seen the '[' */ | |
395 type = CCLASS; | |
396 yyclassp = newclass(); | |
397 | |
398 /* look ahead for negation */ | |
399 /* SPECIAL CASE!!! negated classes don't match \n */ | |
400 ep = r; | |
401 quoted = nextc(&rune); | |
402 if(!quoted && rune == '^'){ | |
403 type = NCCLASS; | |
404 quoted = nextc(&rune); | |
405 *ep++ = '\n'; | |
406 *ep++ = '\n'; | |
407 } | |
408 | |
409 /* parse class into a set of spans */ | |
410 for(; ep<&r[NCCRUNE];){ | |
411 if(rune == 0){ | |
412 rcerror("malformed '[]'"); | |
413 return 0; | |
414 } | |
415 if(!quoted && rune == ']') | |
416 break; | |
417 if(!quoted && rune == '-'){ | |
418 if(ep == r){ | |
419 rcerror("malformed '[]'"); | |
420 return 0; | |
421 } | |
422 quoted = nextc(&rune); | |
423 if((!quoted && rune == ']') || rune == 0){ | |
424 rcerror("malformed '[]'"); | |
425 return 0; | |
426 } | |
427 *(ep-1) = rune; | |
428 } else { | |
429 *ep++ = rune; | |
430 *ep++ = rune; | |
431 } | |
432 quoted = nextc(&rune); | |
433 } | |
434 | |
435 /* sort on span start */ | |
436 for(p = r; p < ep; p += 2){ | |
437 for(np = p; np < ep; np += 2) | |
438 if(*np < *p){ | |
439 rune = np[0]; | |
440 np[0] = p[0]; | |
441 p[0] = rune; | |
442 rune = np[1]; | |
443 np[1] = p[1]; | |
444 p[1] = rune; | |
445 } | |
446 } | |
447 | |
448 /* merge spans */ | |
449 np = yyclassp->spans; | |
450 p = r; | |
451 if(r == ep) | |
452 yyclassp->end = np; | |
453 else { | |
454 np[0] = *p++; | |
455 np[1] = *p++; | |
456 for(; p < ep; p += 2) | |
457 if(p[0] <= np[1]){ | |
458 if(p[1] > np[1]) | |
459 np[1] = p[1]; | |
460 } else { | |
461 np += 2; | |
462 np[0] = p[0]; | |
463 np[1] = p[1]; | |
464 } | |
465 yyclassp->end = np+2; | |
466 } | |
467 | |
468 return type; | |
469 } | |
470 | |
471 static Reprog* | |
472 regcomp1(char *s, int literal, int dot_type) | |
473 { | |
474 int token; | |
475 Reprog *volatile pp; | |
476 | |
477 /* get memory for the program */ | |
478 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); | |
479 if(pp == 0){ | |
480 regerror("out of memory"); | |
481 return 0; | |
482 } | |
483 freep = pp->firstinst; | |
484 classp = pp->class; | |
485 errors = 0; | |
486 | |
487 if(setjmp(regkaboom)) | |
488 goto out; | |
489 | |
490 /* go compile the sucker */ | |
491 lexdone = 0; | |
492 exprp = s; | |
493 nclass = 0; | |
494 nbra = 0; | |
495 atorp = atorstack; | |
496 andp = andstack; | |
497 subidp = subidstack; | |
498 lastwasand = FALSE; | |
499 cursubid = 0; | |
500 | |
501 /* Start with a low priority operator to prime parser */ | |
502 pushator(START-1); | |
503 while((token = lex(literal, dot_type)) != END){ | |
504 if((token&0300) == OPERATOR) | |
505 operator(token); | |
506 else | |
507 operand(token); | |
508 } | |
509 | |
510 /* Close with a low priority operator */ | |
511 evaluntil(START); | |
512 | |
513 /* Force END */ | |
514 operand(END); | |
515 evaluntil(START); | |
516 #ifdef DEBUG | |
517 dumpstack(); | |
518 #endif | |
519 if(nbra) | |
520 rcerror("unmatched left paren"); | |
521 --andp; /* points to first and only operand */ | |
522 pp->startinst = andp->first; | |
523 #ifdef DEBUG | |
524 dump(pp); | |
525 #endif | |
526 pp = optimize(pp); | |
527 #ifdef DEBUG | |
528 print("start: %d\n", andp->first-pp->firstinst); | |
529 dump(pp); | |
530 #endif | |
531 out: | |
532 if(errors){ | |
533 free(pp); | |
534 pp = 0; | |
535 } | |
536 return pp; | |
537 } | |
538 | |
539 extern Reprog* | |
540 regcomp(char *s) | |
541 { | |
542 return regcomp1(s, 0, ANY); | |
543 } | |
544 | |
545 extern Reprog* | |
546 regcomplit(char *s) | |
547 { | |
548 return regcomp1(s, 1, ANY); | |
549 } | |
550 | |
551 extern Reprog* | |
552 regcompnl(char *s) | |
553 { | |
554 return regcomp1(s, 0, ANYNL); | |
555 } |