Introduction
Introduction Statistics Contact Development Disclaimer Help
regcomp.c - 9base - revived minimalist port of Plan 9 userland to Unix
git clone git://git.suckless.org/9base
Log
Files
Refs
README
LICENSE
---
regcomp.c (9743B)
---
1 #include "lib9.h"
2 #include "regexp9.h"
3 #include "regcomp.h"
4
5 #define TRUE 1
6 #define FALSE 0
7
8 /*
9 * Parser Information
10 */
11 typedef
12 struct Node
13 {
14 Reinst* first;
15 Reinst* last;
16 }Node;
17
18 #define NSTACK 20
19 static Node andstack[NSTACK];
20 static Node *andp;
21 static int atorstack[NSTACK];
22 static int* atorp;
23 static int cursubid; /* id of current subex…
24 static int subidstack[NSTACK]; /* parallel to ators…
25 static int* subidp;
26 static int lastwasand; /* Last token was operand */
27 static int nbra;
28 static char* exprp; /* pointer to next char…
29 static int lexdone;
30 static int nclass;
31 static Reclass*classp;
32 static Reinst* freep;
33 static int errors;
34 static Rune yyrune; /* last lex'd rune */
35 static Reclass*yyclassp; /* last lex'd class */
36
37 /* predeclared crap */
38 static void operator(int);
39 static void pushand(Reinst*, Reinst*);
40 static void pushator(int);
41 static void evaluntil(int);
42 static int bldcclass(void);
43
44 static jmp_buf regkaboom;
45
46 static void
47 rcerror(char *s)
48 {
49 errors++;
50 regerror(s);
51 longjmp(regkaboom, 1);
52 }
53
54 static Reinst*
55 newinst(int t)
56 {
57 freep->type = t;
58 freep->u2.left = 0;
59 freep->u1.right = 0;
60 return freep++;
61 }
62
63 static void
64 operand(int t)
65 {
66 Reinst *i;
67
68 if(lastwasand)
69 operator(CAT); /* catenate is implicit */
70 i = newinst(t);
71
72 if(t == CCLASS || t == NCCLASS)
73 i->u1.cp = yyclassp;
74 if(t == RUNE)
75 i->u1.r = yyrune;
76
77 pushand(i, i);
78 lastwasand = TRUE;
79 }
80
81 static void
82 operator(int t)
83 {
84 if(t==RBRA && --nbra<0)
85 rcerror("unmatched right paren");
86 if(t==LBRA){
87 if(++cursubid >= NSUBEXP)
88 rcerror ("too many subexpressions");
89 nbra++;
90 if(lastwasand)
91 operator(CAT);
92 } else
93 evaluntil(t);
94 if(t != RBRA)
95 pushator(t);
96 lastwasand = FALSE;
97 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
98 lastwasand = TRUE; /* these look like operands */
99 }
100
101 static void
102 regerr2(char *s, int c)
103 {
104 char buf[100];
105 char *cp = buf;
106 while(*s)
107 *cp++ = *s++;
108 *cp++ = c;
109 *cp = '\0';
110 rcerror(buf);
111 }
112
113 static void
114 cant(char *s)
115 {
116 char buf[100];
117 strcpy(buf, "can't happen: ");
118 strcat(buf, s);
119 rcerror(buf);
120 }
121
122 static void
123 pushand(Reinst *f, Reinst *l)
124 {
125 if(andp >= &andstack[NSTACK])
126 cant("operand stack overflow");
127 andp->first = f;
128 andp->last = l;
129 andp++;
130 }
131
132 static void
133 pushator(int t)
134 {
135 if(atorp >= &atorstack[NSTACK])
136 cant("operator stack overflow");
137 *atorp++ = t;
138 *subidp++ = cursubid;
139 }
140
141 static Node*
142 popand(int op)
143 {
144 Reinst *inst;
145
146 if(andp <= &andstack[0]){
147 regerr2("missing operand for ", op);
148 inst = newinst(NOP);
149 pushand(inst,inst);
150 }
151 return --andp;
152 }
153
154 static int
155 popator(void)
156 {
157 if(atorp <= &atorstack[0])
158 cant("operator stack underflow");
159 --subidp;
160 return *--atorp;
161 }
162
163 static void
164 evaluntil(int pri)
165 {
166 Node *op1, *op2;
167 Reinst *inst1, *inst2;
168
169 while(pri==RBRA || atorp[-1]>=pri){
170 switch(popator()){
171 default:
172 rcerror("unknown operator in evaluntil");
173 break;
174 case LBRA: /* must have been RBRA */
175 op1 = popand('(');
176 inst2 = newinst(RBRA);
177 inst2->u1.subid = *subidp;
178 op1->last->u2.next = inst2;
179 inst1 = newinst(LBRA);
180 inst1->u1.subid = *subidp;
181 inst1->u2.next = op1->first;
182 pushand(inst1, inst2);
183 return;
184 case OR:
185 op2 = popand('|');
186 op1 = popand('|');
187 inst2 = newinst(NOP);
188 op2->last->u2.next = inst2;
189 op1->last->u2.next = inst2;
190 inst1 = newinst(OR);
191 inst1->u1.right = op1->first;
192 inst1->u2.left = op2->first;
193 pushand(inst1, inst2);
194 break;
195 case CAT:
196 op2 = popand(0);
197 op1 = popand(0);
198 op1->last->u2.next = op2->first;
199 pushand(op1->first, op2->last);
200 break;
201 case STAR:
202 op2 = popand('*');
203 inst1 = newinst(OR);
204 op2->last->u2.next = inst1;
205 inst1->u1.right = op2->first;
206 pushand(inst1, inst1);
207 break;
208 case PLUS:
209 op2 = popand('+');
210 inst1 = newinst(OR);
211 op2->last->u2.next = inst1;
212 inst1->u1.right = op2->first;
213 pushand(op2->first, inst1);
214 break;
215 case QUEST:
216 op2 = popand('?');
217 inst1 = newinst(OR);
218 inst2 = newinst(NOP);
219 inst1->u2.left = inst2;
220 inst1->u1.right = op2->first;
221 op2->last->u2.next = inst2;
222 pushand(inst1, inst2);
223 break;
224 }
225 }
226 }
227
228 static Reprog*
229 optimize(Reprog *pp)
230 {
231 Reinst *inst, *target;
232 int size;
233 Reprog *npp;
234 Reclass *cl;
235 int diff;
236
237 /*
238 * get rid of NOOP chains
239 */
240 for(inst=pp->firstinst; inst->type!=END; inst++){
241 target = inst->u2.next;
242 while(target->type == NOP)
243 target = target->u2.next;
244 inst->u2.next = target;
245 }
246
247 /*
248 * The original allocation is for an area larger than
249 * necessary. Reallocate to the actual space used
250 * and then relocate the code.
251 */
252 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
253 npp = realloc(pp, size);
254 if(npp==0 || npp==pp)
255 return pp;
256 diff = (char *)npp - (char *)pp;
257 freep = (Reinst *)((char *)freep + diff);
258 for(inst=npp->firstinst; inst<freep; inst++){
259 switch(inst->type){
260 case OR:
261 case STAR:
262 case PLUS:
263 case QUEST:
264 inst->u1.right = (void*)((char*)inst->u1.right +…
265 break;
266 case CCLASS:
267 case NCCLASS:
268 inst->u1.right = (void*)((char*)inst->u1.right +…
269 cl = inst->u1.cp;
270 cl->end = (void*)((char*)cl->end + diff);
271 break;
272 }
273 inst->u2.left = (void*)((char*)inst->u2.left + diff);
274 }
275 npp->startinst = (void*)((char*)npp->startinst + diff);
276 return npp;
277 }
278
279 #ifdef DEBUG
280 static void
281 dumpstack(void){
282 Node *stk;
283 int *ip;
284
285 print("operators\n");
286 for(ip=atorstack; ip<atorp; ip++)
287 print("0%o\n", *ip);
288 print("operands\n");
289 for(stk=andstack; stk<andp; stk++)
290 print("0%o\t0%o\n", stk->first->type, stk->last->type);
291 }
292
293 static void
294 dump(Reprog *pp)
295 {
296 Reinst *l;
297 Rune *p;
298
299 l = pp->firstinst;
300 do{
301 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
302 l->u2.left-pp->firstinst, l->u1.right-pp->firsti…
303 if(l->type == RUNE)
304 print("\t%C\n", l->u1.r);
305 else if(l->type == CCLASS || l->type == NCCLASS){
306 print("\t[");
307 if(l->type == NCCLASS)
308 print("^");
309 for(p = l->u1.cp->spans; p < l->u1.cp->end; p +=…
310 if(p[0] == p[1])
311 print("%C", p[0]);
312 else
313 print("%C-%C", p[0], p[1]);
314 print("]\n");
315 } else
316 print("\n");
317 }while(l++->type);
318 }
319 #endif
320
321 static Reclass*
322 newclass(void)
323 {
324 if(nclass >= NCLASS)
325 regerr2("too many character classes; limit", NCLASS+'0');
326 return &(classp[nclass++]);
327 }
328
329 static int
330 nextc(Rune *rp)
331 {
332 if(lexdone){
333 *rp = 0;
334 return 1;
335 }
336 exprp += chartorune(rp, exprp);
337 if(*rp == '\\'){
338 exprp += chartorune(rp, exprp);
339 return 1;
340 }
341 if(*rp == 0)
342 lexdone = 1;
343 return 0;
344 }
345
346 static int
347 lex(int literal, int dot_type)
348 {
349 int quoted;
350
351 quoted = nextc(&yyrune);
352 if(literal || quoted){
353 if(yyrune == 0)
354 return END;
355 return RUNE;
356 }
357
358 switch(yyrune){
359 case 0:
360 return END;
361 case '*':
362 return STAR;
363 case '?':
364 return QUEST;
365 case '+':
366 return PLUS;
367 case '|':
368 return OR;
369 case '.':
370 return dot_type;
371 case '(':
372 return LBRA;
373 case ')':
374 return RBRA;
375 case '^':
376 return BOL;
377 case '$':
378 return EOL;
379 case '[':
380 return bldcclass();
381 }
382 return RUNE;
383 }
384
385 static int
386 bldcclass(void)
387 {
388 int type;
389 Rune r[NCCRUNE];
390 Rune *p, *ep, *np;
391 Rune rune;
392 int quoted;
393
394 /* we have already seen the '[' */
395 type = CCLASS;
396 yyclassp = newclass();
397
398 /* look ahead for negation */
399 /* SPECIAL CASE!!! negated classes don't match \n */
400 ep = r;
401 quoted = nextc(&rune);
402 if(!quoted && rune == '^'){
403 type = NCCLASS;
404 quoted = nextc(&rune);
405 *ep++ = '\n';
406 *ep++ = '\n';
407 }
408
409 /* parse class into a set of spans */
410 for(; ep<&r[NCCRUNE];){
411 if(rune == 0){
412 rcerror("malformed '[]'");
413 return 0;
414 }
415 if(!quoted && rune == ']')
416 break;
417 if(!quoted && rune == '-'){
418 if(ep == r){
419 rcerror("malformed '[]'");
420 return 0;
421 }
422 quoted = nextc(&rune);
423 if((!quoted && rune == ']') || rune == 0){
424 rcerror("malformed '[]'");
425 return 0;
426 }
427 *(ep-1) = rune;
428 } else {
429 *ep++ = rune;
430 *ep++ = rune;
431 }
432 quoted = nextc(&rune);
433 }
434
435 /* sort on span start */
436 for(p = r; p < ep; p += 2){
437 for(np = p; np < ep; np += 2)
438 if(*np < *p){
439 rune = np[0];
440 np[0] = p[0];
441 p[0] = rune;
442 rune = np[1];
443 np[1] = p[1];
444 p[1] = rune;
445 }
446 }
447
448 /* merge spans */
449 np = yyclassp->spans;
450 p = r;
451 if(r == ep)
452 yyclassp->end = np;
453 else {
454 np[0] = *p++;
455 np[1] = *p++;
456 for(; p < ep; p += 2)
457 if(p[0] <= np[1]){
458 if(p[1] > np[1])
459 np[1] = p[1];
460 } else {
461 np += 2;
462 np[0] = p[0];
463 np[1] = p[1];
464 }
465 yyclassp->end = np+2;
466 }
467
468 return type;
469 }
470
471 static Reprog*
472 regcomp1(char *s, int literal, int dot_type)
473 {
474 int token;
475 Reprog *volatile pp;
476
477 /* get memory for the program */
478 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
479 if(pp == 0){
480 regerror("out of memory");
481 return 0;
482 }
483 freep = pp->firstinst;
484 classp = pp->class;
485 errors = 0;
486
487 if(setjmp(regkaboom))
488 goto out;
489
490 /* go compile the sucker */
491 lexdone = 0;
492 exprp = s;
493 nclass = 0;
494 nbra = 0;
495 atorp = atorstack;
496 andp = andstack;
497 subidp = subidstack;
498 lastwasand = FALSE;
499 cursubid = 0;
500
501 /* Start with a low priority operator to prime parser */
502 pushator(START-1);
503 while((token = lex(literal, dot_type)) != END){
504 if((token&0300) == OPERATOR)
505 operator(token);
506 else
507 operand(token);
508 }
509
510 /* Close with a low priority operator */
511 evaluntil(START);
512
513 /* Force END */
514 operand(END);
515 evaluntil(START);
516 #ifdef DEBUG
517 dumpstack();
518 #endif
519 if(nbra)
520 rcerror("unmatched left paren");
521 --andp; /* points to first and only operand */
522 pp->startinst = andp->first;
523 #ifdef DEBUG
524 dump(pp);
525 #endif
526 pp = optimize(pp);
527 #ifdef DEBUG
528 print("start: %d\n", andp->first-pp->firstinst);
529 dump(pp);
530 #endif
531 out:
532 if(errors){
533 free(pp);
534 pp = 0;
535 }
536 return pp;
537 }
538
539 extern Reprog*
540 regcomp(char *s)
541 {
542 return regcomp1(s, 0, ANY);
543 }
544
545 extern Reprog*
546 regcomplit(char *s)
547 {
548 return regcomp1(s, 1, ANY);
549 }
550
551 extern Reprog*
552 regcompnl(char *s)
553 {
554 return regcomp1(s, 0, ANYNL);
555 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.