Introduction
Introduction Statistics Contact Development Disclaimer Help
regexp.c - 9base - revived minimalist port of Plan 9 userland to Unix
git clone git://git.suckless.org/9base
Log
Files
Refs
README
LICENSE
---
regexp.c (15453B)
---
1 #include "sam.h"
2
3 Rangeset sel;
4 String lastregexp;
5 /*
6 * Machine Information
7 */
8 typedef struct Inst Inst;
9
10 struct Inst
11 {
12 long type; /* < 0x10000 ==> literal, otherwise act…
13 union {
14 int rsid;
15 int rsubid;
16 int class;
17 struct Inst *rother;
18 struct Inst *rright;
19 } r;
20 union{
21 struct Inst *lleft;
22 struct Inst *lnext;
23 } l;
24 };
25 #define sid r.rsid
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
30 #define left l.lleft
31 #define next l.lnext
32
33 #define NPROG 1024
34 Inst program[NPROG];
35 Inst *progp;
36 Inst *startinst; /* First inst. of program; might not be p…
37 Inst *bstartinst; /* same for backwards machine */
38
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 Inst *inst; /* Instruction of the thread */
43 Rangeset se;
44 Posn startp; /* first char of match */
45 };
46
47 #define NLIST 127
48
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
52
53 /*
54 * Actions and Tokens
55 *
56 * 0x100xx are operators, value == precedence
57 * 0x200xx are tokens, i.e. operands for operators
58 */
59 #define OPERATOR 0x10000 /* Bitmask of all operator…
60 #define START 0x10000 /* Start, used for ma…
61 #define RBRA 0x10001 /* Right bracket, ) */
62 #define LBRA 0x10002 /* Left bracket, ( */
63 #define OR 0x10003 /* Alternation, | */
64 #define CAT 0x10004 /* Concatentation, impl…
65 #define STAR 0x10005 /* Closure, * */
66 #define PLUS 0x10006 /* a+ == aa* */
67 #define QUEST 0x10007 /* a? == a|nothing, i…
68 #define ANY 0x20000 /* Any character but ne…
69 #define NOP 0x20001 /* No operation, intern…
70 #define BOL 0x20002 /* Beginning of line, ^…
71 #define EOL 0x20003 /* End of line, $ */
72 #define CCLASS 0x20004 /* Character class, …
73 #define NCCLASS 0x20005 /* Negated characte…
74 #define END 0x20077 /* Terminate: match fou…
75
76 #define ISATOR 0x10000
77 #define ISAND 0x20000
78
79 /*
80 * Parser Information
81 */
82 typedef struct Node Node;
83 struct Node
84 {
85 Inst *first;
86 Inst *last;
87 };
88
89 #define NSTACK 20
90 Node andstack[NSTACK];
91 Node *andp;
92 int atorstack[NSTACK];
93 int *atorp;
94 int lastwasand; /* Last token was operand */
95 int cursubid;
96 int subidstack[NSTACK];
97 int *subidp;
98 int backwards;
99 int nbra;
100 Rune *exprp; /* pointer to next character in sourc…
101 #define DCLASS 10 /* allocation increment */
102 int nclass; /* number active */
103 int Nclass; /* high water mark */
104 Rune **class;
105 int negateclass;
106
107 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void newmatch(Rangeset*);
109 void bnewmatch(Rangeset*);
110 void pushand(Inst*, Inst*);
111 void pushator(int);
112 Node *popand(int);
113 int popator(void);
114 void startlex(Rune*);
115 int lex(void);
116 void operator(int);
117 void operand(int);
118 void evaluntil(int);
119 void optimize(Inst*);
120 void bldcclass(void);
121
122 void
123 regerror(Err e)
124 {
125 Strzero(&lastregexp);
126 error(e);
127 }
128
129 void
130 regerror_c(Err e, int c)
131 {
132 Strzero(&lastregexp);
133 error_c(e, c);
134 }
135
136 Inst *
137 newinst(int t)
138 {
139 if(progp >= &program[NPROG])
140 regerror(Etoolong);
141 progp->type = t;
142 progp->left = 0;
143 progp->right = 0;
144 return progp++;
145 }
146
147 Inst *
148 realcompile(Rune *s)
149 {
150 int token;
151
152 startlex(s);
153 atorp = atorstack;
154 andp = andstack;
155 subidp = subidstack;
156 cursubid = 0;
157 lastwasand = FALSE;
158 /* Start with a low priority operator to prime parser */
159 pushator(START-1);
160 while((token=lex()) != END){
161 if((token&ISATOR) == OPERATOR)
162 operator(token);
163 else
164 operand(token);
165 }
166 /* Close with a low priority operator */
167 evaluntil(START);
168 /* Force END */
169 operand(END);
170 evaluntil(START);
171 if(nbra)
172 regerror(Eleftpar);
173 --andp; /* points to first and only operand */
174 return andp->first;
175 }
176
177 void
178 compile(String *s)
179 {
180 int i;
181 Inst *oprogp;
182
183 if(Strcmp(s, &lastregexp)==0)
184 return;
185 for(i=0; i<nclass; i++)
186 free(class[i]);
187 nclass = 0;
188 progp = program;
189 backwards = FALSE;
190 startinst = realcompile(s->s);
191 optimize(program);
192 oprogp = progp;
193 backwards = TRUE;
194 bstartinst = realcompile(s->s);
195 optimize(oprogp);
196 Strduplstr(&lastregexp, s);
197 }
198
199 void
200 operand(int t)
201 {
202 Inst *i;
203 if(lastwasand)
204 operator(CAT); /* catenate is implicit */
205 i = newinst(t);
206 if(t == CCLASS){
207 if(negateclass)
208 i->type = NCCLASS; /* UGH */
209 i->rclass = nclass-1; /* UGH */
210 }
211 pushand(i, i);
212 lastwasand = TRUE;
213 }
214
215 void
216 operator(int t)
217 {
218 if(t==RBRA && --nbra<0)
219 regerror(Erightpar);
220 if(t==LBRA){
221 /*
222 * if(++cursubid >= NSUBEXP)
223 * regerror(Esubexp);
224 */
225 cursubid++; /* silently ignored */
226 nbra++;
227 if(lastwasand)
228 operator(CAT);
229 }else
230 evaluntil(t);
231 if(t!=RBRA)
232 pushator(t);
233 lastwasand = FALSE;
234 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235 lastwasand = TRUE; /* these look like operands */
236 }
237
238 void
239 cant(char *s)
240 {
241 char buf[100];
242
243 sprint(buf, "regexp: can't happen: %s", s);
244 panic(buf);
245 }
246
247 void
248 pushand(Inst *f, Inst *l)
249 {
250 if(andp >= &andstack[NSTACK])
251 cant("operand stack overflow");
252 andp->first = f;
253 andp->last = l;
254 andp++;
255 }
256
257 void
258 pushator(int t)
259 {
260 if(atorp >= &atorstack[NSTACK])
261 cant("operator stack overflow");
262 *atorp++=t;
263 if(cursubid >= NSUBEXP)
264 *subidp++= -1;
265 else
266 *subidp++=cursubid;
267 }
268
269 Node *
270 popand(int op)
271 {
272 if(andp <= &andstack[0])
273 if(op)
274 regerror_c(Emissop, op);
275 else
276 regerror(Ebadregexp);
277 return --andp;
278 }
279
280 int
281 popator(void)
282 {
283 if(atorp <= &atorstack[0])
284 cant("operator stack underflow");
285 --subidp;
286 return *--atorp;
287 }
288
289 void
290 evaluntil(int pri)
291 {
292 Node *op1, *op2, *t;
293 Inst *inst1, *inst2;
294
295 while(pri==RBRA || atorp[-1]>=pri){
296 switch(popator()){
297 case LBRA:
298 op1 = popand('(');
299 inst2 = newinst(RBRA);
300 inst2->subid = *subidp;
301 op1->last->next = inst2;
302 inst1 = newinst(LBRA);
303 inst1->subid = *subidp;
304 inst1->next = op1->first;
305 pushand(inst1, inst2);
306 return; /* must have been RBRA */
307 default:
308 panic("unknown regexp operator");
309 break;
310 case OR:
311 op2 = popand('|');
312 op1 = popand('|');
313 inst2 = newinst(NOP);
314 op2->last->next = inst2;
315 op1->last->next = inst2;
316 inst1 = newinst(OR);
317 inst1->right = op1->first;
318 inst1->left = op2->first;
319 pushand(inst1, inst2);
320 break;
321 case CAT:
322 op2 = popand(0);
323 op1 = popand(0);
324 if(backwards && op2->first->type!=END)
325 t = op1, op1 = op2, op2 = t;
326 op1->last->next = op2->first;
327 pushand(op1->first, op2->last);
328 break;
329 case STAR:
330 op2 = popand('*');
331 inst1 = newinst(OR);
332 op2->last->next = inst1;
333 inst1->right = op2->first;
334 pushand(inst1, inst1);
335 break;
336 case PLUS:
337 op2 = popand('+');
338 inst1 = newinst(OR);
339 op2->last->next = inst1;
340 inst1->right = op2->first;
341 pushand(op2->first, inst1);
342 break;
343 case QUEST:
344 op2 = popand('?');
345 inst1 = newinst(OR);
346 inst2 = newinst(NOP);
347 inst1->left = inst2;
348 inst1->right = op2->first;
349 op2->last->next = inst2;
350 pushand(inst1, inst2);
351 break;
352 }
353 }
354 }
355
356
357 void
358 optimize(Inst *start)
359 {
360 Inst *inst, *target;
361
362 for(inst=start; inst->type!=END; inst++){
363 target = inst->next;
364 while(target->type == NOP)
365 target = target->next;
366 inst->next = target;
367 }
368 }
369
370 #ifdef DEBUG
371 void
372 dumpstack(void){
373 Node *stk;
374 int *ip;
375
376 dprint("operators\n");
377 for(ip = atorstack; ip<atorp; ip++)
378 dprint("0%o\n", *ip);
379 dprint("operands\n");
380 for(stk = andstack; stk<andp; stk++)
381 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
382 }
383 void
384 dump(void){
385 Inst *l;
386
387 l = program;
388 do{
389 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390 l->left-program, l->right-program);
391 }while(l++->type);
392 }
393 #endif
394
395 void
396 startlex(Rune *s)
397 {
398 exprp = s;
399 nbra = 0;
400 }
401
402
403 int
404 lex(void){
405 int c= *exprp++;
406
407 switch(c){
408 case '\\':
409 if(*exprp)
410 if((c= *exprp++)=='n')
411 c='\n';
412 break;
413 case 0:
414 c = END;
415 --exprp; /* In case we come here again */
416 break;
417 case '*':
418 c = STAR;
419 break;
420 case '?':
421 c = QUEST;
422 break;
423 case '+':
424 c = PLUS;
425 break;
426 case '|':
427 c = OR;
428 break;
429 case '.':
430 c = ANY;
431 break;
432 case '(':
433 c = LBRA;
434 break;
435 case ')':
436 c = RBRA;
437 break;
438 case '^':
439 c = BOL;
440 break;
441 case '$':
442 c = EOL;
443 break;
444 case '[':
445 c = CCLASS;
446 bldcclass();
447 break;
448 }
449 return c;
450 }
451
452 long
453 nextrec(void){
454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 regerror(Ebadclass);
456 if(exprp[0] == '\\'){
457 exprp++;
458 if(*exprp=='n'){
459 exprp++;
460 return '\n';
461 }
462 return *exprp++|0x10000;
463 }
464 return *exprp++;
465 }
466
467 void
468 bldcclass(void)
469 {
470 long c1, c2, n, na;
471 Rune *classp;
472
473 classp = emalloc(DCLASS*RUNESIZE);
474 n = 0;
475 na = DCLASS;
476 /* we have already seen the '[' */
477 if(*exprp == '^'){
478 classp[n++] = '\n'; /* don't match newline in neg…
479 negateclass = TRUE;
480 exprp++;
481 }else
482 negateclass = FALSE;
483 while((c1 = nextrec()) != ']'){
484 if(c1 == '-'){
485 Error:
486 free(classp);
487 regerror(Ebadclass);
488 }
489 if(n+4 >= na){ /* 3 runes plus NUL */
490 na += DCLASS;
491 classp = erealloc(classp, na*RUNESIZE);
492 }
493 if(*exprp == '-'){
494 exprp++; /* eat '-' */
495 if((c2 = nextrec()) == ']')
496 goto Error;
497 classp[n+0] = Runemax;
498 classp[n+1] = c1;
499 classp[n+2] = c2;
500 n += 3;
501 }else
502 classp[n++] = c1;
503 }
504 classp[n] = 0;
505 if(nclass == Nclass){
506 Nclass += DCLASS;
507 class = erealloc(class, Nclass*sizeof(Rune*));
508 }
509 class[nclass++] = classp;
510 }
511
512 int
513 classmatch(int classno, int c, int negate)
514 {
515 Rune *p;
516
517 p = class[classno];
518 while(*p){
519 if(*p == Runemax){
520 if(p[1]<=c && c<=p[2])
521 return !negate;
522 p += 3;
523 }else if(*p++ == c)
524 return !negate;
525 }
526 return negate;
527 }
528
529 /*
530 * Note optimization in addinst:
531 * *l must be pending when addinst called; if *l has been looked
532 * at already, the optimization is a bug.
533 */
534 int
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
536 {
537 Ilist *p;
538
539 for(p = l; p->inst; p++){
540 if(p->inst==inst){
541 if((sep)->p[0].p1 < p->se.p[0].p1)
542 p->se= *sep; /* this would be bug…
543 return 0; /* It's already there */
544 }
545 }
546 p->inst = inst;
547 p->se= *sep;
548 (p+1)->inst = 0;
549 return 1;
550 }
551
552 int
553 execute(File *f, Posn startp, Posn eof)
554 {
555 int flag = 0;
556 Inst *inst;
557 Ilist *tlp;
558 Posn p = startp;
559 int nnl = 0, ntl;
560 int c;
561 int wrapped = 0;
562 int startchar = startinst->type<OPERATOR? startinst->type : 0;
563
564 list[0][0].inst = list[1][0].inst = 0;
565 sel.p[0].p1 = -1;
566 /* Execute machine once for each character */
567 for(;;p++){
568 doloop:
569 c = filereadc(f, p);
570 if(p>=eof || c<0){
571 switch(wrapped++){
572 case 0: /* let loop run one more …
573 case 2:
574 break;
575 case 1: /* expired; wrap to begin…
576 if(sel.p[0].p1>=0 || eof!=INFINITY)
577 goto Return;
578 list[0][0].inst = list[1][0].inst = 0;
579 p = 0;
580 goto doloop;
581 default:
582 goto Return;
583 }
584 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nn…
585 break;
586 /* fast check for first char */
587 if(startchar && nnl==0 && c!=startchar)
588 continue;
589 tl = list[flag];
590 nl = list[flag^=1];
591 nl->inst = 0;
592 ntl = nnl;
593 nnl = 0;
594 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof…
595 /* Add first instruction to this list */
596 sempty.p[0].p1 = p;
597 if(addinst(tl, startinst, &sempty))
598 if(++ntl >= NLIST)
599 Overflow:
600 error(Eoverflow);
601 }
602 /* Execute machine until this list is empty */
603 for(tlp = tl; inst = tlp->inst; tlp++){ /* assign…
604 Switchstmt:
605 switch(inst->type){
606 default: /* regular character */
607 if(inst->type==c){
608 Addinst:
609 if(addinst(nl, inst->next, &tlp-…
610 if(++nnl >= NLIST)
611 goto Overflow;
612 }
613 break;
614 case LBRA:
615 if(inst->subid>=0)
616 tlp->se.p[inst->subid].p1 = p;
617 inst = inst->next;
618 goto Switchstmt;
619 case RBRA:
620 if(inst->subid>=0)
621 tlp->se.p[inst->subid].p2 = p;
622 inst = inst->next;
623 goto Switchstmt;
624 case ANY:
625 if(c!='\n')
626 goto Addinst;
627 break;
628 case BOL:
629 if(p==0 || filereadc(f, p - 1)=='\n'){
630 Step:
631 inst = inst->next;
632 goto Switchstmt;
633 }
634 break;
635 case EOL:
636 if(c == '\n')
637 goto Step;
638 break;
639 case CCLASS:
640 if(c>=0 && classmatch(inst->rclass, c, 0…
641 goto Addinst;
642 break;
643 case NCCLASS:
644 if(c>=0 && classmatch(inst->rclass, c, 1…
645 goto Addinst;
646 break;
647 case OR:
648 /* evaluate right choice later */
649 if(addinst(tl, inst->right, &tlp->se))
650 if(++ntl >= NLIST)
651 goto Overflow;
652 /* efficiency: advance and re-evaluate */
653 inst = inst->left;
654 goto Switchstmt;
655 case END: /* Match! */
656 tlp->se.p[0].p2 = p;
657 newmatch(&tlp->se);
658 break;
659 }
660 }
661 }
662 Return:
663 return sel.p[0].p1>=0;
664 }
665
666 void
667 newmatch(Rangeset *sp)
668 {
669 int i;
670
671 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
672 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
673 for(i = 0; i<NSUBEXP; i++)
674 sel.p[i] = sp->p[i];
675 }
676
677 int
678 bexecute(File *f, Posn startp)
679 {
680 int flag = 0;
681 Inst *inst;
682 Ilist *tlp;
683 Posn p = startp;
684 int nnl = 0, ntl;
685 int c;
686 int wrapped = 0;
687 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
688
689 list[0][0].inst = list[1][0].inst = 0;
690 sel.p[0].p1= -1;
691 /* Execute machine once for each character, including terminal N…
692 for(;;--p){
693 doloop:
694 if((c = filereadc(f, p - 1))==-1){
695 switch(wrapped++){
696 case 0: /* let loop run one more …
697 case 2:
698 break;
699 case 1: /* expired; wrap to end */
700 if(sel.p[0].p1>=0)
701 case 3:
702 goto Return;
703 list[0][0].inst = list[1][0].inst = 0;
704 p = f->b.nc;
705 goto doloop;
706 default:
707 goto Return;
708 }
709 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nn…
710 break;
711 /* fast check for first char */
712 if(startchar && nnl==0 && c!=startchar)
713 continue;
714 tl = list[flag];
715 nl = list[flag^=1];
716 nl->inst = 0;
717 ntl = nnl;
718 nnl = 0;
719 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
720 /* Add first instruction to this list */
721 /* the minus is so the optimizations in addinst …
722 sempty.p[0].p1 = -p;
723 if(addinst(tl, bstartinst, &sempty))
724 if(++ntl >= NLIST)
725 Overflow:
726 error(Eoverflow);
727 }
728 /* Execute machine until this list is empty */
729 for(tlp = tl; inst = tlp->inst; tlp++){ /* assign…
730 Switchstmt:
731 switch(inst->type){
732 default: /* regular character */
733 if(inst->type == c){
734 Addinst:
735 if(addinst(nl, inst->next, &tlp-…
736 if(++nnl >= NLIST)
737 goto Overflow;
738 }
739 break;
740 case LBRA:
741 if(inst->subid>=0)
742 tlp->se.p[inst->subid].p1 = p;
743 inst = inst->next;
744 goto Switchstmt;
745 case RBRA:
746 if(inst->subid >= 0)
747 tlp->se.p[inst->subid].p2 = p;
748 inst = inst->next;
749 goto Switchstmt;
750 case ANY:
751 if(c != '\n')
752 goto Addinst;
753 break;
754 case BOL:
755 if(c=='\n' || p==0){
756 Step:
757 inst = inst->next;
758 goto Switchstmt;
759 }
760 break;
761 case EOL:
762 if(p==f->b.nc || filereadc(f, p)=='\n')
763 goto Step;
764 break;
765 case CCLASS:
766 if(c>=0 && classmatch(inst->rclass, c, 0…
767 goto Addinst;
768 break;
769 case NCCLASS:
770 if(c>=0 && classmatch(inst->rclass, c, 1…
771 goto Addinst;
772 break;
773 case OR:
774 /* evaluate right choice later */
775 if(addinst(tlp, inst->right, &tlp->se))
776 if(++ntl >= NLIST)
777 goto Overflow;
778 /* efficiency: advance and re-evaluate */
779 inst = inst->left;
780 goto Switchstmt;
781 case END: /* Match! */
782 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* m…
783 tlp->se.p[0].p2 = p;
784 bnewmatch(&tlp->se);
785 break;
786 }
787 }
788 }
789 Return:
790 return sel.p[0].p1>=0;
791 }
792
793 void
794 bnewmatch(Rangeset *sp)
795 {
796 int i;
797 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel…
798 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; …
799 sel.p[i].p1 = sp->p[i].p2;
800 sel.p[i].p2 = sp->p[i].p1;
801 }
802 }
You are viewing proxied material from suckless.org. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.