Introduction
Introduction Statistics Contact Development Disclaimer Help
tregex: import regex.c from neatlibc - neatvi - [fork] simple vi-type editor wi…
git clone git://src.adamsgaard.dk/neatvi
Log
Files
Refs
README
---
commit 67f3f8c955e80f17bc4ca092e2454a0104d13b18
parent 0225a76aa55a2d0dae9ac2a6d98f9c19de3b52c2
Author: Ali Gholami Rudi <[email protected]>
Date: Tue, 16 Jun 2015 14:51:07 +0430
regex: import regex.c from neatlibc
Diffstat:
M Makefile | 2 +-
A regex.c | 593 +++++++++++++++++++++++++++++…
A regex.h | 18 ++++++++++++++++++
M rset.c | 2 +-
4 files changed, 613 insertions(+), 2 deletions(-)
---
diff --git a/Makefile b/Makefile
t@@ -3,7 +3,7 @@ CFLAGS = -Wall -O2
LDFLAGS =
OBJS = vi.o ex.o lbuf.o mot.o sbuf.o ren.o dir.o syn.o reg.o led.o \
- uc.o term.o rset.o cmd.o conf.o
+ uc.o term.o rset.o regex.o cmd.o conf.o
all: vi
diff --git a/regex.c b/regex.c
t@@ -0,0 +1,593 @@
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "regex.h"
+
+#define NGRPS 64
+#define NREPS 128
+
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#define LEN(a) (sizeof(a) / sizeof((a)[0]))
+
+/* regular expressions atoms */
+#define RA_CHR '\0' /* character literal */
+#define RA_BEG '^' /* string start */
+#define RA_END '$' /* string end */
+#define RA_ANY '.' /* any character */
+#define RA_BRK '[' /* bracket expression */
+#define RA_WBEG '<' /* word start */
+#define RA_WEND '>' /* word end */
+
+/* regular expression node types */
+#define RN_ATOM '\0' /* regular expression */
+#define RN_CAT 'c' /* concatenation */
+#define RN_ALT '|' /* alternate expressions */
+#define RN_GRP '(' /* pattern group */
+
+/* regular expression program instructions */
+#define RI_ATOM '\0' /* regular expression */
+#define RI_FORK 'f' /* fork the execution */
+#define RI_JUMP 'j' /* jump to the given instruction */
+#define RI_MARK 'm' /* mark the current position */
+#define RI_MATCH 'q' /* the pattern is matched */
+
+/* regular expression atom */
+struct ratom {
+ int ra; /* atom type (RA_*) */
+ char *s; /* atom argument */
+};
+
+/* regular expression instruction */
+struct rinst {
+ int ri; /* instruction type (RI_*) */
+ struct ratom ra; /* regular expression atom (RI_ATOM) */
+ int a1, a2; /* destination of RE_FORK and RI_JUMP */
+ int mark; /* mark (RI_MARK) */
+};
+
+/* regular expression program */
+struct regex {
+ struct rinst *p; /* the program */
+ int n; /* number of instructions */
+ int grpcnt; /* number of groups */
+ int flg; /* regcomp() flags */
+};
+
+/* regular expression matching state */
+struct rstate {
+ int mark[NGRPS * 2]; /* marks for RI_MARK */
+ int pc; /* program counter */
+ char *s; /* the current position in the string */
+ char *o; /* the beginning of the string */
+ int flg; /* flags passed to regcomp() and regexec() */
+};
+
+/* regular expression tree; used for parsing */
+struct rnode {
+ int rn; /* node type (RN_*) */
+ struct ratom ra; /* regular expression atom (RN_ATOM) */
+ int mincnt, maxcnt; /* number of repetitions */
+ struct rnode *c1, *c2; /* children */
+};
+
+static struct rnode *rnode_make(int rn, struct rnode *c1, struct rnode *c2)
+{
+ struct rnode *rnode = malloc(sizeof(*rnode));
+ memset(rnode, 0, sizeof(*rnode));
+ rnode->rn = rn;
+ rnode->c1 = c1;
+ rnode->c2 = c2;
+ rnode->mincnt = 1;
+ rnode->maxcnt = 1;
+ return rnode;
+}
+
+static void rnode_free(struct rnode *rnode)
+{
+ if (rnode->c1)
+ rnode_free(rnode->c1);
+ if (rnode->c2)
+ rnode_free(rnode->c2);
+ free(rnode->ra.s);
+ free(rnode);
+}
+
+static int uc_len(char *s)
+{
+ int c = (unsigned char) s[0];
+ if (c > 0 && c <= 0x7f)
+ return 1;
+ if (c >= 0xfc)
+ return 6;
+ if (c >= 0xf8)
+ return 5;
+ if (c >= 0xf0)
+ return 4;
+ if (c >= 0xe0)
+ return 3;
+ if (c >= 0xc0)
+ return 2;
+ return c != 0;
+}
+
+static int uc_dec(char *s)
+{
+ int result;
+ int l = uc_len(s);
+ if (l <= 1)
+ return (unsigned char) *s;
+ result = (0x3f >> --l) & (unsigned char) *s++;
+ while (l--)
+ result = (result << 6) | ((unsigned char) *s++ & 0x3f);
+ return result;
+}
+
+static void ratom_copy(struct ratom *dst, struct ratom *src)
+{
+ dst->ra = src->ra;
+ dst->s = NULL;
+ if (src->s) {
+ int len = strlen(src->s);
+ dst->s = malloc(len + 1);
+ memcpy(dst->s, src->s, len + 1);
+ }
+}
+
+static int brk_len(char *s)
+{
+ int n = 1;
+ if (s[n] == '^') /* exclusion mark */
+ n++;
+ if (s[n] == ']') /* handling []a] */
+ n++;
+ while (s[n] && s[n] != ']') {
+ if (s[n] == '[' && (s[n + 1] == ':' || s[n + 1] == '='))
+ while (s[n] && s[n] != ']')
+ n++;
+ if (s[n])
+ n++;
+ }
+ return s[n] == ']' ? n + 1 : n;
+}
+
+static void ratom_readbrk(struct ratom *ra, char **pat)
+{
+ int len = brk_len(*pat);
+ ra->ra = RA_BRK;
+ ra->s = malloc(len + 1);
+ memcpy(ra->s, *pat, len);
+ ra->s[len] = '\0';
+ *pat += len;
+}
+
+static void ratom_read(struct ratom *ra, char **pat)
+{
+ int len;
+ switch ((unsigned char) **pat) {
+ case '.':
+ ra->ra = RA_ANY;
+ (*pat)++;
+ break;
+ case '^':
+ ra->ra = RA_BEG;
+ (*pat)++;
+ break;
+ case '$':
+ ra->ra = RA_END;
+ (*pat)++;
+ break;
+ case '[':
+ ratom_readbrk(ra, pat);
+ break;
+ case '\\':
+ if ((*pat)[1] == '<' || (*pat)[1] == '>') {
+ ra->ra = (*pat)[1] == '<' ? RA_WBEG : RA_WEND;
+ *pat += 2;
+ break;
+ }
+ (*pat)++;
+ default:
+ ra->ra = RA_CHR;
+ len = uc_len(*pat);
+ ra->s = malloc(8);
+ memcpy(ra->s, *pat, len);
+ ra->s[len] = '\0';
+ *pat += len;
+ }
+}
+
+static char *uc_beg(char *beg, char *s)
+{
+ while (s > beg && (((unsigned char) *s) & 0xc0) == 0x80)
+ s--;
+ return s;
+}
+
+static int isword(char *s)
+{
+ int c = (unsigned char) s[0];
+ return isalnum(c) || c == '_' || c > 127;
+}
+
+static char *brk_classes[][2] = {
+ {":alnum:", "a-zA-Z0-9"},
+ {":alpha:", "a-zA-Z"},
+ {":blank:", " \t"},
+ {":digit:", "0-9"},
+ {":lower:", "a-z"},
+ {":print:", "\x20-\x7e"},
+ {":punct:", "][!\"#$%&'()*+,./:;<=>?@\\^_`{|}~-"},
+ {":space:", " \t\r\n\v\f"},
+ {":upper:", "A-Z"},
+ {":word:", "a-zA-Z0-9_"},
+ {":xdigit:", "a-fA-F0-9"},
+};
+
+static int brk_match(char *brk, int c, int flg)
+{
+ int beg, end;
+ int i;
+ int not = brk[0] == '^';
+ char *p = not ? brk + 1 : brk;
+ char *p0 = p;
+ if (flg & REG_ICASE && c < 128 && isupper(c))
+ c = tolower(c);
+ while (*p && (p == p0 || *p != ']')) {
+ if (p[0] == '[' && p[1] == ':') {
+ for (i = 0; i < LEN(brk_classes); i++) {
+ char *cc = brk_classes[i][0];
+ char *cp = brk_classes[i][1];
+ if (!strncmp(cc, p + 1, strlen(cc)))
+ if (!brk_match(cp, c, flg))
+ return not;
+ }
+ p += brk_len(p);
+ continue;
+ }
+ beg = uc_dec(p);
+ p += uc_len(p);
+ end = beg;
+ if (p[0] == '-' && p[1] && p[1] != ']') {
+ p++;
+ end = uc_dec(p);
+ p += uc_len(p);
+ }
+ if (flg & REG_ICASE && beg < 128 && isupper(beg))
+ beg = tolower(beg);
+ if (flg & REG_ICASE && end < 128 && isupper(end))
+ end = tolower(end);
+ if (c >= beg && c <= end)
+ return not;
+ }
+ return !not;
+}
+
+static int ratom_match(struct ratom *ra, struct rstate *rs)
+{
+ if (ra->ra == RA_CHR) {
+ int c1 = uc_dec(ra->s);
+ int c2 = uc_dec(rs->s);
+ if (rs->flg & REG_ICASE && c1 < 128 && isupper(c1))
+ c1 = tolower(c1);
+ if (rs->flg & REG_ICASE && c2 < 128 && isupper(c2))
+ c2 = tolower(c2);
+ if (c1 != c2)
+ return 1;
+ rs->s += uc_len(ra->s);
+ return 0;
+ }
+ if (ra->ra == RA_ANY) {
+ if (!rs->s[0])
+ return 1;
+ rs->s += uc_len(rs->s);
+ return 0;
+ }
+ if (ra->ra == RA_BRK) {
+ int c = uc_dec(rs->s);
+ if (!c)
+ return 1;
+ rs->s += uc_len(rs->s);
+ return brk_match(ra->s + 1, c, rs->flg);
+ }
+ if (ra->ra == RA_BEG && !(rs->flg & REG_NOTBOL))
+ return rs->s != rs->o;
+ if (ra->ra == RA_END && !(rs->flg & REG_NOTEOL))
+ return rs->s[0] != '\0';
+ if (ra->ra == RA_WBEG)
+ return !((rs->s == rs->o || !isword(uc_beg(rs->o, rs->s - 1)))…
+ isword(rs->s));
+ if (ra->ra == RA_WEND)
+ return !(rs->s != rs->o && isword(uc_beg(rs->o, rs->s - 1)) &&
+ (!rs->s[0] || !isword(rs->s)));
+ return 1;
+}
+
+static struct rnode *rnode_parse(char **pat);
+
+static struct rnode *rnode_grp(char **pat)
+{
+ struct rnode *rnode;
+ if ((*pat)[0] != '(')
+ return NULL;
+ *pat += 1;
+ rnode = rnode_parse(pat);
+ if ((*pat)[0] != ')') {
+ rnode_free(rnode);
+ return NULL;
+ }
+ *pat += 1;
+ return rnode_make(RN_GRP, rnode, NULL);
+}
+
+static struct rnode *rnode_atom(char **pat)
+{
+ struct rnode *rnode;
+ if (!**pat)
+ return NULL;
+ if ((*pat)[0] == '|' || (*pat)[0] == ')')
+ return NULL;
+ if ((*pat)[0] == '(') {
+ rnode = rnode_grp(pat);
+ } else {
+ rnode = rnode_make(RN_ATOM, NULL, NULL);
+ ratom_read(&rnode->ra, pat);
+ }
+ if ((*pat)[0] == '*' || (*pat)[0] == '?') {
+ rnode->mincnt = 0;
+ rnode->maxcnt = (*pat)[0] == '*' ? -1 : 1;
+ (*pat)++;
+ }
+ if ((*pat)[0] == '+') {
+ rnode->mincnt = 1;
+ rnode->maxcnt = -1;
+ *pat += 1;
+ }
+ if ((*pat)[0] == '{') {
+ rnode->mincnt = 0;
+ rnode->maxcnt = 0;
+ *pat += 1;
+ while (isdigit((unsigned char) **pat))
+ rnode->mincnt = rnode->mincnt * 10 + *(*pat)++ - '0';
+ if (**pat == ',') {
+ (*pat)++;
+ if ((*pat)[0] == '}')
+ rnode->maxcnt = -1;
+ while (isdigit((unsigned char) **pat))
+ rnode->maxcnt = rnode->maxcnt * 10 + *(*pat)++…
+ } else {
+ rnode->maxcnt = rnode->mincnt;
+ }
+ *pat += 1;
+ if (rnode->mincnt > NREPS || rnode->maxcnt > NREPS) {
+ rnode_free(rnode);
+ return NULL;
+ }
+ }
+ return rnode;
+}
+
+static struct rnode *rnode_seq(char **pat)
+{
+ struct rnode *c1 = rnode_atom(pat);
+ struct rnode *c2;
+ if (!c1)
+ return NULL;
+ c2 = rnode_seq(pat);
+ return c2 ? rnode_make(RN_CAT, c1, c2) : c1;
+}
+
+static struct rnode *rnode_parse(char **pat)
+{
+ struct rnode *c1 = rnode_seq(pat);
+ struct rnode *c2;
+ if ((*pat)[0] != '|')
+ return c1;
+ *pat += 1;
+ c2 = rnode_parse(pat);
+ return c2 ? rnode_make(RN_ALT, c1, c2) : c1;
+}
+
+static int rnode_count(struct rnode *rnode)
+{
+ int n = 1;
+ if (!rnode)
+ return 0;
+ if (rnode->rn == RN_CAT)
+ n = rnode_count(rnode->c1) + rnode_count(rnode->c2);
+ if (rnode->rn == RN_ALT)
+ n = rnode_count(rnode->c1) + rnode_count(rnode->c2) + 2;
+ if (rnode->rn == RN_GRP)
+ n = rnode_count(rnode->c1) + 2;
+ if (rnode->mincnt == 0 && rnode->maxcnt == 0)
+ return 0;
+ if (rnode->mincnt == 1 && rnode->maxcnt == 1)
+ return n;
+ if (rnode->maxcnt < 0) {
+ n = (rnode->mincnt + 1) * n + 1;
+ } else {
+ n = (rnode->mincnt + rnode->maxcnt) * n +
+ rnode->maxcnt - rnode->mincnt;
+ }
+ if (!rnode->mincnt)
+ n++;
+ return n;
+}
+
+static int re_insert(struct regex *p, int ri)
+{
+ p->p[p->n++].ri = ri;
+ return p->n - 1;
+}
+
+static void rnode_emit(struct rnode *n, struct regex *p);
+
+static void rnode_emitnorep(struct rnode *n, struct regex *p)
+{
+ int fork, done, mark;
+ if (n->rn == RN_ALT) {
+ fork = re_insert(p, RI_FORK);
+ p->p[fork].a1 = p->n;
+ rnode_emit(n->c1, p);
+ done = re_insert(p, RI_JUMP);
+ p->p[fork].a2 = p->n;
+ rnode_emit(n->c2, p);
+ p->p[done].a1 = p->n;
+ }
+ if (n->rn == RN_CAT) {
+ rnode_emit(n->c1, p);
+ rnode_emit(n->c2, p);
+ }
+ if (n->rn == RN_GRP) {
+ int grp = p->grpcnt++ + 1;
+ mark = re_insert(p, RI_MARK);
+ p->p[mark].mark = 2 * grp;
+ rnode_emit(n->c1, p);
+ mark = re_insert(p, RI_MARK);
+ p->p[mark].mark = 2 * grp + 1;
+ }
+ if (n->rn == RN_ATOM) {
+ int atom = re_insert(p, RI_ATOM);
+ ratom_copy(&p->p[atom].ra, &n->ra);
+ }
+}
+
+static void rnode_emit(struct rnode *n, struct regex *p)
+{
+ int last;
+ int jmpend[NREPS];
+ int jmpend_cnt = 0;
+ int i;
+ if (!n)
+ return;
+ if (n->mincnt == 0 && n->maxcnt == 0)
+ return;
+ if (n->mincnt == 1 && n->maxcnt == 1) {
+ rnode_emitnorep(n, p);
+ return;
+ }
+ if (n->mincnt == 0) {
+ int fork = re_insert(p, RI_FORK);
+ p->p[fork].a1 = p->n;
+ jmpend[jmpend_cnt++] = fork;
+ }
+ for (i = 0; i < MAX(1, n->mincnt); i++) {
+ last = p->n;
+ rnode_emitnorep(n, p);
+ }
+ if (n->maxcnt < 0) {
+ int fork;
+ fork = re_insert(p, RI_FORK);
+ p->p[fork].a1 = last;
+ p->p[fork].a2 = p->n;
+ }
+ for (i = MAX(1, n->mincnt); i < n->maxcnt; i++) {
+ int fork = re_insert(p, RI_FORK);
+ p->p[fork].a1 = p->n;
+ jmpend[jmpend_cnt++] = fork;
+ rnode_emitnorep(n, p);
+ }
+ for (i = 0; i < jmpend_cnt; i++)
+ p->p[jmpend[i]].a2 = p->n;
+}
+
+int regcomp(regex_t *preg, char *pat, int flg)
+{
+ struct rnode *rnode = rnode_parse(&pat);
+ struct regex *re = malloc(sizeof(*re));
+ int n = rnode_count(rnode) + 3;
+ int mark;
+ memset(re, 0, sizeof(*re));
+ re->p = malloc(n * sizeof(re->p[0]));
+ memset(re->p, 0, n * sizeof(re->p[0]));
+ mark = re_insert(re, RI_MARK);
+ re->p[mark].mark = 0;
+ rnode_emit(rnode, re);
+ mark = re_insert(re, RI_MARK);
+ re->p[mark].mark = 1;
+ mark = re_insert(re, RI_MATCH);
+ rnode_free(rnode);
+ re->flg = flg;
+ *preg = re;
+ return 0;
+}
+
+void regfree(regex_t *preg)
+{
+ struct regex *re = *preg;
+ int i;
+ for (i = 0; i < re->n; i++)
+ if (re->p[i].ri == RI_ATOM)
+ free(re->p[i].ra.s);
+ free(re->p);
+ free(re);
+}
+
+static int re_rec(struct regex *re, struct rstate *rs)
+{
+ struct rinst *ri = &re->p[rs->pc];
+ if (ri->ri == RI_ATOM) {
+ if (ratom_match(&ri->ra, rs))
+ return 1;
+ rs->pc++;
+ return re_rec(re, rs);
+ }
+ if (ri->ri == RI_MARK) {
+ if (ri->mark < NGRPS)
+ rs->mark[ri->mark] = rs->s - rs->o;
+ rs->pc++;
+ return re_rec(re, rs);
+ }
+ if (ri->ri == RI_JUMP) {
+ rs->pc = ri->a1;
+ return re_rec(re, rs);
+ }
+ if (ri->ri == RI_FORK) {
+ struct rstate base = *rs;
+ rs->pc = ri->a1;
+ if (!re_rec(re, rs))
+ return 0;
+ *rs = base;
+ rs->pc = ri->a2;
+ return re_rec(re, rs);
+ }
+ if (ri->ri == RI_MATCH)
+ return 0;
+ return 1;
+}
+
+static int re_recmatch(struct regex *re, struct rstate *rs, int nsub, regmatch…
+{
+ int i;
+ rs->pc = 0;
+ for (i = 0; i < LEN(rs->mark); i++)
+ rs->mark[i] = -1;
+ if (!re_rec(re, rs)) {
+ for (i = 0; i < nsub; i++) {
+ psub[i].rm_so = i * 2 < LEN(rs->mark) ? rs->mark[i * 2…
+ psub[i].rm_eo = i * 2 < LEN(rs->mark) ? rs->mark[i * 2…
+ }
+ return 0;
+ }
+ return 1;
+}
+
+int regexec(regex_t *preg, char *s, int nsub, regmatch_t psub[], int flg)
+{
+ struct regex *re = *preg;
+ struct rstate rs;
+ memset(&rs, 0, sizeof(rs));
+ rs.flg = re->flg | flg;
+ rs.o = s;
+ while (*s) {
+ rs.s = s++;
+ if (!re_recmatch(re, &rs, flg & REG_NOSUB ? 0 : nsub, psub))
+ return 0;
+ }
+ return 1;
+}
+
+int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size)
+{
+ return 0;
+}
diff --git a/regex.h b/regex.h
t@@ -0,0 +1,18 @@
+#define REG_EXTENDED 0x01
+#define REG_NOSUB 0x02
+#define REG_ICASE 0x04
+#define REG_NEWLINE 0x08
+#define REG_NOTBOL 0x10
+#define REG_NOTEOL 0x20
+
+typedef struct {
+ long rm_so;
+ long rm_eo;
+} regmatch_t;
+
+typedef struct regex *regex_t;
+
+int regcomp(regex_t *preg, char *regex, int cflags);
+int regexec(regex_t *preg, char *str, int nmatch, regmatch_t pmatch[], int efl…
+int regerror(int errcode, regex_t *preg, char *errbuf, int errbuf_size);
+void regfree(regex_t *preg);
diff --git a/rset.c b/rset.c
t@@ -1,7 +1,7 @@
-#include <regex.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
+#include "regex.h"
#include "vi.h"
/* regular expression set */
You are viewing proxied material from mx1.adamsgaard.dk. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.