Don't punt on encoding errors. - sam - An updated version of the sam text edito… | |
git clone git://vernunftzentrum.de/sam.git | |
Log | |
Files | |
Refs | |
LICENSE | |
--- | |
commit ce79fc47ddd96ac43d9ab40078a80fe28a93ec12 | |
parent c6c0085b825c6060c598acfb6e2831a246e4fd31 | |
Author: Rob King <[email protected]> | |
Date: Fri, 27 Jan 2017 10:08:55 -0600 | |
Don't punt on encoding errors. | |
Originally, if files contained encoding errors (i.e. they weren't | |
valid text files), we would punt with a "file is not text" error. | |
This was considered sub-optimal, as there are many files that users | |
might want to edit that aren't correctly encoded. | |
We now replace invalid characters with the Unicode replacement | |
character (0xfffd) and warn. The dirty flag is handled "correctly" | |
as well. | |
Diffstat: | |
sam/io.c | 132 ++++++++++++++++++++++++++++++- | |
sam/sam.c | 2 ++ | |
sam/sam.h | 3 +++ | |
3 files changed, 136 insertions(+), 1 deletion(-) | |
--- | |
diff --git a/sam/io.c b/sam/io.c | |
@@ -10,6 +10,8 @@ | |
#define NSYSFILE 3 | |
#define NOFILE 128 | |
+#define MIN(x, y) ((x) < (y)? (x) : (y)) | |
+ | |
void | |
checkqid(File *f) | |
{ | |
@@ -76,9 +78,137 @@ writef(File *f) | |
} | |
} | |
+static wchar_t | |
+finishpartialchar(File *f, const char *s, size_t n, size_t *p) | |
+{ | |
+ size_t lp = *p; | |
+ wchar_t w = 0; | |
+ | |
+ while (!w && f->mblen && lp < n && f->mblen < BLOCKSIZE){ | |
+ mbstate_t ts = f->ps; | |
+ size_t rc = 0; | |
+ wchar_t c = 0; | |
+ | |
+ switch (rc = mbrtowc(&c, f->mbbuf, f->mblen, &ts)){ | |
+ case (size_t)-1: | |
+ memset(&f->ps, 0, sizeof(f->ps)); | |
+ w = UNICODE_REPLACEMENT_CHAR; | |
+ lp++; | |
+ break; | |
+ | |
+ case (size_t)-2: | |
+ f->mbbuf[f->mblen++] = s[lp++]; | |
+ break; | |
+ | |
+ default: | |
+ f->ps = ts; | |
+ w = c; | |
+ break; | |
+ } | |
+ } | |
+ | |
+ *p = lp; | |
+ f->mblen = 0; | |
+ memset(f->mbbuf, 0, sizeof(f->mbbuf)); | |
+ | |
+ return w? w : UNICODE_REPLACEMENT_CHAR; | |
+} | |
+ | |
+static size_t | |
+insertbuf(File *f, const char *s, size_t n, bool *nulls) | |
+{ | |
+ wchar_t wbuf[BLOCKSIZE + 1] = {0}; | |
+ size_t nw = 0; | |
+ size_t nt = 0; | |
+ size_t p = 0; | |
+ Posn pos = addr.r.p2; | |
+ | |
+ if (f->mblen) | |
+ wbuf[nw++] = finishpartialchar(f, s, n, &p); | |
+ | |
+ while (p < n){ | |
+ mbstate_t ts = f->ps; | |
+ wchar_t c = 0; | |
+ size_t rc = mbrtowc(&c, s + p, n - p, &ts); | |
+ switch (rc){ | |
+ case (size_t)0: | |
+ if (p < n){ | |
+ memset(&f->ps, 0, sizeof(f->ps)); | |
+ wbuf[nw++] = UNICODE_REPLACEMENT_CHAR; | |
+ *nulls = true; | |
+ p++; | |
+ } | |
+ break; | |
+ | |
+ case (size_t)-1: | |
+ memset(&f->ps, 0, sizeof(f->ps)); | |
+ wbuf[nw++] = UNICODE_REPLACEMENT_CHAR; | |
+ p++; | |
+ *nulls = true; | |
+ break; | |
+ | |
+ case (size_t)-2: | |
+ Finsert(f, tmprstr(wbuf, nw), pos); | |
+ memcpy(f->mbbuf, s + p, MIN(n - p, BLOCKSIZE)); | |
+ f->mblen = MIN(n - p, BLOCKSIZE); | |
+ return nt + nw; | |
+ | |
+ default: | |
+ f->ps = ts; | |
+ p += rc; | |
+ wbuf[nw++] = c; | |
+ break; | |
+ } | |
+ | |
+ if (nw >= BLOCKSIZE){ | |
+ Finsert(f, tmprstr(wbuf, nw), pos); | |
+ memset(wbuf, 0, sizeof(wbuf)); | |
+ nt += nw; | |
+ nw = 0; | |
+ } | |
+ } | |
+ | |
+ Finsert(f, tmprstr(wbuf, nw), pos); | |
+ return nt + nw; | |
+} | |
+ | |
Posn | |
readio(File *f, bool *nulls, bool setdate) | |
{ | |
+ char buf[(BLOCKSIZE * MB_LEN_MAX) + 1] = {0}; | |
+ wchar_t wbuf[BLOCKSIZE + 1] = {0}; | |
+ size_t nw = 0; | |
+ size_t p = 0; | |
+ size_t n = 0; | |
+ size_t nt = 0; | |
+ Posn pos = addr.r.p2; | |
+ uint64_t dev, qid; | |
+ int64_t mtime; | |
+ | |
+ n = read(io, buf, BLOCKSIZE); | |
+ while (n > 0){ | |
+ if ((ssize_t)n < 0) | |
+ return nt; | |
+ | |
+ nt += insertbuf(f, buf, n, nulls); | |
+ n = read(io, buf, BLOCKSIZE); | |
+ } | |
+ | |
+ if (setdate){ | |
+ if (statfd(io, &dev, &qid, &mtime, 0, 0) > 0){ | |
+ f->dev = dev; | |
+ f->qid = qid; | |
+ f->date = mtime; | |
+ checkqid(f); | |
+ } | |
+ } | |
+ | |
+ return nt; | |
+} | |
+ | |
+/* Posn | |
+readio(File *f, bool *nulls, bool setdate) | |
+{ | |
size_t n = 0; | |
size_t nt = 0; | |
Posn p = addr.r.p2; | |
@@ -117,7 +247,7 @@ readio(File *f, bool *nulls, bool setdate) | |
} | |
return nt; | |
-} | |
+} */ | |
Posn | |
writeio(File *f) | |
diff --git a/sam/sam.c b/sam/sam.c | |
@@ -516,6 +516,8 @@ edit(File *f, int cmd) | |
error_s(Eopen, genc); | |
} | |
p = readio(f, &nulls, empty); | |
+ if (nulls) | |
+ warn(Wnulls); | |
closeio((cmd=='e' || cmd=='I')? -1 : p); | |
if(cmd == 'r') | |
f->ndot.r.p1 = addr.r.p2, f->ndot.r.p2 = addr.r.p2+p; | |
diff --git a/sam/sam.h b/sam/sam.h | |
@@ -119,6 +119,9 @@ struct File | |
Posn cp1, cp2; /* Write-behind cache positions and */ | |
String cache; /* string */ | |
wchar_t getcbuf[NGETC]; | |
+ char mbbuf[BUFSIZ]; /* partial character during read */ | |
+ size_t mblen; /* number of bytes in partial character */ | |
+ mbstate_t ps; /* state of multibyte decoding */ | |
int ngetc; | |
int getci; | |
Posn getcp; |