Introduction
Introduction Statistics Contact Development Disclaimer Help
Don't punt on encoding errors. - sam - An updated version of the sam text edito…
git clone git://vernunftzentrum.de/sam.git
Log
Files
Refs
LICENSE
---
commit ce79fc47ddd96ac43d9ab40078a80fe28a93ec12
parent c6c0085b825c6060c598acfb6e2831a246e4fd31
Author: Rob King <[email protected]>
Date: Fri, 27 Jan 2017 10:08:55 -0600
Don't punt on encoding errors.
Originally, if files contained encoding errors (i.e. they weren't
valid text files), we would punt with a "file is not text" error.
This was considered sub-optimal, as there are many files that users
might want to edit that aren't correctly encoded.
We now replace invalid characters with the Unicode replacement
character (0xfffd) and warn. The dirty flag is handled "correctly"
as well.
Diffstat:
sam/io.c | 132 ++++++++++++++++++++++++++++++-
sam/sam.c | 2 ++
sam/sam.h | 3 +++
3 files changed, 136 insertions(+), 1 deletion(-)
---
diff --git a/sam/io.c b/sam/io.c
@@ -10,6 +10,8 @@
#define NSYSFILE 3
#define NOFILE 128
+#define MIN(x, y) ((x) < (y)? (x) : (y))
+
void
checkqid(File *f)
{
@@ -76,9 +78,137 @@ writef(File *f)
}
}
+static wchar_t
+finishpartialchar(File *f, const char *s, size_t n, size_t *p)
+{
+ size_t lp = *p;
+ wchar_t w = 0;
+
+ while (!w && f->mblen && lp < n && f->mblen < BLOCKSIZE){
+ mbstate_t ts = f->ps;
+ size_t rc = 0;
+ wchar_t c = 0;
+
+ switch (rc = mbrtowc(&c, f->mbbuf, f->mblen, &ts)){
+ case (size_t)-1:
+ memset(&f->ps, 0, sizeof(f->ps));
+ w = UNICODE_REPLACEMENT_CHAR;
+ lp++;
+ break;
+
+ case (size_t)-2:
+ f->mbbuf[f->mblen++] = s[lp++];
+ break;
+
+ default:
+ f->ps = ts;
+ w = c;
+ break;
+ }
+ }
+
+ *p = lp;
+ f->mblen = 0;
+ memset(f->mbbuf, 0, sizeof(f->mbbuf));
+
+ return w? w : UNICODE_REPLACEMENT_CHAR;
+}
+
+static size_t
+insertbuf(File *f, const char *s, size_t n, bool *nulls)
+{
+ wchar_t wbuf[BLOCKSIZE + 1] = {0};
+ size_t nw = 0;
+ size_t nt = 0;
+ size_t p = 0;
+ Posn pos = addr.r.p2;
+
+ if (f->mblen)
+ wbuf[nw++] = finishpartialchar(f, s, n, &p);
+
+ while (p < n){
+ mbstate_t ts = f->ps;
+ wchar_t c = 0;
+ size_t rc = mbrtowc(&c, s + p, n - p, &ts);
+ switch (rc){
+ case (size_t)0:
+ if (p < n){
+ memset(&f->ps, 0, sizeof(f->ps));
+ wbuf[nw++] = UNICODE_REPLACEMENT_CHAR;
+ *nulls = true;
+ p++;
+ }
+ break;
+
+ case (size_t)-1:
+ memset(&f->ps, 0, sizeof(f->ps));
+ wbuf[nw++] = UNICODE_REPLACEMENT_CHAR;
+ p++;
+ *nulls = true;
+ break;
+
+ case (size_t)-2:
+ Finsert(f, tmprstr(wbuf, nw), pos);
+ memcpy(f->mbbuf, s + p, MIN(n - p, BLOCKSIZE));
+ f->mblen = MIN(n - p, BLOCKSIZE);
+ return nt + nw;
+
+ default:
+ f->ps = ts;
+ p += rc;
+ wbuf[nw++] = c;
+ break;
+ }
+
+ if (nw >= BLOCKSIZE){
+ Finsert(f, tmprstr(wbuf, nw), pos);
+ memset(wbuf, 0, sizeof(wbuf));
+ nt += nw;
+ nw = 0;
+ }
+ }
+
+ Finsert(f, tmprstr(wbuf, nw), pos);
+ return nt + nw;
+}
+
Posn
readio(File *f, bool *nulls, bool setdate)
{
+ char buf[(BLOCKSIZE * MB_LEN_MAX) + 1] = {0};
+ wchar_t wbuf[BLOCKSIZE + 1] = {0};
+ size_t nw = 0;
+ size_t p = 0;
+ size_t n = 0;
+ size_t nt = 0;
+ Posn pos = addr.r.p2;
+ uint64_t dev, qid;
+ int64_t mtime;
+
+ n = read(io, buf, BLOCKSIZE);
+ while (n > 0){
+ if ((ssize_t)n < 0)
+ return nt;
+
+ nt += insertbuf(f, buf, n, nulls);
+ n = read(io, buf, BLOCKSIZE);
+ }
+
+ if (setdate){
+ if (statfd(io, &dev, &qid, &mtime, 0, 0) > 0){
+ f->dev = dev;
+ f->qid = qid;
+ f->date = mtime;
+ checkqid(f);
+ }
+ }
+
+ return nt;
+}
+
+/* Posn
+readio(File *f, bool *nulls, bool setdate)
+{
size_t n = 0;
size_t nt = 0;
Posn p = addr.r.p2;
@@ -117,7 +247,7 @@ readio(File *f, bool *nulls, bool setdate)
}
return nt;
-}
+} */
Posn
writeio(File *f)
diff --git a/sam/sam.c b/sam/sam.c
@@ -516,6 +516,8 @@ edit(File *f, int cmd)
error_s(Eopen, genc);
}
p = readio(f, &nulls, empty);
+ if (nulls)
+ warn(Wnulls);
closeio((cmd=='e' || cmd=='I')? -1 : p);
if(cmd == 'r')
f->ndot.r.p1 = addr.r.p2, f->ndot.r.p2 = addr.r.p2+p;
diff --git a/sam/sam.h b/sam/sam.h
@@ -119,6 +119,9 @@ struct File
Posn cp1, cp2; /* Write-behind cache positions and */
String cache; /* string */
wchar_t getcbuf[NGETC];
+ char mbbuf[BUFSIZ]; /* partial character during read */
+ size_t mblen; /* number of bytes in partial character */
+ mbstate_t ps; /* state of multibyte decoding */
int ngetc;
int getci;
Posn getcp;
You are viewing proxied material from vernunftzentrum.de. The copyright of proxied material belongs to its original authors. Any comments or complaints in relation to proxied material should be directed to the original authors of the content concerned. Please see the disclaimer for more details.