/*
* File system devices.
* Follows device config in Ken's file server.
* Builds mirrors, concatenations, interleavings, and partitions
* of devices out of other (inner) devices.
* It is ok if inner devices are provided by this driver.
*
* Built files are grouped on different directories
* (called trees, and used to represent disks).
* The "#k/fs" tree is always available and never goes away.
* Configuration changes happen only while no I/O is in progress.
*
* Default sector size is one byte unless changed by the "disk" ctl.
*/
enum
{
Fnone,
Fmirror, /* mirror of others */
Fcat, /* catenation of others */
Finter, /* interleaving of others */
Fpart, /* part of other */
Fclear, /* start over */
Fdel, /* delete a configure device */
Fdisk, /* set default tree and sector sz*/
Fcrypt, /* encrypted device */
Sectorsz = 1,
Blksize = 8*1024, /* for Finter only */
Cryptsectsz = 512, /* for Fcrypt only */
Incr = 5, /* Increments for the dev array */
/*
* All qids are decorated with the tree number.
* #k/fs is tree number 0, is automatically added and
* its first qid is for the ctl file. It never goes away.
*/
Qtop = 0, /* #k */
Qdir, /* directory (#k/fs) */
Qctl, /* ctl, only for #k/fs/ctl */
Qfirst, /* first qid assigned for device */
Iswrite = 0,
Isread,
Optional = 0,
Mustexist,
/* tunable parameters */
Maxconf = 4*1024, /* max length for config */
Ndevs = 32, /* max. inner devs per command */
Ntrees = 128, /* max. number of trees */
Maxretries = 3, /* max. retries of i/o errors */
Retrypause = 5000, /* ms. to pause between retries */
};
struct Fsdev
{
Ref; /* one per Chan doing I/O */
int gone; /* true if removed */
int vers; /* qid version for this device */
int type; /* Fnone, Fmirror, ... */
char *name; /* name for this fsdev */
Tree* tree; /* where the device is kept */
vlong size; /* min(inner[X].isize) */
vlong start; /* start address (for Fpart) */
uint ndevs; /* number of inner devices */
Inner *inner[Ndevs]; /* inner devices */
Key *key; /* crypt key */
};
struct Tree
{
char *name; /* name for #k/<name> */
Fsdev **devs; /* devices in dir. */
uint ndevs; /* number of devices */
uint nadevs; /* number of allocated devices in devs */
};
struct Key {
AESstate tweak, ecb;
};
#define dprint if(debug)print
extern Dev fsdevtab; /* forward */
static RWlock lck; /* r: use devices; w: change config */
static Tree fstree; /* The main "fs" tree. Never goes away */
static Tree *trees[Ntrees]; /* internal representation of config */
static int ntrees; /* max number of trees */
static int qidvers;
static char *disk; /* default tree name used */
static char *source; /* default inner device used */
static int sectorsz = Sectorsz; /* default sector size */
static char confstr[Maxconf]; /* textual configuration */
if(mp == nil)
return seprint(s, e, "<null Fsdev>");
if(mp->type < 0 || mp->type >= nelem(tnames) || tnames[mp->type] == nil)
return seprint(s, e, "bad device type %d\n", mp->type);
s = strecpy(s, e, tnames[mp->type]);
if(mp->tree != &fstree)
s = seprint(s, e, " %s/%s", mp->tree->name, mp->name);
else
s = seprint(s, e, " %s", mp->name);
for(i = 0; i < mp->ndevs; i++)
s = seprint(s, e, " %s", mp->inner[i]->iname);
switch(mp->type){
case Fmirror:
case Fcat:
case Finter:
case Fcrypt:
s = strecpy(s, e, "\n");
break;
case Fpart:
s = seprint(s, e, " %ulld %ulld\n", mp->start, mp->size);
break;
default:
panic("#k: seprintdev bug");
}
return s;
}
dprint("deltree %s\n", t->name);
for(i = 0; i < ntrees; i++)
if(trees[i] == t){
if(i > 0){ /* "fs" never goes away */
free(t->name);
free(t->devs);
free(t);
trees[i] = nil;
}
return;
}
panic("#k: deltree: bug: tree not found");
}
/*
* A device is gone and we know that all its users are gone.
* A tree is gone when all its devices are gone ("fs" is never gone).
* Must close devices outside locks, so we could nest our own devices.
*/
static void
mdeldev(Fsdev *mp)
{
int i;
Inner *in;
Tree *t;
/*
* Delete one or all devices in one or all trees.
*/
static void
mdelctl(char *tname, char *dname)
{
int i, alldevs, alltrees, some;
Fsdev *mp;
Tree *t;
dprint("delctl %s\n", dname);
alldevs = strcmp(dname, "*") == 0;
alltrees = strcmp(tname, "*") == 0;
some = 0;
Again:
wlock(&lck);
for(i = 0; i < ntrees; i++){
t = trees[i];
if(t == nil)
continue;
if(alltrees == 0 && strcmp(t->name, tname) != 0)
continue;
for(i = 0; i < t->nadevs; i++){
mp = t->devs[i];
if(t->devs[i] == nil)
continue;
if(alldevs == 0 && strcmp(mp->name, dname) != 0)
continue;
/*
* Careful: must close outside locks and that
* may change the file tree we are looking at.
*/
some++;
mp->gone = 1;
if(mp->ref == 0){
incref(mp); /* keep it there */
wunlock(&lck);
mdeldev(mp);
goto Again; /* tree can change */
}
}
}
wunlock(&lck);
if(some == 0 && alltrees == 0)
error(Enonexist);
}
/*
* Process a single line of configuration,
* often of the form "cmd newname idev0 idev1".
* locking is tricky, because we need a write lock to
* add/remove devices yet adding/removing them may lead
* to calls to this driver that require a read lock (when
* inner devices are also provided by us).
*/
static void
mconfig(char* a, long n)
{
int i;
vlong size, start;
vlong *ilen;
char *tname, *dname, *fakef[4];
uchar key[2*256/8];
int keylen;
Chan **idev;
Cmdbuf *cb;
Cmdtab *ct;
Fsdev *mp;
Inner *inprv;
Tree *t;
/*
* Open all inner devices while we have only a read lock.
*/
poperror();
rlock(&lck);
if(waserror()){
runlock(&lck);
Fail:
for(i = 1; i < cb->nf; i++)
if(idev != nil && idev[i-1] != nil)
cclose(idev[i-1]);
if(mp != nil)
mdeldev(mp);
free(idev);
free(ilen);
free(cb);
nexterror();
}
idev = smalloc(sizeof(Chan*) * Ndevs);
ilen = smalloc(sizeof(vlong) * Ndevs);
for(i = 1; i < cb->nf; i++){
idev[i-1] = namec(cb->f[i], Aopen, ORDWR, 0);
ilen[i-1] = getlen(idev[i-1]);
}
poperror();
runlock(&lck);
/*
* Get a write lock and add the device if we can.
*/
wlock(&lck);
if(waserror()){
wunlock(&lck);
goto Fail;
}
t = lookuptree(tname);
if(t != nil)
validdevname(t, dname);
else{
t = treealloc(tname);
if(t == nil)
error("no more trees");
}
mp = devalloc(t, dname);
if(mp == nil){
if(t->ndevs == 0) /* it was created for us */
deltree(t); /* but we will not mdeldev() */
error(Enomem);
}
/* validate, copy and erase config; mconfig will repopulate confstr */
if (strncmp(confstr, cfgstr, sizeof cfgstr - 1) != 0)
error("bad #k config, first line must be: 'fsdev:\\n'");
c = nil;
kstrdup(&c, confstr + sizeof cfgstr - 1);
if(waserror()){
free(c);
nexterror();
}
memset(confstr, 0, sizeof confstr);
/* process config copy one line at a time */
for (p = c; p != nil && *p != '\0'; p = e){
e = strchr(p, '\n');
if (e == nil)
e = p + strlen(p);
else
e++;
mconfig(p, e - p);
}
poperror();
free(c);
poperror(); /* mustrd */
}
static int
mgen(Chan *c, char*, Dirtab*, int, int i, Dir *dp)
{
int treeno;
Fsdev *mp;
Qid qid;
Tree *t;
static long
cryptio(Fsdev *mp, int isread, uchar *a, long n, vlong off)
{
long l, m, o, nb;
uchar *b;
if((((ulong)off|n) & (Cryptsectsz-1)))
error(Ebadarg);
if(isread){
l = io(mp, mp->inner[0], Isread, a, n, off);
if(l > 0){
l &= ~(Cryptsectsz-1);
for(o=0; o<l; o+=Cryptsectsz)
aes_xts_decrypt(&mp->key->tweak, &mp->key->ecb,
off+o, a+o, a+o, Cryptsectsz);
}
return l;
}
nb = n < SDmaxio ? n : SDmaxio;
while((b = sdmalloc(nb)) == nil){
if(!waserror()){
resrcwait("no memory for cryptio");
poperror();
}
}
if(waserror()) {
sdfree(b);
nexterror();
}
for(l = 0; (m = n - l) > 0; l += m){
if(m > nb) m = nb;
for(o=0; o<m; o+=Cryptsectsz)
aes_xts_encrypt(&mp->key->tweak, &mp->key->ecb,
off+o, a+o, b+o, Cryptsectsz);
if(io(mp, mp->inner[0], Iswrite, b, m, off) != m)
error(Eio);
off += m;
a += m;
}
sdfree(b);
poperror();
return l;
}
/* NB: a transfer could span multiple inner devices */
static long
catio(Fsdev *mp, int isread, void *a, long n, vlong off)
{
int i;
long l, res;
Inner *in;
if(debug)
print("catio %d %p %ld %lld\n", isread, a, n, off);
res = n;
for (i = 0; n > 0 && i < mp->ndevs; i++){
in = mp->inner[i];
if (off >= in->isize){
off -= in->isize;
continue; /* not there yet */
}
if (off + n > in->isize)
l = in->isize - off;
else
l = n;
if(debug)
print("\tdev %d %p %ld %lld\n", i, a, l, off);
if (io(mp, in, isread, a, l, off) != l)
error(Eio);
a = (char*)a + l;
off = 0;
n -= l;
}
if(debug)
print("\tres %ld\n", res - n);
return res - n;
}
static long
interio(Fsdev *mp, int isread, void *a, long n, vlong off)
{
int i;
long boff, res, l, wl, wsz;
vlong woff, blk, mblk;
blk = off / Blksize;
boff = off % Blksize;
wsz = Blksize - boff;
res = n;
while(n > 0){
mblk = blk / mp->ndevs;
i = blk % mp->ndevs;
woff = mblk*Blksize + boff;
if (n > wsz)
l = wsz;
else
l = n;
wl = io(mp, mp->inner[i], isread, a, l, woff);
if (wl != l)
error(Eio);
blk++;
boff = 0;
wsz = Blksize;
a = (char*)a + l;
n -= l;
}
return res;
}
static char*
seprintconf(char *s, char *e)
{
int i, j;
Tree *t;
*s = 0;
for(i = 0; i < ntrees; i++){
t = trees[i];
if(t != nil)
for(j = 0; j < t->nadevs; j++)
if(t->devs[j] != nil)
s = seprintdev(s, e, t->devs[j]);
}
return s;
}
static long
mread(Chan *c, void *a, long n, vlong off)
{
int i, retry;
long l, res;
Fsdev *mp;
Tree *t;
dprint("mread %llux\n", c->qid.path);
rlock(&lck);
if(waserror()){
runlock(&lck);
nexterror();
}
res = -1;
if(c->qid.type & QTDIR){
res = devdirread(c, a, n, 0, 0, mgen);
goto Done;
}
if(c->qid.path == Qctl){
seprintconf(confstr, confstr + sizeof(confstr));
res = readstr((long)off, a, n, confstr);
goto Done;
}
if(off >= mp->size){
res = 0;
goto Done;
}
if(off + n > mp->size)
n = mp->size - off;
if(n == 0){
res = 0;
goto Done;
}
switch(mp->type){
case Fcat:
res = catio(mp, Isread, a, n, off);
break;
case Finter:
res = interio(mp, Isread, a, n, off);
break;
case Fpart:
res = io(mp, mp->inner[0], Isread, a, n, mp->start + off);
break;
case Fmirror:
retry = 0;
do {
if (retry > 0) {
print("#k/%s: retry %d read for byte %,lld "
"count %ld: %s\n", mp->name, retry, off,
n, (up && up->errstr? up->errstr: ""));
/*
* pause before retrying in case it's due to
* a transient bus or controller problem.
*/
tsleep(&up->sleep, return0, 0, Retrypause);
}
for (i = 0; i < mp->ndevs; i++){
if (waserror())
continue;
l = io(mp, mp->inner[i], Isread, a, n, off);
poperror();
if (l >= 0){
res = l;
break; /* read a good copy */
}
}
} while (i == mp->ndevs && ++retry <= Maxretries);
if (retry > Maxretries) {
/* no mirror had a good copy of the block */
print("#k/%s: byte %,lld count %ld: CAN'T READ "
"from mirror: %s\n", mp->name, off, n,
(up && up->errstr? up->errstr: ""));
error(Eio);
} else if (retry > 0)
print("#k/%s: byte %,lld count %ld: retry read OK "
"from mirror: %s\n", mp->name, off, n,
(up && up->errstr? up->errstr: ""));
break;
case Fcrypt:
res = cryptio(mp, Isread, a, n, mp->start + off);
break;
}
Done:
poperror();
runlock(&lck);
return res;
}
static long
mwrite(Chan *c, void *a, long n, vlong off)
{
int i, allbad, anybad, retry;
long l, res;
Fsdev *mp;
Tree *t;
dprint("mwrite %llux\n", c->qid.path);
if (c->qid.type & QTDIR)
error(Eisdir);
if (c->qid.path == Qctl){
mconfig(a, n);
return n;
}
if(off >= mp->size){
res = 0;
goto Done;
}
if(off + n > mp->size)
n = mp->size - off;
if(n == 0){
res = 0;
goto Done;
}
res = n;
switch(mp->type){
case Fcat:
res = catio(mp, Iswrite, a, n, off);
break;
case Finter:
res = interio(mp, Iswrite, a, n, off);
break;
case Fpart:
res = io(mp, mp->inner[0], Iswrite, a, n, mp->start + off);
if (res != n)
error(Eio);
break;
case Fmirror:
retry = 0;
do {
if (retry > 0) {
print("#k/%s: retry %d write for byte %,lld "
"count %ld: %s\n", mp->name, retry, off,
n, (up && up->errstr? up->errstr: ""));
/*
* pause before retrying in case it's due to
* a transient bus or controller problem.
*/
tsleep(&up->sleep, return0, 0, Retrypause);
}
allbad = 1;
anybad = 0;
for (i = mp->ndevs - 1; i >= 0; i--){
if (waserror()) {
anybad = 1;
continue;
}
l = io(mp, mp->inner[i], Iswrite, a, n, off);
poperror();
if (l == n)
allbad = 0; /* wrote a good copy */
else
anybad = 1;
}
} while (anybad && ++retry <= Maxretries);
if (allbad) {
/* no mirror took a good copy of the block */
print("#k/%s: byte %,lld count %ld: CAN'T WRITE "
"to mirror: %s\n", mp->name, off, n,
(up && up->errstr? up->errstr: ""));
error(Eio);
} else if (retry > 0)
print("#k/%s: byte %,lld count %ld: retry wrote OK "
"to mirror: %s\n", mp->name, off, n,
(up && up->errstr? up->errstr: ""));
break;
case Fcrypt:
res = cryptio(mp, Iswrite, a, n, mp->start + off);
break;
}
Done:
poperror();
runlock(&lck);
return res;
}