/*      $NetBSD: rumpblk.c,v 1.64 2016/07/07 06:55:44 msaitoh Exp $     */

/*
* Copyright (c) 2009 Antti Kantee.  All Rights Reserved.
*
* Development of this software was supported by the
* Finnish Cultural Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
*    notice, this list of conditions and the following disclaimer in the
*    documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/

/*
* Block device emulation.  Presents a block device interface and
* uses rumpuser system calls to satisfy I/O requests.
*
* We provide fault injection.  The driver can be made to fail
* I/O occasionally.
*/

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rumpblk.c,v 1.64 2016/07/07 06:55:44 msaitoh Exp $");

#include <sys/param.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/condvar.h>
#include <sys/disklabel.h>
#include <sys/evcnt.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/malloc.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/cprng.h>

#include <rump-sys/kern.h>
#include <rump-sys/vfs.h>

#include <rump/rumpuser.h>

#if 0
#define DPRINTF(x) printf x
#else
#define DPRINTF(x)
#endif

#define RUMPBLK_SIZE 16
static struct rblkdev {
       char *rblk_path;
       int rblk_fd;
       int rblk_mode;

       uint64_t rblk_size;
       uint64_t rblk_hostoffset;
       uint64_t rblk_hostsize;
       int rblk_ftype;

       struct disklabel rblk_label;
} minors[RUMPBLK_SIZE];

static struct evcnt ev_io_total;
static struct evcnt ev_io_async;

static struct evcnt ev_bwrite_total;
static struct evcnt ev_bwrite_async;
static struct evcnt ev_bread_total;

dev_type_open(rumpblk_open);
dev_type_close(rumpblk_close);
dev_type_read(rumpblk_read);
dev_type_write(rumpblk_write);
dev_type_ioctl(rumpblk_ioctl);
dev_type_strategy(rumpblk_strategy);
dev_type_strategy(rumpblk_strategy_fail);
dev_type_dump(rumpblk_dump);
dev_type_size(rumpblk_size);

static const struct bdevsw rumpblk_bdevsw = {
       .d_open = rumpblk_open,
       .d_close = rumpblk_close,
       .d_strategy = rumpblk_strategy,
       .d_ioctl = rumpblk_ioctl,
       .d_dump = nodump,
       .d_psize = nosize,
       .d_discard = nodiscard,
       .d_flag = D_DISK
};

static const struct bdevsw rumpblk_bdevsw_fail = {
       .d_open = rumpblk_open,
       .d_close = rumpblk_close,
       .d_strategy = rumpblk_strategy_fail,
       .d_ioctl = rumpblk_ioctl,
       .d_dump = nodump,
       .d_psize = nosize,
       .d_discard = nodiscard,
       .d_flag = D_DISK
};

static const struct cdevsw rumpblk_cdevsw = {
       .d_open = rumpblk_open,
       .d_close = rumpblk_close,
       .d_read = rumpblk_read,
       .d_write = rumpblk_write,
       .d_ioctl = rumpblk_ioctl,
       .d_stop = nostop,
       .d_tty = notty,
       .d_poll = nopoll,
       .d_mmap = nommap,
       .d_kqfilter = nokqfilter,
       .d_discard = nodiscard,
       .d_flag = D_DISK
};

static int backend_open(struct rblkdev *, const char *);
static int backend_close(struct rblkdev *);

/* fail every n out of BLKFAIL_MAX */
#define BLKFAIL_MAX 10000
static int blkfail;
static unsigned randstate;
static kmutex_t rumpblk_lock;
static int sectshift = DEV_BSHIFT;

static void
makedefaultlabel(struct disklabel *lp, off_t size, int part)
{
       int i;

       memset(lp, 0, sizeof(*lp));

       lp->d_secperunit = size;
       lp->d_secsize = 1 << sectshift;
       lp->d_nsectors = size >> sectshift;
       lp->d_ntracks = 1;
       lp->d_ncylinders = 1;
       lp->d_secpercyl = lp->d_nsectors;

       /* oh dear oh dear */
       strncpy(lp->d_typename, "rumpd", sizeof(lp->d_typename));
       strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));

       lp->d_type = DKTYPE_RUMPD;
       lp->d_rpm = 11;
       lp->d_interleave = 1;
       lp->d_flags = 0;

       /* XXX: RAW_PART handling? */
       for (i = 0; i < part; i++) {
               lp->d_partitions[i].p_fstype = FS_UNUSED;
       }
       lp->d_partitions[part].p_size = size >> sectshift;
       lp->d_npartitions = part+1;
       /* XXX: file system type? */

       lp->d_magic = DISKMAGIC;
       lp->d_magic2 = DISKMAGIC;
       lp->d_checksum = 0; /* XXX */
}

int
rumpblk_init(void)
{
       char buf[64];
       devmajor_t rumpblkmaj = RUMPBLK_DEVMAJOR;
       unsigned tmp;
       int i;

       mutex_init(&rumpblk_lock, MUTEX_DEFAULT, IPL_NONE);

       if (rumpuser_getparam("RUMP_BLKFAIL", buf, sizeof(buf)) == 0) {
               blkfail = strtoul(buf, NULL, 10);
               /* fail everything */
               if (blkfail > BLKFAIL_MAX)
                       blkfail = BLKFAIL_MAX;
               if (rumpuser_getparam("RUMP_BLKFAIL_SEED",
                   buf, sizeof(buf)) == 0) {
                       randstate = strtoul(buf, NULL, 10);
               } else {
                       randstate = cprng_fast32();
               }
               printf("rumpblk: FAULT INJECTION ACTIVE! fail %d/%d. "
                   "seed %u\n", blkfail, BLKFAIL_MAX, randstate);
       } else {
               blkfail = 0;
       }

       if (rumpuser_getparam("RUMP_BLKSECTSHIFT", buf, sizeof(buf)) == 0) {
               printf("rumpblk: ");
               tmp = strtoul(buf, NULL, 10);
               if (tmp >= DEV_BSHIFT)
                       sectshift = tmp;
               else
                       printf("RUMP_BLKSECTSHIFT must be least %d (now %d), ",
                          DEV_BSHIFT, tmp);
               printf("using %d for sector shift (size %d)\n",
                   sectshift, 1<<sectshift);
       }

       memset(minors, 0, sizeof(minors));
       for (i = 0; i < RUMPBLK_SIZE; i++) {
               minors[i].rblk_fd = -1;
       }

       evcnt_attach_dynamic(&ev_io_total, EVCNT_TYPE_MISC, NULL,
           "rumpblk", "I/O reqs");
       evcnt_attach_dynamic(&ev_io_async, EVCNT_TYPE_MISC, NULL,
           "rumpblk", "async I/O");

       evcnt_attach_dynamic(&ev_bread_total, EVCNT_TYPE_MISC, NULL,
           "rumpblk", "bytes read");
       evcnt_attach_dynamic(&ev_bwrite_total, EVCNT_TYPE_MISC, NULL,
           "rumpblk", "bytes written");
       evcnt_attach_dynamic(&ev_bwrite_async, EVCNT_TYPE_MISC, NULL,
           "rumpblk", "bytes written async");

       if (blkfail) {
               return devsw_attach("rumpblk",
                   &rumpblk_bdevsw_fail, &rumpblkmaj,
                   &rumpblk_cdevsw, &rumpblkmaj);
       } else {
               return devsw_attach("rumpblk",
                   &rumpblk_bdevsw, &rumpblkmaj,
                   &rumpblk_cdevsw, &rumpblkmaj);
       }
}

int
rumpblk_register(const char *path, devminor_t *dmin,
       uint64_t offset, uint64_t size)
{
       struct rblkdev *rblk;
       uint64_t flen;
       size_t len;
       int ftype, error, i;

       /* devices might not report correct size unless they're open */
       if ((error = rumpuser_getfileinfo(path, &flen, &ftype)) != 0)
               return error;

       /* verify host file is of supported type */
       if (!(ftype == RUMPUSER_FT_REG
          || ftype == RUMPUSER_FT_BLK
          || ftype == RUMPUSER_FT_CHR))
               return EINVAL;

       mutex_enter(&rumpblk_lock);
       for (i = 0; i < RUMPBLK_SIZE; i++) {
               if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
                       mutex_exit(&rumpblk_lock);
                       *dmin = i;
                       return 0;
               }
       }

       for (i = 0; i < RUMPBLK_SIZE; i++)
               if (minors[i].rblk_path == NULL)
                       break;
       if (i == RUMPBLK_SIZE) {
               mutex_exit(&rumpblk_lock);
               return EBUSY;
       }

       rblk = &minors[i];
       rblk->rblk_path = __UNCONST("taken");
       mutex_exit(&rumpblk_lock);

       len = strlen(path);
       rblk->rblk_path = malloc(len + 1, M_TEMP, M_WAITOK);
       strcpy(rblk->rblk_path, path);
       rblk->rblk_hostoffset = offset;
       if (size != RUMPBLK_SIZENOTSET) {
               KASSERT(size + offset <= flen);
               rblk->rblk_size = size;
       } else {
               KASSERT(offset < flen);
               rblk->rblk_size = flen - offset;
       }
       rblk->rblk_hostsize = flen;
       rblk->rblk_ftype = ftype;
       makedefaultlabel(&rblk->rblk_label, rblk->rblk_size, i);

       if ((error = backend_open(rblk, path)) != 0) {
               memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
               free(rblk->rblk_path, M_TEMP);
               rblk->rblk_path = NULL;
               return error;
       }

       *dmin = i;
       return 0;
}

/*
* Unregister rumpblk.  It's the callers responsibility to make
* sure it's no longer in use.
*/
int
rumpblk_deregister(const char *path)
{
       struct rblkdev *rblk;
       int i;

       mutex_enter(&rumpblk_lock);
       for (i = 0; i < RUMPBLK_SIZE; i++) {
               if (minors[i].rblk_path&&strcmp(minors[i].rblk_path, path)==0) {
                       break;
               }
       }
       mutex_exit(&rumpblk_lock);

       if (i == RUMPBLK_SIZE)
               return ENOENT;

       rblk = &minors[i];
       backend_close(rblk);

       free(rblk->rblk_path, M_TEMP);
       memset(&rblk->rblk_label, 0, sizeof(rblk->rblk_label));
       rblk->rblk_path = NULL;

       return 0;
}

/*
* Release all backend resources, to be called only when the rump
* kernel is being shut down.
* This routine does not do a full "fini" since we're going down anyway.
*/
void
rumpblk_fini(void)
{
       int i;

       for (i = 0; i < RUMPBLK_SIZE; i++) {
               struct rblkdev *rblk;

               rblk = &minors[i];
               if (rblk->rblk_fd != -1)
                       backend_close(rblk);
       }
}

static int
backend_open(struct rblkdev *rblk, const char *path)
{
       int error, fd;

       KASSERT(rblk->rblk_fd == -1);
       error = rumpuser_open(path,
           RUMPUSER_OPEN_RDWR | RUMPUSER_OPEN_BIO, &fd);
       if (error) {
               error = rumpuser_open(path,
                   RUMPUSER_OPEN_RDONLY | RUMPUSER_OPEN_BIO, &fd);
               if (error)
                       return error;
               rblk->rblk_mode = FREAD;
       } else {
               rblk->rblk_mode = FREAD|FWRITE;
       }

       rblk->rblk_fd = fd;
       KASSERT(rblk->rblk_fd != -1);
       return 0;
}

static int
backend_close(struct rblkdev *rblk)
{

       rumpuser_close(rblk->rblk_fd);
       rblk->rblk_fd = -1;

       return 0;
}

int
rumpblk_open(dev_t dev, int flag, int fmt, struct lwp *l)
{
       struct rblkdev *rblk = &minors[minor(dev)];

       if (rblk->rblk_fd == -1)
               return ENXIO;

       if (((flag & (FREAD|FWRITE)) & ~rblk->rblk_mode) != 0) {
               return EACCES;
       }

       return 0;
}

int
rumpblk_close(dev_t dev, int flag, int fmt, struct lwp *l)
{

       return 0;
}

int
rumpblk_ioctl(dev_t dev, u_long xfer, void *addr, int flag, struct lwp *l)
{
       devminor_t dmin = minor(dev);
       struct rblkdev *rblk = &minors[dmin];
       struct partinfo *pi;
       struct partition *dp;
       int error = 0;

       /* well, me should support a few more, but we don't for now */
       switch (xfer) {
       case DIOCGDINFO:
               *(struct disklabel *)addr = rblk->rblk_label;
               break;

       case DIOCGPARTINFO:
               dp = &rblk->rblk_label.d_partitions[DISKPART(dmin)];
               pi = addr;
               pi->pi_offset = dp->p_offset;
               pi->pi_size = dp->p_size;
               pi->pi_secsize = rblk->rblk_label.d_secsize;
               pi->pi_bsize = BLKDEV_IOSIZE;
               pi->pi_fstype = dp->p_fstype;
               pi->pi_fsize = dp->p_fsize;
               pi->pi_frag = dp->p_frag;
               pi->pi_cpg = dp->p_cpg;
               break;

       /* it's synced enough along the write path */
       case DIOCCACHESYNC:
               break;

       case DIOCGMEDIASIZE:
               *(off_t *)addr = (off_t)rblk->rblk_size;
               break;

       default:
               error = ENOTTY;
               break;
       }

       return error;
}

static int
do_physio(dev_t dev, struct uio *uio, int which)
{
       void (*strat)(struct buf *);

       if (blkfail)
               strat = rumpblk_strategy_fail;
       else
               strat = rumpblk_strategy;

       return physio(strat, NULL, dev, which, minphys, uio);
}

int
rumpblk_read(dev_t dev, struct uio *uio, int flags)
{

       return do_physio(dev, uio, B_READ);
}

int
rumpblk_write(dev_t dev, struct uio *uio, int flags)
{

       return do_physio(dev, uio, B_WRITE);
}

static void
dostrategy(struct buf *bp)
{
       struct rblkdev *rblk = &minors[minor(bp->b_dev)];
       off_t off;
       int async = bp->b_flags & B_ASYNC;
       int op;

       if (bp->b_bcount % (1<<sectshift) != 0) {
               rump_biodone(bp, 0, EINVAL);
               return;
       }

       /* collect statistics */
       ev_io_total.ev_count++;
       if (async)
               ev_io_async.ev_count++;
       if (BUF_ISWRITE(bp)) {
               ev_bwrite_total.ev_count += bp->b_bcount;
               if (async)
                       ev_bwrite_async.ev_count += bp->b_bcount;
       } else {
               ev_bread_total.ev_count++;
       }

       /*
        * b_blkno is always in terms of DEV_BSIZE, and since we need
        * to translate to a byte offset for the host read, this
        * calculation does not need sectshift.
        */
       off = bp->b_blkno << DEV_BSHIFT;

       /*
        * Do bounds checking if we're working on a file.  Otherwise
        * invalid file systems might attempt to read beyond EOF.  This
        * is bad(tm) especially on mmapped images.  This is essentially
        * the kernel bounds_check() routines.
        */
       if (off + bp->b_bcount > rblk->rblk_size) {
               int64_t sz = rblk->rblk_size - off;

               /* EOF */
               if (sz == 0) {
                       rump_biodone(bp, 0, 0);
                       return;
               }
               /* beyond EOF ==> error */
               if (sz < 0) {
                       rump_biodone(bp, 0, EINVAL);
                       return;
               }

               /* truncate to device size */
               bp->b_bcount = sz;
       }

       off += rblk->rblk_hostoffset;
       DPRINTF(("rumpblk_strategy: 0x%x bytes %s off 0x%" PRIx64
           " (0x%" PRIx64 " - 0x%" PRIx64 "), %ssync\n",
           bp->b_bcount, BUF_ISREAD(bp) ? "READ" : "WRITE",
           off, off, (off + bp->b_bcount), async ? "a" : ""));

       op = BUF_ISREAD(bp) ? RUMPUSER_BIO_READ : RUMPUSER_BIO_WRITE;
       if (BUF_ISWRITE(bp) && !async)
               op |= RUMPUSER_BIO_SYNC;

       rumpuser_bio(rblk->rblk_fd, op, bp->b_data, bp->b_bcount, off,
           rump_biodone, bp);
}

void
rumpblk_strategy(struct buf *bp)
{

       dostrategy(bp);
}

/*
* Simple random number generator.  This is private so that we can
* very repeatedly control which blocks will fail.
*
* <mlelstv> pooka, rand()
* <mlelstv> [paste]
*/
static unsigned
gimmerand(void)
{

       return (randstate = randstate * 1103515245 + 12345) % (0x80000000L);
}

/*
* Block device with very simple fault injection.  Fails every
* n out of BLKFAIL_MAX I/O with EIO.  n is determined by the env
* variable RUMP_BLKFAIL.
*/
void
rumpblk_strategy_fail(struct buf *bp)
{

       if (gimmerand() % BLKFAIL_MAX >= blkfail) {
               dostrategy(bp);
       } else {
               printf("block fault injection: failing I/O on block %lld\n",
                   (long long)bp->b_blkno);
               bp->b_error = EIO;
               biodone(bp);
       }
}