<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=utf8">
<title>/usr/web/sources/contrib/geoff/sdnvme.c - Plan 9 from Bell Labs</title>
<!-- THIS FILE IS AUTOMATICALLY GENERATED. -->
<!-- EDIT sources.tr INSTEAD. -->
</meta>
</head>
<body>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="line-height: 1.2em; margin-left: 1.00in; text-indent: 0.00in; margin-right: 1.00in; margin-top: 0; margin-bottom: 0; text-align: center;">
<span style="font-size: 10pt"><a href="/plan9/">Plan 9 from Bell Labs</a>&rsquo;s /usr/web/sources/contrib/geoff/sdnvme.c</span></p>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center><font size=-1>
Copyright © 2009 Alcatel-Lucent.<br />
Distributed under the
<a href="/plan9/license.html">Lucent Public License version 1.02</a>.
<br />
<a href="/plan9/download.html">Download the Plan 9 distribution.</a>
</font>
</center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<table width="100%" cellspacing=0 border=0><tr><td align="center">
<table cellspacing=0 cellpadding=5 bgcolor="#eeeeff"><tr><td align="left">
<pre>
<!-- END HEADER -->
/*
* driver for NVM Express 1.1 interface to PCI-Express solid state disk
* (i.e., flash memory).
*
* currently the controller is in the drive, so there's no multiplexing
* of drives through the controller.  multiple namespaces (actually number
* spaces) are assumed to refer to different views of the same disk
* (different block sizes).
*
* many features of NVME are ignored in the interest of simplicity and speed.
* many of them are intended to jump on a bandwagon (e.g., VMs) or check a box.
* using interrupts rather than polling costs us about 4% in large-block
* sequential read performance.
*/
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/error.h"
#include "../port/sd.h"

#define PAGEOF(ctlr, p) ((uintptr)(p) &amp; ~((ctlr)-&gt;pgsz-1))

#define QFULL(qp)       ((qp)-&gt;qidx.hd == qidxplus1((qp), (qp)-&gt;qidx.tl))
#define QEMPTY(qp)      ((qp)-&gt;qidx.hd == (qp)-&gt;qidx.tl)

#define nvmeadmissue(ctlr, op, nsid, buf) \
       nvmeissue(ctlr, &amp;ctlr-&gt;qpair[Qadmin], nil, op, nsid, buf, 0)

enum {
       /* fundamental constants */
       Qadmin,                 /* queue-pair ordinals; Qadmin fixed at 0 */
       Qio,
       Nqueues,

       Vall = 1&lt;&lt;Qadmin | 1&lt;&lt;Qio,  /* all interesting vector */

       Subq = 0,
       Complq,
       Qtypes,

       Nsunused = 0,
       Nsall   = ~0ul,

       Idns    = 0,
       Idctlr,
       Idnsids,

       Minsect = 512,

       /* tunable parameters */
       Debugintr = 0,
       Debugns = 0,

       Timeout = 20*1000,      /* adjust to taste. started at 2000 ms. */

       /*
        * NVME page size must be &gt;= sector size.  anything over 8K only
        * benefits bulk copies and benchmarks.
        */
       Startpgsz = Sdalign,    /* on samsung sm951, 4k ≤ page_size ≤ 128MB */

       Qlen    = 32,   /* defaults; queue lengths must be powers of 2 &lt; 4K */
       Cqlen   = 16,

       NCtlr   = 8,    /* each takes a pci-e or m.2 slot */
       NCtlrdrv= 1,
       NDrive  = NCtlr * NCtlrdrv,

       Reserved = (ushort)~0,          /* placeholder cmdid */
};

/* admin commands */
enum Adminops {
       Admmkiosq       = 1,    /* create i/o submission q */
       Admmkiocq       = 5,    /* create i/o completion q */
       Admid           = 6,    /* identify */
};

/* I/O commands */
enum Opcode {
       Cmdflush        = 0,
       Cmdwrite        = 1,
       Cmdread         = 2,
       Cmdwriteuncorr  = 4,
       Cmdcompare      = 5,
       Cmddsm          = 9,
};

typedef struct Cmd Cmd;
typedef struct Completion Completion;
typedef struct Ctlr Ctlr;
typedef struct Ctlrid Ctlrid;
typedef struct Doorbell Doorbell;
typedef struct Lbafmt Lbafmt;
typedef struct Nsid Nsid;
typedef struct Nvindx Nvindx;
typedef struct Qpair Qpair;
typedef struct Regs Regs;
typedef struct Transfer Transfer;

extern SDifc sdnvmeifc;

struct Nvindx {
       unsigned hd;                    /* remove at this index */
       unsigned tl;                    /* add at this index */
};

struct Qpair {
       Cmd     *q;                     /* base of Cmd array */
       Nvindx  qidx;
       int     sqlen;
       int     writelast;              /* flag: read or write in last cmd? */

       Completion *cmpl;               /* base of Completion array */
       Nvindx  cidx;
       int     cqlen;
       int     phase;                  /* initial phase bit setting in cmpl */
};

/* these are reused and never freed */
struct Transfer {
       Transfer *next;
       Rendez;
       int     done;                   /* flag for rendezvous */
       int     status;                 /* from completion */
       ulong   qtm;                    /* time at enqueue */
       uvlong  stcyc;                  /* cycles at enqueue */
       ushort  cmdid;                  /* 0 means available */
       int     rdwr;
};

struct Ctlr {
       Regs*   regs;                   /* memory-mapped I/O registers */
       SDev*   sdev;
       Intrcommon;
       uintptr port;                   /* physical addr of I/O registers */

       int     pgsz;                   /* size of an `nvme page' */
       int     minpgsz;
       int     mdts;                   /* actual value, not log2; unit minpgsz */
       int     sqlen;                  /* sub q len */
       int     cqlen;                  /* compl q len */
       int     stride; /* bytes from base of one doorbell reg. to the next */

       /* per-drive scalars, since there is only one drive */
       vlong   sectors;                /* total, copy to SDunits */
       int     secsize;                /* sector size, copy to SDunits */
       int     ns;                     /* namespace of the single drive */
       /* stats */
       int     maxqlen[2];     /* high water marks of read, write queues */
       /* example results: rd 89 µs, wr 325 µs */
       uvlong  maxcyc[3];  /* high water marks of read, write, admin cycles */

       /* per controller */
       QLock;                          /* serialise q notifications */
       Rendez;                         /* q empty/full notifications */
       Lock;                           /* intr svc */
       Lock    issuelock;              /* inflight &amp; q heads &amp; tail mostly */
       Lock    xfrlock;
       Lock    shutlock;
       int     inflight;               /* count of xfrs in progress */
       int     intrsena;               /* interrupts we care about */
       Transfer *xfrs;                 /* transfers in flight or done */
       Qpair   qpair[Nqueues];         /* use a single admin queue pair */

       /* per-drive arrays */
       char    serial[20+1];
       char    model[40+1];
       char    fw[8+1];
};

struct Regs {
       uvlong  cap;            /* controller capabilities */
       ulong   vs;             /* version */
       /* intm* bits are actually vector number offsets */
       ulong   intmset;        /* intr mask set: bit # is i/o completion q # */
       ulong   intmclr;        /* intr mask clear: " */
       ulong   cc;             /* controller configuration */
       ulong   nssrc;          /* reset, iff cap.nssrs set */
       ulong   csts;           /* controller status */
       ulong   _rsvd2;         /* reserved */
       ulong   aqa;            /* admin queue attributes */
       uvlong  asq;            /* admin submit queue base address */
       uvlong  acq;            /* admin completion queue base address */
       uchar   _pad0[0x1000 - 0x38];
       /* this is the nominal doorbell layout, with stride of 4 */
       struct Doorbell {
               ulong   sqtl;   /* submission queue tail */
               ulong   cqhd;   /* completion queue head */
       } doorbell[Nqueues];
};

/*
* making the doorbell stride variable at run time requires changing the
* declaration and addressing of the Regs-&gt;doorbell array, making it clunkier.
* supposedly non-zero strides are only desirable in VMs, for efficiency.
*/
/* clunky doorbell register addressing for any stride */
/* instead of &amp;ctlr-&gt;regs-&gt;doorbell[qid].sqtl */
#define doorbellsqtl(ctlr, qp) (ulong *)\
       ((char *)(ctlr)-&gt;regs-&gt;doorbell + (ctlr)-&gt;stride*(Qtypes*(qp) + Subq))
/* instead of &amp;ctlr-&gt;regs-&gt;doorbell[qid].cqhd */
#define doorbellcqhd(ctlr, qp) (ulong *)\
       ((char *)(ctlr)-&gt;regs-&gt;doorbell + (ctlr)-&gt;stride*(Qtypes*(qp) + Complq))

enum {
       /* cap */
       Nssrs           = 1ull &lt;&lt; 36,

       /* cc */
       Enable          = 1 &lt;&lt; 0,
       Cssnvm          = 0 &lt;&lt; 4,         /* nvm command set */
       Cssmask         = 7 &lt;&lt; 4,
       Shnnone         = 0 &lt;&lt; 14,                /* shutdown style */
       Shnnormal       = 1 &lt;&lt; 14,
       Shnabrupt       = 2 &lt;&lt; 14,
       Shnmask         = 3 &lt;&lt; 14,

       /* csts */
       Rdy             = 1 &lt;&lt; 0,         /* okay to add to sub. q */
       Cfs             = 1 &lt;&lt; 1,         /* controller fatal status */
       Shstnormal      = 0 &lt;&lt; 2,         /* shutdown status */
       Shstoccur       = 1 &lt;&lt; 2,
       Shstcmplt       = 2 &lt;&lt; 2,
       Shstmask        = 3 &lt;&lt; 2,
       Nssro           = 1 &lt;&lt; 4,
};

struct Cmd {
       /* common 40-byte header */
       uchar   opcode;         /* command dword 0 */
       uchar   flags;
       ushort  cmdid;
       ulong   nsid;
       ulong   cdw2[2];        /* not used */
       uvlong  metadata;
       uvlong  prp1;           /* buffer memory address */
       uvlong  prp2;           /* zero, buffer addr, or prp list addr */
       union {
               ulong   cdw10[6]; /* admin: command dwords 10-15 */
               struct {        /* nvm i/o */
                       uvlong  slba;
                       ushort  length;
                       ushort  control;
                       ulong   dsmgmt;
                       /* rest are for end-to-end protection only */
                       ulong   reftag;
                       ushort  apptag;
                       ushort  appmask;
               };
       };
};

enum {
       /* cdw10[1] for Admmkiocq */
       Ien     = 1&lt;&lt;1,           /* intr enabled */
       Pc      = 1&lt;&lt;0,           /* physically contiguous */
};

struct Completion {
       ulong   specific;
       ulong   _pad;
       ushort  sqhd;
       ushort  sqid;
       ushort  cmdid;
       ushort  stsphs;         /* status + 1 phase bit */
};

enum {
       Phase   = 1,            /* phase bit in stsphs */
};

struct Ctlrid {
       ushort  pcivid;
       ushort  pcissvid;
       char    serial[20];     /* space-padded, unterminated strings */
       char    model[40];
       char    fw[8];
       char    _72_[77-72];
       uchar   mdts;           /* log2(max data xfr size), unit: min pg sz */
                               /* 0 is unlimited */
       char    _516_[516-78];  /* ... lots of uninteresting stuff ... */
       ulong   nns;            /* number of namespaces present */
       /* ... lots of uninteresting stuff ... */
};

struct Nsid {
       uvlong  size;
       uvlong  cap;
       uvlong  used;
       uchar   feat;
       uchar   lnbafmts;
       uchar   fmtlbasz;
       uchar   mdcap;
       uchar   dpc;
       uchar   dps;
       uchar   optnmic;
       uchar   optrescap;
       uchar   _pad0[128-32];
       struct Lbafmt {
               ushort  mdsize;
               uchar   lglbasize;      /* log2(lba size) */
               uchar   relperf;
       } lbafmt[16];
       /* ... uninteresting stuff ... */
};

CTASSERT(sizeof(Cmd) == 64, cmd_wrong_size);
CTASSERT(sizeof(Completion) == 16, compl_wrong_size);

static Lock clocklck;
static int clockrunning;
static ulong iosttck;           /* tick of most recently-started i/o */
static int nctlrs;
static Ctlr *ctlrs[NCtlr];

static void
cidxincr(Ctlr *ctlr, Qpair *qp)
{
       if (++qp-&gt;cidx.hd &gt;=  ctlr-&gt;cqlen) {
               qp-&gt;cidx.hd = 0;
               qp-&gt;phase ^= Phase;
       }
}

#ifdef unused
static void
isfatal(Regs *regs, char *where)
{
       if (regs-&gt;csts &amp; Cfs)
               panic("nvme: fatal controller error %s", where);
}
#endif

static Transfer *
findxfr(Ctlr *ctlr, int cmdid)
{
       Transfer *xfr;

       for (xfr = ctlr-&gt;xfrs; xfr; xfr = xfr-&gt;next)
               if (xfr-&gt;cmdid == cmdid)
                       return xfr;
       return nil;
}

/*
* cqhd is head of the completion queue.
* mark its transfer done, notify anybody waiting for it.
*/
static void
completexfr(Ctlr *ctlr, Completion *cqhd, int qid)
{
       uvlong cycs;
       Transfer *xfr;

       if (Debugintr)
               iprint("intr q %d cmdid %d...", qid, cqhd-&gt;cmdid);
       xfr = findxfr(ctlr, cqhd-&gt;cmdid);
       if (xfr == nil)
               panic("sd%C0: nvmeinterrupt: unexpected completion cmd id %d",
                       ctlr-&gt;sdev-&gt;idno, cqhd-&gt;cmdid);
       if (xfr-&gt;qtm &amp;&amp; TK2MS(sys-&gt;ticks) - xfr-&gt;qtm &gt;= Timeout)
               iprint("sd%C0: nvmeinterrupt: completed cmd id %d but "
                       "took more than %d s.\n",
                       ctlr-&gt;sdev-&gt;idno, cqhd-&gt;cmdid, Timeout/1000);

       /* cycle-based measurements */
       cycles(&amp;cycs);
       cycs -= xfr-&gt;stcyc;
       if (cycs &gt; ctlr-&gt;maxcyc[xfr-&gt;rdwr])
               ctlr-&gt;maxcyc[xfr-&gt;rdwr] = cycs;

       xfr-&gt;status = cqhd-&gt;stsphs &amp; ~Phase;
       xfr-&gt;done = 1;
       xfr-&gt;qtm = 0;
       wakeup(xfr);            /* notify of completion */
}

/* advance sub. q head to completion's, notify waiters */
static void
advancesqhd(Ctlr *ctlr, Qpair *qp, Completion *cqhd, int qid)
{
       if (Debugintr)
               iprint("sw q %d sqhd set to %d...", qid, cqhd-&gt;sqhd);
       qp-&gt;qidx.hd = cqhd-&gt;sqhd;
       wakeup(ctlr);           /* notify of sqhd advance */
}

/*
* advance compl. q head, notify ctlr., which will extinguish intr source
* (by acknowledging this completion) and remove cqhd from the compl. q.
*/
static void
advancecqhd(Ctlr *ctlr, Qpair *qp, int qid)
{
       cidxincr(ctlr, qp);
       if (Debugintr)
               iprint("doorbell q %d cqhd set to %d\n", qid, qp-&gt;cidx.hd);
       *doorbellcqhd(ctlr, qid) = qp-&gt;cidx.hd;
       coherence();
}

/*
* Act on and clear the interrupt(s).
* In order to share PCI IRQs, just ignore spurious interrupts.
* Advances queue head indices past completed operations.
*/
static Intrsvcret
nvmeinterrupt(Ureg *, void* arg)
{
       int qid, ndone, donepass; /* qid's not a great name (see path.qid) */
       ulong causes;
       Completion *cqhd;
       Ctlr *ctlr;
       Qpair *qp;
       Regs *regs;

       ctlr = arg;
       regs = ctlr-&gt;regs;
       causes = regs-&gt;intmset;
       USED(causes);
       ilock(&amp;ctlr-&gt;issuelock); /* keep other cpus out of intr svc, indices */
       if (ctlr-&gt;inflight == 0) {   /* not expecting an interrupt? */
               /* probably lost a race with polling: nothing to do */
               iunlock(&amp;ctlr-&gt;issuelock);
               return Intrnotforme;
       }

       ndone = 0;
       do {
               donepass = 0;
               for (qid = Nqueues - 1; qid &gt;= 0; qid--) /* scan i/o q 1st */
                       for (qp = &amp;ctlr-&gt;qpair[qid]; ; ) {
                               cqhd = &amp;qp-&gt;cmpl[qp-&gt;cidx.hd];
                               if ((cqhd-&gt;stsphs &amp; Phase) == qp-&gt;phase)
                                       break;
                               completexfr(ctlr, cqhd, qid);
                               advancesqhd(ctlr, qp, cqhd, qid);
                               /*
                                * toggles qp-&gt;phase if qp-&gt;cidx.hd wraps when
                                * incr'd.
                                */
                               advancecqhd(ctlr, qp, qid);
                               if (--ctlr-&gt;inflight &lt; 0)
                                       iprint("nvmeinterrupt: inflight botch\n");
                               ndone++, donepass++;
                       }
       } while (donepass &gt; 0);
       /* unmask intr. sources of interest iff transfers are in flight */
       if (ctlr-&gt;inflight == 0) {
               iosttck = 0;
               ctlr-&gt;intrsena = 0;
       } else
               regs-&gt;intmclr = Vall;
       iunlock(&amp;ctlr-&gt;issuelock);
       if (ndone &gt; 0)
               return Intrforme;
       else
               return Intrnotforme;
}

/* return cmd id other than zero and Reserved */
static int
cidalloc(void)
{
       int thisid;
       static int cid;
       static Lock cidlck;

       ilock(&amp;cidlck);
       ++cid;
       if ((ushort)cid == 0 || (ushort)cid == Reserved)
               cid = 1;
       thisid = cid;
       iunlock(&amp;cidlck);
       return thisid;
}

/* fill in submission queue entry *cmd */
static void
mkcmd(Ctlr *ctlr, Cmd *cmd, SDreq *r, int op, ulong nsid, void *buf, int qid,
       vlong lba)
{
       long count;
       uintptr addr;

       memset(cmd, 0, sizeof *cmd);
       cmd-&gt;opcode = op;
       cmd-&gt;cmdid = cidalloc();
       cmd-&gt;nsid = nsid;
       addr = (uintptr)buf;
       if (addr != 0) {
               if (addr &lt; KZERO)
                       print("nvme mkcmd: %#p not kernel virtual address\n",
                               addr);
               /* each prp entry points to at most a page */
               cmd-&gt;prp1 = PCIWADDR((void *)addr);
               if (r &amp;&amp; r-&gt;dlen &gt; ctlr-&gt;pgsz &amp;&amp; r-&gt;dlen &lt;= 2*ctlr-&gt;pgsz)
                       cmd-&gt;prp2 = PAGEOF(ctlr, cmd-&gt;prp1) + ctlr-&gt;pgsz;
               else
                       cmd-&gt;prp2 = 0;
       }
       switch (qid) {
       case Qadmin:
               /* we are using single-message msi */
               switch (op) {
               case Admmkiocq:
                       cmd-&gt;cdw10[0] = (ctlr-&gt;cqlen - 1)&lt;&lt;16 | Qio;
                       cmd-&gt;cdw10[1] = Ien | Pc;  /* vector 0 since no msi-x */
                       break;
               case Admmkiosq:
                       cmd-&gt;cdw10[0] = (ctlr-&gt;sqlen - 1)&lt;&lt;16 | Qio;
                       cmd-&gt;cdw10[1] = Qio&lt;&lt;16 | Pc;  /* completion q id */
                       break;
               case Admid:
                       if (nsid == Nsall) {
                               cmd-&gt;cdw10[0] = Idctlr;
                               cmd-&gt;nsid = 0;
                       } else
                               cmd-&gt;cdw10[0] = Idns;
                       break;
               }
               break;
       default:
               switch (op) {
               case Cmdread:
               case Cmdwrite:
                       count = r-&gt;dlen / r-&gt;unit-&gt;secsize;
                       if (count == 0) {
                               print("nvmeissue: zero sector count for i/o "
                                       "of length %d\n", r-&gt;dlen);
                               break;
                       }
                       cmd-&gt;slba = lba;
                       cmd-&gt;length = (ushort)(count - 1);   /* sectors */
                       assert(r-&gt;data == buf);
                       assert(r-&gt;unit-&gt;secsize * count &lt;= r-&gt;dlen);
                       assert(nsid);
                       break;
               }
               break;
       }
}

static void
updmaxqlen(Ctlr *ctlr, Qpair *qp)
{
       int qlen;
       int *qlenp;

       qlen = (qp-&gt;qidx.tl + qp-&gt;sqlen - qp-&gt;qidx.hd) % qp-&gt;sqlen;
       qlenp = &amp;ctlr-&gt;maxqlen[qp-&gt;writelast];
       if (qlen &gt; *qlenp)
               *qlenp = qlen;
}

/*
* send a command via the submission queue.
* call with ctlr-&gt;issuelock held.
* advances submission queue's tail index.
*/
static void
sendcmd(Ctlr *ctlr, Qpair *qp, Cmd *qtl, Transfer *xfr)
{
       int qid;

       xfr-&gt;done = 0;
       xfr-&gt;cmdid = qtl-&gt;cmdid;
       xfr-&gt;qtm = TK2MS(sys-&gt;ticks);
       qid = qp - ctlr-&gt;qpair;
       if (Debugintr)
               iprint("issue q %d cmdid %d...", qid, xfr-&gt;cmdid);

       /*
        * Notify controller of new submission queue entry,
        * which triggers execution of it.
        */
       updmaxqlen(ctlr, qp);
       cycles(&amp;xfr-&gt;stcyc);

       ctlr-&gt;inflight++;
       iosttck = sys-&gt;ticks;
       *doorbellsqtl(ctlr, qid) = qp-&gt;qidx.tl;              /* start i/o */
       coherence();
       ctlr-&gt;regs-&gt;intmclr = ctlr-&gt;intrsena = Vall;   /* unmask intrs */
}

static int
doneio(void* arg)
{
       return ((Transfer *)arg)-&gt;done;
}

static uint
qidxplus1(Qpair *qp, uint idx)
{
       if (++idx &gt;= qp-&gt;sqlen)
               idx = 0;
       return idx;
}

static int
qnotfull(void *arg)
{
       return !QFULL((Qpair *)arg);
}

static int
qempty(void *arg)
{
       return QEMPTY((Qpair *)arg);
}

static Transfer *
getfreexfr(Ctlr *ctlr)
{
       Transfer *xfr;

       ilock(&amp;ctlr-&gt;xfrlock);                   /* allocate xfr */
       xfr = findxfr(ctlr, 0);
       if (xfr == nil) {
               xfr = malloc(sizeof *xfr);
               if (xfr == nil)
                       panic("nvmeissue: out of memory");
               xfr-&gt;next = ctlr-&gt;xfrs;
               ctlr-&gt;xfrs = xfr;    /* add new xfr to chain */
       }
       xfr-&gt;cmdid = Reserved;
       xfr-&gt;qtm = 0;
       iunlock(&amp;ctlr-&gt;xfrlock);
       return xfr;
}

/*
* if needed, wait for the sub q to drain a lot or a little.
* not infallible, so test afterward under lock.
*/
static void
qdrain(Ctlr *ctlr, Qpair *qp, SDreq *r)
{
       if (QFULL(qp)) {
               qlock(ctlr);                    /* wait for q space */
               while (QFULL(qp))
                       sleep(ctlr, qnotfull, qp);
               qunlock(ctlr);
       }
       /*
        * don't mix reads and writes in the queue, to avoid read-before-write
        * problems.
        */
       if (r &amp;&amp; qp-&gt;writelast != r-&gt;write) {
               qlock(ctlr);
               if (qp-&gt;writelast != r-&gt;write)
                       sleep(ctlr, qempty, qp);  /* changing, so drain */
               qp-&gt;writelast = r-&gt;write;
               qunlock(ctlr);
       }
}

/* drain and return with ctlr-&gt;issuelock held */
static void
qdrainilock(Ctlr *ctlr, Qpair *qp, SDreq *r)
{
       int again;

       /* serialise composition of cmd in place at sq tail */
       do {
               qdrain(ctlr, qp, r);

               again = 0;
               ilock(&amp;ctlr-&gt;issuelock);
               /* test again under lock */
               if (QFULL(qp) || r &amp;&amp; qp-&gt;writelast != r-&gt;write) {
                       /* lost a race; uncommon case */
                       iunlock(&amp;ctlr-&gt;issuelock);
                       again = 1;
               }
       } while (again);
       /* issuelock still held */
}

static void
prerr(int sts)
{
       if (sts)
               iprint("nvmeissue: cmd error status %#ux: "
                       "code %#ux type %d more %d do-not-retry %d\n", sts,
                       (sts &gt;&gt;  1) &amp; MASK(8), (sts &gt;&gt;  9) &amp; MASK(3),
                       (sts &gt;&gt; 14) &amp; MASK(1), (sts &gt;&gt; 15) &amp; MASK(1));
}

/*
* add new nvme command to tail of submission queue of Qpair,
* and wait for it to complete.  return status with phase bit zeroed.
*/
static int
nvmeissue(Ctlr *ctlr, Qpair *qp, SDreq *r, int op, ulong nsid, void *buf,
       vlong lba)
{
       ushort sts;
       Cmd *qtl;
       Transfer *xfr;

       xfr = getfreexfr(ctlr);
       if (op == Cmdwrite)
               xfr-&gt;rdwr = Write;
       else if (op == Cmdread)
               xfr-&gt;rdwr = Read;
       else
               xfr-&gt;rdwr = 2;

       /* serialise composition of cmd in place at sq tail */
       qdrainilock(ctlr, qp, r);
       /* ctlr-&gt;issuelock is now held */

       /* Reserve a space and update sub. q tail index past it. */
       qtl = &amp;qp-&gt;q[qp-&gt;qidx.tl];
       qp-&gt;qidx.tl = qidxplus1(qp, qp-&gt;qidx.tl);

       /*
        * Compose the command struct at the tail of the submission queue.
        * mkcmd converts buf to physical address space.
        */
       mkcmd(ctlr, qtl, r, op, nsid, buf, qp - ctlr-&gt;qpair, lba);
       sendcmd(ctlr, qp, qtl, xfr);                    /* start cmd */
       iunlock(&amp;ctlr-&gt;issuelock);

       /* this is the only process waiting for this xfr. */
       while(waserror())
               ;
       tsleep(xfr, doneio, xfr, Timeout);
       poperror();
       if (!xfr-&gt;done) {
               /* we see this with the Samsung 983 DCT. */
               nvmeinterrupt(nil, ctlr);
               if (!xfr-&gt;done)
                       panic("sd%C0: nvmeissue: cmd id %d didn't complete "
                               "in %d s.", ctlr-&gt;sdev-&gt;idno, xfr-&gt;cmdid,
                               Timeout/1000);
       }

       sts = xfr-&gt;status;
       xfr-&gt;cmdid = 0;                              /* xfr available for re-use */
       if (sts)
               prerr(sts);
       return sts;
}

/* map scsi to nvm opcodes */
static int
scsiop2nvme(uchar* cmd)
{
       if (isscsiread(*cmd))
               return Cmdread;
       else if (isscsiwrite(*cmd))
               return Cmdwrite;
       else {
               iprint("scsiop2nvme: scsi cmd %#ux unexpected\n", *cmd);
               return -1;
       }
}

static int
issueios(SDreq *r)
{
       int n, max, iostat, nvmcmd;
       ulong count;                    /* sectors */
       uvlong lba;
       Ctlr *ctlr;
       SDunit *unit;

       unit = r-&gt;unit;
       ctlr = unit-&gt;dev-&gt;ctlr;
       nvmcmd = scsiop2nvme(r-&gt;cmd);
       if (nvmcmd == -1)
               error("nvme: scsi cmd unexpected");
       scsilbacount(r-&gt;cmd, r-&gt;clen, &amp;lba, &amp;count);
       if(count * unit-&gt;secsize &gt; r-&gt;dlen)
               count = r-&gt;dlen / unit-&gt;secsize;
       max = 2*ctlr-&gt;pgsz / unit-&gt;secsize;       /* needs 1 or 2 prp addrs */
       /* to do this in generality, need to allocate a prp list page */
       if (0)
               max = (ctlr-&gt;mdts? ctlr-&gt;mdts * ctlr-&gt;minpgsz: 128*KB) /
                       unit-&gt;secsize;
       iostat = 0;

       for (; count &gt; 0; count -= n){
               n = MIN(count, max);
               r-&gt;dlen = n * unit-&gt;secsize;
               iostat = nvmeissue(ctlr, &amp;ctlr-&gt;qpair[Qio], r, nvmcmd,
                       ctlr-&gt;ns, r-&gt;data, lba);
               if (iostat)
                       break;
               lba += n;
               r-&gt;data = (uchar *)r-&gt;data + r-&gt;dlen;
       }
       return iostat;
}

/*
* Issue an I/O (SCSI) command to a controller and wait for it to complete.
* The command and its length is contained in r-&gt;cmd and r-&gt;cmdlen.
* If any data is to be returned, r-&gt;dlen should be non-zero, and
* the returned data will be placed in r-&gt;data.
*/
static int
nvmerio(SDreq* r)
{
       int i, iostat;
       ulong origdlen;
       uchar *origdata;
       static char info[256];

       if(*r-&gt;cmd == ScmdSynccache || *r-&gt;cmd == ScmdSynccache16)
               return sdsetsense(r, SDok, 0, 0, 0);

       /* scsi command to get information about the drive or disk? */
       if((i = sdfakescsi(r, info, sizeof info)) != SDnostatus){
               r-&gt;status = i;
               return i;
       }

       if(r-&gt;data == nil)
               return SDok;

       /*
        * Cap the size of individual transfers and repeat if needed.
        * Save r-&gt;data and r-&gt;dlen, and restore them after the loop.
        * could call scsibio, which allocates an SDreq.
        */
       origdata = r-&gt;data;
       origdlen = r-&gt;dlen;

       assert(r-&gt;unit-&gt;secsize &gt;= Minsect &amp;&amp;
               r-&gt;unit-&gt;secsize &lt;= ((Ctlr *)r-&gt;unit-&gt;dev-&gt;ctlr)-&gt;pgsz);
       iostat = issueios(r);

       r-&gt;rlen = (uchar *)r-&gt;data - origdata;
       r-&gt;data = origdata;
       r-&gt;dlen = origdlen;
       r-&gt;status = SDok;
       if (iostat != 0) {
               r-&gt;status = SDeio;
               /* 3, 0xc, 2: write error, reallocation failed */
               sdsetsense(r, SDcheck, 3, 0xc, 2);
       }
       return r-&gt;status;
}

static int
nvmerctl(SDunit* unit, char* p, int l)
{
       int n;
       Ctlr *ctlr;
       Regs *regs;

       if((ctlr = unit-&gt;dev-&gt;ctlr) == nil)
               return 0;
       regs = ctlr-&gt;regs;
       n = snprint(p, l, "config %#lux capabilities %#llux status %#lux\n",
               regs-&gt;cc, regs-&gt;cap, regs-&gt;csts);
       /*
        * devsd has already generated "inquiry" line using the model,
        * so printing ctlr-&gt;model here would be redundant.
        */
       n += snprint(p+n, l-n, "serial %s\n", ctlr-&gt;serial);
       if(unit-&gt;sectors)
               n += snprint(p+n, l-n, "geometry %lld %lud\n",
                       unit-&gt;sectors, unit-&gt;secsize);
       return n;
}

/* must emit exactly one line per controller (sd(3)) */
static char*
nvmertopctl(SDev *sdev, char *p, char *e)
{
       Ctlr *ctlr;

       ctlr = sdev-&gt;ctlr;
       return seprint(p, e, "sd%c nvme regs %#p irq %d: max q lens, rd %d "
               "wr %d; max cycs, rd %lld wr %lld\n", sdev-&gt;idno, ctlr-&gt;port,
               ctlr-&gt;irq, ctlr-&gt;maxqlen[Read], ctlr-&gt;maxqlen[Write],
               ctlr-&gt;maxcyc[Read], ctlr-&gt;maxcyc[Write]);
}

static void
reset(Regs *regs)
{
       if (regs-&gt;cc &amp; Enable) {
               if (awaitbitpat(&amp;regs-&gt;csts, Rdy, Rdy) &lt; 0)
                       print("nvme reset timed out awaiting ready\n");
               regs-&gt;cc &amp;= ~Enable;
               coherence();
       }
       /* else may have previously cleared Enable &amp; be waiting for not ready */
       if (awaitbitpat(&amp;regs-&gt;csts, Rdy, 0) &lt; 0)
               print("nvme reset timed out awaiting not ready\n");
}

static void
nvmedrive(SDunit *unit)
{
       uchar *p;
       Ctlr *ctlr;

       unit-&gt;sense[0] = 0x70;
       unit-&gt;sense[7] = sizeof(unit-&gt;sense)-7;

       memset(unit-&gt;inquiry, 0, sizeof unit-&gt;inquiry);
       unit-&gt;inquiry[0] = SDperdisk;
       unit-&gt;inquiry[2] = 2;
       unit-&gt;inquiry[3] = 2;
       unit-&gt;inquiry[4] = sizeof unit-&gt;inquiry - 4;
       p = &amp;unit-&gt;inquiry[8];
       ctlr = unit-&gt;dev-&gt;ctlr;
       /* model is smaller than unit-&gt;inquiry-8 */
       strncpy((char *)p, ctlr-&gt;model, sizeof ctlr-&gt;model);

       unit-&gt;secsize = ctlr-&gt;secsize;
       unit-&gt;sectors = ctlr-&gt;sectors;
       print("sd%C%d: nvme %,lld sectors: %s fw %s serial %s\n",
               unit-&gt;dev-&gt;idno, unit-&gt;subno, unit-&gt;sectors,
               ctlr-&gt;model, ctlr-&gt;fw, ctlr-&gt;serial);
}

static void
pickpgsz(Ctlr *ctlr)
{
       ulong minpgsz, maxpgsz;

       minpgsz = 1 &lt;&lt; (12 + ((ctlr-&gt;regs-&gt;cap &gt;&gt; 48) &amp; MASK(4)));
       maxpgsz = 1 &lt;&lt; (12 + ((ctlr-&gt;regs-&gt;cap &gt;&gt; 52) &amp; MASK(4)));
       ctlr-&gt;minpgsz = minpgsz;             /* for Ctlrid-&gt;mdts */
       ctlr-&gt;pgsz = MIN(Startpgsz, maxpgsz);
       if (ctlr-&gt;pgsz &lt; minpgsz)
               ctlr-&gt;pgsz = minpgsz;
       if (Sdalign &gt;= 4*KB &amp;&amp; ctlr-&gt;pgsz &gt; Sdalign)
               ctlr-&gt;pgsz = Sdalign;
       if (ctlr-&gt;pgsz &lt; 4*KB)                    /* sanity */
               ctlr-&gt;pgsz = 4*KB;
}

static void
pickqlens(Ctlr *ctlr)
{
       ulong mqes;

       mqes = (ctlr-&gt;regs-&gt;cap &amp; MASK(16)) + 1;  /* max i/o [sc] q len */
       ctlr-&gt;sqlen = MIN(mqes, Qlen);
       ctlr-&gt;cqlen = MIN(mqes, Cqlen);
}

static SDev*
nvmeprobe(Pcidev *p)
{
       int logstride;
       uintptr port;
       Ctlr *ctlr;
       Regs *regs;
       SDev *sdev;
       static int count;

       assert(p-&gt;mem[1].bar == 0);  /* upper 32 bits of 64-bit addr */
       port = p-&gt;mem[0].bar &amp; ~0x0f;
       regs = vmap(port, p-&gt;mem[0].size);
       if(regs == nil){
               print("nvmeprobe: phys address %#p in use did=%#ux\n",
                       port, p-&gt;did);
               return nil;
       }

       if ((ctlr = malloc(sizeof(Ctlr))) == nil ||
           (sdev = malloc(sizeof(SDev))) == nil) {
               free(ctlr);
               vunmap(regs, p-&gt;mem[0].size);
               return nil;
       }
       ctlr-&gt;regs = regs;
       ctlr-&gt;port = port;
       ctlr-&gt;irq = p-&gt;intl;
       /*
        * Attempt to hard-reset the board.
        */
       reset(regs);
       logstride = ((regs-&gt;cap &gt;&gt; 32) &amp; MASK(4)); /* doorbell stride */
       if (logstride != 0)
               panic("nvmeprobe: doorbell stride must be 0 (for now), not %d",
                       logstride);
       ctlr-&gt;stride = 1 &lt;&lt; (2 + logstride);   /* 2^(2+logstride) */
       if (0 &amp;&amp; regs-&gt;cap &amp; Nssrs) {            /* nvm subsys reset avail.? */
               regs-&gt;cc |= Nssro;           /* clear Nssro by setting it */
               regs-&gt;nssrc = 'N'&lt;&lt;24 | 'V'&lt;&lt;16 | 'M'&lt;&lt;8 | 'e';
               if (awaitbitpat(&amp;regs-&gt;csts, Nssro, Nssro) &lt; 0)
                       print("nvme subsys reset timed out awaiting Nssro\n");
       }

       pickpgsz(ctlr);
       pickqlens(ctlr);

       sdev-&gt;ifc = &amp;sdnvmeifc;
       sdev-&gt;ctlr = ctlr;
       sdev-&gt;idno = 'n';    /* actually assigned in sdadddevs() */
       sdev-&gt;nunit = NCtlrdrv;      /* max. drives (can be number found) */
       ctlr-&gt;sdev = sdev;

       /*
        * we (pnp) don't have a `spec' argument, so
        * we'll assume that sdn0 goes to the first nvme host
        * adapter found, sdo0 to the next, etc.
        */
       print("#S/sd%c: nvme: irq %d regs %#p page size %d\n",
               sdev-&gt;idno + count++, ctlr-&gt;irq, ctlr-&gt;port, ctlr-&gt;pgsz);

       /* would probe for drives here if there could be more than one. */
       /* upon return, this many sdev-&gt;units will be allocated. */
       sdev-&gt;nunit = 1;
       return sdev;
}

static void
sdevadd(SDev *sdev, SDev **head, SDev **tail)
{
       if(*head != nil)
               (*tail)-&gt;next = sdev;
       else
               *head = sdev;
       *tail = sdev;
}

/*
* find all nvme controllers
*/
static SDev*
nvmepnp(void)
{
       Ctlr *ctlr;
       Pcidev *p;
       SDev *sdev, *head, *tail;

       p = nil;
       head = tail = nil;
       while(p = pcimatch(p, 0, 0)){
               /* ccrp 2 is NVME */
               if(p-&gt;ccrb != Pcibcstore || p-&gt;ccru != Pciscnvm || p-&gt;ccrp != 2)
                       continue;
               if((sdev = nvmeprobe(p)) == nil)
                       continue;
               ctlr = sdev-&gt;ctlr;
               ctlr-&gt;pcidev = p;
               sdevadd(sdev, &amp;head, &amp;tail);
               if (nctlrs &gt;= NCtlr)
                       print("too many nvme controllers\n");
               else
                       ctlrs[nctlrs++] = ctlr;
       }
       return head;
}

static void
allocqpair(Ctlr *ctlr, Qpair *qp)
{
       assert(ctlr-&gt;pgsz);
       qp-&gt;sqlen = ctlr-&gt;sqlen;
       qp-&gt;cqlen = ctlr-&gt;cqlen;
       qp-&gt;q    = mallocalign(qp-&gt;sqlen * sizeof *qp-&gt;q,    ctlr-&gt;pgsz, 0, 0);
       qp-&gt;cmpl = mallocalign(qp-&gt;cqlen * sizeof *qp-&gt;cmpl, ctlr-&gt;pgsz, 0, 0);
       if (qp-&gt;q == nil || qp-&gt;cmpl == nil)
               panic("nvmectlrenable: out of memory for queues");
}

static void
configure(Ctlr *ctlr, Qpair *qpadm)
{
       Regs *regs = ctlr-&gt;regs;

       regs-&gt;aqa = (ctlr-&gt;cqlen - 1)&lt;&lt;16 | (ctlr-&gt;sqlen - 1);
       regs-&gt;asq = PCIWADDR((void *)qpadm-&gt;q);
       regs-&gt;acq = PCIWADDR((void *)qpadm-&gt;cmpl);
       regs-&gt;cc = log2(sizeof(Completion))&lt;&lt;20 | log2(sizeof(Cmd))&lt;&lt;16 |
               (log2(ctlr-&gt;pgsz)-12) &lt;&lt; 7 | Cssnvm;
       coherence();
}

static void
enable(Regs *regs)
{
       if (!(regs-&gt;cc &amp; Enable)) {
               if (awaitbitpat(&amp;regs-&gt;csts, Rdy, 0) &lt; 0)
                       print("nvme enable timed out awaiting not ready\n");
               regs-&gt;cc |= Enable;
               coherence();
       }
       /* else may have previously set Enable &amp; be waiting for ready */
       if (awaitbitpat(&amp;regs-&gt;csts, Rdy, Rdy) &lt; 0)
               print("nvme enable timed out awaiting ready\n");
}

/*
* ns numbers start at 1 and are densely-packed.
* pick one with 512-byte blocks, return preferred lbafmt via *lbafmtp.
*/
static int
bestns(Ctlr *ctlr, int nns, Nsid *nsid, int *lbafmtp)
{
       int i, ns, second, nssecond, lbasize;
       Lbafmt *lbafmt;

       second = 0;
       nssecond = 0;
       *lbafmtp = 0;
       for (ns = 1; ns &lt;= nns; ns++) {
               if (nvmeadmissue(ctlr, Admid, ns, nsid) != 0)
                       panic("nvmectlrenable: Admid(%d) failed", ns);
               for (i = 0; i &lt; nelem(nsid-&gt;lbafmt); i++) {
                       lbafmt = &amp;nsid-&gt;lbafmt[i];
                       if (lbafmt-&gt;lglbasize == 0)  /* end lbafmt list? */
                               break;
                       lbasize = 1 &lt;&lt; lbafmt-&gt;lglbasize;
                       if (Debugns)
                               print("nvme ns %d: lba %d mdsize %d perf %d\n",
                                       ns, lbasize, lbafmt-&gt;mdsize,
                                       lbafmt-&gt;relperf &amp; 3);
                       if (lbafmt-&gt;mdsize == 0 &amp;&amp; lbasize == Minsect) {
                               *lbafmtp = i;
                               return ns;
                       }
                       /* settle for 4k if that's all there is */
                       if (lbafmt-&gt;mdsize == 0 &amp;&amp; lbasize == 4096) {
                               second = i;
                               nssecond = ns;
                       }
               }
       }
       if (nssecond)
               *lbafmtp = second;
       return second;
}

/*
* copy id string from controller, trim trailing blanks, downcase.
* assumes src is unterminated and dest is at least one byte larger.
*/
static void
idcopy(char *dest, char *src, int size)
{
       char *p, *pend;

       memmove(dest, src, size);
       pend = &amp;dest[size];
       *pend-- = '\0';
       for (p = pend; p &gt; dest &amp;&amp; *p == ' '; p--)
               *p = '\0';
       for (p = dest; p &lt;= pend &amp;&amp; *p != '\0'; p++)
               *p = tolower(*p);
}

static void
nvmeintron(SDev *sdev)
{
       char name[32];
       Ctlr *ctlr;

       ctlr = sdev-&gt;ctlr;
       snprint(name, sizeof(name), "sd%c (%s)", sdev-&gt;idno, sdev-&gt;ifc-&gt;name);
       enableintr(ctlr, nvmeinterrupt, ctlr, name);
       ctlr-&gt;regs-&gt;intmset = ~0; /* mask all interrupt sources */
}

static void
zeroqhdtls(Qpair *qp)
{
       qp-&gt;cidx.hd = qp-&gt;qidx.tl = 0;
       qp-&gt;cidx.tl = qp-&gt;qidx.hd = 0;    /* paranoia */
       coherence();
}

static int
nvmectlrenable(Ctlr* ctlr)
{
       int i, nns, gotns;
       char *idpage;
       Ctlrid *ctlrid;
       Lbafmt *lbafmt;
       Nsid *nsid;
       Qpair *qpadm, *qpio;
       Regs *regs = ctlr-&gt;regs;
       SDev *sdev = ctlr-&gt;sdev;

       /* we need at least one admin queue and one i/o queue */
       qpadm = &amp;ctlr-&gt;qpair[Qadmin];
       allocqpair(ctlr, qpadm);
       qpio = &amp;ctlr-&gt;qpair[Qio];
       allocqpair(ctlr, qpio);

       assert(!(regs-&gt;cc &amp; Enable));
       configure(ctlr, qpadm); /* must do this while ctlr is disabled */
       enable(regs);
       zeroqhdtls(qpadm);              /* paranoia */

       regs-&gt;intmset = ~0;          /* mask all interrupt sources */
       nvmeintron(sdev);

       idpage = mallocalign(BY2PG, ctlr-&gt;pgsz, 0, 0);
       if (idpage == nil)
               panic("nvmectlrenable: out of memory");
       if (nvmeadmissue(ctlr, Admid, Nsall, idpage) != 0)
               panic("nvmectlrenable: Admid(Nsall) failed");
       ctlrid = (Ctlrid *)idpage;
       nns = ctlrid-&gt;nns;

       /* smuggle hw id strings into ctlr for later printing */
       idcopy(ctlr-&gt;serial, ctlrid-&gt;serial, sizeof ctlrid-&gt;serial);
       idcopy(ctlr-&gt;model, ctlrid-&gt;model, sizeof ctlrid-&gt;model);
       idcopy(ctlr-&gt;fw, ctlrid-&gt;fw, sizeof ctlrid-&gt;fw);
       if (ctlrid-&gt;mdts)
               ctlr-&gt;mdts = 1 &lt;&lt; ctlrid-&gt;mdts;
//      iprint("nvme: max xfr size %d\n", ctlr-&gt;mdts * ctlr-&gt;minpgsz);

       /*
        * create first i/o queue with admin queue cmds.
        * completion queue must be created first.
        */
       if (nvmeadmissue(ctlr, Admmkiocq, Nsunused, qpio-&gt;cmpl) != 0)
               panic("nvmectlrenable: Admmkiocq failed");
       if (nvmeadmissue(ctlr, Admmkiosq, Nsunused, qpio-&gt;q) != 0)
               panic("nvmectlrenable: Admmkiosq failed");
       zeroqhdtls(qpio);               /* paranoia */

       /* find a suitable namespace */
       nsid = (Nsid *)idpage;
       gotns = bestns(ctlr, nns, nsid, &amp;i);        /* fills in nsid page */
       if (gotns == 0)
               panic("nvmectlrenable: no suitable namespace found");
       lbafmt = &amp;nsid-&gt;lbafmt[i];
       ctlr-&gt;secsize = 1 &lt;&lt; lbafmt-&gt;lglbasize;     /* remember for SDunit */
       ctlr-&gt;sectors = nsid-&gt;cap;                /* remember for SDunit */
       ctlr-&gt;ns = gotns;
       free(idpage);
       if (Debugns)
               print("nvme best ns: %d: sectors %,lld of %d bytes\n",
                       ctlr-&gt;ns, ctlr-&gt;sectors, ctlr-&gt;secsize);
       return 1;
}

static void
freeqpair(Qpair *qp)
{
       free(qp-&gt;q);
       free(qp-&gt;cmpl);
       qp-&gt;q = nil;
       qp-&gt;cmpl = nil;
}

static void
ckstuck(void)
{
       int i;
       static int whined;

       for (i = 0; i &lt; nctlrs; i++)
               nvmeinterrupt(nil, ctlrs[i]);
       if (iosttck &amp;&amp; sys-&gt;ticks - iosttck &gt; 5*HZ &amp;&amp; ++whined &lt; 5)
               iprint("nvme: stuck for 5 s.\n");
}

/*
* activate a single nvme controller, sdev.
* upon return, sdev-&gt;nunit SDunits will be allocated.
*/
static int
nvmeenable(SDev* sdev)
{
       Ctlr *ctlr;

       ctlr = sdev-&gt;ctlr;
       if(ctlr-&gt;qpair[Qadmin].q)
               return 0;

       pcisetbme(ctlr-&gt;pcidev);
       if(!nvmectlrenable(ctlr)) {
               freeqpair(&amp;ctlr-&gt;qpair[Qadmin]);
               freeqpair(&amp;ctlr-&gt;qpair[Qio]);
               return 0;
       }

       /* watch for hardware bugs */
       lock(&amp;clocklck);
       if (!clockrunning) {
               addclock0link(ckstuck, 1000);
               clockrunning = 1;
       }
       unlock(&amp;clocklck);
       return 1;
}

static void
nvmeintroff(SDev *sdev)
{
       char name[32];
       Ctlr *ctlr;

       ctlr = sdev-&gt;ctlr;
       ctlr-&gt;regs-&gt;intmset = ~0;         /* mask all interrupt sources */

       snprint(name, sizeof(name), "sd%c (%s)", sdev-&gt;idno, sdev-&gt;ifc-&gt;name);
       disableintr(ctlr, nvmeinterrupt, ctlr, name);
}

/*
* returns when all in-flight transfers are done.
* call with shutlock &amp; issuelock held.
*/
static void
waitnoxfrs(Ctlr *ctlr)
{
       int i;

       for (i = 1000; i-- &gt; 0 &amp;&amp; ctlr-&gt;inflight &gt; 0; ) {
               iunlock(&amp;ctlr-&gt;shutlock);
               iunlock(&amp;ctlr-&gt;issuelock);
               delay(1);
               ilock(&amp;ctlr-&gt;issuelock);
               ilock(&amp;ctlr-&gt;shutlock);
       }
       if (i &lt;= 0)
               iprint("sdnvme: %d transfers still in flight after 1 s.\n",
                       ctlr-&gt;inflight);
}

static int
nvmedisable(SDev* sdev)                 /* disable interrupts for this sdev */
{
       Ctlr *ctlr;

       ctlr = sdev-&gt;ctlr;
       if (ctlr == nil)
               return 1;
       nvmeissue(ctlr, &amp;ctlr-&gt;qpair[Qio], nil, Cmdflush, Nsall, nil, 0);

       ilock(&amp;ctlr-&gt;issuelock);
       ilock(&amp;ctlr-&gt;shutlock);
       waitnoxfrs(ctlr);
       nvmeintroff(sdev);
       pciclrbme(ctlr-&gt;pcidev);
       iunlock(&amp;ctlr-&gt;shutlock);
       iunlock(&amp;ctlr-&gt;issuelock);
       return 1;
}

static void
nvmeclear(SDev* sdev)                   /* clear the interface for this sdev */
{
       Ctlr *ctlr;

       ctlr = sdev-&gt;ctlr;
       if (ctlr == nil)
               return;
       ilock(&amp;ctlr-&gt;issuelock);
       ilock(&amp;ctlr-&gt;shutlock);
       if (ctlr-&gt;regs) {
               waitnoxfrs(ctlr);
               reset(ctlr-&gt;regs);   /* ctlrs and drives are one-to-one */
       }
       iunlock(&amp;ctlr-&gt;shutlock);
       iunlock(&amp;ctlr-&gt;issuelock);
}

/*
* see if a particular drive exists.
* must not set unit-&gt;sectors here, but rather in nvmeonline.
*/
static int
nvmeverify(SDunit *unit)
{
       if (unit-&gt;subno != 0)
               return 0;
       return 1;
}

/*
* initialise a drive known to exist.
* returns boolean for success.
*/
static int
nvmeonline(SDunit *unit)
{
       int r;

       if (unit-&gt;subno != 0)                /* not me? */
               return 0;
       if (unit-&gt;sectors)           /* already inited? */
               return 1;
       r = scsionline(unit);
       if(r == 0)
               return r;
       nvmedrive(unit);
       /*
        * could hang around until disks are spun up and thus available as
        * nvram, dos file systems, etc.  you wouldn't expect it, but
        * the intel 330 sata ssd takes a while to `spin up'.
        */
       return 1;                       /* drive ready */
}

SDifc sdnvmeifc = {
       "nvme",                         /* name */

       nvmepnp,                        /* pnp */
       nil,                            /* legacy */
       nvmeenable,                     /* enable */
       nvmedisable,                    /* disable */

       nvmeverify,                     /* verify */
       nvmeonline,                     /* online */
       nvmerio,                        /* rio */
       nvmerctl,                       /* rctl */
       nil,                            /* wctl */

       scsibio,                        /* bio */
       nil,                            /* probe */
       nvmeclear,                      /* clear */
       nvmertopctl,                    /* rtopctl */
       nil,                            /* wtopctl */
};
<!-- BEGIN TAIL -->
</pre>
</td></tr></table>
</td></tr></table>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="line-height: 1.2em; margin-left: 1.00in; text-indent: 0.00in; margin-right: 1.00in; margin-top: 0; margin-bottom: 0; text-align: center;">
<span style="font-size: 10pt"></span></p>
<p style="margin-top: 0; margin-bottom: 0.50in"></p>
<p style="margin-top: 0; margin-bottom: 0.33in"></p>
<center><table border="0"><tr>
<td valign="middle"><a href="http://www.alcatel-lucent.com/"><img border="0" src="/plan9/img/logo_ft.gif" alt="Bell Labs" />
</a></td>
<td valign="middle"><a href="http://www.opensource.org"><img border="0" alt="OSI certified" src="/plan9/img/osi-certified-60x50.gif" />
</a></td>
<td><img style="padding-right: 45px;" alt="Powered by Plan 9" src="/plan9/img/power36.gif" />
</td>
</tr></table></center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center>
<span style="font-size: 10pt">(<a href="/plan9/">Return to Plan 9 Home Page</a>)</span>
</center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center><font size=-1>
<span style="font-size: 10pt"><a href="http://www.lucent.com/copyright.html">Copyright</a></span>
<span style="font-size: 10pt">© 2009 Alcatel-Lucent.</span>
<span style="font-size: 10pt">All Rights Reserved.</span>
<br />
<span style="font-size: 10pt">Comments to</span>
<span style="font-size: 10pt"><a href="mailto:[email protected]">[email protected]</a>.</span>
</font></center>
</body>
</html>