<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"
http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=utf8">
<title>/usr/web/sources/contrib/geoff/sdnvme.c - Plan 9 from Bell Labs</title>
<!-- THIS FILE IS AUTOMATICALLY GENERATED. -->
<!-- EDIT sources.tr INSTEAD. -->
</meta>
</head>
<body>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="line-height: 1.2em; margin-left: 1.00in; text-indent: 0.00in; margin-right: 1.00in; margin-top: 0; margin-bottom: 0; text-align: center;">
<span style="font-size: 10pt"><a href="/plan9/">Plan 9 from Bell Labs</a>’s /usr/web/sources/contrib/geoff/sdnvme.c</span></p>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center><font size=-1>
Copyright © 2009 Alcatel-Lucent.<br />
Distributed under the
<a href="/plan9/license.html">Lucent Public License version 1.02</a>.
<br />
<a href="/plan9/download.html">Download the Plan 9 distribution.</a>
</font>
</center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<table width="100%" cellspacing=0 border=0><tr><td align="center">
<table cellspacing=0 cellpadding=5 bgcolor="#eeeeff"><tr><td align="left">
<pre>
<!-- END HEADER -->
/*
* driver for NVM Express 1.1 interface to PCI-Express solid state disk
* (i.e., flash memory).
*
* currently the controller is in the drive, so there's no multiplexing
* of drives through the controller. multiple namespaces (actually number
* spaces) are assumed to refer to different views of the same disk
* (different block sizes).
*
* many features of NVME are ignored in the interest of simplicity and speed.
* many of them are intended to jump on a bandwagon (e.g., VMs) or check a box.
* using interrupts rather than polling costs us about 4% in large-block
* sequential read performance.
*/
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/error.h"
#include "../port/sd.h"
#define PAGEOF(ctlr, p) ((uintptr)(p) & ~((ctlr)->pgsz-1))
#define QFULL(qp) ((qp)->qidx.hd == qidxplus1((qp), (qp)->qidx.tl))
#define QEMPTY(qp) ((qp)->qidx.hd == (qp)->qidx.tl)
#define nvmeadmissue(ctlr, op, nsid, buf) \
nvmeissue(ctlr, &ctlr->qpair[Qadmin], nil, op, nsid, buf, 0)
enum {
/* fundamental constants */
Qadmin, /* queue-pair ordinals; Qadmin fixed at 0 */
Qio,
Nqueues,
Vall = 1<<Qadmin | 1<<Qio, /* all interesting vector */
Subq = 0,
Complq,
Qtypes,
Nsunused = 0,
Nsall = ~0ul,
Idns = 0,
Idctlr,
Idnsids,
Minsect = 512,
/* tunable parameters */
Debugintr = 0,
Debugns = 0,
Timeout = 20*1000, /* adjust to taste. started at 2000 ms. */
/*
* NVME page size must be >= sector size. anything over 8K only
* benefits bulk copies and benchmarks.
*/
Startpgsz = Sdalign, /* on samsung sm951, 4k ≤ page_size ≤ 128MB */
Qlen = 32, /* defaults; queue lengths must be powers of 2 < 4K */
Cqlen = 16,
NCtlr = 8, /* each takes a pci-e or m.2 slot */
NCtlrdrv= 1,
NDrive = NCtlr * NCtlrdrv,
Reserved = (ushort)~0, /* placeholder cmdid */
};
/* admin commands */
enum Adminops {
Admmkiosq = 1, /* create i/o submission q */
Admmkiocq = 5, /* create i/o completion q */
Admid = 6, /* identify */
};
/* I/O commands */
enum Opcode {
Cmdflush = 0,
Cmdwrite = 1,
Cmdread = 2,
Cmdwriteuncorr = 4,
Cmdcompare = 5,
Cmddsm = 9,
};
typedef struct Cmd Cmd;
typedef struct Completion Completion;
typedef struct Ctlr Ctlr;
typedef struct Ctlrid Ctlrid;
typedef struct Doorbell Doorbell;
typedef struct Lbafmt Lbafmt;
typedef struct Nsid Nsid;
typedef struct Nvindx Nvindx;
typedef struct Qpair Qpair;
typedef struct Regs Regs;
typedef struct Transfer Transfer;
extern SDifc sdnvmeifc;
struct Nvindx {
unsigned hd; /* remove at this index */
unsigned tl; /* add at this index */
};
struct Qpair {
Cmd *q; /* base of Cmd array */
Nvindx qidx;
int sqlen;
int writelast; /* flag: read or write in last cmd? */
Completion *cmpl; /* base of Completion array */
Nvindx cidx;
int cqlen;
int phase; /* initial phase bit setting in cmpl */
};
/* these are reused and never freed */
struct Transfer {
Transfer *next;
Rendez;
int done; /* flag for rendezvous */
int status; /* from completion */
ulong qtm; /* time at enqueue */
uvlong stcyc; /* cycles at enqueue */
ushort cmdid; /* 0 means available */
int rdwr;
};
struct Ctlr {
Regs* regs; /* memory-mapped I/O registers */
SDev* sdev;
Intrcommon;
uintptr port; /* physical addr of I/O registers */
int pgsz; /* size of an `nvme page' */
int minpgsz;
int mdts; /* actual value, not log2; unit minpgsz */
int sqlen; /* sub q len */
int cqlen; /* compl q len */
int stride; /* bytes from base of one doorbell reg. to the next */
/* per-drive scalars, since there is only one drive */
vlong sectors; /* total, copy to SDunits */
int secsize; /* sector size, copy to SDunits */
int ns; /* namespace of the single drive */
/* stats */
int maxqlen[2]; /* high water marks of read, write queues */
/* example results: rd 89 µs, wr 325 µs */
uvlong maxcyc[3]; /* high water marks of read, write, admin cycles */
/* per controller */
QLock; /* serialise q notifications */
Rendez; /* q empty/full notifications */
Lock; /* intr svc */
Lock issuelock; /* inflight & q heads & tail mostly */
Lock xfrlock;
Lock shutlock;
int inflight; /* count of xfrs in progress */
int intrsena; /* interrupts we care about */
Transfer *xfrs; /* transfers in flight or done */
Qpair qpair[Nqueues]; /* use a single admin queue pair */
/* per-drive arrays */
char serial[20+1];
char model[40+1];
char fw[8+1];
};
struct Regs {
uvlong cap; /* controller capabilities */
ulong vs; /* version */
/* intm* bits are actually vector number offsets */
ulong intmset; /* intr mask set: bit # is i/o completion q # */
ulong intmclr; /* intr mask clear: " */
ulong cc; /* controller configuration */
ulong nssrc; /* reset, iff cap.nssrs set */
ulong csts; /* controller status */
ulong _rsvd2; /* reserved */
ulong aqa; /* admin queue attributes */
uvlong asq; /* admin submit queue base address */
uvlong acq; /* admin completion queue base address */
uchar _pad0[0x1000 - 0x38];
/* this is the nominal doorbell layout, with stride of 4 */
struct Doorbell {
ulong sqtl; /* submission queue tail */
ulong cqhd; /* completion queue head */
} doorbell[Nqueues];
};
/*
* making the doorbell stride variable at run time requires changing the
* declaration and addressing of the Regs->doorbell array, making it clunkier.
* supposedly non-zero strides are only desirable in VMs, for efficiency.
*/
/* clunky doorbell register addressing for any stride */
/* instead of &ctlr->regs->doorbell[qid].sqtl */
#define doorbellsqtl(ctlr, qp) (ulong *)\
((char *)(ctlr)->regs->doorbell + (ctlr)->stride*(Qtypes*(qp) + Subq))
/* instead of &ctlr->regs->doorbell[qid].cqhd */
#define doorbellcqhd(ctlr, qp) (ulong *)\
((char *)(ctlr)->regs->doorbell + (ctlr)->stride*(Qtypes*(qp) + Complq))
enum {
/* cap */
Nssrs = 1ull << 36,
/* cc */
Enable = 1 << 0,
Cssnvm = 0 << 4, /* nvm command set */
Cssmask = 7 << 4,
Shnnone = 0 << 14, /* shutdown style */
Shnnormal = 1 << 14,
Shnabrupt = 2 << 14,
Shnmask = 3 << 14,
/* csts */
Rdy = 1 << 0, /* okay to add to sub. q */
Cfs = 1 << 1, /* controller fatal status */
Shstnormal = 0 << 2, /* shutdown status */
Shstoccur = 1 << 2,
Shstcmplt = 2 << 2,
Shstmask = 3 << 2,
Nssro = 1 << 4,
};
struct Cmd {
/* common 40-byte header */
uchar opcode; /* command dword 0 */
uchar flags;
ushort cmdid;
ulong nsid;
ulong cdw2[2]; /* not used */
uvlong metadata;
uvlong prp1; /* buffer memory address */
uvlong prp2; /* zero, buffer addr, or prp list addr */
union {
ulong cdw10[6]; /* admin: command dwords 10-15 */
struct { /* nvm i/o */
uvlong slba;
ushort length;
ushort control;
ulong dsmgmt;
/* rest are for end-to-end protection only */
ulong reftag;
ushort apptag;
ushort appmask;
};
};
};
enum {
/* cdw10[1] for Admmkiocq */
Ien = 1<<1, /* intr enabled */
Pc = 1<<0, /* physically contiguous */
};
struct Completion {
ulong specific;
ulong _pad;
ushort sqhd;
ushort sqid;
ushort cmdid;
ushort stsphs; /* status + 1 phase bit */
};
enum {
Phase = 1, /* phase bit in stsphs */
};
struct Ctlrid {
ushort pcivid;
ushort pcissvid;
char serial[20]; /* space-padded, unterminated strings */
char model[40];
char fw[8];
char _72_[77-72];
uchar mdts; /* log2(max data xfr size), unit: min pg sz */
/* 0 is unlimited */
char _516_[516-78]; /* ... lots of uninteresting stuff ... */
ulong nns; /* number of namespaces present */
/* ... lots of uninteresting stuff ... */
};
struct Nsid {
uvlong size;
uvlong cap;
uvlong used;
uchar feat;
uchar lnbafmts;
uchar fmtlbasz;
uchar mdcap;
uchar dpc;
uchar dps;
uchar optnmic;
uchar optrescap;
uchar _pad0[128-32];
struct Lbafmt {
ushort mdsize;
uchar lglbasize; /* log2(lba size) */
uchar relperf;
} lbafmt[16];
/* ... uninteresting stuff ... */
};
CTASSERT(sizeof(Cmd) == 64, cmd_wrong_size);
CTASSERT(sizeof(Completion) == 16, compl_wrong_size);
static Lock clocklck;
static int clockrunning;
static ulong iosttck; /* tick of most recently-started i/o */
static int nctlrs;
static Ctlr *ctlrs[NCtlr];
static void
cidxincr(Ctlr *ctlr, Qpair *qp)
{
if (++qp->cidx.hd >= ctlr->cqlen) {
qp->cidx.hd = 0;
qp->phase ^= Phase;
}
}
#ifdef unused
static void
isfatal(Regs *regs, char *where)
{
if (regs->csts & Cfs)
panic("nvme: fatal controller error %s", where);
}
#endif
static Transfer *
findxfr(Ctlr *ctlr, int cmdid)
{
Transfer *xfr;
for (xfr = ctlr->xfrs; xfr; xfr = xfr->next)
if (xfr->cmdid == cmdid)
return xfr;
return nil;
}
/*
* cqhd is head of the completion queue.
* mark its transfer done, notify anybody waiting for it.
*/
static void
completexfr(Ctlr *ctlr, Completion *cqhd, int qid)
{
uvlong cycs;
Transfer *xfr;
if (Debugintr)
iprint("intr q %d cmdid %d...", qid, cqhd->cmdid);
xfr = findxfr(ctlr, cqhd->cmdid);
if (xfr == nil)
panic("sd%C0: nvmeinterrupt: unexpected completion cmd id %d",
ctlr->sdev->idno, cqhd->cmdid);
if (xfr->qtm && TK2MS(sys->ticks) - xfr->qtm >= Timeout)
iprint("sd%C0: nvmeinterrupt: completed cmd id %d but "
"took more than %d s.\n",
ctlr->sdev->idno, cqhd->cmdid, Timeout/1000);
/* cycle-based measurements */
cycles(&cycs);
cycs -= xfr->stcyc;
if (cycs > ctlr->maxcyc[xfr->rdwr])
ctlr->maxcyc[xfr->rdwr] = cycs;
xfr->status = cqhd->stsphs & ~Phase;
xfr->done = 1;
xfr->qtm = 0;
wakeup(xfr); /* notify of completion */
}
/* advance sub. q head to completion's, notify waiters */
static void
advancesqhd(Ctlr *ctlr, Qpair *qp, Completion *cqhd, int qid)
{
if (Debugintr)
iprint("sw q %d sqhd set to %d...", qid, cqhd->sqhd);
qp->qidx.hd = cqhd->sqhd;
wakeup(ctlr); /* notify of sqhd advance */
}
/*
* advance compl. q head, notify ctlr., which will extinguish intr source
* (by acknowledging this completion) and remove cqhd from the compl. q.
*/
static void
advancecqhd(Ctlr *ctlr, Qpair *qp, int qid)
{
cidxincr(ctlr, qp);
if (Debugintr)
iprint("doorbell q %d cqhd set to %d\n", qid, qp->cidx.hd);
*doorbellcqhd(ctlr, qid) = qp->cidx.hd;
coherence();
}
/*
* Act on and clear the interrupt(s).
* In order to share PCI IRQs, just ignore spurious interrupts.
* Advances queue head indices past completed operations.
*/
static Intrsvcret
nvmeinterrupt(Ureg *, void* arg)
{
int qid, ndone, donepass; /* qid's not a great name (see path.qid) */
ulong causes;
Completion *cqhd;
Ctlr *ctlr;
Qpair *qp;
Regs *regs;
ctlr = arg;
regs = ctlr->regs;
causes = regs->intmset;
USED(causes);
ilock(&ctlr->issuelock); /* keep other cpus out of intr svc, indices */
if (ctlr->inflight == 0) { /* not expecting an interrupt? */
/* probably lost a race with polling: nothing to do */
iunlock(&ctlr->issuelock);
return Intrnotforme;
}
ndone = 0;
do {
donepass = 0;
for (qid = Nqueues - 1; qid >= 0; qid--) /* scan i/o q 1st */
for (qp = &ctlr->qpair[qid]; ; ) {
cqhd = &qp->cmpl[qp->cidx.hd];
if ((cqhd->stsphs & Phase) == qp->phase)
break;
completexfr(ctlr, cqhd, qid);
advancesqhd(ctlr, qp, cqhd, qid);
/*
* toggles qp->phase if qp->cidx.hd wraps when
* incr'd.
*/
advancecqhd(ctlr, qp, qid);
if (--ctlr->inflight < 0)
iprint("nvmeinterrupt: inflight botch\n");
ndone++, donepass++;
}
} while (donepass > 0);
/* unmask intr. sources of interest iff transfers are in flight */
if (ctlr->inflight == 0) {
iosttck = 0;
ctlr->intrsena = 0;
} else
regs->intmclr = Vall;
iunlock(&ctlr->issuelock);
if (ndone > 0)
return Intrforme;
else
return Intrnotforme;
}
/* return cmd id other than zero and Reserved */
static int
cidalloc(void)
{
int thisid;
static int cid;
static Lock cidlck;
ilock(&cidlck);
++cid;
if ((ushort)cid == 0 || (ushort)cid == Reserved)
cid = 1;
thisid = cid;
iunlock(&cidlck);
return thisid;
}
/* fill in submission queue entry *cmd */
static void
mkcmd(Ctlr *ctlr, Cmd *cmd, SDreq *r, int op, ulong nsid, void *buf, int qid,
vlong lba)
{
long count;
uintptr addr;
memset(cmd, 0, sizeof *cmd);
cmd->opcode = op;
cmd->cmdid = cidalloc();
cmd->nsid = nsid;
addr = (uintptr)buf;
if (addr != 0) {
if (addr < KZERO)
print("nvme mkcmd: %#p not kernel virtual address\n",
addr);
/* each prp entry points to at most a page */
cmd->prp1 = PCIWADDR((void *)addr);
if (r && r->dlen > ctlr->pgsz && r->dlen <= 2*ctlr->pgsz)
cmd->prp2 = PAGEOF(ctlr, cmd->prp1) + ctlr->pgsz;
else
cmd->prp2 = 0;
}
switch (qid) {
case Qadmin:
/* we are using single-message msi */
switch (op) {
case Admmkiocq:
cmd->cdw10[0] = (ctlr->cqlen - 1)<<16 | Qio;
cmd->cdw10[1] = Ien | Pc; /* vector 0 since no msi-x */
break;
case Admmkiosq:
cmd->cdw10[0] = (ctlr->sqlen - 1)<<16 | Qio;
cmd->cdw10[1] = Qio<<16 | Pc; /* completion q id */
break;
case Admid:
if (nsid == Nsall) {
cmd->cdw10[0] = Idctlr;
cmd->nsid = 0;
} else
cmd->cdw10[0] = Idns;
break;
}
break;
default:
switch (op) {
case Cmdread:
case Cmdwrite:
count = r->dlen / r->unit->secsize;
if (count == 0) {
print("nvmeissue: zero sector count for i/o "
"of length %d\n", r->dlen);
break;
}
cmd->slba = lba;
cmd->length = (ushort)(count - 1); /* sectors */
assert(r->data == buf);
assert(r->unit->secsize * count <= r->dlen);
assert(nsid);
break;
}
break;
}
}
static void
updmaxqlen(Ctlr *ctlr, Qpair *qp)
{
int qlen;
int *qlenp;
qlen = (qp->qidx.tl + qp->sqlen - qp->qidx.hd) % qp->sqlen;
qlenp = &ctlr->maxqlen[qp->writelast];
if (qlen > *qlenp)
*qlenp = qlen;
}
/*
* send a command via the submission queue.
* call with ctlr->issuelock held.
* advances submission queue's tail index.
*/
static void
sendcmd(Ctlr *ctlr, Qpair *qp, Cmd *qtl, Transfer *xfr)
{
int qid;
xfr->done = 0;
xfr->cmdid = qtl->cmdid;
xfr->qtm = TK2MS(sys->ticks);
qid = qp - ctlr->qpair;
if (Debugintr)
iprint("issue q %d cmdid %d...", qid, xfr->cmdid);
/*
* Notify controller of new submission queue entry,
* which triggers execution of it.
*/
updmaxqlen(ctlr, qp);
cycles(&xfr->stcyc);
ctlr->inflight++;
iosttck = sys->ticks;
*doorbellsqtl(ctlr, qid) = qp->qidx.tl; /* start i/o */
coherence();
ctlr->regs->intmclr = ctlr->intrsena = Vall; /* unmask intrs */
}
static int
doneio(void* arg)
{
return ((Transfer *)arg)->done;
}
static uint
qidxplus1(Qpair *qp, uint idx)
{
if (++idx >= qp->sqlen)
idx = 0;
return idx;
}
static int
qnotfull(void *arg)
{
return !QFULL((Qpair *)arg);
}
static int
qempty(void *arg)
{
return QEMPTY((Qpair *)arg);
}
static Transfer *
getfreexfr(Ctlr *ctlr)
{
Transfer *xfr;
ilock(&ctlr->xfrlock); /* allocate xfr */
xfr = findxfr(ctlr, 0);
if (xfr == nil) {
xfr = malloc(sizeof *xfr);
if (xfr == nil)
panic("nvmeissue: out of memory");
xfr->next = ctlr->xfrs;
ctlr->xfrs = xfr; /* add new xfr to chain */
}
xfr->cmdid = Reserved;
xfr->qtm = 0;
iunlock(&ctlr->xfrlock);
return xfr;
}
/*
* if needed, wait for the sub q to drain a lot or a little.
* not infallible, so test afterward under lock.
*/
static void
qdrain(Ctlr *ctlr, Qpair *qp, SDreq *r)
{
if (QFULL(qp)) {
qlock(ctlr); /* wait for q space */
while (QFULL(qp))
sleep(ctlr, qnotfull, qp);
qunlock(ctlr);
}
/*
* don't mix reads and writes in the queue, to avoid read-before-write
* problems.
*/
if (r && qp->writelast != r->write) {
qlock(ctlr);
if (qp->writelast != r->write)
sleep(ctlr, qempty, qp); /* changing, so drain */
qp->writelast = r->write;
qunlock(ctlr);
}
}
/* drain and return with ctlr->issuelock held */
static void
qdrainilock(Ctlr *ctlr, Qpair *qp, SDreq *r)
{
int again;
/* serialise composition of cmd in place at sq tail */
do {
qdrain(ctlr, qp, r);
again = 0;
ilock(&ctlr->issuelock);
/* test again under lock */
if (QFULL(qp) || r && qp->writelast != r->write) {
/* lost a race; uncommon case */
iunlock(&ctlr->issuelock);
again = 1;
}
} while (again);
/* issuelock still held */
}
static void
prerr(int sts)
{
if (sts)
iprint("nvmeissue: cmd error status %#ux: "
"code %#ux type %d more %d do-not-retry %d\n", sts,
(sts >> 1) & MASK(8), (sts >> 9) & MASK(3),
(sts >> 14) & MASK(1), (sts >> 15) & MASK(1));
}
/*
* add new nvme command to tail of submission queue of Qpair,
* and wait for it to complete. return status with phase bit zeroed.
*/
static int
nvmeissue(Ctlr *ctlr, Qpair *qp, SDreq *r, int op, ulong nsid, void *buf,
vlong lba)
{
ushort sts;
Cmd *qtl;
Transfer *xfr;
xfr = getfreexfr(ctlr);
if (op == Cmdwrite)
xfr->rdwr = Write;
else if (op == Cmdread)
xfr->rdwr = Read;
else
xfr->rdwr = 2;
/* serialise composition of cmd in place at sq tail */
qdrainilock(ctlr, qp, r);
/* ctlr->issuelock is now held */
/* Reserve a space and update sub. q tail index past it. */
qtl = &qp->q[qp->qidx.tl];
qp->qidx.tl = qidxplus1(qp, qp->qidx.tl);
/*
* Compose the command struct at the tail of the submission queue.
* mkcmd converts buf to physical address space.
*/
mkcmd(ctlr, qtl, r, op, nsid, buf, qp - ctlr->qpair, lba);
sendcmd(ctlr, qp, qtl, xfr); /* start cmd */
iunlock(&ctlr->issuelock);
/* this is the only process waiting for this xfr. */
while(waserror())
;
tsleep(xfr, doneio, xfr, Timeout);
poperror();
if (!xfr->done) {
/* we see this with the Samsung 983 DCT. */
nvmeinterrupt(nil, ctlr);
if (!xfr->done)
panic("sd%C0: nvmeissue: cmd id %d didn't complete "
"in %d s.", ctlr->sdev->idno, xfr->cmdid,
Timeout/1000);
}
sts = xfr->status;
xfr->cmdid = 0; /* xfr available for re-use */
if (sts)
prerr(sts);
return sts;
}
/* map scsi to nvm opcodes */
static int
scsiop2nvme(uchar* cmd)
{
if (isscsiread(*cmd))
return Cmdread;
else if (isscsiwrite(*cmd))
return Cmdwrite;
else {
iprint("scsiop2nvme: scsi cmd %#ux unexpected\n", *cmd);
return -1;
}
}
static int
issueios(SDreq *r)
{
int n, max, iostat, nvmcmd;
ulong count; /* sectors */
uvlong lba;
Ctlr *ctlr;
SDunit *unit;
unit = r->unit;
ctlr = unit->dev->ctlr;
nvmcmd = scsiop2nvme(r->cmd);
if (nvmcmd == -1)
error("nvme: scsi cmd unexpected");
scsilbacount(r->cmd, r->clen, &lba, &count);
if(count * unit->secsize > r->dlen)
count = r->dlen / unit->secsize;
max = 2*ctlr->pgsz / unit->secsize; /* needs 1 or 2 prp addrs */
/* to do this in generality, need to allocate a prp list page */
if (0)
max = (ctlr->mdts? ctlr->mdts * ctlr->minpgsz: 128*KB) /
unit->secsize;
iostat = 0;
for (; count > 0; count -= n){
n = MIN(count, max);
r->dlen = n * unit->secsize;
iostat = nvmeissue(ctlr, &ctlr->qpair[Qio], r, nvmcmd,
ctlr->ns, r->data, lba);
if (iostat)
break;
lba += n;
r->data = (uchar *)r->data + r->dlen;
}
return iostat;
}
/*
* Issue an I/O (SCSI) command to a controller and wait for it to complete.
* The command and its length is contained in r->cmd and r->cmdlen.
* If any data is to be returned, r->dlen should be non-zero, and
* the returned data will be placed in r->data.
*/
static int
nvmerio(SDreq* r)
{
int i, iostat;
ulong origdlen;
uchar *origdata;
static char info[256];
if(*r->cmd == ScmdSynccache || *r->cmd == ScmdSynccache16)
return sdsetsense(r, SDok, 0, 0, 0);
/* scsi command to get information about the drive or disk? */
if((i = sdfakescsi(r, info, sizeof info)) != SDnostatus){
r->status = i;
return i;
}
if(r->data == nil)
return SDok;
/*
* Cap the size of individual transfers and repeat if needed.
* Save r->data and r->dlen, and restore them after the loop.
* could call scsibio, which allocates an SDreq.
*/
origdata = r->data;
origdlen = r->dlen;
assert(r->unit->secsize >= Minsect &&
r->unit->secsize <= ((Ctlr *)r->unit->dev->ctlr)->pgsz);
iostat = issueios(r);
r->rlen = (uchar *)r->data - origdata;
r->data = origdata;
r->dlen = origdlen;
r->status = SDok;
if (iostat != 0) {
r->status = SDeio;
/* 3, 0xc, 2: write error, reallocation failed */
sdsetsense(r, SDcheck, 3, 0xc, 2);
}
return r->status;
}
static int
nvmerctl(SDunit* unit, char* p, int l)
{
int n;
Ctlr *ctlr;
Regs *regs;
if((ctlr = unit->dev->ctlr) == nil)
return 0;
regs = ctlr->regs;
n = snprint(p, l, "config %#lux capabilities %#llux status %#lux\n",
regs->cc, regs->cap, regs->csts);
/*
* devsd has already generated "inquiry" line using the model,
* so printing ctlr->model here would be redundant.
*/
n += snprint(p+n, l-n, "serial %s\n", ctlr->serial);
if(unit->sectors)
n += snprint(p+n, l-n, "geometry %lld %lud\n",
unit->sectors, unit->secsize);
return n;
}
/* must emit exactly one line per controller (sd(3)) */
static char*
nvmertopctl(SDev *sdev, char *p, char *e)
{
Ctlr *ctlr;
ctlr = sdev->ctlr;
return seprint(p, e, "sd%c nvme regs %#p irq %d: max q lens, rd %d "
"wr %d; max cycs, rd %lld wr %lld\n", sdev->idno, ctlr->port,
ctlr->irq, ctlr->maxqlen[Read], ctlr->maxqlen[Write],
ctlr->maxcyc[Read], ctlr->maxcyc[Write]);
}
static void
reset(Regs *regs)
{
if (regs->cc & Enable) {
if (awaitbitpat(&regs->csts, Rdy, Rdy) < 0)
print("nvme reset timed out awaiting ready\n");
regs->cc &= ~Enable;
coherence();
}
/* else may have previously cleared Enable & be waiting for not ready */
if (awaitbitpat(&regs->csts, Rdy, 0) < 0)
print("nvme reset timed out awaiting not ready\n");
}
static void
nvmedrive(SDunit *unit)
{
uchar *p;
Ctlr *ctlr;
unit->sense[0] = 0x70;
unit->sense[7] = sizeof(unit->sense)-7;
memset(unit->inquiry, 0, sizeof unit->inquiry);
unit->inquiry[0] = SDperdisk;
unit->inquiry[2] = 2;
unit->inquiry[3] = 2;
unit->inquiry[4] = sizeof unit->inquiry - 4;
p = &unit->inquiry[8];
ctlr = unit->dev->ctlr;
/* model is smaller than unit->inquiry-8 */
strncpy((char *)p, ctlr->model, sizeof ctlr->model);
unit->secsize = ctlr->secsize;
unit->sectors = ctlr->sectors;
print("sd%C%d: nvme %,lld sectors: %s fw %s serial %s\n",
unit->dev->idno, unit->subno, unit->sectors,
ctlr->model, ctlr->fw, ctlr->serial);
}
static void
pickpgsz(Ctlr *ctlr)
{
ulong minpgsz, maxpgsz;
minpgsz = 1 << (12 + ((ctlr->regs->cap >> 48) & MASK(4)));
maxpgsz = 1 << (12 + ((ctlr->regs->cap >> 52) & MASK(4)));
ctlr->minpgsz = minpgsz; /* for Ctlrid->mdts */
ctlr->pgsz = MIN(Startpgsz, maxpgsz);
if (ctlr->pgsz < minpgsz)
ctlr->pgsz = minpgsz;
if (Sdalign >= 4*KB && ctlr->pgsz > Sdalign)
ctlr->pgsz = Sdalign;
if (ctlr->pgsz < 4*KB) /* sanity */
ctlr->pgsz = 4*KB;
}
static void
pickqlens(Ctlr *ctlr)
{
ulong mqes;
mqes = (ctlr->regs->cap & MASK(16)) + 1; /* max i/o [sc] q len */
ctlr->sqlen = MIN(mqes, Qlen);
ctlr->cqlen = MIN(mqes, Cqlen);
}
static SDev*
nvmeprobe(Pcidev *p)
{
int logstride;
uintptr port;
Ctlr *ctlr;
Regs *regs;
SDev *sdev;
static int count;
assert(p->mem[1].bar == 0); /* upper 32 bits of 64-bit addr */
port = p->mem[0].bar & ~0x0f;
regs = vmap(port, p->mem[0].size);
if(regs == nil){
print("nvmeprobe: phys address %#p in use did=%#ux\n",
port, p->did);
return nil;
}
if ((ctlr = malloc(sizeof(Ctlr))) == nil ||
(sdev = malloc(sizeof(SDev))) == nil) {
free(ctlr);
vunmap(regs, p->mem[0].size);
return nil;
}
ctlr->regs = regs;
ctlr->port = port;
ctlr->irq = p->intl;
/*
* Attempt to hard-reset the board.
*/
reset(regs);
logstride = ((regs->cap >> 32) & MASK(4)); /* doorbell stride */
if (logstride != 0)
panic("nvmeprobe: doorbell stride must be 0 (for now), not %d",
logstride);
ctlr->stride = 1 << (2 + logstride); /* 2^(2+logstride) */
if (0 && regs->cap & Nssrs) { /* nvm subsys reset avail.? */
regs->cc |= Nssro; /* clear Nssro by setting it */
regs->nssrc = 'N'<<24 | 'V'<<16 | 'M'<<8 | 'e';
if (awaitbitpat(&regs->csts, Nssro, Nssro) < 0)
print("nvme subsys reset timed out awaiting Nssro\n");
}
pickpgsz(ctlr);
pickqlens(ctlr);
sdev->ifc = &sdnvmeifc;
sdev->ctlr = ctlr;
sdev->idno = 'n'; /* actually assigned in sdadddevs() */
sdev->nunit = NCtlrdrv; /* max. drives (can be number found) */
ctlr->sdev = sdev;
/*
* we (pnp) don't have a `spec' argument, so
* we'll assume that sdn0 goes to the first nvme host
* adapter found, sdo0 to the next, etc.
*/
print("#S/sd%c: nvme: irq %d regs %#p page size %d\n",
sdev->idno + count++, ctlr->irq, ctlr->port, ctlr->pgsz);
/* would probe for drives here if there could be more than one. */
/* upon return, this many sdev->units will be allocated. */
sdev->nunit = 1;
return sdev;
}
static void
sdevadd(SDev *sdev, SDev **head, SDev **tail)
{
if(*head != nil)
(*tail)->next = sdev;
else
*head = sdev;
*tail = sdev;
}
/*
* find all nvme controllers
*/
static SDev*
nvmepnp(void)
{
Ctlr *ctlr;
Pcidev *p;
SDev *sdev, *head, *tail;
p = nil;
head = tail = nil;
while(p = pcimatch(p, 0, 0)){
/* ccrp 2 is NVME */
if(p->ccrb != Pcibcstore || p->ccru != Pciscnvm || p->ccrp != 2)
continue;
if((sdev = nvmeprobe(p)) == nil)
continue;
ctlr = sdev->ctlr;
ctlr->pcidev = p;
sdevadd(sdev, &head, &tail);
if (nctlrs >= NCtlr)
print("too many nvme controllers\n");
else
ctlrs[nctlrs++] = ctlr;
}
return head;
}
static void
allocqpair(Ctlr *ctlr, Qpair *qp)
{
assert(ctlr->pgsz);
qp->sqlen = ctlr->sqlen;
qp->cqlen = ctlr->cqlen;
qp->q = mallocalign(qp->sqlen * sizeof *qp->q, ctlr->pgsz, 0, 0);
qp->cmpl = mallocalign(qp->cqlen * sizeof *qp->cmpl, ctlr->pgsz, 0, 0);
if (qp->q == nil || qp->cmpl == nil)
panic("nvmectlrenable: out of memory for queues");
}
static void
configure(Ctlr *ctlr, Qpair *qpadm)
{
Regs *regs = ctlr->regs;
regs->aqa = (ctlr->cqlen - 1)<<16 | (ctlr->sqlen - 1);
regs->asq = PCIWADDR((void *)qpadm->q);
regs->acq = PCIWADDR((void *)qpadm->cmpl);
regs->cc = log2(sizeof(Completion))<<20 | log2(sizeof(Cmd))<<16 |
(log2(ctlr->pgsz)-12) << 7 | Cssnvm;
coherence();
}
static void
enable(Regs *regs)
{
if (!(regs->cc & Enable)) {
if (awaitbitpat(&regs->csts, Rdy, 0) < 0)
print("nvme enable timed out awaiting not ready\n");
regs->cc |= Enable;
coherence();
}
/* else may have previously set Enable & be waiting for ready */
if (awaitbitpat(&regs->csts, Rdy, Rdy) < 0)
print("nvme enable timed out awaiting ready\n");
}
/*
* ns numbers start at 1 and are densely-packed.
* pick one with 512-byte blocks, return preferred lbafmt via *lbafmtp.
*/
static int
bestns(Ctlr *ctlr, int nns, Nsid *nsid, int *lbafmtp)
{
int i, ns, second, nssecond, lbasize;
Lbafmt *lbafmt;
second = 0;
nssecond = 0;
*lbafmtp = 0;
for (ns = 1; ns <= nns; ns++) {
if (nvmeadmissue(ctlr, Admid, ns, nsid) != 0)
panic("nvmectlrenable: Admid(%d) failed", ns);
for (i = 0; i < nelem(nsid->lbafmt); i++) {
lbafmt = &nsid->lbafmt[i];
if (lbafmt->lglbasize == 0) /* end lbafmt list? */
break;
lbasize = 1 << lbafmt->lglbasize;
if (Debugns)
print("nvme ns %d: lba %d mdsize %d perf %d\n",
ns, lbasize, lbafmt->mdsize,
lbafmt->relperf & 3);
if (lbafmt->mdsize == 0 && lbasize == Minsect) {
*lbafmtp = i;
return ns;
}
/* settle for 4k if that's all there is */
if (lbafmt->mdsize == 0 && lbasize == 4096) {
second = i;
nssecond = ns;
}
}
}
if (nssecond)
*lbafmtp = second;
return second;
}
/*
* copy id string from controller, trim trailing blanks, downcase.
* assumes src is unterminated and dest is at least one byte larger.
*/
static void
idcopy(char *dest, char *src, int size)
{
char *p, *pend;
memmove(dest, src, size);
pend = &dest[size];
*pend-- = '\0';
for (p = pend; p > dest && *p == ' '; p--)
*p = '\0';
for (p = dest; p <= pend && *p != '\0'; p++)
*p = tolower(*p);
}
static void
nvmeintron(SDev *sdev)
{
char name[32];
Ctlr *ctlr;
ctlr = sdev->ctlr;
snprint(name, sizeof(name), "sd%c (%s)", sdev->idno, sdev->ifc->name);
enableintr(ctlr, nvmeinterrupt, ctlr, name);
ctlr->regs->intmset = ~0; /* mask all interrupt sources */
}
static void
zeroqhdtls(Qpair *qp)
{
qp->cidx.hd = qp->qidx.tl = 0;
qp->cidx.tl = qp->qidx.hd = 0; /* paranoia */
coherence();
}
static int
nvmectlrenable(Ctlr* ctlr)
{
int i, nns, gotns;
char *idpage;
Ctlrid *ctlrid;
Lbafmt *lbafmt;
Nsid *nsid;
Qpair *qpadm, *qpio;
Regs *regs = ctlr->regs;
SDev *sdev = ctlr->sdev;
/* we need at least one admin queue and one i/o queue */
qpadm = &ctlr->qpair[Qadmin];
allocqpair(ctlr, qpadm);
qpio = &ctlr->qpair[Qio];
allocqpair(ctlr, qpio);
assert(!(regs->cc & Enable));
configure(ctlr, qpadm); /* must do this while ctlr is disabled */
enable(regs);
zeroqhdtls(qpadm); /* paranoia */
regs->intmset = ~0; /* mask all interrupt sources */
nvmeintron(sdev);
idpage = mallocalign(BY2PG, ctlr->pgsz, 0, 0);
if (idpage == nil)
panic("nvmectlrenable: out of memory");
if (nvmeadmissue(ctlr, Admid, Nsall, idpage) != 0)
panic("nvmectlrenable: Admid(Nsall) failed");
ctlrid = (Ctlrid *)idpage;
nns = ctlrid->nns;
/* smuggle hw id strings into ctlr for later printing */
idcopy(ctlr->serial, ctlrid->serial, sizeof ctlrid->serial);
idcopy(ctlr->model, ctlrid->model, sizeof ctlrid->model);
idcopy(ctlr->fw, ctlrid->fw, sizeof ctlrid->fw);
if (ctlrid->mdts)
ctlr->mdts = 1 << ctlrid->mdts;
// iprint("nvme: max xfr size %d\n", ctlr->mdts * ctlr->minpgsz);
/*
* create first i/o queue with admin queue cmds.
* completion queue must be created first.
*/
if (nvmeadmissue(ctlr, Admmkiocq, Nsunused, qpio->cmpl) != 0)
panic("nvmectlrenable: Admmkiocq failed");
if (nvmeadmissue(ctlr, Admmkiosq, Nsunused, qpio->q) != 0)
panic("nvmectlrenable: Admmkiosq failed");
zeroqhdtls(qpio); /* paranoia */
/* find a suitable namespace */
nsid = (Nsid *)idpage;
gotns = bestns(ctlr, nns, nsid, &i); /* fills in nsid page */
if (gotns == 0)
panic("nvmectlrenable: no suitable namespace found");
lbafmt = &nsid->lbafmt[i];
ctlr->secsize = 1 << lbafmt->lglbasize; /* remember for SDunit */
ctlr->sectors = nsid->cap; /* remember for SDunit */
ctlr->ns = gotns;
free(idpage);
if (Debugns)
print("nvme best ns: %d: sectors %,lld of %d bytes\n",
ctlr->ns, ctlr->sectors, ctlr->secsize);
return 1;
}
static void
freeqpair(Qpair *qp)
{
free(qp->q);
free(qp->cmpl);
qp->q = nil;
qp->cmpl = nil;
}
static void
ckstuck(void)
{
int i;
static int whined;
for (i = 0; i < nctlrs; i++)
nvmeinterrupt(nil, ctlrs[i]);
if (iosttck && sys->ticks - iosttck > 5*HZ && ++whined < 5)
iprint("nvme: stuck for 5 s.\n");
}
/*
* activate a single nvme controller, sdev.
* upon return, sdev->nunit SDunits will be allocated.
*/
static int
nvmeenable(SDev* sdev)
{
Ctlr *ctlr;
ctlr = sdev->ctlr;
if(ctlr->qpair[Qadmin].q)
return 0;
pcisetbme(ctlr->pcidev);
if(!nvmectlrenable(ctlr)) {
freeqpair(&ctlr->qpair[Qadmin]);
freeqpair(&ctlr->qpair[Qio]);
return 0;
}
/* watch for hardware bugs */
lock(&clocklck);
if (!clockrunning) {
addclock0link(ckstuck, 1000);
clockrunning = 1;
}
unlock(&clocklck);
return 1;
}
static void
nvmeintroff(SDev *sdev)
{
char name[32];
Ctlr *ctlr;
ctlr = sdev->ctlr;
ctlr->regs->intmset = ~0; /* mask all interrupt sources */
snprint(name, sizeof(name), "sd%c (%s)", sdev->idno, sdev->ifc->name);
disableintr(ctlr, nvmeinterrupt, ctlr, name);
}
/*
* returns when all in-flight transfers are done.
* call with shutlock & issuelock held.
*/
static void
waitnoxfrs(Ctlr *ctlr)
{
int i;
for (i = 1000; i-- > 0 && ctlr->inflight > 0; ) {
iunlock(&ctlr->shutlock);
iunlock(&ctlr->issuelock);
delay(1);
ilock(&ctlr->issuelock);
ilock(&ctlr->shutlock);
}
if (i <= 0)
iprint("sdnvme: %d transfers still in flight after 1 s.\n",
ctlr->inflight);
}
static int
nvmedisable(SDev* sdev) /* disable interrupts for this sdev */
{
Ctlr *ctlr;
ctlr = sdev->ctlr;
if (ctlr == nil)
return 1;
nvmeissue(ctlr, &ctlr->qpair[Qio], nil, Cmdflush, Nsall, nil, 0);
ilock(&ctlr->issuelock);
ilock(&ctlr->shutlock);
waitnoxfrs(ctlr);
nvmeintroff(sdev);
pciclrbme(ctlr->pcidev);
iunlock(&ctlr->shutlock);
iunlock(&ctlr->issuelock);
return 1;
}
static void
nvmeclear(SDev* sdev) /* clear the interface for this sdev */
{
Ctlr *ctlr;
ctlr = sdev->ctlr;
if (ctlr == nil)
return;
ilock(&ctlr->issuelock);
ilock(&ctlr->shutlock);
if (ctlr->regs) {
waitnoxfrs(ctlr);
reset(ctlr->regs); /* ctlrs and drives are one-to-one */
}
iunlock(&ctlr->shutlock);
iunlock(&ctlr->issuelock);
}
/*
* see if a particular drive exists.
* must not set unit->sectors here, but rather in nvmeonline.
*/
static int
nvmeverify(SDunit *unit)
{
if (unit->subno != 0)
return 0;
return 1;
}
/*
* initialise a drive known to exist.
* returns boolean for success.
*/
static int
nvmeonline(SDunit *unit)
{
int r;
if (unit->subno != 0) /* not me? */
return 0;
if (unit->sectors) /* already inited? */
return 1;
r = scsionline(unit);
if(r == 0)
return r;
nvmedrive(unit);
/*
* could hang around until disks are spun up and thus available as
* nvram, dos file systems, etc. you wouldn't expect it, but
* the intel 330 sata ssd takes a while to `spin up'.
*/
return 1; /* drive ready */
}
SDifc sdnvmeifc = {
"nvme", /* name */
nvmepnp, /* pnp */
nil, /* legacy */
nvmeenable, /* enable */
nvmedisable, /* disable */
nvmeverify, /* verify */
nvmeonline, /* online */
nvmerio, /* rio */
nvmerctl, /* rctl */
nil, /* wctl */
scsibio, /* bio */
nil, /* probe */
nvmeclear, /* clear */
nvmertopctl, /* rtopctl */
nil, /* wtopctl */
};
<!-- BEGIN TAIL -->
</pre>
</td></tr></table>
</td></tr></table>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="line-height: 1.2em; margin-left: 1.00in; text-indent: 0.00in; margin-right: 1.00in; margin-top: 0; margin-bottom: 0; text-align: center;">
<span style="font-size: 10pt"></span></p>
<p style="margin-top: 0; margin-bottom: 0.50in"></p>
<p style="margin-top: 0; margin-bottom: 0.33in"></p>
<center><table border="0"><tr>
<td valign="middle"><a href="
http://www.alcatel-lucent.com/"><img border="0" src="/plan9/img/logo_ft.gif" alt="Bell Labs" />
</a></td>
<td valign="middle"><a href="
http://www.opensource.org"><img border="0" alt="OSI certified" src="/plan9/img/osi-certified-60x50.gif" />
</a></td>
<td><img style="padding-right: 45px;" alt="Powered by Plan 9" src="/plan9/img/power36.gif" />
</td>
</tr></table></center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center>
<span style="font-size: 10pt">(<a href="/plan9/">Return to Plan 9 Home Page</a>)</span>
</center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center><font size=-1>
<span style="font-size: 10pt"><a href="
http://www.lucent.com/copyright.html">Copyright</a></span>
<span style="font-size: 10pt">© 2009 Alcatel-Lucent.</span>
<span style="font-size: 10pt">All Rights Reserved.</span>
<br />
<span style="font-size: 10pt">Comments to</span>
<span style="font-size: 10pt"><a href="mailto:
[email protected]">
[email protected]</a>.</span>
</font></center>
</body>
</html>