[SeaBIOS] [PATCH] block: add NVMe boot support

Kevin O'Connor kevin at koconnor.net
Tue Jan 24 18:26:06 CET 2017


On Fri, Jan 20, 2017 at 12:26:25PM -0800, Julian Stecklina wrote:
> This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O
> works. Testing has been done in qemu and so far it works with Grub, syslinux,
> and the FreeBSD loader. You need a recent Qemu (>= October 2016), because older
> versions have buggy NVMe support.

Thanks.  See my comments below.  Mostly minor things I noticed.

> 
> Signed-off-by: Julian Stecklina <jsteckli at amazon.com>
> ---
>  Makefile          |   2 +-
>  src/Kconfig       |   6 +
>  src/block.c       |   4 +
>  src/block.h       |   1 +
>  src/hw/nvme-int.h | 199 +++++++++++++++++
>  src/hw/nvme.c     | 622 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/hw/nvme.h     |  15 ++
>  src/hw/pci_ids.h  |   1 +
>  8 files changed, 849 insertions(+), 1 deletion(-)
>  create mode 100644 src/hw/nvme-int.h
>  create mode 100644 src/hw/nvme.c
>  create mode 100644 src/hw/nvme.h
> 
> diff --git a/Makefile b/Makefile
> index 3b94ee0..946df7e 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -43,7 +43,7 @@ SRC32FLAT=$(SRCBOTH) post.c e820map.c malloc.c romfile.c x86.c optionroms.c \
>      fw/paravirt.c fw/shadow.c fw/pciinit.c fw/smm.c fw/smp.c fw/mtrr.c fw/xen.c \
>      fw/acpi.c fw/mptable.c fw/pirtable.c fw/smbios.c fw/romfile_loader.c \
>      hw/virtio-ring.c hw/virtio-pci.c hw/virtio-blk.c hw/virtio-scsi.c \
> -    hw/tpm_drivers.c
> +    hw/tpm_drivers.c hw/nvme.c
>  SRC32SEG=string.c output.c pcibios.c apm.c stacks.c hw/pci.c hw/serialio.c
>  DIRS=src src/hw src/fw vgasrc
>  
> diff --git a/src/Kconfig b/src/Kconfig
> index 457d082..77ec9c7 100644
> --- a/src/Kconfig
> +++ b/src/Kconfig
> @@ -227,6 +227,12 @@ menu "Hardware support"
>          help
>              Support floppy images stored in coreboot flash or from
>              QEMU fw_cfg.
> +    config NVME
> +        depends on DRIVES
> +        bool "NVMe controllers"
> +        default y
> +        help
> +            Support for NVMe disk code.

Is this device also available in real hardware?  Is it expected to
work on real hardware and/or has it been tested?

If it hasn't been tested on real hardware, make this dependent on
QEMU_HARDWARE and add a runningOnQEMU() runtime check (see lsi-scsi.c
for an example).

>      config PS2PORT
>          depends on KEYBOARD || MOUSE
> diff --git a/src/block.c b/src/block.c
> index f7280cf..d104f6d 100644
> --- a/src/block.c
> +++ b/src/block.c
> @@ -20,6 +20,7 @@
>  #include "hw/usb-uas.h" // uas_process_op
>  #include "hw/virtio-blk.h" // process_virtio_blk_op
>  #include "hw/virtio-scsi.h" // virtio_scsi_process_op
> +#include "hw/nvme.h" // nvme_process_op
>  #include "malloc.h" // malloc_low
>  #include "output.h" // dprintf
>  #include "stacks.h" // call32
> @@ -502,6 +503,7 @@ block_setup(void)
>      megasas_setup();
>      pvscsi_setup();
>      mpt_scsi_setup();
> +    nvme_setup();
>  }
>  
>  // Fallback handler for command requests not implemented by drivers
> @@ -571,6 +573,8 @@ process_op_32(struct disk_op_s *op)
>          return virtio_scsi_process_op(op);
>      case DTYPE_PVSCSI:
>          return pvscsi_process_op(op);
> +    case DTYPE_NVME:
> +        return nvme_process_op(op);
>      default:
>          return process_op_both(op);
>      }
> diff --git a/src/block.h b/src/block.h
> index 0f15ff9..f03ec38 100644
> --- a/src/block.h
> +++ b/src/block.h
> @@ -82,6 +82,7 @@ struct drive_s {
>  #define DTYPE_PVSCSI       0x83
>  #define DTYPE_MPT_SCSI     0x84
>  #define DTYPE_SDCARD       0x90
> +#define DTYPE_NVME         0x91
>  
>  #define MAXDESCSIZE 80
>  
> diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h
> new file mode 100644
> index 0000000..873ee71
> --- /dev/null
> +++ b/src/hw/nvme-int.h
> @@ -0,0 +1,199 @@
> +// NVMe datastructures and constants
> +//
> +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> +
> +#ifndef __NVME_INT_H
> +#define __NVME_INT_H
> +
> +#include "types.h" // u32
> +#include "pcidevice.h" // struct pci_device
> +
> +/* Data structures */
> +
> +/* The register file of a NVMe host controller. This struct follows the naming
> +   scheme in the NVMe specification. */
> +struct nvme_reg {
> +    u64 cap;                    /* controller capabilities */
> +    u32 vs;                     /* version */
> +    u32 intms;                  /* interrupt mask set */
> +    u32 intmc;                  /* interrupt mask clear */
> +    u32 cc;                     /* controller configuration */
> +    u32 _res0;
> +    u32 csts;                   /* controller status */
> +    u32 _res1;
> +    u32 aqa;                    /* admin queue attributes */
> +    u64 asq;                    /* admin submission queue base address */
> +    u64 acq;                    /* admin completion queue base address */
> +};
> +
> +/* Submission queue entry */
> +struct nvme_sqe {
> +    union {
> +        u32 dword[16];
> +        struct {
> +            u32 cdw0;           /* Command DWORD 0 */
> +            u32 nsid;           /* Namespace ID */
> +            u64 _res0;
> +            u64 mptr;           /* metadata ptr */
> +
> +            u64 dptr_prp1;
> +            u64 dptr_prp2;
> +        };
> +    };
> +};
> +
> +/* Completion queue entry */
> +struct nvme_cqe {
> +    union {
> +        u32 dword[4];
> +        struct {
> +            u32 cdw0;
> +            u32 _res0;
> +            u16 sq_head;
> +            u16 sq_id;
> +            u16 cid;
> +            u16 status;
> +        };
> +    };
> +};
> +
> +/* The common part of every submission or completion queue. */
> +struct nvme_queue {
> +    u32 volatile *dbl;          /* doorbell */
> +    u16 mask;                   /* length - 1 */
> +};
> +
> +struct nvme_cq {
> +    struct nvme_queue common;
> +    struct nvme_cqe *cqe;
> +
> +    /* We have read upto (but not including) this entry in the queue. */
> +    u16 head;
> +
> +    /* The current phase bit the controller uses to indicate that it has written
> +       a new entry. This is inverted after each wrap. */
> +    unsigned phase : 1;
> +};
> +
> +struct nvme_sq {
> +    struct nvme_queue common;
> +    struct nvme_sqe *sqe;
> +
> +    /* Corresponding completion queue. We only support a single SQ per CQ. */
> +    struct nvme_cq *cq;
> +
> +    /* The last entry the controller has fetched. */
> +    u16 head;
> +
> +    /* The last value we have written to the tail doorbell. */
> +    u16 tail;
> +};
> +
> +struct nvme_ctrl {
> +    struct pci_device *pci;
> +    struct nvme_reg volatile *reg;
> +
> +    u32 doorbell_stride;        /* in bytes */
> +
> +    struct nvme_sq admin_sq;
> +    struct nvme_cq admin_cq;
> +
> +    u32 ns_count;
> +    struct nvme_namespace *ns;
> +
> +    struct nvme_sq io_sq;
> +    struct nvme_cq io_cq;
> +};
> +
> +struct nvme_namespace {
> +    struct drive_s drive;
> +    struct nvme_ctrl *ctrl;
> +
> +    u32 ns_id;
> +
> +    u64 lba_count;              /* The total amount of sectors. */
> +
> +    u32 block_size;
> +    u32 metadata_size;
> +
> +    /* Page aligned buffer of size NVME_PAGE_SIZE. */
> +    char *dma_buffer;
> +};
> +
> +/* Data structures for NVMe admin identify commands */
> +
> +struct nvme_identify_ctrl {
> +    u16 vid;
> +    u16 ssvid;
> +    char sn[20];
> +    char mn[40];
> +    char fr[8];
> +
> +    char _boring[516 - 72];
> +
> +    u32 nn;                     /* number of namespaces */
> +};
> +
> +struct nvme_identify_ns_list {
> +    u32 ns_id[1024];
> +};
> +
> +struct nvme_lba_format {
> +    u16 ms;
> +    u8  lbads;
> +    u8  rp;
> +    u8  res;
> +};
> +
> +struct nvme_identify_ns {
> +    u64 nsze;
> +    u64 ncap;
> +    u64 nuse;
> +    u8  nsfeat;
> +    u8  nlbaf;
> +    u8  flbas;
> +
> +    char _boring[128 - 27];
> +
> +    struct nvme_lba_format lbaf[16];
> +};
> +
> +union nvme_identify {
> +    struct nvme_identify_ns      ns;
> +    struct nvme_identify_ctrl    ctrl;
> +    struct nvme_identify_ns_list ns_list;
> +};
> +
> +/* NVMe constants */
> +
> +#define NVME_CAP_CSS_NVME (1ULL << 37)
> +
> +#define NVME_CSTS_FATAL   (1U <<  1)
> +#define NVME_CSTS_RDY     (1U <<  0)
> +
> +#define NVME_CC_EN        (1U <<  0)
> +
> +#define NVME_SQE_OPC_ADMIN_CREATE_IO_SQ 1U
> +#define NVME_SQE_OPC_ADMIN_CREATE_IO_CQ 5U
> +#define NVME_SQE_OPC_ADMIN_IDENTIFY     6U
> +
> +#define NVME_SQE_OPC_IO_WRITE 1U
> +#define NVME_SQE_OPC_IO_READ  2U
> +
> +#define NVME_ADMIN_IDENTIFY_CNS_ID_NS       0U
> +#define NVME_ADMIN_IDENTIFY_CNS_ID_CTRL     1U
> +#define NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST 2U
> +
> +#define NVME_CQE_DW3_P (1U << 16)
> +
> +#define NVME_PAGE_SIZE 4096
> +
> +/* Length for the queue entries. */
> +#define NVME_SQE_SIZE_LOG 6
> +#define NVME_CQE_SIZE_LOG 4
> +_Static_assert(sizeof(struct nvme_sqe) == 1U << NVME_SQE_SIZE_LOG, "invalid queue entry size");
> +_Static_assert(sizeof(struct nvme_cqe) == 1U << NVME_CQE_SIZE_LOG, "invalid queue entry size");

Current SeaBIOS supports being compiled on gcc v3.4 and this construct
isn't supported there.  That compiler is pretty old, but for now it's
going to be easier to just change this in your patch.

> +
> +#endif
> +
> +/* EOF */
> diff --git a/src/hw/nvme.c b/src/hw/nvme.c
> new file mode 100644
> index 0000000..1266d7f
> --- /dev/null
> +++ b/src/hw/nvme.c
> @@ -0,0 +1,622 @@
> +// Low level NVMe disk access
> +//
> +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> +
> +#include "blockcmd.h"
> +#include "malloc.h" // malloc_fseq
> +#include "string.h" // memset
> +#include "output.h" // dprintf
> +#include "pci.h"
> +#include "pcidevice.h" // foreachpci
> +#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME
> +#include "pci_regs.h" // PCI_BASE_ADDRESS_0
> +#include "util.h" // boot_add_hd
> +#include "std/disk.h" // DISK_RET_
> +#include "util.h" // timer_calc
> +#include "x86.h" // cpu_relax
> +
> +#include "nvme.h"
> +#include "nvme-int.h"
> +
> +/* Sequentially consistent read. */
> +static u32 nvme_seq_read(u32 *p) { return *(_Atomic u32 *)p; }
> +
> +/* Sequentially consistent writes. We have a volatile version for doorbell registers. */
> +static void nvme_seq_writev(u32 volatile *p, u32 v) { *(_Atomic volatile u32 *)p = v; }

Same gcc v3.4 issue with _Atomic.  Is _Atomic necessary or just a
decoration?  The seabios code typically uses readl/writel to make the
accesses atomic.

> +
> +static void *
> +zalloc_page_aligned_high(u32 size)
> +{
> +    void *res = memalign_high(NVME_PAGE_SIZE, size);
> +    if (res) memset(res, 0, size);
> +    return res;
> +}
> +
> +static void
> +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx, u16 length)
> +{
> +    memset(q, 0, sizeof(*q));
> +    q->dbl = (u32 volatile *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
> +    dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
> +    q->mask = length - 1;
> +}
> +
> +static void
> +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
> +             struct nvme_cq *cq)
> +{
> +    nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
> +    sq->sqe = zalloc_page_aligned_high(sizeof(*sq->sqe) * length);
> +    dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
> +    sq->cq   = cq;
> +    sq->head = 0;
> +    sq->tail = 0;
> +}
> +
> +static void
> +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length)
> +{
> +    nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
> +    cq->cqe = zalloc_page_aligned_high(sizeof(*cq->cqe) * length);
> +
> +    cq->head = 0;
> +
> +    /* All CQE phase bits are initialized to zero. This means initially we wait
> +       for the host controller to set these to 1. */
> +    cq->phase = 1;
> +}
> +
> +static int
> +nvme_poll_cq(struct nvme_cq *cq)
> +{
> +    u32 dw3 = nvme_seq_read(&cq->cqe[cq->head].dword[3]);
> +    return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
> +}
> +
> +static int
> +nvme_is_cqe_success(struct nvme_cqe const *cqe)
> +{
> +    return (cqe->status & 0xFF) >> 1 == 0;
> +}
> +
> +static struct nvme_cqe
> +nvme_consume_cqe(struct nvme_sq *sq)
> +{
> +    struct nvme_cq *cq = sq->cq;
> +
> +    if (!nvme_poll_cq(cq)) {
> +        panic("nvme: can't consume cqe when not ready!\n");

We don't want to panic() in any driver - halting the BIOS can make a
machine inoperable.  Instead, this (and the other locations) should
call warn_internalerror() and return early.

> +    }
> +
> +    struct nvme_cqe *cqe = &cq->cqe[cq->head];
> +    u16 cq_next_head = (cq->head + 1) & cq->common.mask;
> +    dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
> +    if (cq_next_head < cq->head) {
> +        dprintf(3, "cq %p wrap\n", cq);
> +        cq->phase = ~cq->phase;
> +    }
> +    cq->head = cq_next_head;
> +
> +    /* Update the submission queue head. */
> +    if (cqe->sq_head != sq->head) {
> +        sq->head = cqe->sq_head;
> +        dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
> +    }
> +
> +    /* Tell the controller that we consumed the completion. */
> +    nvme_seq_writev(cq->common.dbl, cq->head);
> +
> +    return *cqe;
> +}
> +
> +static struct nvme_cqe
> +nvme_timeout_cqe(void)
> +{
> +    struct nvme_cqe r;
> +
> +    /* 0xFF is a vendor specific status code != success. Should be okay for
> +       indicating failure. */
> +    memset(&r, 0xFF, sizeof(r));
> +    return r;
> +}
> +
> +static struct nvme_cqe
> +nvme_wait(struct nvme_sq *sq)
> +{
> +    static const unsigned nvme_timeout = 500 /* ms */;
> +    u32 to = timer_calc(nvme_timeout);
> +    while (!nvme_poll_cq(sq->cq)) {
> +        cpu_relax();

Unless I'm missing something subtle, this should be yield() so that
irqs may be serviced.

> +
> +        if (timer_check(to)) {
> +            warn_timeout();
> +            return nvme_timeout_cqe();
> +        }
> +    }
> +
> +    return nvme_consume_cqe(sq);
> +}
> +
> +/* Returns the next submission queue entry (or NULL if the queue is full). It
> +   also fills out Command Dword 0 and clears the rest. */
> +static struct nvme_sqe *
> +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data)
> +{
> +    if (((sq->head + 1) & sq->common.mask) == sq->tail) {
> +        dprintf(3, "submission queue is full");
> +        return NULL;
> +    }
> +
> +    struct nvme_sqe *sqe = &sq->sqe[sq->tail];
> +    dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
> +
> +    memset(sqe, 0, sizeof(*sqe));
> +    sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
> +    sqe->mptr = (u32)metadata;
> +    sqe->dptr_prp1 = (u32)data;
> +
> +    if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) {
> +        panic("data buffer not page aligned: %p\n", data);
> +    }
> +
> +    return sqe;
> +}
> +
> +/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */
> +static void
> +nvme_commit_sqe(struct nvme_sq *sq)
> +{
> +    dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
> +    sq->tail = (sq->tail + 1) & sq->common.mask;
> +    nvme_seq_writev(sq->common.dbl, sq->tail);
> +}
> +
> +/* The caller needs to free the returned pointer, because this is the 80s and we
> +   can't use unique_ptr. */
> +static union nvme_identify *
> +nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid)
> +{
> +    union nvme_identify *identify_buf = zalloc_page_aligned_high(4096);

For temporary memory allocations, use memalign_tmp() - using
memalign_high and then freeing the memory can result in memory
fragmentation of the permanent memory pool.

> +    if (!identify_buf) {
> +        panic("NVMe couldn't allocate identify buffer");
> +    }
> +
> +    struct nvme_sqe *cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_IDENTIFY,
> +                                                      NULL, identify_buf);
> +    if (!cmd_identify) { panic("admin queue full\n"); }
> +
> +    cmd_identify->nsid = nsid;
> +    cmd_identify->dword[10] = cns;
> +
> +    nvme_commit_sqe(&ctrl->admin_sq);
> +
> +    struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
> +
> +    if (!nvme_is_cqe_success(&cqe)) {
> +        free(identify_buf);
> +        return NULL;
> +    }
> +
> +    return identify_buf;
> +}
> +
> +static struct nvme_identify_ctrl *
> +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl)
> +{
> +    union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0);
> +    /* C question: Is it safe to skip the nullptr check here? */

Yes - if identify_buf == NULL then &identify_buf->ctrl will also
always be NULL.  I don't mind the explicit check though, and I suspect
the compiler will optimize it away anyway.

> +    return identify_buf ? &identify_buf->ctrl : NULL;
> +}
> +
> +static struct nvme_identify_ns_list *
> +nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl)
> +{
> +    union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST, 0);
> +    return identify_buf ? &identify_buf->ns_list : NULL;
> +}
> +
> +static struct nvme_identify_ns *
> +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id)
> +{
> +    union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS, ns_id);
> +    return identify_buf ? &identify_buf->ns : NULL;
> +}
> +
> +static void
> +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id)
> +{
> +    ns->ctrl  = ctrl;
> +    ns->ns_id = ns_id;
> +
> +    struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
> +    if (!id) {
> +        dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
> +        goto free_buffer;
> +    }
> +
> +    u8 current_lba_format = id->flbas & 0xF;
> +    if (current_lba_format > id->nlbaf) {
> +        dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the namespace supports (%u)?\n",
> +                ns_id, current_lba_format, id->nlbaf + 1);
> +        goto free_buffer;
> +    }
> +
> +    ns->lba_count = id->nsze;
> +
> +    struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
> +
> +    ns->block_size    = 1U << fmt->lbads;
> +    ns->metadata_size = fmt->ms;
> +
> +    if (ns->block_size > NVME_PAGE_SIZE) {
> +        panic("Cannot DMA a single block from our buffer: %u vs %u", ns->block_size, NVME_PAGE_SIZE);
> +    }
> +
> +    ns->drive.cntl_id   = ns - ctrl->ns;
> +    ns->drive.removable = 0;
> +    ns->drive.type      = DTYPE_NVME;
> +    ns->drive.blksize   = ns->block_size;
> +    ns->drive.sectors   = ns->lba_count;
> +
> +    ns->dma_buffer = zalloc_page_aligned_high(NVME_PAGE_SIZE);
> +
> +    char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte blocks + %u-byte metadata)\n",
> +                          ns_id, (ns->lba_count * ns->block_size) >> 20, ns->lba_count, ns->block_size,
> +                          ns->metadata_size);
> +
> +    dprintf(3, "%s", desc);
> +    boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
> +
> + free_buffer:
> +    free (id);
> + }
> +
> +/* Returns 0 on success. */
> +static int
> +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx)
> +{
> +    nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
> +    struct nvme_sqe *cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_CQ,
> +                                                       NULL, cq->cqe);
> +    if (!cmd_create_cq) {
> +        return -1;
> +    }
> +
> +    cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
> +    cmd_create_cq->dword[11] = 1 /* physically contiguous */;
> +
> +    nvme_commit_sqe(&ctrl->admin_sq);
> +
> +    struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
> +
> +    if (!nvme_is_cqe_success(&cqe)) {
> +        dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
> +                cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
> +
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Returns 0 on success. */
> +static int
> +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq)
> +{
> +    nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq);
> +    struct nvme_sqe *cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_SQ,
> +                                                       NULL, sq->sqe);
> +    if (!cmd_create_sq) {
> +        return -1;
> +    }
> +
> +    cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
> +    cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* physically contiguous */;
> +    dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq, cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
> +
> +    nvme_commit_sqe(&ctrl->admin_sq);
> +
> +    struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
> +
> +    if (!nvme_is_cqe_success(&cqe)) {
> +        dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
> +                cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
> +   page boundaries. */
> +static int
> +nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, int write)
> +{
> +    if ((u32)buf & 0x3) panic("buf %p is not DWORD aligned", buf);
> +
> +    if (((u32)buf & ~(NVME_PAGE_SIZE - 1))
> +        != (((u32)buf + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1))) {
> +        panic("IO read crosses page boundary: buf %p bs %u count %u", buf, ns->block_size, count);
> +    }
> +
> +    struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
> +                                                 write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ,
> +                                                 NULL, buf);

It would be preferable if the code was line-wrapped to 80 characters.

> +    io_read->nsid = ns->ns_id;
> +    io_read->dword[10] = (u32)lba;
> +    io_read->dword[11] = (u32)(lba >> 32);
> +    io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
> +
> +    nvme_commit_sqe(&ns->ctrl->io_sq);
> +
> +    struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
> +
> +    if (!nvme_is_cqe_success(&cqe)) {
> +        dprintf(2, "read io: %08x %08x %08x %08x\n",
> +                cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
> +
> +        return DISK_RET_EBADTRACK;
> +    }
> +
> +    return DISK_RET_SUCCESS;
> +}
> +
> +
> +static int
> +nvme_create_io_queues(struct nvme_ctrl *ctrl)
> +{
> +    if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
> +        return -1;
> +
> +    if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
> +        return -1;
> +
> +    return 0;
> +}
> +
> +/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */
> +static int
> +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy)
> +{
> +    u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
> +    u32 to = timer_calc(max_to);
> +    u32 csts;
> +
> +    while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
> +        cpu_relax();

yield()

> +
> +        if (csts & NVME_CSTS_FATAL) {
> +            dprintf(3, "NVMe fatal error during controller shutdown\n");
> +            return -1;
> +        }
> +
> +        if (timer_check(to)) {
> +            warn_timeout();
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static void
> +nvme_controller_init(struct nvme_ctrl *ctrl)

It would be better to avoid the _init suffix as functions with an
_init suffix have a special meaning in seabios' boot order.

> +{
> +    pci_enable_busmaster(ctrl->pci);
> +
> +    /* Turn the controller off. */
> +    ctrl->reg->cc = 0;
> +    if (nvme_wait_csts_rdy(ctrl, 0)) {
> +        dprintf(2, "NVMe fatal error during controller shutdown\n");
> +        return;
> +    }
> +
> +    ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
> +
> +    nvme_init_cq(ctrl, &ctrl->admin_cq, 1, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
> +    nvme_init_sq(ctrl, &ctrl->admin_sq, 0, NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
> +
> +    ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16 | ctrl->admin_sq.common.mask;
> +
> +    /* Create the admin queue pair */
> +    if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory;
> +
> +    ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
> +    ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
> +
> +    dprintf(3, "  admin submission queue: %p\n", ctrl->admin_sq.sqe);
> +    dprintf(3, "  admin completion queue: %p\n", ctrl->admin_cq.cqe);
> +
> +    ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20) | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
> +    if (nvme_wait_csts_rdy(ctrl, 1)) {
> +        dprintf(2, "NVMe fatal error while enabling controller\n");
> +        return;
> +    }
> +    /* The admin queue is set up and the controller is ready. Let's figure out
> +       what namespaces we have. */
> +
> +    struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
> +
> +    if (!identify) {
> +        dprintf(2, "NVMe couldn't identify controller.\n");
> +        goto failed;
> +    }
> +
> +    /* TODO Print model/serial info. */
> +    dprintf(3, "NVMe has %u namespace%s.\n",
> +            identify->nn, (identify->nn == 1) ? "" : "s");
> +
> +    ctrl->ns_count = identify->nn;
> +    free(identify);
> +
> +    if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
> +        /* No point to continue, if the controller says it doesn't have
> +           namespaces or we couldn't create I/O queues. */
> +        goto failed;
> +    }
> +
> +    ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count);
> +    if (!ctrl->ns) goto out_of_memory;
> +    memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count);
> +
> +    struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl);
> +    if (!ns_list) {
> +        dprintf(2, "NVMe couldn't get namespace list.\n");
> +        goto failed;
> +    }
> +
> +    /* Populate namespace IDs */
> +    int ns_idx;
> +    for (ns_idx = 0;
> +         ns_idx < ARRAY_SIZE(ns_list->ns_id)
> +             && ns_idx < ctrl->ns_count
> +             && ns_list->ns_id[ns_idx];
> +         ns_idx++) {
> +        nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]);
> +    }
> +
> +    free(ns_list);
> +
> +    /* If for some reason the namespace list gives us fewer namespaces, we just go along. */
> +    if (ns_idx != ctrl->ns_count) {
> +        dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx);
> +        ctrl->ns_count = ns_idx;
> +    }
> +
> +    dprintf(3, "NVMe initialization complete!\n");
> +    return;
> +
> + out_of_memory:
> +    warn_noalloc();
> + failed:
> +    free(ctrl->admin_sq.sqe);
> +    free(ctrl->admin_cq.cqe);

Shouldn't this free(ctrl->ns) also?

> +    return;
> +}
> +
> +/* Initialize an NVMe controller and detect its drives. */
> +static void
> +nvme_controller_setup(struct pci_device *pci)
> +{
> +    if (create_bounce_buf() < 0)
> +        return;
> +
> +    struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
> +    if (!reg)
> +        return;
> +
> +    u32 version = reg->vs;
> +    dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
> +            version >> 16, (version >> 8) & 0xFF, version & 0xFF);
> +    dprintf(3, "  Capabilities %016llx\n", reg->cap);
> +
> +    if (version < 0x00010100U) {
> +        dprintf(3, "Need at least 1.1.0! Skipping.\n");
> +        return;
> +    }
> +
> +    if (~reg->cap & NVME_CAP_CSS_NVME) {
> +        dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
> +        return;
> +    }
> +
> +    struct nvme_ctrl *ctrl = malloc_fseg(sizeof(*ctrl));

I think this could be malloc_high() here - the fseg space is fairly
limited and there isn't much need to use it for 32bit only drivers.

> +    if (!ctrl) {
> +        warn_noalloc();
> +        return;
> +    }
> +
> +    memset(ctrl, 0, sizeof(*ctrl));
> +
> +    ctrl->reg = reg;
> +    ctrl->pci = pci;
> +
> +    nvme_controller_init(ctrl);
> +}
> +
> +// Locate and init NVMe controllers
> +static void
> +nvme_scan(void)
> +{
> +    // Scan PCI bus for ATA adapters
> +    struct pci_device *pci;
> +
> +    foreachpci(pci) {
> +        if (pci->class != PCI_CLASS_STORAGE_NVME)
> +            continue;
> +        if (pci->prog_if != 2 /* as of NVM 1.0e */) {
> +            dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
> +            continue;
> +        }
> +
> +        nvme_controller_setup(pci);

Ideally the code would start a thread here:
    run_thread(nvme_controller_setup, pci);

> +    }
> +}
> +
> +static int
> +nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write)
> +{
> +    int res = DISK_RET_SUCCESS;
> +    u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
> +
> +    if (write) {
> +        panic("XXX Writes are implemented, but not tested."
> +              " Remove this panic, if you are sure what you are doing!");

If the driver isn't tested with writes, just return
DISK_RET_EWRITEPROTECT for CMD_FORMAT and CMD_WRITE in
nvme_process_op().  Is it difficult to test writes?

> +    }
> +
> +    for (u16 i = 0; i < op->count || res != DISK_RET_SUCCESS;) {

Older versions of gcc don't like the 'u16 i' in the for() declaration.
It can be fixed by moving the declaration just above the for loop.

> +        u16 const blocks_remaining = op->count - i;
> +        u16 const blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks;
> +        char * const op_buf = op->buf_fl + i * ns->block_size;
> +
> +        if (write) {
> +            memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
> +        }
> +
> +        res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
> +        dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" : "read",
> +                op->lba + i, blocks, res);
> +
> +        if (!write && res == DISK_RET_SUCCESS) {
> +            memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
> +        }
> +
> +        i += blocks;
> +    }
> +
> +    return res;
> +}
> +
> +int
> +nvme_process_op(struct disk_op_s *op)
> +{
> +    if (!CONFIG_NVME)
> +        return DISK_RET_SUCCESS;
> +
> +    struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace, drive);
> +
> +    switch (op->command) {
> +    case CMD_READ:
> +    case CMD_WRITE:
> +        return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
> +    default:
> +        dprintf(2, "NVMe disk op %u not implemented\n", op->command);
> +        return DISK_RET_EBADTRACK;

Various bootloaders tend to make funky calls, so logging here could
flood the log.  This should call default_process_op(op) instead.

> +    }
> +}
> +
> +void
> +nvme_setup(void)
> +{
> +    ASSERT32FLAT();
> +    if (!CONFIG_NVME)
> +        return;
> +
> +    dprintf(3, "init nvme\n");
> +    nvme_scan();
> +}
> +
> +/* Local Variables:      */
> +/* indent-tabs-mode: nil */
> +/* c-basic-offset: 4     */
> +/* End:                  */
> diff --git a/src/hw/nvme.h b/src/hw/nvme.h
> new file mode 100644
> index 0000000..1555dc7
> --- /dev/null
> +++ b/src/hw/nvme.h
> @@ -0,0 +1,15 @@
> +// External interfaces for low level NVMe support
> +//
> +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
> +
> +#ifndef __NVME_H
> +#define __NVME_H
> +
> +#include "block.h" // struct disk_op_s
> +
> +void nvme_setup(void);
> +int nvme_process_op(struct disk_op_s *op);
> +
> +#endif
> +
> +/* EOF */
> diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h
> index cdf9b3c..4ac73b4 100644
> --- a/src/hw/pci_ids.h
> +++ b/src/hw/pci_ids.h
> @@ -18,6 +18,7 @@
>  #define PCI_CLASS_STORAGE_SATA		0x0106
>  #define PCI_CLASS_STORAGE_SATA_AHCI	0x010601
>  #define PCI_CLASS_STORAGE_SAS		0x0107
> +#define PCI_CLASS_STORAGE_NVME		0x0108
>  #define PCI_CLASS_STORAGE_OTHER		0x0180
>  
>  #define PCI_BASE_CLASS_NETWORK		0x02
> -- 
> 2.7.4
> 
> 
> _______________________________________________
> SeaBIOS mailing list
> SeaBIOS at seabios.org
> https://www.coreboot.org/mailman/listinfo/seabios



More information about the SeaBIOS mailing list