This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= October 2016), because older versions have buggy NVMe support.
Signed-off-by: Julian Stecklina jsteckli@amazon.com --- Makefile | 2 +- src/Kconfig | 6 + src/block.c | 4 + src/block.h | 1 + src/hw/nvme-int.h | 199 +++++++++++++++++ src/hw/nvme.c | 622 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hw/nvme.h | 15 ++ src/hw/pci_ids.h | 1 + 8 files changed, 849 insertions(+), 1 deletion(-) create mode 100644 src/hw/nvme-int.h create mode 100644 src/hw/nvme.c create mode 100644 src/hw/nvme.h
diff --git a/Makefile b/Makefile index 3b94ee0..946df7e 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ SRC32FLAT=$(SRCBOTH) post.c e820map.c malloc.c romfile.c x86.c optionroms.c \ fw/paravirt.c fw/shadow.c fw/pciinit.c fw/smm.c fw/smp.c fw/mtrr.c fw/xen.c \ fw/acpi.c fw/mptable.c fw/pirtable.c fw/smbios.c fw/romfile_loader.c \ hw/virtio-ring.c hw/virtio-pci.c hw/virtio-blk.c hw/virtio-scsi.c \ - hw/tpm_drivers.c + hw/tpm_drivers.c hw/nvme.c SRC32SEG=string.c output.c pcibios.c apm.c stacks.c hw/pci.c hw/serialio.c DIRS=src src/hw src/fw vgasrc
diff --git a/src/Kconfig b/src/Kconfig index 457d082..77ec9c7 100644 --- a/src/Kconfig +++ b/src/Kconfig @@ -227,6 +227,12 @@ menu "Hardware support" help Support floppy images stored in coreboot flash or from QEMU fw_cfg. + config NVME + depends on DRIVES + bool "NVMe controllers" + default y + help + Support for NVMe disk code.
config PS2PORT depends on KEYBOARD || MOUSE diff --git a/src/block.c b/src/block.c index f7280cf..d104f6d 100644 --- a/src/block.c +++ b/src/block.c @@ -20,6 +20,7 @@ #include "hw/usb-uas.h" // uas_process_op #include "hw/virtio-blk.h" // process_virtio_blk_op #include "hw/virtio-scsi.h" // virtio_scsi_process_op +#include "hw/nvme.h" // nvme_process_op #include "malloc.h" // malloc_low #include "output.h" // dprintf #include "stacks.h" // call32 @@ -502,6 +503,7 @@ block_setup(void) megasas_setup(); pvscsi_setup(); mpt_scsi_setup(); + nvme_setup(); }
// Fallback handler for command requests not implemented by drivers @@ -571,6 +573,8 @@ process_op_32(struct disk_op_s *op) return virtio_scsi_process_op(op); case DTYPE_PVSCSI: return pvscsi_process_op(op); + case DTYPE_NVME: + return nvme_process_op(op); default: return process_op_both(op); } diff --git a/src/block.h b/src/block.h index 0f15ff9..f03ec38 100644 --- a/src/block.h +++ b/src/block.h @@ -82,6 +82,7 @@ struct drive_s { #define DTYPE_PVSCSI 0x83 #define DTYPE_MPT_SCSI 0x84 #define DTYPE_SDCARD 0x90 +#define DTYPE_NVME 0x91
#define MAXDESCSIZE 80
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h new file mode 100644 index 0000000..873ee71 --- /dev/null +++ b/src/hw/nvme-int.h @@ -0,0 +1,199 @@ +// NVMe datastructures and constants +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +#ifndef __NVME_INT_H +#define __NVME_INT_H + +#include "types.h" // u32 +#include "pcidevice.h" // struct pci_device + +/* Data structures */ + +/* The register file of a NVMe host controller. This struct follows the naming + scheme in the NVMe specification. */ +struct nvme_reg { + u64 cap; /* controller capabilities */ + u32 vs; /* version */ + u32 intms; /* interrupt mask set */ + u32 intmc; /* interrupt mask clear */ + u32 cc; /* controller configuration */ + u32 _res0; + u32 csts; /* controller status */ + u32 _res1; + u32 aqa; /* admin queue attributes */ + u64 asq; /* admin submission queue base address */ + u64 acq; /* admin completion queue base address */ +}; + +/* Submission queue entry */ +struct nvme_sqe { + union { + u32 dword[16]; + struct { + u32 cdw0; /* Command DWORD 0 */ + u32 nsid; /* Namespace ID */ + u64 _res0; + u64 mptr; /* metadata ptr */ + + u64 dptr_prp1; + u64 dptr_prp2; + }; + }; +}; + +/* Completion queue entry */ +struct nvme_cqe { + union { + u32 dword[4]; + struct { + u32 cdw0; + u32 _res0; + u16 sq_head; + u16 sq_id; + u16 cid; + u16 status; + }; + }; +}; + +/* The common part of every submission or completion queue. */ +struct nvme_queue { + u32 volatile *dbl; /* doorbell */ + u16 mask; /* length - 1 */ +}; + +struct nvme_cq { + struct nvme_queue common; + struct nvme_cqe *cqe; + + /* We have read upto (but not including) this entry in the queue. */ + u16 head; + + /* The current phase bit the controller uses to indicate that it has written + a new entry. This is inverted after each wrap. */ + unsigned phase : 1; +}; + +struct nvme_sq { + struct nvme_queue common; + struct nvme_sqe *sqe; + + /* Corresponding completion queue. We only support a single SQ per CQ. */ + struct nvme_cq *cq; + + /* The last entry the controller has fetched. */ + u16 head; + + /* The last value we have written to the tail doorbell. */ + u16 tail; +}; + +struct nvme_ctrl { + struct pci_device *pci; + struct nvme_reg volatile *reg; + + u32 doorbell_stride; /* in bytes */ + + struct nvme_sq admin_sq; + struct nvme_cq admin_cq; + + u32 ns_count; + struct nvme_namespace *ns; + + struct nvme_sq io_sq; + struct nvme_cq io_cq; +}; + +struct nvme_namespace { + struct drive_s drive; + struct nvme_ctrl *ctrl; + + u32 ns_id; + + u64 lba_count; /* The total amount of sectors. */ + + u32 block_size; + u32 metadata_size; + + /* Page aligned buffer of size NVME_PAGE_SIZE. */ + char *dma_buffer; +}; + +/* Data structures for NVMe admin identify commands */ + +struct nvme_identify_ctrl { + u16 vid; + u16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + + char _boring[516 - 72]; + + u32 nn; /* number of namespaces */ +}; + +struct nvme_identify_ns_list { + u32 ns_id[1024]; +}; + +struct nvme_lba_format { + u16 ms; + u8 lbads; + u8 rp; + u8 res; +}; + +struct nvme_identify_ns { + u64 nsze; + u64 ncap; + u64 nuse; + u8 nsfeat; + u8 nlbaf; + u8 flbas; + + char _boring[128 - 27]; + + struct nvme_lba_format lbaf[16]; +}; + +union nvme_identify { + struct nvme_identify_ns ns; + struct nvme_identify_ctrl ctrl; + struct nvme_identify_ns_list ns_list; +}; + +/* NVMe constants */ + +#define NVME_CAP_CSS_NVME (1ULL << 37) + +#define NVME_CSTS_FATAL (1U << 1) +#define NVME_CSTS_RDY (1U << 0) + +#define NVME_CC_EN (1U << 0) + +#define NVME_SQE_OPC_ADMIN_CREATE_IO_SQ 1U +#define NVME_SQE_OPC_ADMIN_CREATE_IO_CQ 5U +#define NVME_SQE_OPC_ADMIN_IDENTIFY 6U + +#define NVME_SQE_OPC_IO_WRITE 1U +#define NVME_SQE_OPC_IO_READ 2U + +#define NVME_ADMIN_IDENTIFY_CNS_ID_NS 0U +#define NVME_ADMIN_IDENTIFY_CNS_ID_CTRL 1U +#define NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST 2U + +#define NVME_CQE_DW3_P (1U << 16) + +#define NVME_PAGE_SIZE 4096 + +/* Length for the queue entries. */ +#define NVME_SQE_SIZE_LOG 6 +#define NVME_CQE_SIZE_LOG 4 +_Static_assert(sizeof(struct nvme_sqe) == 1U << NVME_SQE_SIZE_LOG, "invalid queue entry size"); +_Static_assert(sizeof(struct nvme_cqe) == 1U << NVME_CQE_SIZE_LOG, "invalid queue entry size"); + +#endif + +/* EOF */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c new file mode 100644 index 0000000..1266d7f --- /dev/null +++ b/src/hw/nvme.c @@ -0,0 +1,622 @@ +// Low level NVMe disk access +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +#include "blockcmd.h" +#include "malloc.h" // malloc_fseq +#include "string.h" // memset +#include "output.h" // dprintf +#include "pci.h" +#include "pcidevice.h" // foreachpci +#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME +#include "pci_regs.h" // PCI_BASE_ADDRESS_0 +#include "util.h" // boot_add_hd +#include "std/disk.h" // DISK_RET_ +#include "util.h" // timer_calc +#include "x86.h" // cpu_relax + +#include "nvme.h" +#include "nvme-int.h" + +/* Sequentially consistent read. */ +static u32 nvme_seq_read(u32 *p) { return *(_Atomic u32 *)p; } + +/* Sequentially consistent writes. We have a volatile version for doorbell registers. */ +static void nvme_seq_writev(u32 volatile *p, u32 v) { *(_Atomic volatile u32 *)p = v; } + +static void * +zalloc_page_aligned_high(u32 size) +{ + void *res = memalign_high(NVME_PAGE_SIZE, size); + if (res) memset(res, 0, size); + return res; +} + +static void +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx, u16 length) +{ + memset(q, 0, sizeof(*q)); + q->dbl = (u32 volatile *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride); + dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl); + q->mask = length - 1; +} + +static void +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length, + struct nvme_cq *cq) +{ + nvme_init_queue_common(ctrl, &sq->common, q_idx, length); + sq->sqe = zalloc_page_aligned_high(sizeof(*sq->sqe) * length); + dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe); + sq->cq = cq; + sq->head = 0; + sq->tail = 0; +} + +static void +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length) +{ + nvme_init_queue_common(ctrl, &cq->common, q_idx, length); + cq->cqe = zalloc_page_aligned_high(sizeof(*cq->cqe) * length); + + cq->head = 0; + + /* All CQE phase bits are initialized to zero. This means initially we wait + for the host controller to set these to 1. */ + cq->phase = 1; +} + +static int +nvme_poll_cq(struct nvme_cq *cq) +{ + u32 dw3 = nvme_seq_read(&cq->cqe[cq->head].dword[3]); + return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase); +} + +static int +nvme_is_cqe_success(struct nvme_cqe const *cqe) +{ + return (cqe->status & 0xFF) >> 1 == 0; +} + +static struct nvme_cqe +nvme_consume_cqe(struct nvme_sq *sq) +{ + struct nvme_cq *cq = sq->cq; + + if (!nvme_poll_cq(cq)) { + panic("nvme: can't consume cqe when not ready!\n"); + } + + struct nvme_cqe *cqe = &cq->cqe[cq->head]; + u16 cq_next_head = (cq->head + 1) & cq->common.mask; + dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head); + if (cq_next_head < cq->head) { + dprintf(3, "cq %p wrap\n", cq); + cq->phase = ~cq->phase; + } + cq->head = cq_next_head; + + /* Update the submission queue head. */ + if (cqe->sq_head != sq->head) { + sq->head = cqe->sq_head; + dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head); + } + + /* Tell the controller that we consumed the completion. */ + nvme_seq_writev(cq->common.dbl, cq->head); + + return *cqe; +} + +static struct nvme_cqe +nvme_timeout_cqe(void) +{ + struct nvme_cqe r; + + /* 0xFF is a vendor specific status code != success. Should be okay for + indicating failure. */ + memset(&r, 0xFF, sizeof(r)); + return r; +} + +static struct nvme_cqe +nvme_wait(struct nvme_sq *sq) +{ + static const unsigned nvme_timeout = 500 /* ms */; + u32 to = timer_calc(nvme_timeout); + while (!nvme_poll_cq(sq->cq)) { + cpu_relax(); + + if (timer_check(to)) { + warn_timeout(); + return nvme_timeout_cqe(); + } + } + + return nvme_consume_cqe(sq); +} + +/* Returns the next submission queue entry (or NULL if the queue is full). It + also fills out Command Dword 0 and clears the rest. */ +static struct nvme_sqe * +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) +{ + if (((sq->head + 1) & sq->common.mask) == sq->tail) { + dprintf(3, "submission queue is full"); + return NULL; + } + + struct nvme_sqe *sqe = &sq->sqe[sq->tail]; + dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail); + + memset(sqe, 0, sizeof(*sqe)); + sqe->cdw0 = opc | (sq->tail << 16 /* CID */); + sqe->mptr = (u32)metadata; + sqe->dptr_prp1 = (u32)data; + + if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) { + panic("data buffer not page aligned: %p\n", data); + } + + return sqe; +} + +/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */ +static void +nvme_commit_sqe(struct nvme_sq *sq) +{ + dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail); + sq->tail = (sq->tail + 1) & sq->common.mask; + nvme_seq_writev(sq->common.dbl, sq->tail); +} + +/* The caller needs to free the returned pointer, because this is the 80s and we + can't use unique_ptr. */ +static union nvme_identify * +nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid) +{ + union nvme_identify *identify_buf = zalloc_page_aligned_high(4096); + if (!identify_buf) { + panic("NVMe couldn't allocate identify buffer"); + } + + struct nvme_sqe *cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_IDENTIFY, + NULL, identify_buf); + if (!cmd_identify) { panic("admin queue full\n"); } + + cmd_identify->nsid = nsid; + cmd_identify->dword[10] = cns; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + free(identify_buf); + return NULL; + } + + return identify_buf; +} + +static struct nvme_identify_ctrl * +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl) +{ + union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0); + /* C question: Is it safe to skip the nullptr check here? */ + return identify_buf ? &identify_buf->ctrl : NULL; +} + +static struct nvme_identify_ns_list * +nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl) +{ + union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST, 0); + return identify_buf ? &identify_buf->ns_list : NULL; +} + +static struct nvme_identify_ns * +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id) +{ + union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS, ns_id); + return identify_buf ? &identify_buf->ns : NULL; +} + +static void +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) +{ + ns->ctrl = ctrl; + ns->ns_id = ns_id; + + struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id); + if (!id) { + dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id); + goto free_buffer; + } + + u8 current_lba_format = id->flbas & 0xF; + if (current_lba_format > id->nlbaf) { + dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the namespace supports (%u)?\n", + ns_id, current_lba_format, id->nlbaf + 1); + goto free_buffer; + } + + ns->lba_count = id->nsze; + + struct nvme_lba_format *fmt = &id->lbaf[current_lba_format]; + + ns->block_size = 1U << fmt->lbads; + ns->metadata_size = fmt->ms; + + if (ns->block_size > NVME_PAGE_SIZE) { + panic("Cannot DMA a single block from our buffer: %u vs %u", ns->block_size, NVME_PAGE_SIZE); + } + + ns->drive.cntl_id = ns - ctrl->ns; + ns->drive.removable = 0; + ns->drive.type = DTYPE_NVME; + ns->drive.blksize = ns->block_size; + ns->drive.sectors = ns->lba_count; + + ns->dma_buffer = zalloc_page_aligned_high(NVME_PAGE_SIZE); + + char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte blocks + %u-byte metadata)\n", + ns_id, (ns->lba_count * ns->block_size) >> 20, ns->lba_count, ns->block_size, + ns->metadata_size); + + dprintf(3, "%s", desc); + boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci)); + + free_buffer: + free (id); + } + +/* Returns 0 on success. */ +static int +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx) +{ + nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + struct nvme_sqe *cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, + NULL, cq->cqe); + if (!cmd_create_cq) { + return -1; + } + + cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1); + cmd_create_cq->dword[11] = 1 /* physically contiguous */; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "create io cq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return -1; + } + + return 0; +} + +/* Returns 0 on success. */ +static int +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq) +{ + nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq); + struct nvme_sqe *cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, + NULL, sq->sqe); + if (!cmd_create_sq) { + return -1; + } + + cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1); + cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* physically contiguous */; + dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq, cmd_create_sq->dword[10], cmd_create_sq->dword[11]); + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "create io sq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + return -1; + } + + return 0; +} + +/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross + page boundaries. */ +static int +nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, int write) +{ + if ((u32)buf & 0x3) panic("buf %p is not DWORD aligned", buf); + + if (((u32)buf & ~(NVME_PAGE_SIZE - 1)) + != (((u32)buf + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1))) { + panic("IO read crosses page boundary: buf %p bs %u count %u", buf, ns->block_size, count); + } + + struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, + write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ, + NULL, buf); + io_read->nsid = ns->ns_id; + io_read->dword[10] = (u32)lba; + io_read->dword[11] = (u32)(lba >> 32); + io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1); + + nvme_commit_sqe(&ns->ctrl->io_sq); + + struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "read io: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return DISK_RET_EBADTRACK; + } + + return DISK_RET_SUCCESS; +} + + +static int +nvme_create_io_queues(struct nvme_ctrl *ctrl) +{ + if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3)) + return -1; + + if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq)) + return -1; + + return 0; +} + +/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */ +static int +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy) +{ + u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU); + u32 to = timer_calc(max_to); + u32 csts; + + while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) { + cpu_relax(); + + if (csts & NVME_CSTS_FATAL) { + dprintf(3, "NVMe fatal error during controller shutdown\n"); + return -1; + } + + if (timer_check(to)) { + warn_timeout(); + return -1; + } + } + + return 0; +} + +static void +nvme_controller_init(struct nvme_ctrl *ctrl) +{ + pci_enable_busmaster(ctrl->pci); + + /* Turn the controller off. */ + ctrl->reg->cc = 0; + if (nvme_wait_csts_rdy(ctrl, 0)) { + dprintf(2, "NVMe fatal error during controller shutdown\n"); + return; + } + + ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF); + + nvme_init_cq(ctrl, &ctrl->admin_cq, 1, NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + nvme_init_sq(ctrl, &ctrl->admin_sq, 0, NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq); + + ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16 | ctrl->admin_sq.common.mask; + + /* Create the admin queue pair */ + if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory; + + ctrl->reg->asq = (u32)ctrl->admin_sq.sqe; + ctrl->reg->acq = (u32)ctrl->admin_cq.cqe; + + dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe); + dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe); + + ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20) | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */); + if (nvme_wait_csts_rdy(ctrl, 1)) { + dprintf(2, "NVMe fatal error while enabling controller\n"); + return; + } + /* The admin queue is set up and the controller is ready. Let's figure out + what namespaces we have. */ + + struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl); + + if (!identify) { + dprintf(2, "NVMe couldn't identify controller.\n"); + goto failed; + } + + /* TODO Print model/serial info. */ + dprintf(3, "NVMe has %u namespace%s.\n", + identify->nn, (identify->nn == 1) ? "" : "s"); + + ctrl->ns_count = identify->nn; + free(identify); + + if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) { + /* No point to continue, if the controller says it doesn't have + namespaces or we couldn't create I/O queues. */ + goto failed; + } + + ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count); + if (!ctrl->ns) goto out_of_memory; + memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count); + + struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl); + if (!ns_list) { + dprintf(2, "NVMe couldn't get namespace list.\n"); + goto failed; + } + + /* Populate namespace IDs */ + int ns_idx; + for (ns_idx = 0; + ns_idx < ARRAY_SIZE(ns_list->ns_id) + && ns_idx < ctrl->ns_count + && ns_list->ns_id[ns_idx]; + ns_idx++) { + nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]); + } + + free(ns_list); + + /* If for some reason the namespace list gives us fewer namespaces, we just go along. */ + if (ns_idx != ctrl->ns_count) { + dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx); + ctrl->ns_count = ns_idx; + } + + dprintf(3, "NVMe initialization complete!\n"); + return; + + out_of_memory: + warn_noalloc(); + failed: + free(ctrl->admin_sq.sqe); + free(ctrl->admin_cq.cqe); + return; +} + +/* Initialize an NVMe controller and detect its drives. */ +static void +nvme_controller_setup(struct pci_device *pci) +{ + if (create_bounce_buf() < 0) + return; + + struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0); + if (!reg) + return; + + u32 version = reg->vs; + dprintf(3, "Found NVMe controller with version %u.%u.%u.\n", + version >> 16, (version >> 8) & 0xFF, version & 0xFF); + dprintf(3, " Capabilities %016llx\n", reg->cap); + + if (version < 0x00010100U) { + dprintf(3, "Need at least 1.1.0! Skipping.\n"); + return; + } + + if (~reg->cap & NVME_CAP_CSS_NVME) { + dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n"); + return; + } + + struct nvme_ctrl *ctrl = malloc_fseg(sizeof(*ctrl)); + if (!ctrl) { + warn_noalloc(); + return; + } + + memset(ctrl, 0, sizeof(*ctrl)); + + ctrl->reg = reg; + ctrl->pci = pci; + + nvme_controller_init(ctrl); +} + +// Locate and init NVMe controllers +static void +nvme_scan(void) +{ + // Scan PCI bus for ATA adapters + struct pci_device *pci; + + foreachpci(pci) { + if (pci->class != PCI_CLASS_STORAGE_NVME) + continue; + if (pci->prog_if != 2 /* as of NVM 1.0e */) { + dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if); + continue; + } + + nvme_controller_setup(pci); + } +} + +static int +nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) +{ + int res = DISK_RET_SUCCESS; + u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; + + if (write) { + panic("XXX Writes are implemented, but not tested." + " Remove this panic, if you are sure what you are doing!"); + } + + for (u16 i = 0; i < op->count || res != DISK_RET_SUCCESS;) { + u16 const blocks_remaining = op->count - i; + u16 const blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks; + char * const op_buf = op->buf_fl + i * ns->block_size; + + if (write) { + memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); + } + + res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write); + dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" : "read", + op->lba + i, blocks, res); + + if (!write && res == DISK_RET_SUCCESS) { + memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); + } + + i += blocks; + } + + return res; +} + +int +nvme_process_op(struct disk_op_s *op) +{ + if (!CONFIG_NVME) + return DISK_RET_SUCCESS; + + struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace, drive); + + switch (op->command) { + case CMD_READ: + case CMD_WRITE: + return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE); + default: + dprintf(2, "NVMe disk op %u not implemented\n", op->command); + return DISK_RET_EBADTRACK; + } +} + +void +nvme_setup(void) +{ + ASSERT32FLAT(); + if (!CONFIG_NVME) + return; + + dprintf(3, "init nvme\n"); + nvme_scan(); +} + +/* Local Variables: */ +/* indent-tabs-mode: nil */ +/* c-basic-offset: 4 */ +/* End: */ diff --git a/src/hw/nvme.h b/src/hw/nvme.h new file mode 100644 index 0000000..1555dc7 --- /dev/null +++ b/src/hw/nvme.h @@ -0,0 +1,15 @@ +// External interfaces for low level NVMe support +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. + +#ifndef __NVME_H +#define __NVME_H + +#include "block.h" // struct disk_op_s + +void nvme_setup(void); +int nvme_process_op(struct disk_op_s *op); + +#endif + +/* EOF */ diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h index cdf9b3c..4ac73b4 100644 --- a/src/hw/pci_ids.h +++ b/src/hw/pci_ids.h @@ -18,6 +18,7 @@ #define PCI_CLASS_STORAGE_SATA 0x0106 #define PCI_CLASS_STORAGE_SATA_AHCI 0x010601 #define PCI_CLASS_STORAGE_SAS 0x0107 +#define PCI_CLASS_STORAGE_NVME 0x0108 #define PCI_CLASS_STORAGE_OTHER 0x0180
#define PCI_BASE_CLASS_NETWORK 0x02
On Fri, Jan 20, 2017 at 12:26:25PM -0800, Julian Stecklina wrote:
This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= October 2016), because older versions have buggy NVMe support.
Thanks. I haven't had a chance to do a review, but one thing I noticed:
--- /dev/null +++ b/src/hw/nvme-int.h @@ -0,0 +1,199 @@ +// NVMe datastructures and constants +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
The copyright would need to say the file is licensed under the LGPLv3. Preference would be to use what the other files have:
// This file may be distributed under the terms of the GNU LGPLv3 license.
-Kevin
On Fri, Jan 20, 2017 at 12:26:25PM -0800, Julian Stecklina wrote:
This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= October 2016), because older versions have buggy NVMe support.
Thanks. See my comments below. Mostly minor things I noticed.
Signed-off-by: Julian Stecklina jsteckli@amazon.com
Makefile | 2 +- src/Kconfig | 6 + src/block.c | 4 + src/block.h | 1 + src/hw/nvme-int.h | 199 +++++++++++++++++ src/hw/nvme.c | 622 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hw/nvme.h | 15 ++ src/hw/pci_ids.h | 1 + 8 files changed, 849 insertions(+), 1 deletion(-) create mode 100644 src/hw/nvme-int.h create mode 100644 src/hw/nvme.c create mode 100644 src/hw/nvme.h
diff --git a/Makefile b/Makefile index 3b94ee0..946df7e 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ SRC32FLAT=$(SRCBOTH) post.c e820map.c malloc.c romfile.c x86.c optionroms.c \ fw/paravirt.c fw/shadow.c fw/pciinit.c fw/smm.c fw/smp.c fw/mtrr.c fw/xen.c \ fw/acpi.c fw/mptable.c fw/pirtable.c fw/smbios.c fw/romfile_loader.c \ hw/virtio-ring.c hw/virtio-pci.c hw/virtio-blk.c hw/virtio-scsi.c \
- hw/tpm_drivers.c
- hw/tpm_drivers.c hw/nvme.c
SRC32SEG=string.c output.c pcibios.c apm.c stacks.c hw/pci.c hw/serialio.c DIRS=src src/hw src/fw vgasrc
diff --git a/src/Kconfig b/src/Kconfig index 457d082..77ec9c7 100644 --- a/src/Kconfig +++ b/src/Kconfig @@ -227,6 +227,12 @@ menu "Hardware support" help Support floppy images stored in coreboot flash or from QEMU fw_cfg.
- config NVME
depends on DRIVES
bool "NVMe controllers"
default y
help
Support for NVMe disk code.
Is this device also available in real hardware? Is it expected to work on real hardware and/or has it been tested?
If it hasn't been tested on real hardware, make this dependent on QEMU_HARDWARE and add a runningOnQEMU() runtime check (see lsi-scsi.c for an example).
config PS2PORT depends on KEYBOARD || MOUSE
diff --git a/src/block.c b/src/block.c index f7280cf..d104f6d 100644 --- a/src/block.c +++ b/src/block.c @@ -20,6 +20,7 @@ #include "hw/usb-uas.h" // uas_process_op #include "hw/virtio-blk.h" // process_virtio_blk_op #include "hw/virtio-scsi.h" // virtio_scsi_process_op +#include "hw/nvme.h" // nvme_process_op #include "malloc.h" // malloc_low #include "output.h" // dprintf #include "stacks.h" // call32 @@ -502,6 +503,7 @@ block_setup(void) megasas_setup(); pvscsi_setup(); mpt_scsi_setup();
- nvme_setup();
}
// Fallback handler for command requests not implemented by drivers @@ -571,6 +573,8 @@ process_op_32(struct disk_op_s *op) return virtio_scsi_process_op(op); case DTYPE_PVSCSI: return pvscsi_process_op(op);
- case DTYPE_NVME:
default: return process_op_both(op); }return nvme_process_op(op);
diff --git a/src/block.h b/src/block.h index 0f15ff9..f03ec38 100644 --- a/src/block.h +++ b/src/block.h @@ -82,6 +82,7 @@ struct drive_s { #define DTYPE_PVSCSI 0x83 #define DTYPE_MPT_SCSI 0x84 #define DTYPE_SDCARD 0x90 +#define DTYPE_NVME 0x91
#define MAXDESCSIZE 80
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h new file mode 100644 index 0000000..873ee71 --- /dev/null +++ b/src/hw/nvme-int.h @@ -0,0 +1,199 @@ +// NVMe datastructures and constants +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#ifndef __NVME_INT_H +#define __NVME_INT_H
+#include "types.h" // u32 +#include "pcidevice.h" // struct pci_device
+/* Data structures */
+/* The register file of a NVMe host controller. This struct follows the naming
- scheme in the NVMe specification. */
+struct nvme_reg {
- u64 cap; /* controller capabilities */
- u32 vs; /* version */
- u32 intms; /* interrupt mask set */
- u32 intmc; /* interrupt mask clear */
- u32 cc; /* controller configuration */
- u32 _res0;
- u32 csts; /* controller status */
- u32 _res1;
- u32 aqa; /* admin queue attributes */
- u64 asq; /* admin submission queue base address */
- u64 acq; /* admin completion queue base address */
+};
+/* Submission queue entry */ +struct nvme_sqe {
- union {
u32 dword[16];
struct {
u32 cdw0; /* Command DWORD 0 */
u32 nsid; /* Namespace ID */
u64 _res0;
u64 mptr; /* metadata ptr */
u64 dptr_prp1;
u64 dptr_prp2;
};
- };
+};
+/* Completion queue entry */ +struct nvme_cqe {
- union {
u32 dword[4];
struct {
u32 cdw0;
u32 _res0;
u16 sq_head;
u16 sq_id;
u16 cid;
u16 status;
};
- };
+};
+/* The common part of every submission or completion queue. */ +struct nvme_queue {
- u32 volatile *dbl; /* doorbell */
- u16 mask; /* length - 1 */
+};
+struct nvme_cq {
- struct nvme_queue common;
- struct nvme_cqe *cqe;
- /* We have read upto (but not including) this entry in the queue. */
- u16 head;
- /* The current phase bit the controller uses to indicate that it has written
a new entry. This is inverted after each wrap. */
- unsigned phase : 1;
+};
+struct nvme_sq {
- struct nvme_queue common;
- struct nvme_sqe *sqe;
- /* Corresponding completion queue. We only support a single SQ per CQ. */
- struct nvme_cq *cq;
- /* The last entry the controller has fetched. */
- u16 head;
- /* The last value we have written to the tail doorbell. */
- u16 tail;
+};
+struct nvme_ctrl {
- struct pci_device *pci;
- struct nvme_reg volatile *reg;
- u32 doorbell_stride; /* in bytes */
- struct nvme_sq admin_sq;
- struct nvme_cq admin_cq;
- u32 ns_count;
- struct nvme_namespace *ns;
- struct nvme_sq io_sq;
- struct nvme_cq io_cq;
+};
+struct nvme_namespace {
- struct drive_s drive;
- struct nvme_ctrl *ctrl;
- u32 ns_id;
- u64 lba_count; /* The total amount of sectors. */
- u32 block_size;
- u32 metadata_size;
- /* Page aligned buffer of size NVME_PAGE_SIZE. */
- char *dma_buffer;
+};
+/* Data structures for NVMe admin identify commands */
+struct nvme_identify_ctrl {
- u16 vid;
- u16 ssvid;
- char sn[20];
- char mn[40];
- char fr[8];
- char _boring[516 - 72];
- u32 nn; /* number of namespaces */
+};
+struct nvme_identify_ns_list {
- u32 ns_id[1024];
+};
+struct nvme_lba_format {
- u16 ms;
- u8 lbads;
- u8 rp;
- u8 res;
+};
+struct nvme_identify_ns {
- u64 nsze;
- u64 ncap;
- u64 nuse;
- u8 nsfeat;
- u8 nlbaf;
- u8 flbas;
- char _boring[128 - 27];
- struct nvme_lba_format lbaf[16];
+};
+union nvme_identify {
- struct nvme_identify_ns ns;
- struct nvme_identify_ctrl ctrl;
- struct nvme_identify_ns_list ns_list;
+};
+/* NVMe constants */
+#define NVME_CAP_CSS_NVME (1ULL << 37)
+#define NVME_CSTS_FATAL (1U << 1) +#define NVME_CSTS_RDY (1U << 0)
+#define NVME_CC_EN (1U << 0)
+#define NVME_SQE_OPC_ADMIN_CREATE_IO_SQ 1U +#define NVME_SQE_OPC_ADMIN_CREATE_IO_CQ 5U +#define NVME_SQE_OPC_ADMIN_IDENTIFY 6U
+#define NVME_SQE_OPC_IO_WRITE 1U +#define NVME_SQE_OPC_IO_READ 2U
+#define NVME_ADMIN_IDENTIFY_CNS_ID_NS 0U +#define NVME_ADMIN_IDENTIFY_CNS_ID_CTRL 1U +#define NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST 2U
+#define NVME_CQE_DW3_P (1U << 16)
+#define NVME_PAGE_SIZE 4096
+/* Length for the queue entries. */ +#define NVME_SQE_SIZE_LOG 6 +#define NVME_CQE_SIZE_LOG 4 +_Static_assert(sizeof(struct nvme_sqe) == 1U << NVME_SQE_SIZE_LOG, "invalid queue entry size"); +_Static_assert(sizeof(struct nvme_cqe) == 1U << NVME_CQE_SIZE_LOG, "invalid queue entry size");
Current SeaBIOS supports being compiled on gcc v3.4 and this construct isn't supported there. That compiler is pretty old, but for now it's going to be easier to just change this in your patch.
+#endif
+/* EOF */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c new file mode 100644 index 0000000..1266d7f --- /dev/null +++ b/src/hw/nvme.c @@ -0,0 +1,622 @@ +// Low level NVMe disk access +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#include "blockcmd.h" +#include "malloc.h" // malloc_fseq +#include "string.h" // memset +#include "output.h" // dprintf +#include "pci.h" +#include "pcidevice.h" // foreachpci +#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME +#include "pci_regs.h" // PCI_BASE_ADDRESS_0 +#include "util.h" // boot_add_hd +#include "std/disk.h" // DISK_RET_ +#include "util.h" // timer_calc +#include "x86.h" // cpu_relax
+#include "nvme.h" +#include "nvme-int.h"
+/* Sequentially consistent read. */ +static u32 nvme_seq_read(u32 *p) { return *(_Atomic u32 *)p; }
+/* Sequentially consistent writes. We have a volatile version for doorbell registers. */ +static void nvme_seq_writev(u32 volatile *p, u32 v) { *(_Atomic volatile u32 *)p = v; }
Same gcc v3.4 issue with _Atomic. Is _Atomic necessary or just a decoration? The seabios code typically uses readl/writel to make the accesses atomic.
+static void * +zalloc_page_aligned_high(u32 size) +{
- void *res = memalign_high(NVME_PAGE_SIZE, size);
- if (res) memset(res, 0, size);
- return res;
+}
+static void +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx, u16 length) +{
- memset(q, 0, sizeof(*q));
- q->dbl = (u32 volatile *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
- dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
- q->mask = length - 1;
+}
+static void +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
struct nvme_cq *cq)
+{
- nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
- sq->sqe = zalloc_page_aligned_high(sizeof(*sq->sqe) * length);
- dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
- sq->cq = cq;
- sq->head = 0;
- sq->tail = 0;
+}
+static void +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length) +{
- nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
- cq->cqe = zalloc_page_aligned_high(sizeof(*cq->cqe) * length);
- cq->head = 0;
- /* All CQE phase bits are initialized to zero. This means initially we wait
for the host controller to set these to 1. */
- cq->phase = 1;
+}
+static int +nvme_poll_cq(struct nvme_cq *cq) +{
- u32 dw3 = nvme_seq_read(&cq->cqe[cq->head].dword[3]);
- return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
+}
+static int +nvme_is_cqe_success(struct nvme_cqe const *cqe) +{
- return (cqe->status & 0xFF) >> 1 == 0;
+}
+static struct nvme_cqe +nvme_consume_cqe(struct nvme_sq *sq) +{
- struct nvme_cq *cq = sq->cq;
- if (!nvme_poll_cq(cq)) {
panic("nvme: can't consume cqe when not ready!\n");
We don't want to panic() in any driver - halting the BIOS can make a machine inoperable. Instead, this (and the other locations) should call warn_internalerror() and return early.
- }
- struct nvme_cqe *cqe = &cq->cqe[cq->head];
- u16 cq_next_head = (cq->head + 1) & cq->common.mask;
- dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
- if (cq_next_head < cq->head) {
dprintf(3, "cq %p wrap\n", cq);
cq->phase = ~cq->phase;
- }
- cq->head = cq_next_head;
- /* Update the submission queue head. */
- if (cqe->sq_head != sq->head) {
sq->head = cqe->sq_head;
dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
- }
- /* Tell the controller that we consumed the completion. */
- nvme_seq_writev(cq->common.dbl, cq->head);
- return *cqe;
+}
+static struct nvme_cqe +nvme_timeout_cqe(void) +{
- struct nvme_cqe r;
- /* 0xFF is a vendor specific status code != success. Should be okay for
indicating failure. */
- memset(&r, 0xFF, sizeof(r));
- return r;
+}
+static struct nvme_cqe +nvme_wait(struct nvme_sq *sq) +{
- static const unsigned nvme_timeout = 500 /* ms */;
- u32 to = timer_calc(nvme_timeout);
- while (!nvme_poll_cq(sq->cq)) {
cpu_relax();
Unless I'm missing something subtle, this should be yield() so that irqs may be serviced.
if (timer_check(to)) {
warn_timeout();
return nvme_timeout_cqe();
}
- }
- return nvme_consume_cqe(sq);
+}
+/* Returns the next submission queue entry (or NULL if the queue is full). It
- also fills out Command Dword 0 and clears the rest. */
+static struct nvme_sqe * +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) +{
- if (((sq->head + 1) & sq->common.mask) == sq->tail) {
dprintf(3, "submission queue is full");
return NULL;
- }
- struct nvme_sqe *sqe = &sq->sqe[sq->tail];
- dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
- memset(sqe, 0, sizeof(*sqe));
- sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
- sqe->mptr = (u32)metadata;
- sqe->dptr_prp1 = (u32)data;
- if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) {
panic("data buffer not page aligned: %p\n", data);
- }
- return sqe;
+}
+/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */ +static void +nvme_commit_sqe(struct nvme_sq *sq) +{
- dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
- sq->tail = (sq->tail + 1) & sq->common.mask;
- nvme_seq_writev(sq->common.dbl, sq->tail);
+}
+/* The caller needs to free the returned pointer, because this is the 80s and we
- can't use unique_ptr. */
+static union nvme_identify * +nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid) +{
- union nvme_identify *identify_buf = zalloc_page_aligned_high(4096);
For temporary memory allocations, use memalign_tmp() - using memalign_high and then freeing the memory can result in memory fragmentation of the permanent memory pool.
- if (!identify_buf) {
panic("NVMe couldn't allocate identify buffer");
- }
- struct nvme_sqe *cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_IDENTIFY,
NULL, identify_buf);
- if (!cmd_identify) { panic("admin queue full\n"); }
- cmd_identify->nsid = nsid;
- cmd_identify->dword[10] = cns;
- nvme_commit_sqe(&ctrl->admin_sq);
- struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
- if (!nvme_is_cqe_success(&cqe)) {
free(identify_buf);
return NULL;
- }
- return identify_buf;
+}
+static struct nvme_identify_ctrl * +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl) +{
- union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0);
- /* C question: Is it safe to skip the nullptr check here? */
Yes - if identify_buf == NULL then &identify_buf->ctrl will also always be NULL. I don't mind the explicit check though, and I suspect the compiler will optimize it away anyway.
- return identify_buf ? &identify_buf->ctrl : NULL;
+}
+static struct nvme_identify_ns_list * +nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl) +{
- union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST, 0);
- return identify_buf ? &identify_buf->ns_list : NULL;
+}
+static struct nvme_identify_ns * +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id) +{
- union nvme_identify *identify_buf = nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS, ns_id);
- return identify_buf ? &identify_buf->ns : NULL;
+}
+static void +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) +{
- ns->ctrl = ctrl;
- ns->ns_id = ns_id;
- struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
- if (!id) {
dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
goto free_buffer;
- }
- u8 current_lba_format = id->flbas & 0xF;
- if (current_lba_format > id->nlbaf) {
dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the namespace supports (%u)?\n",
ns_id, current_lba_format, id->nlbaf + 1);
goto free_buffer;
- }
- ns->lba_count = id->nsze;
- struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
- ns->block_size = 1U << fmt->lbads;
- ns->metadata_size = fmt->ms;
- if (ns->block_size > NVME_PAGE_SIZE) {
panic("Cannot DMA a single block from our buffer: %u vs %u", ns->block_size, NVME_PAGE_SIZE);
- }
- ns->drive.cntl_id = ns - ctrl->ns;
- ns->drive.removable = 0;
- ns->drive.type = DTYPE_NVME;
- ns->drive.blksize = ns->block_size;
- ns->drive.sectors = ns->lba_count;
- ns->dma_buffer = zalloc_page_aligned_high(NVME_PAGE_SIZE);
- char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte blocks + %u-byte metadata)\n",
ns_id, (ns->lba_count * ns->block_size) >> 20, ns->lba_count, ns->block_size,
ns->metadata_size);
- dprintf(3, "%s", desc);
- boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
- free_buffer:
- free (id);
- }
+/* Returns 0 on success. */ +static int +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx) +{
- nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
- struct nvme_sqe *cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_CQ,
NULL, cq->cqe);
- if (!cmd_create_cq) {
return -1;
- }
- cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
- cmd_create_cq->dword[11] = 1 /* physically contiguous */;
- nvme_commit_sqe(&ctrl->admin_sq);
- struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
- if (!nvme_is_cqe_success(&cqe)) {
dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return -1;
- }
- return 0;
+}
+/* Returns 0 on success. */ +static int +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq) +{
- nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq);
- struct nvme_sqe *cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_SQ,
NULL, sq->sqe);
- if (!cmd_create_sq) {
return -1;
- }
- cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
- cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* physically contiguous */;
- dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq, cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
- nvme_commit_sqe(&ctrl->admin_sq);
- struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
- if (!nvme_is_cqe_success(&cqe)) {
dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return -1;
- }
- return 0;
+}
+/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
- page boundaries. */
+static int +nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, int write) +{
- if ((u32)buf & 0x3) panic("buf %p is not DWORD aligned", buf);
- if (((u32)buf & ~(NVME_PAGE_SIZE - 1))
!= (((u32)buf + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1))) {
panic("IO read crosses page boundary: buf %p bs %u count %u", buf, ns->block_size, count);
- }
- struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ,
NULL, buf);
It would be preferable if the code was line-wrapped to 80 characters.
- io_read->nsid = ns->ns_id;
- io_read->dword[10] = (u32)lba;
- io_read->dword[11] = (u32)(lba >> 32);
- io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
- nvme_commit_sqe(&ns->ctrl->io_sq);
- struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
- if (!nvme_is_cqe_success(&cqe)) {
dprintf(2, "read io: %08x %08x %08x %08x\n",
cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return DISK_RET_EBADTRACK;
- }
- return DISK_RET_SUCCESS;
+}
+static int +nvme_create_io_queues(struct nvme_ctrl *ctrl) +{
- if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
return -1;
- if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
return -1;
- return 0;
+}
+/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */ +static int +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy) +{
- u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
- u32 to = timer_calc(max_to);
- u32 csts;
- while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
cpu_relax();
yield()
if (csts & NVME_CSTS_FATAL) {
dprintf(3, "NVMe fatal error during controller shutdown\n");
return -1;
}
if (timer_check(to)) {
warn_timeout();
return -1;
}
- }
- return 0;
+}
+static void +nvme_controller_init(struct nvme_ctrl *ctrl)
It would be better to avoid the _init suffix as functions with an _init suffix have a special meaning in seabios' boot order.
+{
- pci_enable_busmaster(ctrl->pci);
- /* Turn the controller off. */
- ctrl->reg->cc = 0;
- if (nvme_wait_csts_rdy(ctrl, 0)) {
dprintf(2, "NVMe fatal error during controller shutdown\n");
return;
- }
- ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
- nvme_init_cq(ctrl, &ctrl->admin_cq, 1, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
- nvme_init_sq(ctrl, &ctrl->admin_sq, 0, NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
- ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16 | ctrl->admin_sq.common.mask;
- /* Create the admin queue pair */
- if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory;
- ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
- ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
- dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe);
- dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe);
- ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20) | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
- if (nvme_wait_csts_rdy(ctrl, 1)) {
dprintf(2, "NVMe fatal error while enabling controller\n");
return;
- }
- /* The admin queue is set up and the controller is ready. Let's figure out
what namespaces we have. */
- struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
- if (!identify) {
dprintf(2, "NVMe couldn't identify controller.\n");
goto failed;
- }
- /* TODO Print model/serial info. */
- dprintf(3, "NVMe has %u namespace%s.\n",
identify->nn, (identify->nn == 1) ? "" : "s");
- ctrl->ns_count = identify->nn;
- free(identify);
- if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
/* No point to continue, if the controller says it doesn't have
namespaces or we couldn't create I/O queues. */
goto failed;
- }
- ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count);
- if (!ctrl->ns) goto out_of_memory;
- memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count);
- struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl);
- if (!ns_list) {
dprintf(2, "NVMe couldn't get namespace list.\n");
goto failed;
- }
- /* Populate namespace IDs */
- int ns_idx;
- for (ns_idx = 0;
ns_idx < ARRAY_SIZE(ns_list->ns_id)
&& ns_idx < ctrl->ns_count
&& ns_list->ns_id[ns_idx];
ns_idx++) {
nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]);
- }
- free(ns_list);
- /* If for some reason the namespace list gives us fewer namespaces, we just go along. */
- if (ns_idx != ctrl->ns_count) {
dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx);
ctrl->ns_count = ns_idx;
- }
- dprintf(3, "NVMe initialization complete!\n");
- return;
- out_of_memory:
- warn_noalloc();
- failed:
- free(ctrl->admin_sq.sqe);
- free(ctrl->admin_cq.cqe);
Shouldn't this free(ctrl->ns) also?
- return;
+}
+/* Initialize an NVMe controller and detect its drives. */ +static void +nvme_controller_setup(struct pci_device *pci) +{
- if (create_bounce_buf() < 0)
return;
- struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
- if (!reg)
return;
- u32 version = reg->vs;
- dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
version >> 16, (version >> 8) & 0xFF, version & 0xFF);
- dprintf(3, " Capabilities %016llx\n", reg->cap);
- if (version < 0x00010100U) {
dprintf(3, "Need at least 1.1.0! Skipping.\n");
return;
- }
- if (~reg->cap & NVME_CAP_CSS_NVME) {
dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
return;
- }
- struct nvme_ctrl *ctrl = malloc_fseg(sizeof(*ctrl));
I think this could be malloc_high() here - the fseg space is fairly limited and there isn't much need to use it for 32bit only drivers.
- if (!ctrl) {
warn_noalloc();
return;
- }
- memset(ctrl, 0, sizeof(*ctrl));
- ctrl->reg = reg;
- ctrl->pci = pci;
- nvme_controller_init(ctrl);
+}
+// Locate and init NVMe controllers +static void +nvme_scan(void) +{
- // Scan PCI bus for ATA adapters
- struct pci_device *pci;
- foreachpci(pci) {
if (pci->class != PCI_CLASS_STORAGE_NVME)
continue;
if (pci->prog_if != 2 /* as of NVM 1.0e */) {
dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
continue;
}
nvme_controller_setup(pci);
Ideally the code would start a thread here: run_thread(nvme_controller_setup, pci);
- }
+}
+static int +nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) +{
- int res = DISK_RET_SUCCESS;
- u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
- if (write) {
panic("XXX Writes are implemented, but not tested."
" Remove this panic, if you are sure what you are doing!");
If the driver isn't tested with writes, just return DISK_RET_EWRITEPROTECT for CMD_FORMAT and CMD_WRITE in nvme_process_op(). Is it difficult to test writes?
- }
- for (u16 i = 0; i < op->count || res != DISK_RET_SUCCESS;) {
Older versions of gcc don't like the 'u16 i' in the for() declaration. It can be fixed by moving the declaration just above the for loop.
u16 const blocks_remaining = op->count - i;
u16 const blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks;
char * const op_buf = op->buf_fl + i * ns->block_size;
if (write) {
memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
}
res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" : "read",
op->lba + i, blocks, res);
if (!write && res == DISK_RET_SUCCESS) {
memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
}
i += blocks;
- }
- return res;
+}
+int +nvme_process_op(struct disk_op_s *op) +{
- if (!CONFIG_NVME)
return DISK_RET_SUCCESS;
- struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace, drive);
- switch (op->command) {
- case CMD_READ:
- case CMD_WRITE:
return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
- default:
dprintf(2, "NVMe disk op %u not implemented\n", op->command);
return DISK_RET_EBADTRACK;
Various bootloaders tend to make funky calls, so logging here could flood the log. This should call default_process_op(op) instead.
- }
+}
+void +nvme_setup(void) +{
- ASSERT32FLAT();
- if (!CONFIG_NVME)
return;
- dprintf(3, "init nvme\n");
- nvme_scan();
+}
+/* Local Variables: */ +/* indent-tabs-mode: nil */ +/* c-basic-offset: 4 */ +/* End: */ diff --git a/src/hw/nvme.h b/src/hw/nvme.h new file mode 100644 index 0000000..1555dc7 --- /dev/null +++ b/src/hw/nvme.h @@ -0,0 +1,15 @@ +// External interfaces for low level NVMe support +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#ifndef __NVME_H +#define __NVME_H
+#include "block.h" // struct disk_op_s
+void nvme_setup(void); +int nvme_process_op(struct disk_op_s *op);
+#endif
+/* EOF */ diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h index cdf9b3c..4ac73b4 100644 --- a/src/hw/pci_ids.h +++ b/src/hw/pci_ids.h @@ -18,6 +18,7 @@ #define PCI_CLASS_STORAGE_SATA 0x0106 #define PCI_CLASS_STORAGE_SATA_AHCI 0x010601 #define PCI_CLASS_STORAGE_SAS 0x0107 +#define PCI_CLASS_STORAGE_NVME 0x0108 #define PCI_CLASS_STORAGE_OTHER 0x0180
#define PCI_BASE_CLASS_NETWORK 0x02
2.7.4
SeaBIOS mailing list SeaBIOS@seabios.org https://www.coreboot.org/mailman/listinfo/seabios
On Tue, 2017-01-24 at 12:26 -0500, Kevin O'Connor wrote:
Thanks.  See my comments below.  Mostly minor things I noticed.
Thanks for looking into this. It'll take me a couple of days to post a new patch, because I'm currently traveling. Expect a new patch next week.
Btw, some of the issues you pointed out also apply to the AHCI code, because that was what I was looking at while implementing the NVMe code. ;)
Some questions below:
+Â Â Â Â config NVME +Â Â Â Â Â Â Â Â depends on DRIVES +Â Â Â Â Â Â Â Â bool "NVMe controllers" +Â Â Â Â Â Â Â Â default y +Â Â Â Â Â Â Â Â help +Â Â Â Â Â Â Â Â Â Â Â Â Support for NVMe disk code.
Is this device also available in real hardware?  Is it expected to work on real hardware and/or has it been tested?
It's expected to work on real hardware. I'll see whether I can find something to test it on other than the Qemu code.
+_Static_assert(sizeof(struct nvme_sqe) == 1U << NVME_SQE_SIZE_LOG, "invalid queue entry size"); +_Static_assert(sizeof(struct nvme_cqe) == 1U << NVME_CQE_SIZE_LOG, "invalid queue entry size");
Current SeaBIOS supports being compiled on gcc v3.4 and this construct isn't supported there.  That compiler is pretty old, but for now it's going to be easier to just change this in your patch.
I'll update the patch to remove C11 features and be C89 compatible.
+/* Sequentially consistent writes. We have a volatile version for doorbell registers. */ +static void nvme_seq_writev(u32 volatile *p, u32 v) { *(_Atomic volatile u32 *)p = v; }
Same gcc v3.4 issue with _Atomic.  Is _Atomic necessary or just a decoration?  The seabios code typically uses readl/writel to make the accesses atomic.
The access needs to order memory operations. I'll try to follow what the rest of SeaBIOS does.
+Â Â Â Â Â Â Â Â nvme_controller_setup(pci);
Ideally the code would start a thread here: Â Â Â Â run_thread(nvme_controller_setup, pci);
What's the advantage of that?
Julian
On Tue, Jan 24, 2017 at 09:43:41PM +0000, Stecklina, Julian wrote:
On Tue, 2017-01-24 at 12:26 -0500, Kevin O'Connor wrote:
Thanks.  See my comments below.  Mostly minor things I noticed.
Thanks for looking into this. It'll take me a couple of days to post a new patch, because I'm currently traveling. Expect a new patch next week.
Thanks.
Btw, some of the issues you pointed out also apply to the AHCI code, because that was what I was looking at while implementing the NVMe code. ;)
Can you point me to which ones? I see one of the AHCI allocations could potentially be moved out of the fseg, but I don't see anything else.
[...]
+_Static_assert(sizeof(struct nvme_sqe) == 1U << NVME_SQE_SIZE_LOG, "invalid queue entry size"); +_Static_assert(sizeof(struct nvme_cqe) == 1U << NVME_CQE_SIZE_LOG, "invalid queue entry size");
Current SeaBIOS supports being compiled on gcc v3.4 and this construct isn't supported there.  That compiler is pretty old, but for now it's going to be easier to just change this in your patch.
I'll update the patch to remove C11 features and be C89 compatible.
Thanks. C99 is fine - it's just _Atomic, _Static_assert, and the one for(u16 i ...).
[...]
Ideally the code would start a thread here: Â Â Â Â run_thread(nvme_controller_setup, pci);
What's the advantage of that?
It runs the controller setup in it's own "thread" - see: https://www.seabios.org/Execution_and_code_flow#Threads It has no real impact on VMs, but it can be significant on real hardware. I prefer to see the drivers use it regardless because the drivers tend to get copied.
-Kevin
This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= 2.7.0), because older versions have buggy NVMe support.
The NVMe code is currently only enabled on Qemu due to lack of testing on real hardware.
Signed-off-by: Julian Stecklina jsteckli@amazon.de --- Makefile | 2 +- src/Kconfig | 6 + src/block.c | 4 + src/block.h | 1 + src/hw/nvme-int.h | 199 +++++++++++++++++ src/hw/nvme.c | 651 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hw/nvme.h | 17 ++ src/hw/pci_ids.h | 1 + 8 files changed, 880 insertions(+), 1 deletion(-) create mode 100644 src/hw/nvme-int.h create mode 100644 src/hw/nvme.c create mode 100644 src/hw/nvme.h
diff --git a/Makefile b/Makefile index 3b94ee0..946df7e 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ SRC32FLAT=$(SRCBOTH) post.c e820map.c malloc.c romfile.c x86.c optionroms.c \ fw/paravirt.c fw/shadow.c fw/pciinit.c fw/smm.c fw/smp.c fw/mtrr.c fw/xen.c \ fw/acpi.c fw/mptable.c fw/pirtable.c fw/smbios.c fw/romfile_loader.c \ hw/virtio-ring.c hw/virtio-pci.c hw/virtio-blk.c hw/virtio-scsi.c \ - hw/tpm_drivers.c + hw/tpm_drivers.c hw/nvme.c SRC32SEG=string.c output.c pcibios.c apm.c stacks.c hw/pci.c hw/serialio.c DIRS=src src/hw src/fw vgasrc
diff --git a/src/Kconfig b/src/Kconfig index 457d082..e1b83a4 100644 --- a/src/Kconfig +++ b/src/Kconfig @@ -227,6 +227,12 @@ menu "Hardware support" help Support floppy images stored in coreboot flash or from QEMU fw_cfg. + config NVME + depends on DRIVES && QEMU_HARDWARE + bool "NVMe controllers" + default y + help + Support for NVMe disk code.
config PS2PORT depends on KEYBOARD || MOUSE diff --git a/src/block.c b/src/block.c index f7280cf..d104f6d 100644 --- a/src/block.c +++ b/src/block.c @@ -20,6 +20,7 @@ #include "hw/usb-uas.h" // uas_process_op #include "hw/virtio-blk.h" // process_virtio_blk_op #include "hw/virtio-scsi.h" // virtio_scsi_process_op +#include "hw/nvme.h" // nvme_process_op #include "malloc.h" // malloc_low #include "output.h" // dprintf #include "stacks.h" // call32 @@ -502,6 +503,7 @@ block_setup(void) megasas_setup(); pvscsi_setup(); mpt_scsi_setup(); + nvme_setup(); }
// Fallback handler for command requests not implemented by drivers @@ -571,6 +573,8 @@ process_op_32(struct disk_op_s *op) return virtio_scsi_process_op(op); case DTYPE_PVSCSI: return pvscsi_process_op(op); + case DTYPE_NVME: + return nvme_process_op(op); default: return process_op_both(op); } diff --git a/src/block.h b/src/block.h index 0f15ff9..f03ec38 100644 --- a/src/block.h +++ b/src/block.h @@ -82,6 +82,7 @@ struct drive_s { #define DTYPE_PVSCSI 0x83 #define DTYPE_MPT_SCSI 0x84 #define DTYPE_SDCARD 0x90 +#define DTYPE_NVME 0x91
#define MAXDESCSIZE 80
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h new file mode 100644 index 0000000..9f95dd8 --- /dev/null +++ b/src/hw/nvme-int.h @@ -0,0 +1,199 @@ +// NVMe datastructures and constants +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#ifndef __NVME_INT_H +#define __NVME_INT_H + +#include "types.h" // u32 +#include "pcidevice.h" // struct pci_device + +/* Data structures */ + +/* The register file of a NVMe host controller. This struct follows the naming + scheme in the NVMe specification. */ +struct nvme_reg { + u64 cap; /* controller capabilities */ + u32 vs; /* version */ + u32 intms; /* interrupt mask set */ + u32 intmc; /* interrupt mask clear */ + u32 cc; /* controller configuration */ + u32 _res0; + u32 csts; /* controller status */ + u32 _res1; + u32 aqa; /* admin queue attributes */ + u64 asq; /* admin submission queue base address */ + u64 acq; /* admin completion queue base address */ +}; + +/* Submission queue entry */ +struct nvme_sqe { + union { + u32 dword[16]; + struct { + u32 cdw0; /* Command DWORD 0 */ + u32 nsid; /* Namespace ID */ + u64 _res0; + u64 mptr; /* metadata ptr */ + + u64 dptr_prp1; + u64 dptr_prp2; + }; + }; +}; + +/* Completion queue entry */ +struct nvme_cqe { + union { + u32 dword[4]; + struct { + u32 cdw0; + u32 _res0; + u16 sq_head; + u16 sq_id; + u16 cid; + u16 status; + }; + }; +}; + +/* The common part of every submission or completion queue. */ +struct nvme_queue { + u32 *dbl; /* doorbell */ + u16 mask; /* length - 1 */ +}; + +struct nvme_cq { + struct nvme_queue common; + struct nvme_cqe *cqe; + + /* We have read upto (but not including) this entry in the queue. */ + u16 head; + + /* The current phase bit the controller uses to indicate that it has written + a new entry. This is inverted after each wrap. */ + unsigned phase : 1; +}; + +struct nvme_sq { + struct nvme_queue common; + struct nvme_sqe *sqe; + + /* Corresponding completion queue. We only support a single SQ per CQ. */ + struct nvme_cq *cq; + + /* The last entry the controller has fetched. */ + u16 head; + + /* The last value we have written to the tail doorbell. */ + u16 tail; +}; + +struct nvme_ctrl { + struct pci_device *pci; + struct nvme_reg volatile *reg; + + u32 doorbell_stride; /* in bytes */ + + struct nvme_sq admin_sq; + struct nvme_cq admin_cq; + + u32 ns_count; + struct nvme_namespace *ns; + + struct nvme_sq io_sq; + struct nvme_cq io_cq; +}; + +struct nvme_namespace { + struct drive_s drive; + struct nvme_ctrl *ctrl; + + u32 ns_id; + + u64 lba_count; /* The total amount of sectors. */ + + u32 block_size; + u32 metadata_size; + + /* Page aligned buffer of size NVME_PAGE_SIZE. */ + char *dma_buffer; +}; + +/* Data structures for NVMe admin identify commands */ + +struct nvme_identify_ctrl { + u16 vid; + u16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + + char _boring[516 - 72]; + + u32 nn; /* number of namespaces */ +}; + +struct nvme_identify_ns_list { + u32 ns_id[1024]; +}; + +struct nvme_lba_format { + u16 ms; + u8 lbads; + u8 rp; + u8 res; +}; + +struct nvme_identify_ns { + u64 nsze; + u64 ncap; + u64 nuse; + u8 nsfeat; + u8 nlbaf; + u8 flbas; + + char _boring[128 - 27]; + + struct nvme_lba_format lbaf[16]; +}; + +union nvme_identify { + struct nvme_identify_ns ns; + struct nvme_identify_ctrl ctrl; + struct nvme_identify_ns_list ns_list; +}; + +/* NVMe constants */ + +#define NVME_CAP_CSS_NVME (1ULL << 37) + +#define NVME_CSTS_FATAL (1U << 1) +#define NVME_CSTS_RDY (1U << 0) + +#define NVME_CC_EN (1U << 0) + +#define NVME_SQE_OPC_ADMIN_CREATE_IO_SQ 1U +#define NVME_SQE_OPC_ADMIN_CREATE_IO_CQ 5U +#define NVME_SQE_OPC_ADMIN_IDENTIFY 6U + +#define NVME_SQE_OPC_IO_WRITE 1U +#define NVME_SQE_OPC_IO_READ 2U + +#define NVME_ADMIN_IDENTIFY_CNS_ID_NS 0U +#define NVME_ADMIN_IDENTIFY_CNS_ID_CTRL 1U +#define NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST 2U + +#define NVME_CQE_DW3_P (1U << 16) + +#define NVME_PAGE_SIZE 4096 + +/* Length for the queue entries. */ +#define NVME_SQE_SIZE_LOG 6 +#define NVME_CQE_SIZE_LOG 4 + +#endif + +/* EOF */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c new file mode 100644 index 0000000..366b177 --- /dev/null +++ b/src/hw/nvme.c @@ -0,0 +1,651 @@ +// Low level NVMe disk access +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#include "blockcmd.h" +#include "fw/paravirt.h" // runningOnQEMU +#include "malloc.h" // malloc_high +#include "output.h" // dprintf +#include "pci.h" +#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME +#include "pci_regs.h" // PCI_BASE_ADDRESS_0 +#include "pcidevice.h" // foreachpci +#include "stacks.h" // yield +#include "std/disk.h" // DISK_RET_ +#include "string.h" // memset +#include "util.h" // boot_add_hd +#include "x86.h" // readl + +#include "nvme.h" +#include "nvme-int.h" + +static void * +zalloc_page_aligned(void *(*memalign)(u32 /* align */, u32 /* size */), + u32 size) +{ + void *res = memalign(NVME_PAGE_SIZE, size); + if (res) memset(res, 0, size); + return res; +} + +static void +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx, + u16 length) +{ + memset(q, 0, sizeof(*q)); + q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride); + dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl); + q->mask = length - 1; +} + +static void +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length, + struct nvme_cq *cq) +{ + nvme_init_queue_common(ctrl, &sq->common, q_idx, length); + sq->sqe = zalloc_page_aligned(memalign_high, sizeof(*sq->sqe) * length); + dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe); + sq->cq = cq; + sq->head = 0; + sq->tail = 0; +} + +static void +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length) +{ + nvme_init_queue_common(ctrl, &cq->common, q_idx, length); + cq->cqe = zalloc_page_aligned(memalign_high, sizeof(*cq->cqe) * length); + + cq->head = 0; + + /* All CQE phase bits are initialized to zero. This means initially we wait + for the host controller to set these to 1. */ + cq->phase = 1; +} + +static int +nvme_poll_cq(struct nvme_cq *cq) +{ + u32 dw3 = readl(&cq->cqe[cq->head].dword[3]); + return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase); +} + +static int +nvme_is_cqe_success(struct nvme_cqe const *cqe) +{ + return (cqe->status & 0xFF) >> 1 == 0; +} + + +static struct nvme_cqe +nvme_error_cqe(void) +{ + struct nvme_cqe r; + + /* 0xFF is a vendor specific status code != success. Should be okay for + indicating failure. */ + memset(&r, 0xFF, sizeof(r)); + return r; +} + +static struct nvme_cqe +nvme_consume_cqe(struct nvme_sq *sq) +{ + struct nvme_cq *cq = sq->cq; + + if (!nvme_poll_cq(cq)) { + /* Cannot consume a completion queue entry, if there is none ready. */ + return nvme_error_cqe(); + } + + struct nvme_cqe *cqe = &cq->cqe[cq->head]; + u16 cq_next_head = (cq->head + 1) & cq->common.mask; + dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head); + if (cq_next_head < cq->head) { + dprintf(3, "cq %p wrap\n", cq); + cq->phase = ~cq->phase; + } + cq->head = cq_next_head; + + /* Update the submission queue head. */ + if (cqe->sq_head != sq->head) { + sq->head = cqe->sq_head; + dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head); + } + + /* Tell the controller that we consumed the completion. */ + writel(cq->common.dbl, cq->head); + + return *cqe; +} + +static struct nvme_cqe +nvme_wait(struct nvme_sq *sq) +{ + static const unsigned nvme_timeout = 500 /* ms */; + u32 to = timer_calc(nvme_timeout); + while (!nvme_poll_cq(sq->cq)) { + yield(); + + if (timer_check(to)) { + warn_timeout(); + return nvme_error_cqe(); + } + } + + return nvme_consume_cqe(sq); +} + +/* Returns the next submission queue entry (or NULL if the queue is full). It + also fills out Command Dword 0 and clears the rest. */ +static struct nvme_sqe * +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) +{ + if (((sq->head + 1) & sq->common.mask) == sq->tail) { + dprintf(3, "submission queue is full"); + return NULL; + } + + struct nvme_sqe *sqe = &sq->sqe[sq->tail]; + dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail); + + memset(sqe, 0, sizeof(*sqe)); + sqe->cdw0 = opc | (sq->tail << 16 /* CID */); + sqe->mptr = (u32)metadata; + sqe->dptr_prp1 = (u32)data; + + if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) { + /* Data buffer not page aligned. */ + panic("data buffer not page aligned: %p\n", data); + warn_internalerror(); + } + + return sqe; +} + +/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */ +static void +nvme_commit_sqe(struct nvme_sq *sq) +{ + dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail); + sq->tail = (sq->tail + 1) & sq->common.mask; + writel(sq->common.dbl, sq->tail); +} + +/* Perform an identify command on the admin queue and return the resulting + buffer. This may be a NULL pointer, if something failed. This function + cannot be used after initialization, because it uses buffers in tmp zone. */ +static union nvme_identify * +nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid) +{ + union nvme_identify *identify_buf = zalloc_page_aligned(memalign_tmp, 4096); + if (!identify_buf) { + /* Could not allocate identify buffer. */ + warn_internalerror(); + return NULL; + } + + struct nvme_sqe *cmd_identify; + cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_IDENTIFY, NULL, + identify_buf); + + if (!cmd_identify) { panic("admin queue full\n"); } + + cmd_identify->nsid = nsid; + cmd_identify->dword[10] = cns; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + free(identify_buf); + return NULL; + } + + return identify_buf; +} + +static struct nvme_identify_ctrl * +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl; +} + +static struct nvme_identify_ns_list * +nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST, + 0)->ns_list; +} + +static struct nvme_identify_ns * +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS, + ns_id)->ns; +} + +static void +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) +{ + ns->ctrl = ctrl; + ns->ns_id = ns_id; + + struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id); + if (!id) { + dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id); + goto free_buffer; + } + + u8 current_lba_format = id->flbas & 0xF; + if (current_lba_format > id->nlbaf) { + dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the " + " namespace supports (%u)?\n", + ns_id, current_lba_format, id->nlbaf + 1); + goto free_buffer; + } + + ns->lba_count = id->nsze; + + struct nvme_lba_format *fmt = &id->lbaf[current_lba_format]; + + ns->block_size = 1U << fmt->lbads; + ns->metadata_size = fmt->ms; + + if (ns->block_size > NVME_PAGE_SIZE) { + /* If we see devices that trigger this path, we need to increase our + buffer size. */ + warn_internalerror(); + goto free_buffer; + } + + ns->drive.cntl_id = ns - ctrl->ns; + ns->drive.removable = 0; + ns->drive.type = DTYPE_NVME; + ns->drive.blksize = ns->block_size; + ns->drive.sectors = ns->lba_count; + + ns->dma_buffer = zalloc_page_aligned(memalign_high, NVME_PAGE_SIZE); + + char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte " + "blocks + %u-byte metadata)\n", + ns_id, (ns->lba_count * ns->block_size) >> 20, + ns->lba_count, ns->block_size, ns->metadata_size); + + dprintf(3, "%s", desc); + boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci)); + + free_buffer: + free (id); + } + +/* Returns 0 on success. */ +static int +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx) +{ + struct nvme_sqe *cmd_create_cq; + + nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL, + cq->cqe); + if (!cmd_create_cq) { + return -1; + } + + cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1); + cmd_create_cq->dword[11] = 1 /* physically contiguous */; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "create io cq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return -1; + } + + return 0; +} + +/* Returns 0 on success. */ +static int +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq) +{ + struct nvme_sqe *cmd_create_sq; + + nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq); + cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL, + sq->sqe); + if (!cmd_create_sq) { + return -1; + } + + cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1); + cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */; + dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq, + cmd_create_sq->dword[10], cmd_create_sq->dword[11]); + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "create io sq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + return -1; + } + + return 0; +} + +/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross + page boundaries. */ +static int +nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, + int write) +{ + u32 buf_addr = (u32)buf; + + if ((buf_addr & 0x3) || + ((buf_addr & ~(NVME_PAGE_SIZE - 1)) != + ((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) { + /* Buffer is misaligned or crosses page boundary */ + warn_internalerror(); + return DISK_RET_EBADTRACK; + } + + struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, + write ? NVME_SQE_OPC_IO_WRITE + : NVME_SQE_OPC_IO_READ, + NULL, buf); + io_read->nsid = ns->ns_id; + io_read->dword[10] = (u32)lba; + io_read->dword[11] = (u32)(lba >> 32); + io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1); + + nvme_commit_sqe(&ns->ctrl->io_sq); + + struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "read io: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return DISK_RET_EBADTRACK; + } + + return DISK_RET_SUCCESS; +} + + +static int +nvme_create_io_queues(struct nvme_ctrl *ctrl) +{ + if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3)) + return -1; + + if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq)) + return -1; + + return 0; +} + +/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */ +static int +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy) +{ + u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU); + u32 to = timer_calc(max_to); + u32 csts; + + while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) { + yield(); + + if (csts & NVME_CSTS_FATAL) { + dprintf(3, "NVMe fatal error during controller shutdown\n"); + return -1; + } + + if (timer_check(to)) { + warn_timeout(); + return -1; + } + } + + return 0; +} + +static void +nvme_controller_enable(struct nvme_ctrl *ctrl) +{ + pci_enable_busmaster(ctrl->pci); + + /* Turn the controller off. */ + ctrl->reg->cc = 0; + if (nvme_wait_csts_rdy(ctrl, 0)) { + dprintf(2, "NVMe fatal error during controller shutdown\n"); + return; + } + + ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF); + + nvme_init_cq(ctrl, &ctrl->admin_cq, 1, + NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + + nvme_init_sq(ctrl, &ctrl->admin_sq, 0, + NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq); + + ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16 + | ctrl->admin_sq.common.mask; + + /* Create the admin queue pair */ + if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory; + + ctrl->reg->asq = (u32)ctrl->admin_sq.sqe; + ctrl->reg->acq = (u32)ctrl->admin_cq.cqe; + + dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe); + dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe); + + ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20) + | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */); + + if (nvme_wait_csts_rdy(ctrl, 1)) { + dprintf(2, "NVMe fatal error while enabling controller\n"); + return; + } + /* The admin queue is set up and the controller is ready. Let's figure out + what namespaces we have. */ + + struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl); + + if (!identify) { + dprintf(2, "NVMe couldn't identify controller.\n"); + goto failed; + } + + /* TODO Print model/serial info. */ + dprintf(3, "NVMe has %u namespace%s.\n", + identify->nn, (identify->nn == 1) ? "" : "s"); + + ctrl->ns_count = identify->nn; + free(identify); + + if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) { + /* No point to continue, if the controller says it doesn't have + namespaces or we couldn't create I/O queues. */ + goto failed; + } + + ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count); + if (!ctrl->ns) goto out_of_memory; + memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count); + + struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl); + if (!ns_list) { + dprintf(2, "NVMe couldn't get namespace list.\n"); + goto failed; + } + + /* Populate namespace IDs */ + int ns_idx; + for (ns_idx = 0; + ns_idx < ARRAY_SIZE(ns_list->ns_id) + && ns_idx < ctrl->ns_count + && ns_list->ns_id[ns_idx]; + ns_idx++) { + nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]); + } + + free(ns_list); + + /* If for some reason the namespace list gives us fewer namespaces, we just + go along. */ + if (ns_idx != ctrl->ns_count) { + dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx); + ctrl->ns_count = ns_idx; + } + + dprintf(3, "NVMe initialization complete!\n"); + return; + + out_of_memory: + warn_noalloc(); + failed: + free(ctrl->admin_sq.sqe); + free(ctrl->admin_cq.cqe); + free(ctrl->ns); + return; +} + +/* Initialize an NVMe controller and detect its drives. */ +static void +nvme_controller_setup(void *opaque) +{ + struct pci_device *pci = opaque; + + if (create_bounce_buf() < 0) + return; + + struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0); + if (!reg) + return; + + u32 version = reg->vs; + dprintf(3, "Found NVMe controller with version %u.%u.%u.\n", + version >> 16, (version >> 8) & 0xFF, version & 0xFF); + dprintf(3, " Capabilities %016llx\n", reg->cap); + + if (version < 0x00010100U) { + dprintf(3, "Need at least 1.1.0! Skipping.\n"); + return; + } + + if (~reg->cap & NVME_CAP_CSS_NVME) { + dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n"); + return; + } + + struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl)); + if (!ctrl) { + warn_noalloc(); + return; + } + + memset(ctrl, 0, sizeof(*ctrl)); + + ctrl->reg = reg; + ctrl->pci = pci; + + nvme_controller_enable(ctrl); +} + +// Locate and init NVMe controllers +static void +nvme_scan(void) +{ + // Scan PCI bus for ATA adapters + struct pci_device *pci; + + foreachpci(pci) { + if (pci->class != PCI_CLASS_STORAGE_NVME) + continue; + if (pci->prog_if != 2 /* as of NVM 1.0e */) { + dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if); + continue; + } + + run_thread(nvme_controller_setup, pci); + } +} + +static int +nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) +{ + int res = DISK_RET_SUCCESS; + u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; + u16 i; + + for (i = 0; i < op->count || res != DISK_RET_SUCCESS;) { + u16 blocks_remaining = op->count - i; + u16 blocks = blocks_remaining < max_blocks ? blocks_remaining + : max_blocks; + char *op_buf = op->buf_fl + i * ns->block_size; + + if (write) { + memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); + } + + res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write); + dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" + : "read", + op->lba + i, blocks, res); + + if (!write && res == DISK_RET_SUCCESS) { + memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); + } + + i += blocks; + } + + return res; +} + +int +nvme_process_op(struct disk_op_s *op) +{ + if (!CONFIG_NVME || !runningOnQEMU()) + return DISK_RET_SUCCESS; + + struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace, + drive); + + switch (op->command) { + case CMD_READ: + case CMD_WRITE: + return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE); + default: + return default_process_op(op); + } +} + +void +nvme_setup(void) +{ + ASSERT32FLAT(); + if (!CONFIG_NVME) + return; + + dprintf(3, "init nvme\n"); + nvme_scan(); +} + +/* EOF */ diff --git a/src/hw/nvme.h b/src/hw/nvme.h new file mode 100644 index 0000000..4dbb70a --- /dev/null +++ b/src/hw/nvme.h @@ -0,0 +1,17 @@ +// External interfaces for low level NVMe support +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#ifndef __NVME_H +#define __NVME_H + +#include "block.h" // struct disk_op_s + +void nvme_setup(void); +int nvme_process_op(struct disk_op_s *op); + +#endif + +/* EOF */ diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h index cdf9b3c..4ac73b4 100644 --- a/src/hw/pci_ids.h +++ b/src/hw/pci_ids.h @@ -18,6 +18,7 @@ #define PCI_CLASS_STORAGE_SATA 0x0106 #define PCI_CLASS_STORAGE_SATA_AHCI 0x010601 #define PCI_CLASS_STORAGE_SAS 0x0107 +#define PCI_CLASS_STORAGE_NVME 0x0108 #define PCI_CLASS_STORAGE_OTHER 0x0180
#define PCI_BASE_CLASS_NETWORK 0x02
On Wed, Feb 01, 2017 at 09:55:33AM +0100, Julian Stecklina wrote:
This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= 2.7.0), because older versions have buggy NVMe support.
The NVMe code is currently only enabled on Qemu due to lack of testing on real hardware.
Signed-off-by: Julian Stecklina jsteckli@amazon.de
Thanks Julian. I have some comments below - most of them minor. However, the use of panic() in a driver is a blocker - we don't want any drivers to use panic as its use could cause a machine to be inoperable.
[...]
--- /dev/null +++ b/src/hw/nvme.c @@ -0,0 +1,651 @@ +// Low level NVMe disk access +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license.
+#include "blockcmd.h" +#include "fw/paravirt.h" // runningOnQEMU +#include "malloc.h" // malloc_high +#include "output.h" // dprintf +#include "pci.h" +#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME +#include "pci_regs.h" // PCI_BASE_ADDRESS_0 +#include "pcidevice.h" // foreachpci +#include "stacks.h" // yield +#include "std/disk.h" // DISK_RET_ +#include "string.h" // memset +#include "util.h" // boot_add_hd +#include "x86.h" // readl
+#include "nvme.h" +#include "nvme-int.h"
+static void * +zalloc_page_aligned(void *(*memalign)(u32 /* align */, u32 /* size */),
u32 size)
+{
- void *res = memalign(NVME_PAGE_SIZE, size);
- if (res) memset(res, 0, size);
- return res;
+}
memalign_high() and memalign_tmp() are thin wrappers around _malloc() - see malloc.h. Instead of passing pointers to these thin wrapper functions it would be preferable to pass the malloc zone and call _malloc() directly.
+static void +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx,
u16 length)
+{
- memset(q, 0, sizeof(*q));
- q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
- dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
- q->mask = length - 1;
+}
+static void +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
struct nvme_cq *cq)
+{
- nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
- sq->sqe = zalloc_page_aligned(memalign_high, sizeof(*sq->sqe) * length);
- dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
- sq->cq = cq;
- sq->head = 0;
- sq->tail = 0;
+}
+static void +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length) +{
- nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
- cq->cqe = zalloc_page_aligned(memalign_high, sizeof(*cq->cqe) * length);
- cq->head = 0;
- /* All CQE phase bits are initialized to zero. This means initially we wait
for the host controller to set these to 1. */
- cq->phase = 1;
+}
+static int +nvme_poll_cq(struct nvme_cq *cq) +{
- u32 dw3 = readl(&cq->cqe[cq->head].dword[3]);
- return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
+}
+static int +nvme_is_cqe_success(struct nvme_cqe const *cqe) +{
- return (cqe->status & 0xFF) >> 1 == 0;
+}
+static struct nvme_cqe +nvme_error_cqe(void) +{
- struct nvme_cqe r;
- /* 0xFF is a vendor specific status code != success. Should be okay for
indicating failure. */
- memset(&r, 0xFF, sizeof(r));
- return r;
+}
+static struct nvme_cqe +nvme_consume_cqe(struct nvme_sq *sq) +{
- struct nvme_cq *cq = sq->cq;
- if (!nvme_poll_cq(cq)) {
/* Cannot consume a completion queue entry, if there is none ready. */
return nvme_error_cqe();
- }
- struct nvme_cqe *cqe = &cq->cqe[cq->head];
- u16 cq_next_head = (cq->head + 1) & cq->common.mask;
- dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
- if (cq_next_head < cq->head) {
dprintf(3, "cq %p wrap\n", cq);
cq->phase = ~cq->phase;
- }
- cq->head = cq_next_head;
- /* Update the submission queue head. */
- if (cqe->sq_head != sq->head) {
sq->head = cqe->sq_head;
dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
- }
- /* Tell the controller that we consumed the completion. */
- writel(cq->common.dbl, cq->head);
- return *cqe;
+}
+static struct nvme_cqe +nvme_wait(struct nvme_sq *sq) +{
- static const unsigned nvme_timeout = 500 /* ms */;
- u32 to = timer_calc(nvme_timeout);
- while (!nvme_poll_cq(sq->cq)) {
yield();
if (timer_check(to)) {
warn_timeout();
return nvme_error_cqe();
}
- }
- return nvme_consume_cqe(sq);
+}
+/* Returns the next submission queue entry (or NULL if the queue is full). It
- also fills out Command Dword 0 and clears the rest. */
+static struct nvme_sqe * +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) +{
- if (((sq->head + 1) & sq->common.mask) == sq->tail) {
dprintf(3, "submission queue is full");
return NULL;
- }
- struct nvme_sqe *sqe = &sq->sqe[sq->tail];
- dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
- memset(sqe, 0, sizeof(*sqe));
- sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
- sqe->mptr = (u32)metadata;
- sqe->dptr_prp1 = (u32)data;
- if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) {
/* Data buffer not page aligned. */
panic("data buffer not page aligned: %p\n", data);
warn_internalerror();
Can't call panic().
- }
- return sqe;
+}
+/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */ +static void +nvme_commit_sqe(struct nvme_sq *sq) +{
- dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
- sq->tail = (sq->tail + 1) & sq->common.mask;
- writel(sq->common.dbl, sq->tail);
+}
+/* Perform an identify command on the admin queue and return the resulting
- buffer. This may be a NULL pointer, if something failed. This function
- cannot be used after initialization, because it uses buffers in tmp zone. */
+static union nvme_identify * +nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid) +{
- union nvme_identify *identify_buf = zalloc_page_aligned(memalign_tmp, 4096);
- if (!identify_buf) {
/* Could not allocate identify buffer. */
warn_internalerror();
return NULL;
- }
- struct nvme_sqe *cmd_identify;
- cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq,
NVME_SQE_OPC_ADMIN_IDENTIFY, NULL,
identify_buf);
- if (!cmd_identify) { panic("admin queue full\n"); }
Can't call panic().
- cmd_identify->nsid = nsid;
- cmd_identify->dword[10] = cns;
- nvme_commit_sqe(&ctrl->admin_sq);
- struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
- if (!nvme_is_cqe_success(&cqe)) {
free(identify_buf);
return NULL;
- }
- return identify_buf;
+}
+static struct nvme_identify_ctrl * +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl) +{
- return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl;
+}
+static struct nvme_identify_ns_list * +nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl) +{
- return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST,
0)->ns_list;
+}
+static struct nvme_identify_ns * +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id) +{
- return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS,
ns_id)->ns;
+}
+static void +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) +{
- ns->ctrl = ctrl;
- ns->ns_id = ns_id;
- struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
- if (!id) {
dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
goto free_buffer;
- }
- u8 current_lba_format = id->flbas & 0xF;
- if (current_lba_format > id->nlbaf) {
dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the "
" namespace supports (%u)?\n",
ns_id, current_lba_format, id->nlbaf + 1);
goto free_buffer;
- }
- ns->lba_count = id->nsze;
- struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
- ns->block_size = 1U << fmt->lbads;
- ns->metadata_size = fmt->ms;
- if (ns->block_size > NVME_PAGE_SIZE) {
/* If we see devices that trigger this path, we need to increase our
buffer size. */
warn_internalerror();
goto free_buffer;
- }
- ns->drive.cntl_id = ns - ctrl->ns;
- ns->drive.removable = 0;
- ns->drive.type = DTYPE_NVME;
- ns->drive.blksize = ns->block_size;
- ns->drive.sectors = ns->lba_count;
- ns->dma_buffer = zalloc_page_aligned(memalign_high, NVME_PAGE_SIZE);
- char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte "
"blocks + %u-byte metadata)\n",
ns_id, (ns->lba_count * ns->block_size) >> 20,
ns->lba_count, ns->block_size, ns->metadata_size);
- dprintf(3, "%s", desc);
- boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
- free_buffer:
- free (id);
- }
+/* Returns 0 on success. */ +static int +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx) +{
- struct nvme_sqe *cmd_create_cq;
- nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
- cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq,
NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL,
cq->cqe);
- if (!cmd_create_cq) {
return -1;
- }
- cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
- cmd_create_cq->dword[11] = 1 /* physically contiguous */;
- nvme_commit_sqe(&ctrl->admin_sq);
- struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
- if (!nvme_is_cqe_success(&cqe)) {
dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return -1;
- }
- return 0;
+}
+/* Returns 0 on success. */ +static int +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq) +{
- struct nvme_sqe *cmd_create_sq;
- nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq);
- cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq,
NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL,
sq->sqe);
- if (!cmd_create_sq) {
return -1;
- }
- cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
- cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */;
- dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq,
cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
- nvme_commit_sqe(&ctrl->admin_sq);
- struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
- if (!nvme_is_cqe_success(&cqe)) {
dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return -1;
- }
- return 0;
+}
+/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
- page boundaries. */
+static int +nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
int write)
+{
- u32 buf_addr = (u32)buf;
- if ((buf_addr & 0x3) ||
((buf_addr & ~(NVME_PAGE_SIZE - 1)) !=
((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) {
/* Buffer is misaligned or crosses page boundary */
warn_internalerror();
return DISK_RET_EBADTRACK;
- }
- struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
write ? NVME_SQE_OPC_IO_WRITE
: NVME_SQE_OPC_IO_READ,
NULL, buf);
- io_read->nsid = ns->ns_id;
- io_read->dword[10] = (u32)lba;
- io_read->dword[11] = (u32)(lba >> 32);
- io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
- nvme_commit_sqe(&ns->ctrl->io_sq);
- struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
- if (!nvme_is_cqe_success(&cqe)) {
dprintf(2, "read io: %08x %08x %08x %08x\n",
cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return DISK_RET_EBADTRACK;
- }
- return DISK_RET_SUCCESS;
+}
+static int +nvme_create_io_queues(struct nvme_ctrl *ctrl) +{
- if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
return -1;
- if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
return -1;
- return 0;
+}
+/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */ +static int +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy) +{
- u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
- u32 to = timer_calc(max_to);
- u32 csts;
- while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
yield();
if (csts & NVME_CSTS_FATAL) {
dprintf(3, "NVMe fatal error during controller shutdown\n");
return -1;
}
if (timer_check(to)) {
warn_timeout();
return -1;
}
- }
- return 0;
+}
+static void +nvme_controller_enable(struct nvme_ctrl *ctrl) +{
- pci_enable_busmaster(ctrl->pci);
- /* Turn the controller off. */
- ctrl->reg->cc = 0;
- if (nvme_wait_csts_rdy(ctrl, 0)) {
dprintf(2, "NVMe fatal error during controller shutdown\n");
return;
- }
- ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
- nvme_init_cq(ctrl, &ctrl->admin_cq, 1,
NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
- nvme_init_sq(ctrl, &ctrl->admin_sq, 0,
NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
- ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16
| ctrl->admin_sq.common.mask;
- /* Create the admin queue pair */
- if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory;
- ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
- ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
- dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe);
- dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe);
- ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20)
| (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
- if (nvme_wait_csts_rdy(ctrl, 1)) {
dprintf(2, "NVMe fatal error while enabling controller\n");
return;
Shouldn't this free the allocated memory on failure (ie, goto failed)?
- }
- /* The admin queue is set up and the controller is ready. Let's figure out
what namespaces we have. */
- struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
- if (!identify) {
dprintf(2, "NVMe couldn't identify controller.\n");
goto failed;
- }
- /* TODO Print model/serial info. */
- dprintf(3, "NVMe has %u namespace%s.\n",
identify->nn, (identify->nn == 1) ? "" : "s");
- ctrl->ns_count = identify->nn;
- free(identify);
- if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
/* No point to continue, if the controller says it doesn't have
namespaces or we couldn't create I/O queues. */
goto failed;
- }
- ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count);
- if (!ctrl->ns) goto out_of_memory;
- memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count);
- struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl);
- if (!ns_list) {
dprintf(2, "NVMe couldn't get namespace list.\n");
goto failed;
- }
- /* Populate namespace IDs */
- int ns_idx;
- for (ns_idx = 0;
ns_idx < ARRAY_SIZE(ns_list->ns_id)
&& ns_idx < ctrl->ns_count
&& ns_list->ns_id[ns_idx];
ns_idx++) {
nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]);
- }
- free(ns_list);
- /* If for some reason the namespace list gives us fewer namespaces, we just
go along. */
- if (ns_idx != ctrl->ns_count) {
dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx);
ctrl->ns_count = ns_idx;
- }
- dprintf(3, "NVMe initialization complete!\n");
- return;
- out_of_memory:
- warn_noalloc();
- failed:
- free(ctrl->admin_sq.sqe);
- free(ctrl->admin_cq.cqe);
- free(ctrl->ns);
- return;
+}
+/* Initialize an NVMe controller and detect its drives. */ +static void +nvme_controller_setup(void *opaque) +{
- struct pci_device *pci = opaque;
- if (create_bounce_buf() < 0)
return;
The bounce buffer isn't used anywhere in the code, so it's not necessary to create it.
- struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
- if (!reg)
return;
- u32 version = reg->vs;
- dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
version >> 16, (version >> 8) & 0xFF, version & 0xFF);
- dprintf(3, " Capabilities %016llx\n", reg->cap);
- if (version < 0x00010100U) {
dprintf(3, "Need at least 1.1.0! Skipping.\n");
return;
- }
- if (~reg->cap & NVME_CAP_CSS_NVME) {
dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
return;
- }
- struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl));
- if (!ctrl) {
warn_noalloc();
return;
- }
- memset(ctrl, 0, sizeof(*ctrl));
- ctrl->reg = reg;
- ctrl->pci = pci;
- nvme_controller_enable(ctrl);
Shouldn't ctrl be free'd on failure?
+}
+// Locate and init NVMe controllers +static void +nvme_scan(void) +{
- // Scan PCI bus for ATA adapters
- struct pci_device *pci;
- foreachpci(pci) {
if (pci->class != PCI_CLASS_STORAGE_NVME)
continue;
if (pci->prog_if != 2 /* as of NVM 1.0e */) {
dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
continue;
}
run_thread(nvme_controller_setup, pci);
- }
+}
+static int +nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) +{
- int res = DISK_RET_SUCCESS;
- u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
- u16 i;
- for (i = 0; i < op->count || res != DISK_RET_SUCCESS;) {
u16 blocks_remaining = op->count - i;
u16 blocks = blocks_remaining < max_blocks ? blocks_remaining
: max_blocks;
char *op_buf = op->buf_fl + i * ns->block_size;
if (write) {
memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
}
res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
: "read",
op->lba + i, blocks, res);
if (!write && res == DISK_RET_SUCCESS) {
memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
}
i += blocks;
- }
- return res;
+}
+int +nvme_process_op(struct disk_op_s *op) +{
- if (!CONFIG_NVME || !runningOnQEMU())
return DISK_RET_SUCCESS;
- struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace,
drive);
- switch (op->command) {
- case CMD_READ:
- case CMD_WRITE:
return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
- default:
return default_process_op(op);
- }
+}
+void +nvme_setup(void) +{
- ASSERT32FLAT();
- if (!CONFIG_NVME)
return;
With the driver set to require QEMU_HARDWARE, this test should be: if (!CONFIG_NVME || !runningOnQEMU()) return; to make sure the code is actually running on QEMU.
- dprintf(3, "init nvme\n");
- nvme_scan();
+}
+/* EOF */
-Kevin
This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= 2.7.0), because older versions have buggy NVMe support.
The NVMe code is currently only enabled on Qemu due to lack of testing on real hardware.
Signed-off-by: Julian Stecklina jsteckli@amazon.de --- Makefile | 2 +- src/Kconfig | 6 + src/block.c | 4 + src/block.h | 1 + src/hw/nvme-int.h | 199 +++++++++++++++++ src/hw/nvme.c | 655 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hw/nvme.h | 17 ++ src/hw/pci_ids.h | 1 + 8 files changed, 884 insertions(+), 1 deletion(-) create mode 100644 src/hw/nvme-int.h create mode 100644 src/hw/nvme.c create mode 100644 src/hw/nvme.h
diff --git a/Makefile b/Makefile index 3b94ee0..946df7e 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ SRC32FLAT=$(SRCBOTH) post.c e820map.c malloc.c romfile.c x86.c optionroms.c \ fw/paravirt.c fw/shadow.c fw/pciinit.c fw/smm.c fw/smp.c fw/mtrr.c fw/xen.c \ fw/acpi.c fw/mptable.c fw/pirtable.c fw/smbios.c fw/romfile_loader.c \ hw/virtio-ring.c hw/virtio-pci.c hw/virtio-blk.c hw/virtio-scsi.c \ - hw/tpm_drivers.c + hw/tpm_drivers.c hw/nvme.c SRC32SEG=string.c output.c pcibios.c apm.c stacks.c hw/pci.c hw/serialio.c DIRS=src src/hw src/fw vgasrc
diff --git a/src/Kconfig b/src/Kconfig index 457d082..e1b83a4 100644 --- a/src/Kconfig +++ b/src/Kconfig @@ -227,6 +227,12 @@ menu "Hardware support" help Support floppy images stored in coreboot flash or from QEMU fw_cfg. + config NVME + depends on DRIVES && QEMU_HARDWARE + bool "NVMe controllers" + default y + help + Support for NVMe disk code.
config PS2PORT depends on KEYBOARD || MOUSE diff --git a/src/block.c b/src/block.c index f7280cf..d104f6d 100644 --- a/src/block.c +++ b/src/block.c @@ -20,6 +20,7 @@ #include "hw/usb-uas.h" // uas_process_op #include "hw/virtio-blk.h" // process_virtio_blk_op #include "hw/virtio-scsi.h" // virtio_scsi_process_op +#include "hw/nvme.h" // nvme_process_op #include "malloc.h" // malloc_low #include "output.h" // dprintf #include "stacks.h" // call32 @@ -502,6 +503,7 @@ block_setup(void) megasas_setup(); pvscsi_setup(); mpt_scsi_setup(); + nvme_setup(); }
// Fallback handler for command requests not implemented by drivers @@ -571,6 +573,8 @@ process_op_32(struct disk_op_s *op) return virtio_scsi_process_op(op); case DTYPE_PVSCSI: return pvscsi_process_op(op); + case DTYPE_NVME: + return nvme_process_op(op); default: return process_op_both(op); } diff --git a/src/block.h b/src/block.h index 0f15ff9..f03ec38 100644 --- a/src/block.h +++ b/src/block.h @@ -82,6 +82,7 @@ struct drive_s { #define DTYPE_PVSCSI 0x83 #define DTYPE_MPT_SCSI 0x84 #define DTYPE_SDCARD 0x90 +#define DTYPE_NVME 0x91
#define MAXDESCSIZE 80
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h new file mode 100644 index 0000000..9f95dd8 --- /dev/null +++ b/src/hw/nvme-int.h @@ -0,0 +1,199 @@ +// NVMe datastructures and constants +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#ifndef __NVME_INT_H +#define __NVME_INT_H + +#include "types.h" // u32 +#include "pcidevice.h" // struct pci_device + +/* Data structures */ + +/* The register file of a NVMe host controller. This struct follows the naming + scheme in the NVMe specification. */ +struct nvme_reg { + u64 cap; /* controller capabilities */ + u32 vs; /* version */ + u32 intms; /* interrupt mask set */ + u32 intmc; /* interrupt mask clear */ + u32 cc; /* controller configuration */ + u32 _res0; + u32 csts; /* controller status */ + u32 _res1; + u32 aqa; /* admin queue attributes */ + u64 asq; /* admin submission queue base address */ + u64 acq; /* admin completion queue base address */ +}; + +/* Submission queue entry */ +struct nvme_sqe { + union { + u32 dword[16]; + struct { + u32 cdw0; /* Command DWORD 0 */ + u32 nsid; /* Namespace ID */ + u64 _res0; + u64 mptr; /* metadata ptr */ + + u64 dptr_prp1; + u64 dptr_prp2; + }; + }; +}; + +/* Completion queue entry */ +struct nvme_cqe { + union { + u32 dword[4]; + struct { + u32 cdw0; + u32 _res0; + u16 sq_head; + u16 sq_id; + u16 cid; + u16 status; + }; + }; +}; + +/* The common part of every submission or completion queue. */ +struct nvme_queue { + u32 *dbl; /* doorbell */ + u16 mask; /* length - 1 */ +}; + +struct nvme_cq { + struct nvme_queue common; + struct nvme_cqe *cqe; + + /* We have read upto (but not including) this entry in the queue. */ + u16 head; + + /* The current phase bit the controller uses to indicate that it has written + a new entry. This is inverted after each wrap. */ + unsigned phase : 1; +}; + +struct nvme_sq { + struct nvme_queue common; + struct nvme_sqe *sqe; + + /* Corresponding completion queue. We only support a single SQ per CQ. */ + struct nvme_cq *cq; + + /* The last entry the controller has fetched. */ + u16 head; + + /* The last value we have written to the tail doorbell. */ + u16 tail; +}; + +struct nvme_ctrl { + struct pci_device *pci; + struct nvme_reg volatile *reg; + + u32 doorbell_stride; /* in bytes */ + + struct nvme_sq admin_sq; + struct nvme_cq admin_cq; + + u32 ns_count; + struct nvme_namespace *ns; + + struct nvme_sq io_sq; + struct nvme_cq io_cq; +}; + +struct nvme_namespace { + struct drive_s drive; + struct nvme_ctrl *ctrl; + + u32 ns_id; + + u64 lba_count; /* The total amount of sectors. */ + + u32 block_size; + u32 metadata_size; + + /* Page aligned buffer of size NVME_PAGE_SIZE. */ + char *dma_buffer; +}; + +/* Data structures for NVMe admin identify commands */ + +struct nvme_identify_ctrl { + u16 vid; + u16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + + char _boring[516 - 72]; + + u32 nn; /* number of namespaces */ +}; + +struct nvme_identify_ns_list { + u32 ns_id[1024]; +}; + +struct nvme_lba_format { + u16 ms; + u8 lbads; + u8 rp; + u8 res; +}; + +struct nvme_identify_ns { + u64 nsze; + u64 ncap; + u64 nuse; + u8 nsfeat; + u8 nlbaf; + u8 flbas; + + char _boring[128 - 27]; + + struct nvme_lba_format lbaf[16]; +}; + +union nvme_identify { + struct nvme_identify_ns ns; + struct nvme_identify_ctrl ctrl; + struct nvme_identify_ns_list ns_list; +}; + +/* NVMe constants */ + +#define NVME_CAP_CSS_NVME (1ULL << 37) + +#define NVME_CSTS_FATAL (1U << 1) +#define NVME_CSTS_RDY (1U << 0) + +#define NVME_CC_EN (1U << 0) + +#define NVME_SQE_OPC_ADMIN_CREATE_IO_SQ 1U +#define NVME_SQE_OPC_ADMIN_CREATE_IO_CQ 5U +#define NVME_SQE_OPC_ADMIN_IDENTIFY 6U + +#define NVME_SQE_OPC_IO_WRITE 1U +#define NVME_SQE_OPC_IO_READ 2U + +#define NVME_ADMIN_IDENTIFY_CNS_ID_NS 0U +#define NVME_ADMIN_IDENTIFY_CNS_ID_CTRL 1U +#define NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST 2U + +#define NVME_CQE_DW3_P (1U << 16) + +#define NVME_PAGE_SIZE 4096 + +/* Length for the queue entries. */ +#define NVME_SQE_SIZE_LOG 6 +#define NVME_CQE_SIZE_LOG 4 + +#endif + +/* EOF */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c new file mode 100644 index 0000000..31edf29 --- /dev/null +++ b/src/hw/nvme.c @@ -0,0 +1,655 @@ +// Low level NVMe disk access +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#include "blockcmd.h" +#include "fw/paravirt.h" // runningOnQEMU +#include "malloc.h" // malloc_high +#include "output.h" // dprintf +#include "pci.h" +#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME +#include "pci_regs.h" // PCI_BASE_ADDRESS_0 +#include "pcidevice.h" // foreachpci +#include "stacks.h" // yield +#include "std/disk.h" // DISK_RET_ +#include "string.h" // memset +#include "util.h" // boot_add_hd +#include "x86.h" // readl + +#include "nvme.h" +#include "nvme-int.h" + +static void * +zalloc_page_aligned(struct zone_s *zone, u32 size) +{ + void *res = _malloc(zone, size, NVME_PAGE_SIZE); + if (res) memset(res, 0, size); + return res; +} + +static void +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx, + u16 length) +{ + memset(q, 0, sizeof(*q)); + q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride); + dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl); + q->mask = length - 1; +} + +static void +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length, + struct nvme_cq *cq) +{ + nvme_init_queue_common(ctrl, &sq->common, q_idx, length); + sq->sqe = zalloc_page_aligned(&ZoneHigh, sizeof(*sq->sqe) * length); + dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe); + sq->cq = cq; + sq->head = 0; + sq->tail = 0; +} + +static void +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length) +{ + nvme_init_queue_common(ctrl, &cq->common, q_idx, length); + cq->cqe = zalloc_page_aligned(&ZoneHigh, sizeof(*cq->cqe) * length); + + cq->head = 0; + + /* All CQE phase bits are initialized to zero. This means initially we wait + for the host controller to set these to 1. */ + cq->phase = 1; +} + +static int +nvme_poll_cq(struct nvme_cq *cq) +{ + u32 dw3 = readl(&cq->cqe[cq->head].dword[3]); + return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase); +} + +static int +nvme_is_cqe_success(struct nvme_cqe const *cqe) +{ + return (cqe->status & 0xFF) >> 1 == 0; +} + + +static struct nvme_cqe +nvme_error_cqe(void) +{ + struct nvme_cqe r; + + /* 0xFF is a vendor specific status code != success. Should be okay for + indicating failure. */ + memset(&r, 0xFF, sizeof(r)); + return r; +} + +static struct nvme_cqe +nvme_consume_cqe(struct nvme_sq *sq) +{ + struct nvme_cq *cq = sq->cq; + + if (!nvme_poll_cq(cq)) { + /* Cannot consume a completion queue entry, if there is none ready. */ + return nvme_error_cqe(); + } + + struct nvme_cqe *cqe = &cq->cqe[cq->head]; + u16 cq_next_head = (cq->head + 1) & cq->common.mask; + dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head); + if (cq_next_head < cq->head) { + dprintf(3, "cq %p wrap\n", cq); + cq->phase = ~cq->phase; + } + cq->head = cq_next_head; + + /* Update the submission queue head. */ + if (cqe->sq_head != sq->head) { + sq->head = cqe->sq_head; + dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head); + } + + /* Tell the controller that we consumed the completion. */ + writel(cq->common.dbl, cq->head); + + return *cqe; +} + +static struct nvme_cqe +nvme_wait(struct nvme_sq *sq) +{ + static const unsigned nvme_timeout = 500 /* ms */; + u32 to = timer_calc(nvme_timeout); + while (!nvme_poll_cq(sq->cq)) { + yield(); + + if (timer_check(to)) { + warn_timeout(); + return nvme_error_cqe(); + } + } + + return nvme_consume_cqe(sq); +} + +/* Returns the next submission queue entry (or NULL if the queue is full). It + also fills out Command Dword 0 and clears the rest. */ +static struct nvme_sqe * +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) +{ + if (((sq->head + 1) & sq->common.mask) == sq->tail) { + dprintf(3, "submission queue is full"); + return NULL; + } + + struct nvme_sqe *sqe = &sq->sqe[sq->tail]; + dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail); + + memset(sqe, 0, sizeof(*sqe)); + sqe->cdw0 = opc | (sq->tail << 16 /* CID */); + sqe->mptr = (u32)metadata; + sqe->dptr_prp1 = (u32)data; + + if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) { + /* Data buffer not page aligned. */ + warn_internalerror(); + } + + return sqe; +} + +/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */ +static void +nvme_commit_sqe(struct nvme_sq *sq) +{ + dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail); + sq->tail = (sq->tail + 1) & sq->common.mask; + writel(sq->common.dbl, sq->tail); +} + +/* Perform an identify command on the admin queue and return the resulting + buffer. This may be a NULL pointer, if something failed. This function + cannot be used after initialization, because it uses buffers in tmp zone. */ +static union nvme_identify * +nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid) +{ + union nvme_identify *identify_buf = zalloc_page_aligned(&ZoneTmpHigh, 4096); + if (!identify_buf) { + /* Could not allocate identify buffer. */ + warn_internalerror(); + return NULL; + } + + struct nvme_sqe *cmd_identify; + cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_IDENTIFY, NULL, + identify_buf); + + if (!cmd_identify) { + warn_internalerror(); + goto error; + } + + cmd_identify->nsid = nsid; + cmd_identify->dword[10] = cns; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + goto error; + } + + return identify_buf; + error: + free(identify_buf); + return NULL; +} + +static struct nvme_identify_ctrl * +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl; +} + +static struct nvme_identify_ns_list * +nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST, + 0)->ns_list; +} + +static struct nvme_identify_ns * +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS, + ns_id)->ns; +} + +static void +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) +{ + ns->ctrl = ctrl; + ns->ns_id = ns_id; + + struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id); + if (!id) { + dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id); + goto free_buffer; + } + + u8 current_lba_format = id->flbas & 0xF; + if (current_lba_format > id->nlbaf) { + dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the " + " namespace supports (%u)?\n", + ns_id, current_lba_format, id->nlbaf + 1); + goto free_buffer; + } + + ns->lba_count = id->nsze; + + struct nvme_lba_format *fmt = &id->lbaf[current_lba_format]; + + ns->block_size = 1U << fmt->lbads; + ns->metadata_size = fmt->ms; + + if (ns->block_size > NVME_PAGE_SIZE) { + /* If we see devices that trigger this path, we need to increase our + buffer size. */ + warn_internalerror(); + goto free_buffer; + } + + ns->drive.cntl_id = ns - ctrl->ns; + ns->drive.removable = 0; + ns->drive.type = DTYPE_NVME; + ns->drive.blksize = ns->block_size; + ns->drive.sectors = ns->lba_count; + + ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE); + + char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte " + "blocks + %u-byte metadata)\n", + ns_id, (ns->lba_count * ns->block_size) >> 20, + ns->lba_count, ns->block_size, ns->metadata_size); + + dprintf(3, "%s", desc); + boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci)); + + free_buffer: + free (id); + } + +/* Returns 0 on success. */ +static int +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx) +{ + struct nvme_sqe *cmd_create_cq; + + nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL, + cq->cqe); + if (!cmd_create_cq) { + return -1; + } + + cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1); + cmd_create_cq->dword[11] = 1 /* physically contiguous */; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "create io cq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return -1; + } + + return 0; +} + +/* Returns 0 on success. */ +static int +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq) +{ + struct nvme_sqe *cmd_create_sq; + + nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq); + cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL, + sq->sqe); + if (!cmd_create_sq) { + return -1; + } + + cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1); + cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */; + dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq, + cmd_create_sq->dword[10], cmd_create_sq->dword[11]); + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "create io sq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + return -1; + } + + return 0; +} + +/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross + page boundaries. */ +static int +nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, + int write) +{ + u32 buf_addr = (u32)buf; + + if ((buf_addr & 0x3) || + ((buf_addr & ~(NVME_PAGE_SIZE - 1)) != + ((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) { + /* Buffer is misaligned or crosses page boundary */ + warn_internalerror(); + return DISK_RET_EBADTRACK; + } + + struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, + write ? NVME_SQE_OPC_IO_WRITE + : NVME_SQE_OPC_IO_READ, + NULL, buf); + io_read->nsid = ns->ns_id; + io_read->dword[10] = (u32)lba; + io_read->dword[11] = (u32)(lba >> 32); + io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1); + + nvme_commit_sqe(&ns->ctrl->io_sq); + + struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq); + + if (!nvme_is_cqe_success(&cqe)) { + dprintf(2, "read io: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return DISK_RET_EBADTRACK; + } + + return DISK_RET_SUCCESS; +} + + +static int +nvme_create_io_queues(struct nvme_ctrl *ctrl) +{ + if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3)) + return -1; + + if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq)) + return -1; + + return 0; +} + +/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */ +static int +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy) +{ + u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU); + u32 to = timer_calc(max_to); + u32 csts; + + while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) { + yield(); + + if (csts & NVME_CSTS_FATAL) { + dprintf(3, "NVMe fatal error during controller shutdown\n"); + return -1; + } + + if (timer_check(to)) { + warn_timeout(); + return -1; + } + } + + return 0; +} + +/* Returns 0 on success. */ +static int +nvme_controller_enable(struct nvme_ctrl *ctrl) +{ + pci_enable_busmaster(ctrl->pci); + + /* Turn the controller off. */ + ctrl->reg->cc = 0; + if (nvme_wait_csts_rdy(ctrl, 0)) { + dprintf(2, "NVMe fatal error during controller shutdown\n"); + return -1; + } + + ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF); + + nvme_init_cq(ctrl, &ctrl->admin_cq, 1, + NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + + nvme_init_sq(ctrl, &ctrl->admin_sq, 0, + NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq); + + ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16 + | ctrl->admin_sq.common.mask; + + /* Create the admin queue pair */ + if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory; + + ctrl->reg->asq = (u32)ctrl->admin_sq.sqe; + ctrl->reg->acq = (u32)ctrl->admin_cq.cqe; + + dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe); + dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe); + + ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20) + | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */); + + if (nvme_wait_csts_rdy(ctrl, 1)) { + dprintf(2, "NVMe fatal error while enabling controller\n"); + goto failed; + } + /* The admin queue is set up and the controller is ready. Let's figure out + what namespaces we have. */ + + struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl); + + if (!identify) { + dprintf(2, "NVMe couldn't identify controller.\n"); + goto failed; + } + + /* TODO Print model/serial info. */ + dprintf(3, "NVMe has %u namespace%s.\n", + identify->nn, (identify->nn == 1) ? "" : "s"); + + ctrl->ns_count = identify->nn; + free(identify); + + if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) { + /* No point to continue, if the controller says it doesn't have + namespaces or we couldn't create I/O queues. */ + goto failed; + } + + ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count); + if (!ctrl->ns) goto out_of_memory; + memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count); + + struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl); + if (!ns_list) { + dprintf(2, "NVMe couldn't get namespace list.\n"); + goto failed; + } + + /* Populate namespace IDs */ + int ns_idx; + for (ns_idx = 0; + ns_idx < ARRAY_SIZE(ns_list->ns_id) + && ns_idx < ctrl->ns_count + && ns_list->ns_id[ns_idx]; + ns_idx++) { + nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]); + } + + free(ns_list); + + /* If for some reason the namespace list gives us fewer namespaces, we just + go along. */ + if (ns_idx != ctrl->ns_count) { + dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx); + ctrl->ns_count = ns_idx; + } + + dprintf(3, "NVMe initialization complete!\n"); + return 0; + + out_of_memory: + warn_noalloc(); + failed: + free(ctrl->admin_sq.sqe); + free(ctrl->admin_cq.cqe); + free(ctrl->ns); + return -1; +} + +/* Initialize an NVMe controller and detect its drives. */ +static void +nvme_controller_setup(void *opaque) +{ + struct pci_device *pci = opaque; + + struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0); + if (!reg) + return; + + u32 version = reg->vs; + dprintf(3, "Found NVMe controller with version %u.%u.%u.\n", + version >> 16, (version >> 8) & 0xFF, version & 0xFF); + dprintf(3, " Capabilities %016llx\n", reg->cap); + + if (version < 0x00010100U) { + dprintf(3, "Need at least 1.1.0! Skipping.\n"); + return; + } + + if (~reg->cap & NVME_CAP_CSS_NVME) { + dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n"); + return; + } + + struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl)); + if (!ctrl) { + warn_noalloc(); + return; + } + + memset(ctrl, 0, sizeof(*ctrl)); + + ctrl->reg = reg; + ctrl->pci = pci; + + if (nvme_controller_enable(ctrl)) { + /* Initialization failed */ + free(ctrl); + } +} + +// Locate and init NVMe controllers +static void +nvme_scan(void) +{ + // Scan PCI bus for ATA adapters + struct pci_device *pci; + + foreachpci(pci) { + if (pci->class != PCI_CLASS_STORAGE_NVME) + continue; + if (pci->prog_if != 2 /* as of NVM 1.0e */) { + dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if); + continue; + } + + run_thread(nvme_controller_setup, pci); + } +} + +static int +nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) +{ + int res = DISK_RET_SUCCESS; + u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; + u16 i; + + for (i = 0; i < op->count || res != DISK_RET_SUCCESS;) { + u16 blocks_remaining = op->count - i; + u16 blocks = blocks_remaining < max_blocks ? blocks_remaining + : max_blocks; + char *op_buf = op->buf_fl + i * ns->block_size; + + if (write) { + memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); + } + + res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write); + dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" + : "read", + op->lba + i, blocks, res); + + if (!write && res == DISK_RET_SUCCESS) { + memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); + } + + i += blocks; + } + + return res; +} + +int +nvme_process_op(struct disk_op_s *op) +{ + if (!CONFIG_NVME || !runningOnQEMU()) + return DISK_RET_SUCCESS; + + struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace, + drive); + + switch (op->command) { + case CMD_READ: + case CMD_WRITE: + return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE); + default: + return default_process_op(op); + } +} + +void +nvme_setup(void) +{ + ASSERT32FLAT(); + if (!CONFIG_NVME || !runningOnQEMU()) + return; + + dprintf(3, "init nvme\n"); + nvme_scan(); +} + +/* EOF */ diff --git a/src/hw/nvme.h b/src/hw/nvme.h new file mode 100644 index 0000000..4dbb70a --- /dev/null +++ b/src/hw/nvme.h @@ -0,0 +1,17 @@ +// External interfaces for low level NVMe support +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#ifndef __NVME_H +#define __NVME_H + +#include "block.h" // struct disk_op_s + +void nvme_setup(void); +int nvme_process_op(struct disk_op_s *op); + +#endif + +/* EOF */ diff --git a/src/hw/pci_ids.h b/src/hw/pci_ids.h index cdf9b3c..4ac73b4 100644 --- a/src/hw/pci_ids.h +++ b/src/hw/pci_ids.h @@ -18,6 +18,7 @@ #define PCI_CLASS_STORAGE_SATA 0x0106 #define PCI_CLASS_STORAGE_SATA_AHCI 0x010601 #define PCI_CLASS_STORAGE_SAS 0x0107 +#define PCI_CLASS_STORAGE_NVME 0x0108 #define PCI_CLASS_STORAGE_OTHER 0x0180
#define PCI_BASE_CLASS_NETWORK 0x02
On Mon, Feb 13, 2017 at 10:03:59AM +0100, Julian Stecklina wrote:
This patch enables SeaBIOS to boot from NVMe. Finding namespaces and basic I/O works. Testing has been done in qemu and so far it works with Grub, syslinux, and the FreeBSD loader. You need a recent Qemu (>= 2.7.0), because older versions have buggy NVMe support.
The NVMe code is currently only enabled on Qemu due to lack of testing on real hardware.
Thanks. I committed this patch.
-Kevin