While looking at VM bootup times, we stumbled over the fact that the NVMe code only does I/O operations of up to 4kb at a given point in time. This is usually ok, but if you have an OS that loads a lot of data on boot in combination to network backed storage, it shows in bootup times.
There is no need to restrict ourselves to 4kb though. The INT13 call we receive gives us much larger chunks which we can just map into a native bigger NVMe I/O call if the request buffer is page aligned.
This patch implements all logic required to do the above and gives a substantial performance boost on boot.
v1 -> v2:
- fix dprintf - Fix bounds check on PRP list add logic - Reduce PRP list to 15 entries embedded in the ns struct. This reduces BIOS reserved memory footprint by 4kb.
Alexander Graf (3): nvme: Record maximum allowed request size nvme: Allow to set PRP2 nvme: Pass large I/O requests as PRP lists
src/hw/nvme-int.h | 16 ++++++++- src/hw/nvme.c | 106 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 105 insertions(+), 17 deletions(-)
NVMe has a limit on how many sectors it can handle at most within a single request. Remember that number, so that in a follow-up patch, we can verify that we don't exceed it.
Signed-off-by: Alexander Graf graf@amazon.com
---
v1 -> v2:
- fix dprintf --- src/hw/nvme-int.h | 8 +++++++- src/hw/nvme.c | 13 +++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index 9f95dd8..674008a 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -117,6 +117,7 @@ struct nvme_namespace {
u32 block_size; u32 metadata_size; + u32 max_req_size;
/* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer; @@ -131,7 +132,12 @@ struct nvme_identify_ctrl { char mn[40]; char fr[8];
- char _boring[516 - 72]; + u8 rab; + u8 ieee[3]; + u8 cmic; + u8 mdts; + + char _boring[516 - 78];
u32 nn; /* number of namespaces */ }; diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 6a01204..7e56134 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -238,7 +238,8 @@ nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id) }
static void -nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) +nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id, + u8 mdts) { ns->ctrl = ctrl; ns->ns_id = ns_id; @@ -281,6 +282,14 @@ nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id) ns->drive.blksize = ns->block_size; ns->drive.sectors = ns->lba_count;
+ if (mdts) { + ns->max_req_size = 1U << mdts; + dprintf(3, "NVME NS %u max request size: %d sectors\n", + ns_id, ns->max_req_size); + } else { + ns->max_req_size = -1U; + } + ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE);
char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte " @@ -567,7 +576,7 @@ nvme_controller_enable(struct nvme_ctrl *ctrl) /* Populate namespace IDs */ int ns_idx; for (ns_idx = 0; ns_idx < ctrl->ns_count; ns_idx++) { - nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_idx + 1); + nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_idx + 1, identify->mdts); }
dprintf(3, "NVMe initialization complete!\n");
When creating a PRP based I/O request, we pass in the pointer to operate on. Going forward, we will want to be able to pass additional pointers though for mappings above 4k.
This patch adds a parameter to nvme_get_next_sqe() to pass in the PRP2 value of an NVMe I/O request, paving the way for a future patch to implement PRP lists.
Signed-off-by: Alexander Graf graf@amazon.com Reviewed-by: Filippo Sironi sironi@amazon.de --- src/hw/nvme.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 7e56134..04bbba4 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -152,7 +152,7 @@ nvme_wait(struct nvme_sq *sq) /* Returns the next submission queue entry (or NULL if the queue is full). It also fills out Command Dword 0 and clears the rest. */ static struct nvme_sqe * -nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) +nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data, void *data2) { if (((sq->head + 1) & sq->common.mask) == sq->tail) { dprintf(3, "submission queue is full"); @@ -166,6 +166,7 @@ nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data) sqe->cdw0 = opc | (sq->tail << 16 /* CID */); sqe->mptr = (u32)metadata; sqe->dptr_prp1 = (u32)data; + sqe->dptr_prp2 = (u32)data2;
if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) { /* Data buffer not page aligned. */ @@ -200,7 +201,7 @@ nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid) struct nvme_sqe *cmd_identify; cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_IDENTIFY, NULL, - identify_buf); + identify_buf, NULL);
if (!cmd_identify) { warn_internalerror(); @@ -338,7 +339,7 @@ nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx)
cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL, - cq->cqe); + cq->cqe, NULL); if (!cmd_create_cq) { goto err_destroy_cq; } @@ -382,7 +383,7 @@ nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct
cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL, - sq->sqe); + sq->sqe, NULL); if (!cmd_create_sq) { goto err_destroy_sq; } @@ -429,7 +430,7 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ, - NULL, buf); + NULL, buf, NULL); io_read->nsid = ns->ns_id; io_read->dword[10] = (u32)lba; io_read->dword[11] = (u32)(lba >> 32);
Today, we split every I/O request into at most 4kb chunks and wait for these requests to finish. We encountered issues where the backing storage is network based, so every I/O request needs to go over the network with associated latency cost. A few ms of latency when loading 100MB initrd in 4kb chunks does add up.
NVMe implements a feature to allow I/O requests spanning multiple pages, called PRP lists. This patch takes larger I/O operations and checks if they can be directly passed to the NVMe backing device as PRP list. At least for grub, read operations can always be mapped directly into PRP list items.
This reduces the number of I/O operations required during a typical boot path by roughly a factor of 5.
Signed-off-by: Alexander Graf graf@amazon.com
---
v1 -> v2:
- Fix bounds check on PRP list add logic - Reduce PRP list to 15 entries embedded in the ns struct. This reduces BIOS reserved memory footprint by 4kb. --- src/hw/nvme-int.h | 8 ++++++ src/hw/nvme.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 82 insertions(+), 10 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index 674008a..7947b29 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -10,6 +10,8 @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device
+#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ + /* Data structures */
/* The register file of a NVMe host controller. This struct follows the naming @@ -121,6 +123,11 @@ struct nvme_namespace {
/* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer; + + /* Page List */ + u32 prpl_len; + void *prp1; + u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
/* Data structures for NVMe admin identify commands */ @@ -195,6 +202,7 @@ union nvme_identify { #define NVME_CQE_DW3_P (1U << 16)
#define NVME_PAGE_SIZE 4096 +#define NVME_PAGE_MASK ~(NVME_PAGE_SIZE - 1)
/* Length for the queue entries. */ #define NVME_SQE_SIZE_LOG 6 diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 04bbba4..ec05836 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -168,11 +168,6 @@ nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data, void * sqe->dptr_prp1 = (u32)data; sqe->dptr_prp2 = (u32)data2;
- if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) { - /* Data buffer not page aligned. */ - warn_internalerror(); - } - return sqe; }
@@ -418,19 +413,29 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, int write) { u32 buf_addr = (u32)buf; + void *prp2;
- if ((buf_addr & 0x3) || - ((buf_addr & ~(NVME_PAGE_SIZE - 1)) != - ((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) { - /* Buffer is misaligned or crosses page boundary */ + if (buf_addr & 0x3) { + /* Buffer is misaligned */ warn_internalerror(); return DISK_RET_EBADTRACK; }
+ if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { + /* We need to describe more than 2 pages, rely on PRP List */ + prp2 = ns->prpl; + } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { + /* Directly embed the 2nd page if we only need 2 pages */ + prp2 = (void *)(long)ns->prpl[0]; + } else { + /* One page is enough, don't expose anything else */ + prp2 = NULL; + } + struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ, - NULL, buf, NULL); + NULL, buf, prp2); io_read->nsid = ns->ns_id; io_read->dword[10] = (u32)lba; io_read->dword[11] = (u32)(lba >> 32); @@ -450,6 +455,60 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, return DISK_RET_SUCCESS; }
+static void nvme_reset_prpl(struct nvme_namespace *ns) +{ + ns->prpl_len = 0; +} + +static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) +{ + if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES) + return -1; + + ns->prpl[ns->prpl_len++] = base; + + return 0; +} + +int nvme_build_prpl(struct nvme_namespace *ns, struct disk_op_s *op) +{ + int first_page = 1; + u32 base = (long)op->buf_fl; + s32 size = op->count * ns->block_size; + + if (op->count > ns->max_req_size) + return -1; + + nvme_reset_prpl(ns); + + /* Special case for transfers that fit into PRP1, but are unaligned */ + if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) { + ns->prp1 = op->buf_fl; + return 0; + } + + /* Every request has to be page aligned */ + if (base & ~NVME_PAGE_MASK) + return -1; + + /* Make sure a full block fits into the last chunk */ + if (size & (ns->block_size - 1ULL)) + return -1; + + for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { + if (first_page) { + /* First page is special */ + ns->prp1 = (void*)base; + first_page = 0; + continue; + } + if (nvme_add_prpl(ns, base)) + return -1; + } + + return 0; +} + static int nvme_create_io_queues(struct nvme_ctrl *ctrl) { @@ -668,6 +727,11 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i;
+ if (!nvme_build_prpl(ns, op)) { + /* Request goes via PRP List logic */ + return nvme_io_readwrite(ns, op->lba, ns->prp1, op->count, write); + } + for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) { u16 blocks_remaining = op->count - i; u16 blocks = blocks_remaining < max_blocks ? blocks_remaining