Hi Alex,
I was looking at your recent fix for SeaBIOS NVME. I agree that it is not valid to write to the f-segment during runtime. However, I was struggling to understand the PRP list management in the nvme.c code.
I came up with a different implementation that avoids allocating another buffer in the "high zone". Instead, the code in this patch series builds the page list in the existing "dma bounce buffer".
I don't have a good way to test this on real hardware. It passes simple qemu tests (both with and without kvm). For example:
qemu-system-x86_64 -k en-us -snapshot -L test -chardev stdio,id=seabios -device isa-debugcon,iobase=0x402,chardev=seabios -m 512 -drive file=dos-drivec,if=none,id=drive0 -device nvme,drive=drive0,serial=nvme-1 -boot menu=on
Thanks, -Kevin
Kevin O'Connor (5): nvme: Rework nvme_io_readwrite() to return -1 on error nvme: Add nvme_bounce_xfer() helper function nvme: Convert nvme_build_prpl() to nvme_prpl_xfer() nvme: Pass prp1 and prp2 directly to nvme_io_xfer() nvme: Build the page list in the existing dma buffer
src/hw/nvme-int.h | 7 --- src/hw/nvme.c | 143 ++++++++++++++++++++-------------------------- 2 files changed, 61 insertions(+), 89 deletions(-)
Rename nvme_io_readwrite() to nvme_io_xfer() and change it so it implements the debugging dprintf() and it returns -1 on an error.
Signed-off-by: Kevin O'Connor kevin@koconnor.net --- src/hw/nvme.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index f035fa2..a97501b 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -417,8 +417,8 @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross page boundaries. */ static int -nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, - int write) +nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) { u32 buf_addr = (u32)buf; void *prp2; @@ -426,7 +426,7 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, if (buf_addr & 0x3) { /* Buffer is misaligned */ warn_internalerror(); - return DISK_RET_EBADTRACK; + return -1; }
if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { @@ -457,10 +457,12 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, dprintf(2, "read io: %08x %08x %08x %08x\n", cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
- return DISK_RET_EBADTRACK; + return -1; }
- return DISK_RET_SUCCESS; + dprintf(5, "ns %u %s lba %llu+%u\n", ns->ns_id, write ? "write" : "read", + lba, count); + return count; }
static void nvme_reset_prpl(struct nvme_namespace *ns) @@ -716,20 +718,18 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - int res = DISK_RET_SUCCESS; u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks;
- for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) { + for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size;
blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); if (blocks) { - res = nvme_io_readwrite(ns, op->lba + i, ns->prp1, blocks, write); - dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" - : "read", - op->lba, blocks, res); + int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write); + if (res < 0) + return DISK_RET_EBADTRACK; } else { blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks; @@ -738,12 +738,12 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); }
- res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write); - dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" - : "read", - op->lba + i, blocks, res); + int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer, + blocks, write); + if (res < 0) + return DISK_RET_EBADTRACK;
- if (!write && res == DISK_RET_SUCCESS) { + if (!write) { memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); } } @@ -751,7 +751,7 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) i += blocks; }
- return res; + return DISK_RET_SUCCESS; }
int
Move bounce buffer processing to a new helper function.
Signed-off-by: Kevin O'Connor kevin@koconnor.net --- src/hw/nvme.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index a97501b..fd7c1d0 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -465,6 +465,25 @@ nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return count; }
+// Transfer up to one page of data using the internal dma bounce buffer +static int +nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) +{ + u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; + u16 blocks = count < max_blocks ? count : max_blocks; + + if (write) + memcpy(ns->dma_buffer, buf, blocks * ns->block_size); + + int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write); + + if (!write && res >= 0) + memcpy(buf, ns->dma_buffer, res * ns->block_size); + + return res; +} + static void nvme_reset_prpl(struct nvme_namespace *ns) { ns->prpl_len = 0; @@ -718,7 +737,6 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks;
for (i = 0; i < op->count;) { @@ -731,21 +749,10 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) if (res < 0) return DISK_RET_EBADTRACK; } else { - blocks = blocks_remaining < max_blocks ? blocks_remaining - : max_blocks; - - if (write) { - memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); - } - - int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer, - blocks, write); + int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); if (res < 0) return DISK_RET_EBADTRACK; - - if (!write) { - memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); - } + blocks = res; }
i += blocks;
Rename nvme_build_prpl() to nvme_prpl_xfer() and directly invoke nvme_io_xfer() or nvme_bounce_xfer() from that function.
Signed-off-by: Kevin O'Connor kevin@koconnor.net --- src/hw/nvme-int.h | 1 - src/hw/nvme.c | 42 ++++++++++++++++-------------------------- 2 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index a4c1555..9564c17 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -125,7 +125,6 @@ struct nvme_namespace {
/* Page List */ u32 prpl_len; - void *prp1; u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index fd7c1d0..bafe8bf 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -499,10 +499,13 @@ static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) return 0; }
-static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count) +// Transfer data using page list (if applicable) +static int +nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) { int first_page = 1; - u32 base = (long)op_buf; + u32 base = (long)buf; s32 size;
if (count > ns->max_req_size) @@ -512,31 +515,28 @@ static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count)
size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ - if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) { - ns->prp1 = op_buf; - return count; - } + if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) + return nvme_io_xfer(ns, lba, buf, count, write);
/* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) - return 0; + return nvme_bounce_xfer(ns, lba, buf, count, write);
/* Make sure a full block fits into the last chunk */ if (size & (ns->block_size - 1ULL)) - return 0; + return nvme_bounce_xfer(ns, lba, buf, count, write);
for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { if (first_page) { /* First page is special */ - ns->prp1 = (void*)base; first_page = 0; continue; } if (nvme_add_prpl(ns, base)) - return 0; + return nvme_bounce_xfer(ns, lba, buf, count, write); }
- return count; + return nvme_io_xfer(ns, lba, buf, count, write); }
static int @@ -737,24 +737,14 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - u16 i, blocks; - + int i; for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size; - - blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); - if (blocks) { - int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write); - if (res < 0) - return DISK_RET_EBADTRACK; - } else { - int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); - if (res < 0) - return DISK_RET_EBADTRACK; - blocks = res; - } - + int blocks = nvme_prpl_xfer(ns, op->lba + i, op_buf, + blocks_remaining, write); + if (blocks < 0) + return DISK_RET_EBADTRACK; i += blocks; }
When using a prp2 parameter, build it in nvme_prpl_xfer() and pass it directly to nvme_io_xfer().
Signed-off-by: Kevin O'Connor kevin@koconnor.net --- src/hw/nvme.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index bafe8bf..3a73784 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -417,33 +417,19 @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross page boundaries. */ static int -nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, - int write) +nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *prp1, void *prp2, + u16 count, int write) { - u32 buf_addr = (u32)buf; - void *prp2; - - if (buf_addr & 0x3) { + if (((u32)prp1 & 0x3) || ((u32)prp2 & 0x3)) { /* Buffer is misaligned */ warn_internalerror(); return -1; }
- if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { - /* We need to describe more than 2 pages, rely on PRP List */ - prp2 = ns->prpl; - } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { - /* Directly embed the 2nd page if we only need 2 pages */ - prp2 = (void *)(long)ns->prpl[0]; - } else { - /* One page is enough, don't expose anything else */ - prp2 = NULL; - } - struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ, - NULL, buf, prp2); + NULL, prp1, prp2); io_read->nsid = ns->ns_id; io_read->dword[10] = (u32)lba; io_read->dword[11] = (u32)(lba >> 32); @@ -476,7 +462,7 @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (write) memcpy(ns->dma_buffer, buf, blocks * ns->block_size);
- int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write); + int res = nvme_io_xfer(ns, lba, ns->dma_buffer, NULL, blocks, write);
if (!write && res >= 0) memcpy(buf, ns->dma_buffer, res * ns->block_size); @@ -516,7 +502,7 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) - return nvme_io_xfer(ns, lba, buf, count, write); + return nvme_io_xfer(ns, lba, buf, NULL, count, write);
/* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) @@ -536,7 +522,18 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return nvme_bounce_xfer(ns, lba, buf, count, write); }
- return nvme_io_xfer(ns, lba, buf, count, write); + void *prp2; + if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { + /* We need to describe more than 2 pages, rely on PRP List */ + prp2 = ns->prpl; + } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { + /* Directly embed the 2nd page if we only need 2 pages */ + prp2 = (void *)(long)ns->prpl[0]; + } else { + /* One page is enough, don't expose anything else */ + prp2 = NULL; + } + return nvme_io_xfer(ns, lba, buf, prp2, count, write); }
static int
Commit 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") introduced multi-page requests using the NVMe PRP mechanism. To store the list and "first page to write to" hints, it added fields to the NVMe namespace struct.
Unfortunately, that struct resides in fseg which is read-only at runtime. While KVM ignores the read-only part and allows writes, real hardware and TCG adhere to the semantics and ignore writes to the fseg region. The net effect of that is that reads and writes were always happening on address 0, unless they went through the bounce buffer logic.
This patch builds the PRP maintenance data in the existing "dma bounce buffer" and only builds it when needed.
Fixes: 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") Reported-by: Matt DeVillier matt.devillier@gmail.com Signed-off-by: Alexander Graf graf@amazon.com Signed-off-by: Kevin O'Connor kevin@koconnor.net --- src/hw/nvme-int.h | 6 ------ src/hw/nvme.c | 51 +++++++++++++++++------------------------------ 2 files changed, 18 insertions(+), 39 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index 9564c17..f0d717d 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -10,8 +10,6 @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device
-#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ - /* Data structures */
/* The register file of a NVMe host controller. This struct follows the naming @@ -122,10 +120,6 @@ struct nvme_namespace {
/* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer; - - /* Page List */ - u32 prpl_len; - u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
/* Data structures for NVMe admin identify commands */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 3a73784..39b9138 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -470,35 +470,19 @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return res; }
-static void nvme_reset_prpl(struct nvme_namespace *ns) -{ - ns->prpl_len = 0; -} - -static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) -{ - if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES) - return -1; - - ns->prpl[ns->prpl_len++] = base; - - return 0; -} +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
// Transfer data using page list (if applicable) static int nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, int write) { - int first_page = 1; u32 base = (long)buf; s32 size;
if (count > ns->max_req_size) count = ns->max_req_size;
- nvme_reset_prpl(ns); - size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) @@ -512,28 +496,29 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (size & (ns->block_size - 1ULL)) return nvme_bounce_xfer(ns, lba, buf, count, write);
- for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { - if (first_page) { - /* First page is special */ - first_page = 0; - continue; - } - if (nvme_add_prpl(ns, base)) - return nvme_bounce_xfer(ns, lba, buf, count, write); - } - - void *prp2; if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { - /* We need to describe more than 2 pages, rely on PRP List */ - prp2 = ns->prpl; + /* We need to describe more than 2 pages, build PRP List */ + u32 prpl_len = 0; + u64 *prpl = (void*)ns->dma_buffer; + int first_page = 1; + for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { + if (first_page) { + /* First page is special */ + first_page = 0; + continue; + } + if (prpl_len >= NVME_MAX_PRPL_ENTRIES) + return nvme_bounce_xfer(ns, lba, buf, count, write); + prpl[prpl_len++] = base; + } + return nvme_io_xfer(ns, lba, buf, prpl, count, write); } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { /* Directly embed the 2nd page if we only need 2 pages */ - prp2 = (void *)(long)ns->prpl[0]; + return nvme_io_xfer(ns, lba, buf, buf + NVME_PAGE_SIZE, count, write); } else { /* One page is enough, don't expose anything else */ - prp2 = NULL; + return nvme_io_xfer(ns, lba, buf, NULL, count, write); } - return nvme_io_xfer(ns, lba, buf, prp2, count, write); }
static int
patch set tested successfully along with the subsequent single bounce buffer patch on a Purism Librem 15v4 w/Samsung 960 EVO
On Wed, Jan 19, 2022 at 12:45 PM Kevin O'Connor kevin@koconnor.net wrote:
Hi Alex,
I was looking at your recent fix for SeaBIOS NVME. I agree that it is not valid to write to the f-segment during runtime. However, I was struggling to understand the PRP list management in the nvme.c code.
I came up with a different implementation that avoids allocating another buffer in the "high zone". Instead, the code in this patch series builds the page list in the existing "dma bounce buffer".
I don't have a good way to test this on real hardware. It passes simple qemu tests (both with and without kvm). For example:
qemu-system-x86_64 -k en-us -snapshot -L test -chardev stdio,id=seabios -device isa-debugcon,iobase=0x402,chardev=seabios -m 512 -drive file=dos-drivec,if=none,id=drive0 -device nvme,drive=drive0,serial=nvme-1 -boot menu=on
Thanks, -Kevin
Kevin O'Connor (5): nvme: Rework nvme_io_readwrite() to return -1 on error nvme: Add nvme_bounce_xfer() helper function nvme: Convert nvme_build_prpl() to nvme_prpl_xfer() nvme: Pass prp1 and prp2 directly to nvme_io_xfer() nvme: Build the page list in the existing dma buffer
src/hw/nvme-int.h | 7 --- src/hw/nvme.c | 143 ++++++++++++++++++++-------------------------- 2 files changed, 61 insertions(+), 89 deletions(-)
-- 2.31.1
On 19.01.22 19:45, Kevin O'Connor wrote:
Move bounce buffer processing to a new helper function.
Signed-off-by: Kevin O'Connor kevin@koconnor.net
Reviewed-by: Alexander Graf graf@amazon.com
Alex
src/hw/nvme.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index a97501b..fd7c1d0 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -465,6 +465,25 @@ nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return count; }
+// Transfer up to one page of data using the internal dma bounce buffer +static int +nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count,
int write)
+{
- u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
- u16 blocks = count < max_blocks ? count : max_blocks;
- if (write)
memcpy(ns->dma_buffer, buf, blocks * ns->block_size);
- int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write);
- if (!write && res >= 0)
memcpy(buf, ns->dma_buffer, res * ns->block_size);
- return res;
+}
- static void nvme_reset_prpl(struct nvme_namespace *ns) { ns->prpl_len = 0;
@@ -718,7 +737,6 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) {
u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks;
for (i = 0; i < op->count;) {
@@ -731,21 +749,10 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) if (res < 0) return DISK_RET_EBADTRACK; } else {
blocks = blocks_remaining < max_blocks ? blocks_remaining
: max_blocks;
if (write) {
memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
}
int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer,
blocks, write);
int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); if (res < 0) return DISK_RET_EBADTRACK;
if (!write) {
memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
}
blocks = res; } i += blocks;
-- 2.31.1
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 19.01.22 19:45, Kevin O'Connor wrote:
Rename nvme_io_readwrite() to nvme_io_xfer() and change it so it implements the debugging dprintf() and it returns -1 on an error.
Signed-off-by: Kevin O'Connor kevin@koconnor.net
Reviewed-by: Alexander Graf graf@amazon.com
Alex
src/hw/nvme.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index f035fa2..a97501b 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -417,8 +417,8 @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross page boundaries. */ static int -nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
int write)
+nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count,
{ u32 buf_addr = (u32)buf; void *prp2;int write)
@@ -426,7 +426,7 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, if (buf_addr & 0x3) { /* Buffer is misaligned */ warn_internalerror();
return DISK_RET_EBADTRACK;
return -1; } if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
@@ -457,10 +457,12 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, dprintf(2, "read io: %08x %08x %08x %08x\n", cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return DISK_RET_EBADTRACK;
return -1; }
- return DISK_RET_SUCCESS;
dprintf(5, "ns %u %s lba %llu+%u\n", ns->ns_id, write ? "write" : "read",
lba, count);
return count; }
static void nvme_reset_prpl(struct nvme_namespace *ns)
@@ -716,20 +718,18 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) {
int res = DISK_RET_SUCCESS; u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks;
for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) {
for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size;
blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); if (blocks) {
res = nvme_io_readwrite(ns, op->lba + i, ns->prp1, blocks, write);
dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
: "read",
op->lba, blocks, res);
int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write);
if (res < 0)
return DISK_RET_EBADTRACK; } else { blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks;
@@ -738,12 +738,12 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); }
res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
: "read",
op->lba + i, blocks, res);
int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer,
blocks, write);
if (res < 0)
return DISK_RET_EBADTRACK;
if (!write && res == DISK_RET_SUCCESS) {
if (!write) { memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); } }
@@ -751,7 +751,7 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) i += blocks; }
- return res;
return DISK_RET_SUCCESS; }
int
-- 2.31.1
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 19.01.22 19:45, Kevin O'Connor wrote:
Rename nvme_io_readwrite() to nvme_io_xfer() and change it so it implements the debugging dprintf() and it returns -1 on an error.
Signed-off-by: Kevin O'Connor kevin@koconnor.net
src/hw/nvme.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index f035fa2..a97501b 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -417,8 +417,8 @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
This comment no longer is accurate after the patch. I guess it was half stale before already, but now it's completely wrong :).
Alex
page boundaries. */
static int -nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
int write)
+nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count,
{ u32 buf_addr = (u32)buf; void *prp2;int write)
@@ -426,7 +426,7 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, if (buf_addr & 0x3) { /* Buffer is misaligned */ warn_internalerror();
return DISK_RET_EBADTRACK;
return -1; } if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
@@ -457,10 +457,12 @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, dprintf(2, "read io: %08x %08x %08x %08x\n", cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
return DISK_RET_EBADTRACK;
return -1; }
- return DISK_RET_SUCCESS;
dprintf(5, "ns %u %s lba %llu+%u\n", ns->ns_id, write ? "write" : "read",
lba, count);
return count; }
static void nvme_reset_prpl(struct nvme_namespace *ns)
@@ -716,20 +718,18 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) {
int res = DISK_RET_SUCCESS; u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks;
for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) {
for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size;
blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); if (blocks) {
res = nvme_io_readwrite(ns, op->lba + i, ns->prp1, blocks, write);
dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
: "read",
op->lba, blocks, res);
int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write);
if (res < 0)
return DISK_RET_EBADTRACK; } else { blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks;
@@ -738,12 +738,12 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); }
res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
: "read",
op->lba + i, blocks, res);
int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer,
blocks, write);
if (res < 0)
return DISK_RET_EBADTRACK;
if (!write && res == DISK_RET_SUCCESS) {
if (!write) { memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); } }
@@ -751,7 +751,7 @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) i += blocks; }
- return res;
return DISK_RET_SUCCESS; }
int
-- 2.31.1
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 19.01.22 19:45, Kevin O'Connor wrote:
Rename nvme_build_prpl() to nvme_prpl_xfer() and directly invoke nvme_io_xfer() or nvme_bounce_xfer() from that function.
Signed-off-by: Kevin O'Connor kevin@koconnor.net
src/hw/nvme-int.h | 1 - src/hw/nvme.c | 42 ++++++++++++++++-------------------------- 2 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index a4c1555..9564c17 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -125,7 +125,6 @@ struct nvme_namespace {
/* Page List */ u32 prpl_len;
- void *prp1; u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index fd7c1d0..bafe8bf 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -499,10 +499,13 @@ static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) return 0; }
-static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count) +// Transfer data using page list (if applicable) +static int +nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count,
{ int first_page = 1;int write)
- u32 base = (long)op_buf;
u32 base = (long)buf; s32 size;
if (count > ns->max_req_size)
@@ -512,31 +515,28 @@ static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count)
size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */
- if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) {
ns->prp1 = op_buf;
return count;
- }
if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE))
return nvme_io_xfer(ns, lba, buf, count, write); /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK)
return 0;
return nvme_bounce_xfer(ns, lba, buf, count, write); /* Make sure a full block fits into the last chunk */ if (size & (ns->block_size - 1ULL))
return 0;
return nvme_bounce_xfer(ns, lba, buf, count, write); for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { if (first_page) { /* First page is special */
ns->prp1 = (void*)base; first_page = 0; continue; } if (nvme_add_prpl(ns, base))
return 0;
return nvme_bounce_xfer(ns, lba, buf, count, write);
I think this is correct, but reasoning about all of the bounce invocations is truly making my head hurt :). How about we split the "Does this fit into a PRP request" logic from the "do request" part? Can we just at the end of the function have something like this?
return nvme_io_xfer(ns, lba, buf, count, write); bounce: Â Â Â return nvme_bounce_xfer(ns, lba, buf, count, write);
and then goto bounce every time we realize we have to turn the request into an up-to-one-page bounce request?
Alex
}
- return count;
return nvme_io_xfer(ns, lba, buf, count, write); }
static int
@@ -737,24 +737,14 @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) {
- u16 i, blocks;
- int i; for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size;
blocks = nvme_build_prpl(ns, op_buf, blocks_remaining);
if (blocks) {
int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write);
if (res < 0)
return DISK_RET_EBADTRACK;
} else {
int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write);
if (res < 0)
return DISK_RET_EBADTRACK;
blocks = res;
}
int blocks = nvme_prpl_xfer(ns, op->lba + i, op_buf,
blocks_remaining, write);
if (blocks < 0)
return DISK_RET_EBADTRACK; i += blocks; }
-- 2.31.1
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 19.01.22 19:45, Kevin O'Connor wrote:
When using a prp2 parameter, build it in nvme_prpl_xfer() and pass it directly to nvme_io_xfer().
Signed-off-by: Kevin O'Connor kevin@koconnor.net
Reviewed-by: Alexander Graf graf@amazon.com
Alex
src/hw/nvme.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-)
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index bafe8bf..3a73784 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -417,33 +417,19 @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross page boundaries. */ static int -nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count,
int write)
+nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *prp1, void *prp2,
{u16 count, int write)
- u32 buf_addr = (u32)buf;
- void *prp2;
- if (buf_addr & 0x3) {
- if (((u32)prp1 & 0x3) || ((u32)prp2 & 0x3)) { /* Buffer is misaligned */ warn_internalerror(); return -1; }
- if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
- } else if ((ns->block_size * count) > NVME_PAGE_SIZE) {
/* Directly embed the 2nd page if we only need 2 pages */
prp2 = (void *)(long)ns->prpl[0];
- } else {
/* One page is enough, don't expose anything else */
prp2 = NULL;
- }
struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ,
NULL, buf, prp2);
NULL, prp1, prp2); io_read->nsid = ns->ns_id; io_read->dword[10] = (u32)lba; io_read->dword[11] = (u32)(lba >> 32);
@@ -476,7 +462,7 @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (write) memcpy(ns->dma_buffer, buf, blocks * ns->block_size);
- int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write);
int res = nvme_io_xfer(ns, lba, ns->dma_buffer, NULL, blocks, write);
if (!write && res >= 0) memcpy(buf, ns->dma_buffer, res * ns->block_size);
@@ -516,7 +502,7 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE))
return nvme_io_xfer(ns, lba, buf, count, write);
return nvme_io_xfer(ns, lba, buf, NULL, count, write); /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK)
@@ -536,7 +522,18 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return nvme_bounce_xfer(ns, lba, buf, count, write); }
- return nvme_io_xfer(ns, lba, buf, count, write);
void *prp2;
if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
} else if ((ns->block_size * count) > NVME_PAGE_SIZE) {
/* Directly embed the 2nd page if we only need 2 pages */
prp2 = (void *)(long)ns->prpl[0];
} else {
/* One page is enough, don't expose anything else */
prp2 = NULL;
}
return nvme_io_xfer(ns, lba, buf, prp2, count, write); }
static int
-- 2.31.1
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 19.01.22 19:45, Kevin O'Connor wrote:
Commit 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") introduced multi-page requests using the NVMe PRP mechanism. To store the list and "first page to write to" hints, it added fields to the NVMe namespace struct.
Unfortunately, that struct resides in fseg which is read-only at runtime. While KVM ignores the read-only part and allows writes, real hardware and TCG adhere to the semantics and ignore writes to the fseg region. The net effect of that is that reads and writes were always happening on address 0, unless they went through the bounce buffer logic.
This patch builds the PRP maintenance data in the existing "dma bounce buffer" and only builds it when needed.
Fixes: 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") Reported-by: Matt DeVillier matt.devillier@gmail.com Signed-off-by: Alexander Graf graf@amazon.com Signed-off-by: Kevin O'Connor kevin@koconnor.net
src/hw/nvme-int.h | 6 ------ src/hw/nvme.c | 51 +++++++++++++++++------------------------------ 2 files changed, 18 insertions(+), 39 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index 9564c17..f0d717d 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -10,8 +10,6 @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device
-#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
/* Data structures */
/* The register file of a NVMe host controller. This struct follows the naming
@@ -122,10 +120,6 @@ struct nvme_namespace {
/* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer;
/* Page List */
u32 prpl_len;
u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
/* Data structures for NVMe admin identify commands */
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 3a73784..39b9138 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -470,35 +470,19 @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return res; }
-static void nvme_reset_prpl(struct nvme_namespace *ns) -{
- ns->prpl_len = 0;
-}
-static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) -{
- if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES)
return -1;
- ns->prpl[ns->prpl_len++] = base;
- return 0;
-} +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
// Transfer data using page list (if applicable) static int nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, int write) {
int first_page = 1; u32 base = (long)buf; s32 size;
if (count > ns->max_req_size) count = ns->max_req_size;
nvme_reset_prpl(ns);
size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE))
@@ -512,28 +496,29 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (size & (ns->block_size - 1ULL)) return nvme_bounce_xfer(ns, lba, buf, count, write);
- for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) {
if (first_page) {
/* First page is special */
first_page = 0;
continue;
}
if (nvme_add_prpl(ns, base))
return nvme_bounce_xfer(ns, lba, buf, count, write);
- }
- void *prp2; if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
/* We need to describe more than 2 pages, build PRP List */
u32 prpl_len = 0;
u64 *prpl = (void*)ns->dma_buffer;
At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
Alex
int first_page = 1;
for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) {
if (first_page) {
/* First page is special */
first_page = 0;
continue;
}
if (prpl_len >= NVME_MAX_PRPL_ENTRIES)
return nvme_bounce_xfer(ns, lba, buf, count, write);
prpl[prpl_len++] = base;
}
return nvme_io_xfer(ns, lba, buf, prpl, count, write); } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { /* Directly embed the 2nd page if we only need 2 pages */
prp2 = (void *)(long)ns->prpl[0];
return nvme_io_xfer(ns, lba, buf, buf + NVME_PAGE_SIZE, count, write); } else { /* One page is enough, don't expose anything else */
prp2 = NULL;
return nvme_io_xfer(ns, lba, buf, NULL, count, write); }
return nvme_io_xfer(ns, lba, buf, prp2, count, write); }
static int
-- 2.31.1
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On Fri, Jan 21, 2022 at 03:28:33PM +0100, Alexander Graf wrote:
On 19.01.22 19:45, Kevin O'Connor wrote:
Commit 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") introduced multi-page requests using the NVMe PRP mechanism. To store the list and "first page to write to" hints, it added fields to the NVMe namespace struct.
Unfortunately, that struct resides in fseg which is read-only at runtime. While KVM ignores the read-only part and allows writes, real hardware and TCG adhere to the semantics and ignore writes to the fseg region. The net effect of that is that reads and writes were always happening on address 0, unless they went through the bounce buffer logic.
This patch builds the PRP maintenance data in the existing "dma bounce buffer" and only builds it when needed.
Fixes: 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") Reported-by: Matt DeVillier matt.devillier@gmail.com Signed-off-by: Alexander Graf graf@amazon.com Signed-off-by: Kevin O'Connor kevin@koconnor.net
src/hw/nvme-int.h | 6 ------ src/hw/nvme.c | 51 +++++++++++++++++------------------------------ 2 files changed, 18 insertions(+), 39 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index 9564c17..f0d717d 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -10,8 +10,6 @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device
-#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
/* Data structures */
/* The register file of a NVMe host controller. This struct follows the naming
@@ -122,10 +120,6 @@ struct nvme_namespace {
/* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer;
/* Page List */
u32 prpl_len;
u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
/* Data structures for NVMe admin identify commands */
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 3a73784..39b9138 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -470,35 +470,19 @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return res; }
-static void nvme_reset_prpl(struct nvme_namespace *ns) -{
- ns->prpl_len = 0;
-}
-static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) -{
- if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES)
return -1;
- ns->prpl[ns->prpl_len++] = base;
- return 0;
-} +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
// Transfer data using page list (if applicable) static int nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, int write) {
int first_page = 1; u32 base = (long)buf; s32 size;
if (count > ns->max_req_size) count = ns->max_req_size;
nvme_reset_prpl(ns);
size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE))
@@ -512,28 +496,29 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (size & (ns->block_size - 1ULL)) return nvme_bounce_xfer(ns, lba, buf, count, write);
- for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) {
if (first_page) {
/* First page is special */
first_page = 0;
continue;
}
if (nvme_add_prpl(ns, base))
return nvme_bounce_xfer(ns, lba, buf, count, write);
- }
- void *prp2; if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
/* We need to describe more than 2 pages, build PRP List */
u32 prpl_len = 0;
u64 *prpl = (void*)ns->dma_buffer;
At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
In general it's possible to DMA from the stack (eg, src/hw/ohci.c:ohci_send_pipe() ). However, SeaBIOS doesn't guarantee stack alignment so it would need to be done manually. Also, I'm not sure how tight the nvme request completion code is - if it returns early for some reason it could cause havoc (device dma into random memory).
Another option might be to make a single global prpl array, but that would consume the memory even if nvme isn't in use.
FWIW though, I don't see a harm in the single ( u64 *prpl = (void*)ns->dma_buffer ) line.
Thanks, -Kevin
Alex
int first_page = 1;
for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) {
if (first_page) {
/* First page is special */
first_page = 0;
continue;
}
if (prpl_len >= NVME_MAX_PRPL_ENTRIES)
return nvme_bounce_xfer(ns, lba, buf, count, write);
prpl[prpl_len++] = base;
}
return nvme_io_xfer(ns, lba, buf, prpl, count, write); } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { /* Directly embed the 2nd page if we only need 2 pages */
prp2 = (void *)(long)ns->prpl[0];
return nvme_io_xfer(ns, lba, buf, buf + NVME_PAGE_SIZE, count, write); } else { /* One page is enough, don't expose anything else */
prp2 = NULL;
return nvme_io_xfer(ns, lba, buf, NULL, count, write); }
return nvme_io_xfer(ns, lba, buf, prp2, count, write); }
static int
-- 2.31.1
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 21.01.22 17:02, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 03:28:33PM +0100, Alexander Graf wrote:
On 19.01.22 19:45, Kevin O'Connor wrote:
Commit 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") introduced multi-page requests using the NVMe PRP mechanism. To store the list and "first page to write to" hints, it added fields to the NVMe namespace struct.
Unfortunately, that struct resides in fseg which is read-only at runtime. While KVM ignores the read-only part and allows writes, real hardware and TCG adhere to the semantics and ignore writes to the fseg region. The net effect of that is that reads and writes were always happening on address 0, unless they went through the bounce buffer logic.
This patch builds the PRP maintenance data in the existing "dma bounce buffer" and only builds it when needed.
Fixes: 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") Reported-by: Matt DeVillier matt.devillier@gmail.com Signed-off-by: Alexander Graf graf@amazon.com Signed-off-by: Kevin O'Connor kevin@koconnor.net
src/hw/nvme-int.h | 6 ------ src/hw/nvme.c | 51 +++++++++++++++++------------------------------ 2 files changed, 18 insertions(+), 39 deletions(-)
diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index 9564c17..f0d717d 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -10,8 +10,6 @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device
-#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
/* Data structures */
/* The register file of a NVMe host controller. This struct follows the naming
@@ -122,10 +120,6 @@ struct nvme_namespace {
/* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer;
/* Page List */
u32 prpl_len;
u64 prpl[NVME_MAX_PRPL_ENTRIES]; };
/* Data structures for NVMe admin identify commands */
diff --git a/src/hw/nvme.c b/src/hw/nvme.c index 3a73784..39b9138 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -470,35 +470,19 @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return res; }
-static void nvme_reset_prpl(struct nvme_namespace *ns) -{
- ns->prpl_len = 0;
-}
-static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) -{
- if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES)
return -1;
- ns->prpl[ns->prpl_len++] = base;
- return 0;
-} +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */
// Transfer data using page list (if applicable) static int nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, int write) {
int first_page = 1; u32 base = (long)buf; s32 size;
if (count > ns->max_req_size) count = ns->max_req_size;
nvme_reset_prpl(ns);
size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE))
@@ -512,28 +496,29 @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (size & (ns->block_size - 1ULL)) return nvme_bounce_xfer(ns, lba, buf, count, write);
- for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) {
if (first_page) {
/* First page is special */
first_page = 0;
continue;
}
if (nvme_add_prpl(ns, base))
return nvme_bounce_xfer(ns, lba, buf, count, write);
- }
- void *prp2; if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
/* We need to describe more than 2 pages, build PRP List */
u32 prpl_len = 0;
u64 *prpl = (void*)ns->dma_buffer;
At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
In general it's possible to DMA from the stack (eg, src/hw/ohci.c:ohci_send_pipe() ). However, SeaBIOS doesn't guarantee stack alignment so it would need to be done manually. Also, I'm not sure how tight the nvme request completion code is - if it returns early for some reason it could cause havoc (device dma into random memory).
Another option might be to make a single global prpl array, but that would consume the memory even if nvme isn't in use.
FWIW though, I don't see a harm in the single ( u64 *prpl = (void*)ns->dma_buffer ) line.
Fair, works for me. But then we probably want to also adjust MAX_PRPL_ENTRIES to match the full page we now have available, no?
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On Fri, Jan 21, 2022 at 05:41:17PM +0100, Alexander Graf wrote:
On 21.01.22 17:02, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 03:28:33PM +0100, Alexander Graf wrote:
On 19.01.22 19:45, Kevin O'Connor wrote:
if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
/* We need to describe more than 2 pages, build PRP List */
u32 prpl_len = 0;
u64 *prpl = (void*)ns->dma_buffer;
At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
In general it's possible to DMA from the stack (eg, src/hw/ohci.c:ohci_send_pipe() ). However, SeaBIOS doesn't guarantee stack alignment so it would need to be done manually. Also, I'm not sure how tight the nvme request completion code is - if it returns early for some reason it could cause havoc (device dma into random memory).
Another option might be to make a single global prpl array, but that would consume the memory even if nvme isn't in use.
FWIW though, I don't see a harm in the single ( u64 *prpl = (void*)ns->dma_buffer ) line.
Fair, works for me. But then we probably want to also adjust MAX_PRPL_ENTRIES to match the full page we now have available, no?
I don't think a BIOS disk request can be over 64KiB so I don't think it matters. I got the impression the current checks are just "sanity checks". I don't see a harm in keeping the sanity check and that size is as good as any other as far as I know.
Cheers, -Kevin
On 21.01.22 17:54, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 05:41:17PM +0100, Alexander Graf wrote:
On 21.01.22 17:02, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 03:28:33PM +0100, Alexander Graf wrote:
On 19.01.22 19:45, Kevin O'Connor wrote:
if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
/* We need to describe more than 2 pages, build PRP List */
u32 prpl_len = 0;
u64 *prpl = (void*)ns->dma_buffer;
At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
In general it's possible to DMA from the stack (eg, src/hw/ohci.c:ohci_send_pipe() ). However, SeaBIOS doesn't guarantee stack alignment so it would need to be done manually. Also, I'm not sure how tight the nvme request completion code is - if it returns early for some reason it could cause havoc (device dma into random memory).
Another option might be to make a single global prpl array, but that would consume the memory even if nvme isn't in use.
FWIW though, I don't see a harm in the single ( u64 *prpl = (void*)ns->dma_buffer ) line.
Fair, works for me. But then we probably want to also adjust MAX_PRPL_ENTRIES to match the full page we now have available, no?
I don't think a BIOS disk request can be over 64KiB so I don't think it matters. I got the impression the current checks are just "sanity checks". I don't see a harm in keeping the sanity check and that size is as good as any other as far as I know.
It's a bit of both: Sanity checks and code that potentially can be reused outside of the SeaBIOS context. So I would try as hard as possible to not make assumptions that we can only handle max 64kb I/O requests. Plus, if we really wanted to, we could even introduce a new SeaBIOS specific INT to do larger I/O requests.
I won't make a fuss if you want to keep it at 64kb max request size though :).
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On Fri, Jan 21, 2022 at 09:44:46PM +0100, Alexander Graf wrote:
On 21.01.22 17:54, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 05:41:17PM +0100, Alexander Graf wrote:
On 21.01.22 17:02, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 03:28:33PM +0100, Alexander Graf wrote:
On 19.01.22 19:45, Kevin O'Connor wrote:
if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
/* We need to describe more than 2 pages, rely on PRP List */
prp2 = ns->prpl;
/* We need to describe more than 2 pages, build PRP List */
u32 prpl_len = 0;
u64 *prpl = (void*)ns->dma_buffer;
At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
In general it's possible to DMA from the stack (eg, src/hw/ohci.c:ohci_send_pipe() ). However, SeaBIOS doesn't guarantee stack alignment so it would need to be done manually. Also, I'm not sure how tight the nvme request completion code is - if it returns early for some reason it could cause havoc (device dma into random memory).
Another option might be to make a single global prpl array, but that would consume the memory even if nvme isn't in use.
FWIW though, I don't see a harm in the single ( u64 *prpl = (void*)ns->dma_buffer ) line.
Fair, works for me. But then we probably want to also adjust MAX_PRPL_ENTRIES to match the full page we now have available, no?
I don't think a BIOS disk request can be over 64KiB so I don't think it matters. I got the impression the current checks are just "sanity checks". I don't see a harm in keeping the sanity check and that size is as good as any other as far as I know.
It's a bit of both: Sanity checks and code that potentially can be reused outside of the SeaBIOS context. So I would try as hard as possible to not make assumptions that we can only handle max 64kb I/O requests.
I agree. I also think it would be good to address the two items I raised at: https://www.mail-archive.com/seabios@seabios.org/msg12833.html
I'd prefer if someone more familiar with nvme (and has a better testing infrastructure) could look at the above items though. I'm not familiar with the nvme hardware specs and I'm just testing by "checking if qemu starts".
I'd be inclined to go forward with the current patch series as the above items seem orthogonal. That is, if there is interest, I'd guess they would be best merged on top of this series anyway.
Cheers, -Kevin
Am 22.01.2022 um 17:34 schrieb Kevin O'Connor kevin@koconnor.net:
On Fri, Jan 21, 2022 at 09:44:46PM +0100, Alexander Graf wrote:
On 21.01.22 17:54, Kevin O'Connor wrote: On Fri, Jan 21, 2022 at 05:41:17PM +0100, Alexander Graf wrote:
On 21.01.22 17:02, Kevin O'Connor wrote:
On Fri, Jan 21, 2022 at 03:28:33PM +0100, Alexander Graf wrote:
On 19.01.22 19:45, Kevin O'Connor wrote: > if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { > - /* We need to describe more than 2 pages, rely on PRP List */ > - prp2 = ns->prpl; > + /* We need to describe more than 2 pages, build PRP List */ > + u32 prpl_len = 0; > + u64 *prpl = (void*)ns->dma_buffer; At 15*8=120 bytes of data, do we even need to put the prpl into dma_buffer or can we just use the stack? Will the array be guaranteed 64bit aligned or would we need to add an attribute to it?
u64 prpl[NVME_MAX_PRPL_ENTRIES];
Either way, I don't have strong feelings one way or another. It just seems more natural to keep the bounce buffer purely as bounce buffer :).
In general it's possible to DMA from the stack (eg, src/hw/ohci.c:ohci_send_pipe() ). However, SeaBIOS doesn't guarantee stack alignment so it would need to be done manually. Also, I'm not sure how tight the nvme request completion code is - if it returns early for some reason it could cause havoc (device dma into random memory).
Another option might be to make a single global prpl array, but that would consume the memory even if nvme isn't in use.
FWIW though, I don't see a harm in the single ( u64 *prpl = (void*)ns->dma_buffer ) line.
Fair, works for me. But then we probably want to also adjust MAX_PRPL_ENTRIES to match the full page we now have available, no?
I don't think a BIOS disk request can be over 64KiB so I don't think it matters. I got the impression the current checks are just "sanity checks". I don't see a harm in keeping the sanity check and that size is as good as any other as far as I know.
It's a bit of both: Sanity checks and code that potentially can be reused outside of the SeaBIOS context. So I would try as hard as possible to not make assumptions that we can only handle max 64kb I/O requests.
I agree. I also think it would be good to address the two items I raised at: https://www.mail-archive.com/seabios@seabios.org/msg12833.html
I'd prefer if someone more familiar with nvme (and has a better testing infrastructure) could look at the above items though. I'm not familiar with the nvme hardware specs and I'm just testing by "checking if qemu starts".
I'd be inclined to go forward with the current patch series as the above items seem orthogonal. That is, if there is interest, I'd guess they would be best merged on top of this series anyway.
I agree with that plan :). Let me see if I can find a few cycles to be that person some time soon.
Alex
Cheers, -Kevin
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879