This is an interesting enough setting to add a dprintf() for.
Signed-off-by: John Levon john.levon@nutanix.com --- src/fw/pciinit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index bb44dc29..9ad4e41a 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1197,8 +1197,10 @@ pci_setup(void) } }
- if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G) + if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G) { + dprintf(1, "setting pci_pad_mem64=1\n"); pci_pad_mem64 = 1; + }
dprintf(1, "=== PCI bus & bridge init ===\n"); if (pci_probe_host() != 0) {
qemu_cfg_e820() reports RamSize* at debug level 1. Do the same in qemu_early_e820().
Signed-off-by: John Levon john.levon@nutanix.com --- src/fw/paravirt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index e5d4eca0..0ff5d0a4 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -784,7 +784,7 @@ static int qemu_early_e820(void) } }
- dprintf(3, "qemu/e820: RamSize: 0x%08x\n", RamSize); - dprintf(3, "qemu/e820: RamSizeOver4G: 0x%016llx\n", RamSizeOver4G); + dprintf(1, "qemu/e820: RamSize: 0x%08x\n", RamSize); + dprintf(1, "qemu/e820: RamSizeOver4G: 0x%016llx\n", RamSizeOver4G); return 1; }
Signed-off-by: John Levon john.levon@nutanix.com --- src/fw/paravirt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 0ff5d0a4..3ad9094b 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -28,6 +28,8 @@ #include "xen.h" // xen_biostable_setup #include "stacks.h" // yield
+#define MEM_4G (0x100000000ULL) + // Amount of continuous ram under 4Gig u32 RamSize; // Amount of continuous ram >4Gig @@ -589,7 +591,7 @@ qemu_cfg_e820(void) | ((u32)rtc_read(CMOS_MEM_HIGHMEM_MID) << 24) | ((u64)rtc_read(CMOS_MEM_HIGHMEM_HIGH) << 32)); RamSizeOver4G = high; - e820_add(0x100000000ull, high, E820_RAM); + e820_add(MEM_4G, high, E820_RAM); dprintf(1, "RamSizeOver4G: 0x%016llx [cmos]\n", RamSizeOver4G); }
@@ -772,14 +774,14 @@ static int qemu_early_e820(void) e820_add(table.address, table.length, table.type); dprintf(1, "qemu/e820: addr 0x%016llx len 0x%016llx [RAM]\n", table.address, table.length); - if (table.address < 0x100000000LL) { + if (table.address < MEM_4G) { // below 4g if (RamSize < table.address + table.length) RamSize = table.address + table.length; } else { // above 4g - if (RamSizeOver4G < table.address + table.length - 0x100000000LL) - RamSizeOver4G = table.address + table.length - 0x100000000LL; + if (RamSizeOver4G < table.address + table.length - MEM_4G) + RamSizeOver4G = table.address + table.length - MEM_4G; } } }
Older 32-bit Linux VMs (including Ubuntu 16.10) have issues with the 64-bit pci io window, failing during boot with errors like:
virtio_balloon virtio2: virtio: device uses modern interface but does not have VIRTIO_F_VERSION_19734-565debf7b362 virtio_net virtio0: virtio: device uses modern interface but does not have VIRTIO_F_VERSION_1 Virtio_scsi virtio1: virtio: device uses modern interface but does not have VIRTIO_F_VERSION_1 Gave up waiting for root device. Common problems: - Boot args (cat /proc/cmdline) - Check rootdelay= (did the system wait long enough?) - Check root= (did the system wait for the right device?) - Missing modules (cat /proc/modules; ls /dev) ALERT! /dev/disk/by-uuid/86859879-3f17-443d-a226-077c435291e2 does not exist. Dropping to a shell!
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Fixes: 96a8d130 ("be less conservative with the 64bit pci io window") Signed-off-by: John Levon john.levon@nutanix.com --- src/fw/paravirt.c | 28 ++++++++++++++++++++++++---- src/fw/paravirt.h | 1 + src/fw/pciinit.c | 6 +++++- 3 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 3ad9094b..b1806361 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -29,11 +29,14 @@ #include "stacks.h" // yield
#define MEM_4G (0x100000000ULL) +#define MEM_60G (15 * 0x100000000ULL)
// Amount of continuous ram under 4Gig u32 RamSize; // Amount of continuous ram >4Gig u64 RamSizeOver4G; +// Amount of continuous ram >60Gig +u64 RamSizeOver60G; // physical address space bits u8 CPUPhysBits; // 64bit processor @@ -591,8 +594,12 @@ qemu_cfg_e820(void) | ((u32)rtc_read(CMOS_MEM_HIGHMEM_MID) << 24) | ((u64)rtc_read(CMOS_MEM_HIGHMEM_HIGH) << 32)); RamSizeOver4G = high; + RamSizeOver60G = 0; + if (high + MEM_4G > MEM_60G) + RamSizeOver60G = high + MEM_4G - MEM_60G; e820_add(MEM_4G, high, E820_RAM); dprintf(1, "RamSizeOver4G: 0x%016llx [cmos]\n", RamSizeOver4G); + dprintf(1, "RamSizeOver60G: 0x%016llx [cmos]\n", RamSizeOver60G); }
// Populate romfile entries for legacy fw_cfg ports (that predate the @@ -774,19 +781,32 @@ static int qemu_early_e820(void) e820_add(table.address, table.length, table.type); dprintf(1, "qemu/e820: addr 0x%016llx len 0x%016llx [RAM]\n", table.address, table.length); + // address below 4g? if (table.address < MEM_4G) { - // below 4g if (RamSize < table.address + table.length) RamSize = table.address + table.length; } else { - // above 4g - if (RamSizeOver4G < table.address + table.length - MEM_4G) - RamSizeOver4G = table.address + table.length - MEM_4G; + u64 table_end = table.address + table.length; + + /* + * Note that this would ignore any span that crosses the 4G + * boundary. For RamSizeOver60G, we do account for any spans + * that cross the 60G boundary. + */ + if (RamSizeOver4G < table_end - MEM_4G) + RamSizeOver4G = table_end - MEM_4G; + + // crosses 60G ? + if (table_end > MEM_60G) { + if (RamSizeOver60G < table_end - MEM_60G) + RamSizeOver60G = table_end - MEM_60G; + } } } }
dprintf(1, "qemu/e820: RamSize: 0x%08x\n", RamSize); dprintf(1, "qemu/e820: RamSizeOver4G: 0x%016llx\n", RamSizeOver4G); + dprintf(1, "qemu/e820: RamSizeOver60G: 0x%016llx\n", RamSizeOver60G); return 1; } diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 62a2cd07..e5601d7e 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -30,6 +30,7 @@ typedef struct QemuCfgDmaAccess {
extern u32 RamSize; extern u64 RamSizeOver4G; +extern u64 RamSizeOver60G; extern int PlatformRunningOn; extern u8 CPUPhysBits; extern u8 CPULongMode; diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index 9ad4e41a..7446d3d8 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1197,7 +1197,11 @@ pci_setup(void) } }
- if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G) { + /* + * Only enable this if we exceed 60G: some older 32-bit Linux VMs cannot + * handle this correctly. + */ + if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver60G) { dprintf(1, "setting pci_pad_mem64=1\n"); pci_pad_mem64 = 1; }
On Thu, Jun 13, 2024 at 06:00:29PM GMT, John Levon wrote:
This is an interesting enough setting to add a dprintf() for.
Indeed.
- if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G)
- if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G) {
dprintf(1, "setting pci_pad_mem64=1\n");
Should be more descriptive, so people can have an idea what this means without looking at the source code. Something along the lines of "enabling 64-bit pci mmio window".
take care, Gerd
Hi,
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
take care, Gerd
On Fri, Jun 14, 2024 at 12:54:28PM +0200, Gerd Hoffmann wrote:
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
I just realised I mis-understood your suggestion here:
I think it makes sense to check for "RamSizeOver4G > 60GB" then.
We actually prefer 64G so will send out that version when I'm back online next week.
thanks john
On Fri, Jun 14, 2024 at 12:54:28PM +0200, Gerd Hoffmann wrote:
Hi,
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
Could this change introduce a further regression - guests with 32GB of ram that need extra pci space now break?
If ram size is not a good indicator of being 64bit capable, are there other indicators that we could use (eg, looking for how much memory the PCI devices are requesting)?
Cheers, -Kevin
On Fri, Jun 14, 2024 at 01:05:44PM GMT, Kevin O'Connor wrote:
On Fri, Jun 14, 2024 at 12:54:28PM +0200, Gerd Hoffmann wrote:
Hi,
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
Could this change introduce a further regression - guests with 32GB of ram that need extra pci space now break?
Unlikely.
If ram size is not a good indicator of being 64bit capable, are there other indicators that we could use (eg, looking for how much memory the PCI devices are requesting)?
If the pci bars of coldplugged devices do not fit into the 32-bit mmio window below 4G seabios will enable the 64-bit mmio window too.
Turning on the 64-bit mmio window in more cases is indented to make hotplugging devices with large pci bars easier.
So the only case where a regression could happen is if hotplugging is involved, for example a 32G guest gets a device with a 4G pci bar hotplugged. This can be handled by manually setting up pci(e) bridge window size hints in qemu. Same way this case would have been handled before commit 96a8d130a8c2 ("be less conservative with the 64bit pci io window").
take care, Gerd
Dear John,
Thank you for your patch. Maybe be more specific in the summary/title:
paravirt: Log RamSize* at level 1 instead of 3
Reviewed-by: Paul Menzel pmenzel@molgen.mpg.de
Kind regards,
Paul
On Mon, 17 Jun 2024 13:42:20 +0200 Gerd Hoffmann kraxel@redhat.com wrote:
On Fri, Jun 14, 2024 at 01:05:44PM GMT, Kevin O'Connor wrote:
On Fri, Jun 14, 2024 at 12:54:28PM +0200, Gerd Hoffmann wrote:
Hi,
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
Could this change introduce a further regression - guests with 32GB of ram that need extra pci space now break?
Unlikely.
If ram size is not a good indicator of being 64bit capable, are there other indicators that we could use (eg, looking for how much memory the PCI devices are requesting)?
If the pci bars of coldplugged devices do not fit into the 32-bit mmio window below 4G seabios will enable the 64-bit mmio window too.
Turning on the 64-bit mmio window in more cases is indented to make hotplugging devices with large pci bars easier.
So the only case where a regression could happen is if hotplugging is involved, for example a 32G guest gets a device with a 4G pci bar hotplugged. This can be handled by manually setting up pci(e) bridge window size hints in qemu. Same way this case would have been handled before commit 96a8d130a8c2 ("be less conservative with the 64bit pci io window").
we had a number of hotplug related issues due to insufficient MMIO window size, especially when it comes to linux guests. Until 96a8d130a8c2 made those complains go away in most cases. So it would 'regress' those cases. Also I'd say RAM size is not a good indicator that guest doesn't need/can't handle 64-bit MMIO window, it's totally valid config to have low RAM guest with devices that have large PCI bars.
While it's possible to tweak MMIO windows size on QEMU CLI, it's inconvenient and that cascades over to upper layers (libvirt, whatnot on top of that) eventually ending up at end-user somewhere if that config change supported.
So this patch would 'break' pci hotplug on modern linux guest which could or couldn't be fixed by enduser (depending on used sw stack).
Is there any other way to fix old 32 linux guest issues while keeping modern ones happy as well?
take care, Gerd
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Hi,
While it's possible to tweak MMIO windows size on QEMU CLI, it's inconvenient and that cascades over to upper layers (libvirt, whatnot on top of that) eventually ending up at end-user somewhere if that config change supported.
So this patch would 'break' pci hotplug on modern linux guest which could or couldn't be fixed by enduser (depending on used sw stack).
Is there any other way to fix old 32 linux guest issues while keeping modern ones happy as well?
Well, the fundamental issue is that seabios simply doesn't know whenever it is booting a 32-bit or 64-bit os. So we are doing heuristics for the default configuration, trying to make things work well without manual invention in as many cases as possible. Because any kind of manual configuration has the drawbacks you've just outlined above.
One hint seabios takes is long mode support. Booting 32-bit guests with 'qemu -cpu $model,lm=off' will disable the 64-bit mmio window even in case there is alot of memory present. But that is manual configuration too ...
I'm open to ideas, but it's a hard problem unfortunately.
take care, Gerd
On Mon, 17 Jun 2024 16:04:29 +0200 Gerd Hoffmann kraxel@redhat.com wrote:
Hi,
While it's possible to tweak MMIO windows size on QEMU CLI, it's inconvenient and that cascades over to upper layers (libvirt, whatnot on top of that) eventually ending up at end-user somewhere if that config change supported.
So this patch would 'break' pci hotplug on modern linux guest which could or couldn't be fixed by enduser (depending on used sw stack).
Is there any other way to fix old 32 linux guest issues while keeping modern ones happy as well?
Well, the fundamental issue is that seabios simply doesn't know whenever it is booting a 32-bit or 64-bit os. So we are doing heuristics for the default configuration, trying to make things work well without manual invention in as many cases as possible. Because any kind of manual configuration has the drawbacks you've just outlined above.
One hint seabios takes is long mode support. Booting 32-bit guests with 'qemu -cpu $model,lm=off' will disable the 64-bit mmio window even in case there is alot of memory present. But that is manual configuration too ...
another manual way to boot such guest could be to force pci realloc on kernel CLI.
or use some older SeaBIOS version where it doesn't assign huge 64-bit MMIO windows.
Maybe fix guest kernel (commit message isn't clear on what exactly fails on guest side)
I'm open to ideas, but it's a hard problem unfortunately.
I don't have any (it is not the issue that qemu/seabios can reasonably deal with, it's more like upper layers problem to properly configure VM according to installed OS).
Regardless of which way is chosen some users will suffer one way or another. My vote would be to keep current behavior so 'modern' guests would work without issues.
old 32-bit PAE ones can still use workaround(s) to deal with the problem.
take care, Gerd
On Mon, Jun 17, 2024 at 01:42:20PM +0200, Gerd Hoffmann wrote:
On Fri, Jun 14, 2024 at 01:05:44PM GMT, Kevin O'Connor wrote:
On Fri, Jun 14, 2024 at 12:54:28PM +0200, Gerd Hoffmann wrote:
Hi,
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
Could this change introduce a further regression - guests with 32GB of ram that need extra pci space now break?
Unlikely.
Well, there are a very large number of SeaBIOS installs. It's hard to quantify the impact to guests that need large PCI hotplug without much ram, compared to the impact to guests that need 64bit ram but can't handle 64bit pci.
Just to review - the original change (96a8d130) was made to encourage padded PCI mappings so that hotplug has a better chance of succeeding. It was thought that there would be few guests that would have >4G ram that could not also handle >4G pci mappings. Did I miss anything?
I'm leery of moving this heuristic to 64G of ram. I can understand the logic of >4G of ram indicating support for >4G pci. However, it seems strange to me that there would be guests with 50G of ram that can't handle >4G pci, but not similar guests with 70G of ram.. It also seems strange to me that only pci hotplug users with more than 64G of ram will benfit from padded mappings.
Some possible alternatives:
* We could do nothing for now and continue to gauge how much of a problem this is.
* We could revert 96a8d130, and go back to explicit qemu hints for pci hotplug users that need large mappings.
Are there other possibilities or other heuristics?
-Kevin
Dear Kevin, dear SeaBIOS folks,
Am 20.06.24 um 19:29 schrieb Kevin O'Connor:
On Mon, Jun 17, 2024 at 01:42:20PM +0200, Gerd Hoffmann wrote:
On Fri, Jun 14, 2024 at 01:05:44PM GMT, Kevin O'Connor wrote:
On Fri, Jun 14, 2024 at 12:54:28PM +0200, Gerd Hoffmann wrote:
Hi,
Be a bit more conservative, and only enable the window by default when the ram size extends beyond 60G. Due to the mmio window this translates to an effective working configuration limit of 58G/59G, depending on machine type.
Why 60GB not 64GB?
Could this change introduce a further regression - guests with 32GB of ram that need extra pci space now break?
Unlikely.
Well, there are a very large number of SeaBIOS installs. It's hard to quantify the impact to guests that need large PCI hotplug without much ram, compared to the impact to guests that need 64bit ram but can't handle 64bit pci.
Just to review - the original change (96a8d130) was made to encourage padded PCI mappings so that hotplug has a better chance of succeeding. It was thought that there would be few guests that would have >4G ram that could not also handle >4G pci mappings. Did I miss anything?
I'm leery of moving this heuristic to 64G of ram. I can understand the logic of >4G of ram indicating support for >4G pci. However, it seems strange to me that there would be guests with 50G of ram that can't handle >4G pci, but not similar guests with 70G of ram.. It also seems strange to me that only pci hotplug users with more than 64G of ram will benfit from padded mappings.
Some possible alternatives:
We could do nothing for now and continue to gauge how much of a problem this is.
We could revert 96a8d130, and go back to explicit qemu hints for pci hotplug users that need large mappings.
Are there other possibilities or other heuristics?
I know SeaBIOS does not promise anything regarding the version number, but maybe reverting the change and releasing 1.16.4, and then applying the change and increasing the number to 1.17 with a big note/warning in the announcement would lessen the impact of the change..
Kind regards,
Paul
On Thu, Jun 20, 2024 at 01:29:11PM -0400, Kevin O'Connor wrote:
I'm leery of moving this heuristic to 64G of ram. I can understand the logic of >4G of ram indicating support for >4G pci. However, it seems strange to me that there would be guests with 50G of ram that can't handle >4G pci, but not similar guests with 70G of ram.. It
Such a guest cannot possibly address that much, so it doesn't seem at all unreasonable to require a config change there. That is, by definition there can't be a workload in the OS that's relying on 70G of RAM.
And yes, from what we've been able to ascertain, there really are 32-bit VMs with 50G of RAM in production...
regards john
On Thu, Jun 20, 2024 at 07:08:38PM +0100, John Levon wrote:
On Thu, Jun 20, 2024 at 01:29:11PM -0400, Kevin O'Connor wrote:
I'm leery of moving this heuristic to 64G of ram. I can understand the logic of >4G of ram indicating support for >4G pci. However, it seems strange to me that there would be guests with 50G of ram that can't handle >4G pci, but not similar guests with 70G of ram.. It
Such a guest cannot possibly address that much, so it doesn't seem at all unreasonable to require a config change there. That is, by definition there can't be a workload in the OS that's relying on 70G of RAM.
Thanks. I missed that PAE is limited to 36bits (64GB).
Is the problem that SeaBIOS created PCI mappings >4G or is the problem that SeaBIOS created PCI mappings >64G?
-Kevin
On Thu, Jun 20, 2024 at 02:48:03PM GMT, Kevin O'Connor wrote:
On Thu, Jun 20, 2024 at 07:08:38PM +0100, John Levon wrote:
On Thu, Jun 20, 2024 at 01:29:11PM -0400, Kevin O'Connor wrote:
I'm leery of moving this heuristic to 64G of ram. I can understand the logic of >4G of ram indicating support for >4G pci. However, it seems strange to me that there would be guests with 50G of ram that can't handle >4G pci, but not similar guests with 70G of ram.. It
Such a guest cannot possibly address that much, so it doesn't seem at all unreasonable to require a config change there. That is, by definition there can't be a workload in the OS that's relying on 70G of RAM.
Thanks. I missed that PAE is limited to 36bits (64GB).
I had assumed that, but it's not. That was the case in all 32-bit processors with PAE support.
But 64-bit processors have the same physical address space limit in both long mode and pae paging mode. For early intel processors that happened to be 64GB too.
Is the problem that SeaBIOS created PCI mappings >4G or is the problem that SeaBIOS created PCI mappings >64G?
Good question. Older linux kernels have known problems with mappings above 64TB (aka 46 phys-bits). This is addressed by commit a6ed6b701f0a ("limit address space used for pci devices.").
John, can you check with your guests? Try reduce pci_mem64_top and see if things start working at some point.
take care, Gerd
On Thu, Jun 20, 2024 at 07:34:28PM +0200, Paul Menzel wrote:
Am 20.06.24 um 19:29 schrieb Kevin O'Connor:
Some possible alternatives:
We could do nothing for now and continue to gauge how much of a problem this is.
We could revert 96a8d130, and go back to explicit qemu hints for pci hotplug users that need large mappings.
Are there other possibilities or other heuristics?
I know SeaBIOS does not promise anything regarding the version number, but maybe reverting the change and releasing 1.16.4, and then applying the change and increasing the number to 1.17 with a big note/warning in the announcement would lessen the impact of the change..
Thanks. It seems an improved work around for the underlying issue was found, so I'm not sure we need to revert the existing heuristic.
That said, I think we probably should plan a v1.17.0 release.
Cheers, -Kevin
On Fri, Jun 21, 2024 at 03:45:43PM GMT, Kevin O'Connor wrote:
On Thu, Jun 20, 2024 at 07:34:28PM +0200, Paul Menzel wrote:
I know SeaBIOS does not promise anything regarding the version number, but maybe reverting the change and releasing 1.16.4, and then applying the change and increasing the number to 1.17 with a big note/warning in the announcement would lessen the impact of the change..
Thanks. It seems an improved work around for the underlying issue was found, so I'm not sure we need to revert the existing heuristic.
Patch just sent to the list.
That said, I think we probably should plan a v1.17.0 release.
Agree.
take care, Gerd