Gerd Hoffmann (4): better kvm detection detect physical address space size move 64bit pci window to end of address space be less conservative with the 64bit pci io window
src/fw/paravirt.h | 2 ++ src/fw/paravirt.c | 87 ++++++++++++++++++++++++++++++++++++++++++----- src/fw/pciinit.c | 16 ++++++++- 3 files changed, 96 insertions(+), 9 deletions(-)
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
This patch makes seabios loop over the VMM CPUID blocks to make sure it will properly detect kvm when multiple blocks are present.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index fba4e52db684..c880cb10a1bc 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -56,20 +56,31 @@ inline int qemu_cfg_dma_enabled(void)
static void kvm_detect(void) { + unsigned int i, kvmbase = 0, max = 0; unsigned int eax, ebx, ecx, edx; char signature[13];
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + for (i = KVM_CPUID_SIGNATURE;; i += 0x100) { + eax = 0; + cpuid(i, &eax, &ebx, &ecx, &edx); + if (eax < i) + break; + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + dprintf(1, "cpuid 0x%x: eax %x, signature '%s'\n", i, eax, signature); + if (strcmp(signature, "KVMKVMKVM") == 0) { + kvmbase = i; + max = eax; + } + }
- if (strcmp(signature, "KVMKVMKVM") == 0) { + if (kvmbase) { dprintf(1, "Running on KVM\n"); PlatformRunningOn |= PF_KVM; - if (eax >= KVM_CPUID_SIGNATURE + 0x10) { - cpuid(KVM_CPUID_SIGNATURE + 0x10, &eax, &ebx, &ecx, &edx); + if (max >= kvmbase + 0x10) { + cpuid(kvmbase + 0x10, &eax, &ebx, &ecx, &edx); dprintf(1, "kvm: have invtsc, freq %u kHz\n", eax); tsctimer_setfreq(eax, "invtsc"); }
On Mon, 2022-11-21 at 11:32 +0100, Gerd Hoffmann wrote:
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
That isn't the case for emulating Xen on KVM, FWIW. Only for Hyper-V on KVM (and also for Hyper-V on Xen).
Check for pae and long mode using cpuid. If present also read the physical address bits. Apply some qemu sanity checks (see below). Record results in PhysBits and LongMode variables. In case we are not sure what the address space size is leave the PhysBits variable unset.
On qemu we have the problem that for historical reasons x86_64 processors advertise 40 physical address space bits by default, even in case the host supports less than that so actually using the whole address space will not work.
Because of that the code applies some extra sanity checks in case we find 40 (or less) physical address space bits advertised. Only known-good values (which is 40 for amd processors and 36+39 for intel processors) will be accepted as valid.
Recommendation is to use 'qemu -cpu ${name},host-phys-bits=on' to advertise valid physical address space bits to the guest. Some distro builds enable this by default, and most likely the qemu default will change in near future too.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.h | 2 ++ src/fw/paravirt.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+)
diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 4e2e993ba9d3..7ca3fdcc06d4 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -31,6 +31,8 @@ typedef struct QemuCfgDmaAccess { extern u32 RamSize; extern u64 RamSizeOver4G; extern int PlatformRunningOn; +extern u8 PhysBits; +extern int LongMode;
static inline int runningOnQEMU(void) { return CONFIG_QEMU || ( diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index c880cb10a1bc..d50a8a7e9d55 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -32,6 +32,10 @@ u32 RamSize; // Amount of continuous ram >4Gig u64 RamSizeOver4G; +// physical address space bits +u8 PhysBits; +// 64bit processor +int LongMode; // Type of emulator platform. int PlatformRunningOn VARFSEG; // cfg enabled @@ -129,6 +133,61 @@ static void kvmclock_init(void) tsctimer_setfreq(MHz * 1000, "kvmclock"); }
+static void physbits(int qemu_quirk) +{ + unsigned int max, eax, ebx, ecx, edx; + unsigned int physbits; + char signature[13]; + int pae = 0, valid = 0; + + cpuid(0, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &edx, 4); + memcpy(signature + 8, &ecx, 4); + signature[12] = 0; + if (eax >= 1) { + cpuid(1, &eax, &ebx, &ecx, &edx); + pae = (edx & (1 << 6)); + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + max = eax; + + if (max >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + LongMode = (edx & (1 << 29)); + } + + if (pae && LongMode && max >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + physbits = (u8)eax; + if (!qemu_quirk) { + valid = 1; + } else if (physbits >= 41) { + valid = 1; + } else if (strcmp(signature, "GenuineIntel") == 0) { + if ((physbits == 36) || (physbits == 39)) + valid = 1; + } else if (strcmp(signature, "AuthenticAMD") == 0) { + if (physbits == 40) + valid = 1; + } + } else { + physbits = pae ? 36 : 32; + valid = 1; + } + + dprintf(1, "%s: signature="%s", pae=%s, lm=%s, phys-bits=%d, valid=%s\n", + __func__, signature, pae ? "yes" : "no", LongMode ? "yes" : "no", + physbits, valid ? "yes" : "no"); + + if (valid) { + if (physbits > 47) + physbits = 47; + PhysBits = physbits; + } +} + static void qemu_detect(void) { if (!CONFIG_QEMU_HARDWARE) @@ -161,6 +220,7 @@ static void qemu_detect(void) dprintf(1, "Running on QEMU (unknown nb: %04x:%04x)\n", v, d); break; } + physbits(1); }
static int qemu_early_e820(void);
When the size of the physical address space is known (PhysBits is not zero) move the 64bit pci io window to the end of the address space.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index badf13d3233b..ad6def93633b 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1128,6 +1128,14 @@ static void pci_bios_map_devices(struct pci_bus *busses) r64_mem.base = le64_to_cpu(romfile_loadint("etc/reserved-memory-end", 0)); if (r64_mem.base < 0x100000000LL + RamSizeOver4G) r64_mem.base = 0x100000000LL + RamSizeOver4G; + if (PhysBits) { + u64 top = 1LL << PhysBits; + u64 size = (ALIGN(sum_mem, (1LL<<30)) + + ALIGN(sum_pref, (1LL<<30))); + if (r64_mem.base < top - size) { + r64_mem.base = top - size; + } + } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage r64_pref.base = r64_mem.base + sum_mem;
Dear Gerd,
Thank you for the patch.
Am 21.11.22 um 11:32 schrieb Gerd Hoffmann:
When the size of the physical address space is known (PhysBits is not zero) move the 64bit pci io window to the end of the address space.
It’d be great, if you elaborated, why this is an improvement.
Kind regards,
Paul
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
src/fw/pciinit.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index badf13d3233b..ad6def93633b 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1128,6 +1128,14 @@ static void pci_bios_map_devices(struct pci_bus *busses) r64_mem.base = le64_to_cpu(romfile_loadint("etc/reserved-memory-end", 0)); if (r64_mem.base < 0x100000000LL + RamSizeOver4G) r64_mem.base = 0x100000000LL + RamSizeOver4G;
if (PhysBits) {
u64 top = 1LL << PhysBits;
u64 size = (ALIGN(sum_mem, (1LL<<30)) +
ALIGN(sum_pref, (1LL<<30)));
if (r64_mem.base < top - size) {
r64_mem.base = top - size;
}
} r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage r64_pref.base = r64_mem.base + sum_mem;
On Mon, Nov 21, 2022 at 11:57:35AM +0100, Paul Menzel wrote:
Dear Gerd,
Thank you for the patch.
Am 21.11.22 um 11:32 schrieb Gerd Hoffmann:
When the size of the physical address space is known (PhysBits is not zero) move the 64bit pci io window to the end of the address space.
It’d be great, if you elaborated, why this is an improvement.
It makes seabios follow a common pattern. real mode address space has io resources mapped high (below 1M). 32-bit address space has io resources mapped high too (below 4G). This does the same for 64-bit resources.
Physical hardware does simliar things, here is my workstation:
# sudo cat /proc/iomem [ ... regions below 4G snipped ... ] 100000000-a36ffffff : System RAM 2e6000000-2e7002607 : Kernel code 2e7200000-2e7e84fff : Kernel rodata 2e8000000-2e8326e7f : Kernel data 2e8b6e000-2e8ffffff : Kernel bss a37000000-a37ffffff : RAM buffer 2000000000-2fffffffff : PCI Bus 0000:00 2ffff00000-2ffff0ffff : 0000:00:1f.3 2ffff10000-2ffff1ffff : 0000:00:14.0 2ffff10000-2ffff1ffff : xhci-hcd 2ffff20000-2ffff23fff : 0000:00:1f.3 2ffff20000-2ffff23fff : ICH HD audio 2ffff24000-2ffff240ff : 0000:00:1f.4 2ffff25000-2ffff25fff : 0000:00:16.0 2ffff25000-2ffff25fff : mei_me
There is a larger gap between the end of memory and start of the pci io window. It's not the end of the *cpu* physical address space (that would be 7fffffffff aka phys-bits=39). Maybe there are additional constrains in the io chipset (which is a non-issue for virtual machines).
take care, Gerd
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
In case the 64bit pci io window is enabled also assign more memory to prefetchable pci bridge windows (scale with address space).
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index ad6def93633b..3e9636b139a4 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -51,6 +51,7 @@ u64 pcimem_end = BUILD_PCIMEM_END; u64 pcimem64_start = BUILD_PCIMEM64_START; u64 pcimem64_end = BUILD_PCIMEM64_END; u64 pci_io_low_end = 0xa000; +u32 pci_use_64bit = 0;
struct pci_region_entry { struct pci_device *dev; @@ -920,6 +921,8 @@ static int pci_bios_check_devices(struct pci_bus *busses) for (type = 0; type < PCI_REGION_TYPE_COUNT; type++) { u64 align = (type == PCI_REGION_TYPE_IO) ? PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN; + if (pci_use_64bit && (type == PCI_REGION_TYPE_PREFMEM)) + align = (u64)1 << (PhysBits - 11); if (!pci_bridge_has_region(s->bus_dev, type)) continue; u64 size = 0; @@ -1108,7 +1111,7 @@ static void pci_bios_map_devices(struct pci_bus *busses) panic("PCI: out of I/O address space\n");
dprintf(1, "PCI: 32: %016llx - %016llx\n", pcimem_start, pcimem_end); - if (pci_bios_init_root_regions_mem(busses)) { + if (pci_use_64bit || pci_bios_init_root_regions_mem(busses)) { struct pci_region r64_mem, r64_pref; r64_mem.list.first = NULL; r64_pref.list.first = NULL; @@ -1174,6 +1177,9 @@ pci_setup(void)
dprintf(3, "pci setup\n");
+ if (PhysBits >= 36 && LongMode && RamSizeOver4G) + pci_use_64bit = 1; + dprintf(1, "=== PCI bus & bridge init ===\n"); if (pci_probe_host() != 0) { return;
On Mon, Nov 21, 2022 at 11:32:13AM +0100, Gerd Hoffmann wrote:
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
Thanks. In general, the series looks good to me. Can you elaborate on the background to this change though? It sounds like there is a (small) risk of a regression, so I think it would be good to have a high level understanding of what is driving this memory reorg.
Cheers, -Kevin
In case the 64bit pci io window is enabled also assign more memory to prefetchable pci bridge windows (scale with address space).
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
src/fw/pciinit.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index ad6def93633b..3e9636b139a4 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -51,6 +51,7 @@ u64 pcimem_end = BUILD_PCIMEM_END; u64 pcimem64_start = BUILD_PCIMEM64_START; u64 pcimem64_end = BUILD_PCIMEM64_END; u64 pci_io_low_end = 0xa000; +u32 pci_use_64bit = 0;
struct pci_region_entry { struct pci_device *dev; @@ -920,6 +921,8 @@ static int pci_bios_check_devices(struct pci_bus *busses) for (type = 0; type < PCI_REGION_TYPE_COUNT; type++) { u64 align = (type == PCI_REGION_TYPE_IO) ? PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
if (pci_use_64bit && (type == PCI_REGION_TYPE_PREFMEM))
align = (u64)1 << (PhysBits - 11); if (!pci_bridge_has_region(s->bus_dev, type)) continue; u64 size = 0;
@@ -1108,7 +1111,7 @@ static void pci_bios_map_devices(struct pci_bus *busses) panic("PCI: out of I/O address space\n");
dprintf(1, "PCI: 32: %016llx - %016llx\n", pcimem_start, pcimem_end);
- if (pci_bios_init_root_regions_mem(busses)) {
- if (pci_use_64bit || pci_bios_init_root_regions_mem(busses)) { struct pci_region r64_mem, r64_pref; r64_mem.list.first = NULL; r64_pref.list.first = NULL;
@@ -1174,6 +1177,9 @@ pci_setup(void)
dprintf(3, "pci setup\n");
- if (PhysBits >= 36 && LongMode && RamSizeOver4G)
pci_use_64bit = 1;
- dprintf(1, "=== PCI bus & bridge init ===\n"); if (pci_probe_host() != 0) { return;
-- 2.38.1
On Tue, Nov 22, 2022 at 01:43:16PM -0500, Kevin O'Connor wrote:
On Mon, Nov 21, 2022 at 11:32:13AM +0100, Gerd Hoffmann wrote:
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
Thanks. In general, the series looks good to me. Can you elaborate on the background to this change though? It sounds like there is a (small) risk of a regression, so I think it would be good to have a high level understanding of what is driving this memory reorg.
Well, the idea is to adapt to the world moving forward. Running a 64-bit capable OS is standard these days, and the resources needed by devices (especially GPUs) are becoming larger and larger.
Yes, there is the risk that (old) guests are unhappy with their PCI bars suddenly being mapped above 4G. Can happen only in case seabios handles pci initialization (i.e. when running on qemu, otherwise coreboot initializes the pci bars). I hope the memory check handles the 'old guest' case: when the guest can't handle addresses above 4G it is unlikely that qemu is configured to have memory mapped above 4G ...
take care, Gerd
On Wed, 23 Nov 2022 11:25:08 +0100 Gerd Hoffmann kraxel@redhat.com wrote:
On Tue, Nov 22, 2022 at 01:43:16PM -0500, Kevin O'Connor wrote:
On Mon, Nov 21, 2022 at 11:32:13AM +0100, Gerd Hoffmann wrote:
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
Thanks. In general, the series looks good to me. Can you elaborate on the background to this change though? It sounds like there is a (small) risk of a regression, so I think it would be good to have a high level understanding of what is driving this memory reorg.
Well, the idea is to adapt to the world moving forward. Running a 64-bit capable OS is standard these days, and the resources needed by devices (especially GPUs) are becoming larger and larger.
Yes, there is the risk that (old) guests are unhappy with their PCI bars suddenly being mapped above 4G. Can happen only in case seabios handles pci initialization (i.e. when running on qemu, otherwise coreboot initializes the pci bars). I hope the memory check handles the 'old guest' case: when the guest can't handle addresses above 4G it is unlikely that qemu is configured to have memory mapped above 4G ...
does it break 32-bit PAE enabled guests (which can have more then 4Gb RAM configured)?
take care, Gerd
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Hi,
Well, the idea is to adapt to the world moving forward. Running a 64-bit capable OS is standard these days, and the resources needed by devices (especially GPUs) are becoming larger and larger.
Yes, there is the risk that (old) guests are unhappy with their PCI bars suddenly being mapped above 4G. Can happen only in case seabios handles pci initialization (i.e. when running on qemu, otherwise coreboot initializes the pci bars). I hope the memory check handles the 'old guest' case: when the guest can't handle addresses above 4G it is unlikely that qemu is configured to have memory mapped above 4G ...
does it break 32-bit PAE enabled guests (which can have more then 4Gb RAM configured)?
Well, depends on the guest OS I guess. Modern linux copes just fine, either uses PAE paging to access the PCI bars (seen with debian, works even in case the PCI bars are above the 64G limit of 32bit processors), or or it remaps the bars to places below 4G (seen with alpine which compiles i386 kernels with PAE=n).
take care, Gerd