v3 changes: - rename variables, use u8 for CPULongMode. v2 changes: - e820 conflict fix
Gerd Hoffmann (6): better kvm detection detect physical address space size move 64bit pci window to end of address space be less conservative with the 64bit pci io window qemu: log reservations in fw_cfg e820 table check for e820 conflict
src/e820map.h | 1 + src/fw/paravirt.h | 2 ++ src/e820map.c | 15 +++++++++ src/fw/paravirt.c | 86 ++++++++++++++++++++++++++++++++++++++++++----- src/fw/pciinit.c | 20 ++++++++++- 5 files changed, 114 insertions(+), 10 deletions(-)
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
This patch makes seabios loop over the VMM CPUID blocks to make sure it will properly detect kvm when multiple blocks are present.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index fba4e52db684..c880cb10a1bc 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -56,20 +56,31 @@ inline int qemu_cfg_dma_enabled(void)
static void kvm_detect(void) { + unsigned int i, kvmbase = 0, max = 0; unsigned int eax, ebx, ecx, edx; char signature[13];
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + for (i = KVM_CPUID_SIGNATURE;; i += 0x100) { + eax = 0; + cpuid(i, &eax, &ebx, &ecx, &edx); + if (eax < i) + break; + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + dprintf(1, "cpuid 0x%x: eax %x, signature '%s'\n", i, eax, signature); + if (strcmp(signature, "KVMKVMKVM") == 0) { + kvmbase = i; + max = eax; + } + }
- if (strcmp(signature, "KVMKVMKVM") == 0) { + if (kvmbase) { dprintf(1, "Running on KVM\n"); PlatformRunningOn |= PF_KVM; - if (eax >= KVM_CPUID_SIGNATURE + 0x10) { - cpuid(KVM_CPUID_SIGNATURE + 0x10, &eax, &ebx, &ecx, &edx); + if (max >= kvmbase + 0x10) { + cpuid(kvmbase + 0x10, &eax, &ebx, &ecx, &edx); dprintf(1, "kvm: have invtsc, freq %u kHz\n", eax); tsctimer_setfreq(eax, "invtsc"); }
On Fri, May 05, 2023 at 09:11:12AM +0200, Gerd Hoffmann wrote:
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
This patch makes seabios loop over the VMM CPUID blocks to make sure it will properly detect kvm when multiple blocks are present.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
This patch forgot to adapt the kvmclock code to also use the detected kvm cpuid base. Updated patch below.
take care, Gerd
------------------------ ✂️ cut here ------------------------- From 8548ec47c68e198d75e174f82db9959eebb701be Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann kraxel@redhat.com Date: Fri, 9 Sep 2022 07:45:13 +0200 Subject: [PATCH 1/6] better kvm detection
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
This patch makes seabios loop over the VMM CPUID blocks to make sure it will properly detect kvm when multiple blocks are present.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index fba4e52db684..42abac8460c5 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -53,23 +53,35 @@ inline int qemu_cfg_dma_enabled(void) * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 +static unsigned int kvm_cpuid_base = 0;
static void kvm_detect(void) { + unsigned int i, max = 0; unsigned int eax, ebx, ecx, edx; char signature[13];
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + for (i = KVM_CPUID_SIGNATURE;; i += 0x100) { + eax = 0; + cpuid(i, &eax, &ebx, &ecx, &edx); + if (eax < i) + break; + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + dprintf(1, "cpuid 0x%x: eax %x, signature '%s'\n", i, eax, signature); + if (strcmp(signature, "KVMKVMKVM") == 0) { + kvm_cpuid_base = i; + max = eax; + } + }
- if (strcmp(signature, "KVMKVMKVM") == 0) { + if (kvm_cpuid_base) { dprintf(1, "Running on KVM\n"); PlatformRunningOn |= PF_KVM; - if (eax >= KVM_CPUID_SIGNATURE + 0x10) { - cpuid(KVM_CPUID_SIGNATURE + 0x10, &eax, &ebx, &ecx, &edx); + if (max >= kvm_cpuid_base + 0x10) { + cpuid(kvm_cpuid_base + 0x10, &eax, &ebx, &ecx, &edx); dprintf(1, "kvm: have invtsc, freq %u kHz\n", eax); tsctimer_setfreq(eax, "invtsc"); } @@ -93,7 +105,7 @@ static void kvmclock_init(void) if (!runningOnKVM()) return;
- cpuid(KVM_CPUID_SIGNATURE + 0x01, &eax, &ebx, &ecx, &edx); + cpuid(kvm_cpuid_base + 0x01, &eax, &ebx, &ecx, &edx); if (eax & (1 << KVM_FEATURE_CLOCKSOURCE2)) msr = MSR_KVM_SYSTEM_TIME_NEW; else if (eax & (1 << KVM_FEATURE_CLOCKSOURCE))
Check for pae and long mode using cpuid. If present also read the physical address bits. Apply some qemu sanity checks (see below). Record results in PhysBits and LongMode variables. In case we are not sure what the address space size is leave the PhysBits variable unset.
On qemu we have the problem that for historical reasons x86_64 processors advertise 40 physical address space bits by default, even in case the host supports less than that so actually using the whole address space will not work.
Because of that the code applies some extra sanity checks in case we find 40 (or less) physical address space bits advertised. Only known-good values (which is 40 for amd processors and 36+39 for intel processors) will be accepted as valid.
Recommendation is to use 'qemu -cpu ${name},host-phys-bits=on' to advertise valid physical address space bits to the guest. Some distro builds enable this by default, and most likely the qemu default will change in near future too.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.h | 2 ++ src/fw/paravirt.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+)
diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 4e2e993ba9d3..62a2cd075d2b 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -31,6 +31,8 @@ typedef struct QemuCfgDmaAccess { extern u32 RamSize; extern u64 RamSizeOver4G; extern int PlatformRunningOn; +extern u8 CPUPhysBits; +extern u8 CPULongMode;
static inline int runningOnQEMU(void) { return CONFIG_QEMU || ( diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index c880cb10a1bc..fc308bf1ef1d 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -32,6 +32,10 @@ u32 RamSize; // Amount of continuous ram >4Gig u64 RamSizeOver4G; +// physical address space bits +u8 CPUPhysBits; +// 64bit processor +u8 CPULongMode; // Type of emulator platform. int PlatformRunningOn VARFSEG; // cfg enabled @@ -129,6 +133,58 @@ static void kvmclock_init(void) tsctimer_setfreq(MHz * 1000, "kvmclock"); }
+static void physbits(int qemu_quirk) +{ + unsigned int max, eax, ebx, ecx, edx; + unsigned int physbits; + char signature[13]; + int pae = 0, valid = 0; + + cpuid(0, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &edx, 4); + memcpy(signature + 8, &ecx, 4); + signature[12] = 0; + if (eax >= 1) { + cpuid(1, &eax, &ebx, &ecx, &edx); + pae = (edx & (1 << 6)); + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + max = eax; + + if (max >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + CPULongMode = !!(edx & (1 << 29)); + } + + if (pae && CPULongMode && max >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + physbits = (u8)eax; + if (!qemu_quirk) { + valid = 1; + } else if (physbits >= 41) { + valid = 1; + } else if (strcmp(signature, "GenuineIntel") == 0) { + if ((physbits == 36) || (physbits == 39)) + valid = 1; + } else if (strcmp(signature, "AuthenticAMD") == 0) { + if (physbits == 40) + valid = 1; + } + } else { + physbits = pae ? 36 : 32; + valid = 1; + } + + dprintf(1, "%s: signature="%s", pae=%s, lm=%s, phys-bits=%d, valid=%s\n", + __func__, signature, pae ? "yes" : "no", CPULongMode ? "yes" : "no", + physbits, valid ? "yes" : "no"); + + if (valid) + CPUPhysBits = physbits; +} + static void qemu_detect(void) { if (!CONFIG_QEMU_HARDWARE) @@ -161,6 +217,7 @@ static void qemu_detect(void) dprintf(1, "Running on QEMU (unknown nb: %04x:%04x)\n", v, d); break; } + physbits(1); }
static int qemu_early_e820(void);
When the size of the physical address space is known (PhysBits is not zero) move the 64bit pci io window to the end of the address space.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index badf13d3233b..0fcd2be598a2 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1128,6 +1128,14 @@ static void pci_bios_map_devices(struct pci_bus *busses) r64_mem.base = le64_to_cpu(romfile_loadint("etc/reserved-memory-end", 0)); if (r64_mem.base < 0x100000000LL + RamSizeOver4G) r64_mem.base = 0x100000000LL + RamSizeOver4G; + if (CPUPhysBits) { + u64 top = 1LL << CPUPhysBits; + u64 size = (ALIGN(sum_mem, (1LL<<30)) + + ALIGN(sum_pref, (1LL<<30))); + if (r64_mem.base < top - size) { + r64_mem.base = top - size; + } + } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage r64_pref.base = r64_mem.base + sum_mem;
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
In case the 64bit pci io window is enabled also assign more memory to prefetchable pci bridge windows and the complete 64bit pci io window.
The total mmio window size is 1/8 of the physical address space. Minimum bridge windows size is 1/256 of the total mmio window size.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index 0fcd2be598a2..a6b5dff12bd3 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -51,6 +51,7 @@ u64 pcimem_end = BUILD_PCIMEM_END; u64 pcimem64_start = BUILD_PCIMEM64_START; u64 pcimem64_end = BUILD_PCIMEM64_END; u64 pci_io_low_end = 0xa000; +u32 pci_use_64bit = 0;
struct pci_region_entry { struct pci_device *dev; @@ -920,6 +921,8 @@ static int pci_bios_check_devices(struct pci_bus *busses) for (type = 0; type < PCI_REGION_TYPE_COUNT; type++) { u64 align = (type == PCI_REGION_TYPE_IO) ? PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN; + if (pci_use_64bit && (type == PCI_REGION_TYPE_PREFMEM)) + align = (u64)1 << (CPUPhysBits - 11); if (!pci_bridge_has_region(s->bus_dev, type)) continue; u64 size = 0; @@ -1108,7 +1111,7 @@ static void pci_bios_map_devices(struct pci_bus *busses) panic("PCI: out of I/O address space\n");
dprintf(1, "PCI: 32: %016llx - %016llx\n", pcimem_start, pcimem_end); - if (pci_bios_init_root_regions_mem(busses)) { + if (pci_use_64bit || pci_bios_init_root_regions_mem(busses)) { struct pci_region r64_mem, r64_pref; r64_mem.list.first = NULL; r64_pref.list.first = NULL; @@ -1132,6 +1135,8 @@ static void pci_bios_map_devices(struct pci_bus *busses) u64 top = 1LL << CPUPhysBits; u64 size = (ALIGN(sum_mem, (1LL<<30)) + ALIGN(sum_pref, (1LL<<30))); + if (pci_use_64bit) + size = ALIGN(size, (1LL<<(CPUPhysBits-3))); if (r64_mem.base < top - size) { r64_mem.base = top - size; } @@ -1174,6 +1179,9 @@ pci_setup(void)
dprintf(3, "pci setup\n");
+ if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G) + pci_use_64bit = 1; + dprintf(1, "=== PCI bus & bridge init ===\n"); if (pci_probe_host() != 0) { return;
With loglevel 1 (same we use for RAM entries), so it is included in the firmware log by default.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index fc308bf1ef1d..02351b24caea 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -764,7 +764,7 @@ static int qemu_early_e820(void) switch (table.type) { case E820_RESERVED: e820_add(table.address, table.length, table.type); - dprintf(3, "qemu/e820: addr 0x%016llx len 0x%016llx [reserved]\n", + dprintf(1, "qemu/e820: addr 0x%016llx len 0x%016llx [reserved]\n", table.address, table.length); break; case E820_RAM:
On 5/5/23 09:11, Gerd Hoffmann wrote:
With loglevel 1 (same we use for RAM entries), so it is included in the firmware log by default.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
src/fw/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Reviewed-by: Philippe Mathieu-Daudé philmd@linaro.org
Add support to check for overlaps with e820 entries. In case the 64bit pci io window has conflicts move it down.
The only known case where this happens is AMD processors with 1TB address space which has some space just below 1TB reserved for HT.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/e820map.h | 1 + src/e820map.c | 15 +++++++++++++++ src/fw/pciinit.c | 2 ++ 3 files changed, 18 insertions(+)
diff --git a/src/e820map.h b/src/e820map.h index de8b523003c5..07ce16ec213f 100644 --- a/src/e820map.h +++ b/src/e820map.h @@ -18,6 +18,7 @@ struct e820entry { void e820_add(u64 start, u64 size, u32 type); void e820_remove(u64 start, u64 size); void e820_prepboot(void); +int e820_is_used(u64 start, u64 size);
// e820 map storage extern struct e820entry e820_list[]; diff --git a/src/e820map.c b/src/e820map.c index 39445cf6399d..c761e5e98a75 100644 --- a/src/e820map.c +++ b/src/e820map.c @@ -150,3 +150,18 @@ e820_prepboot(void) { dump_map(); } + +int +e820_is_used(u64 start, u64 size) +{ + int i; + for (i=0; i<e820_count; i++) { + struct e820entry *e = &e820_list[i]; + if (start + size <= e->start) + continue; + if (start >= e->start + e->size) + continue; + return 1; + } + return 0; +} diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index a6b5dff12bd3..1e56ee47fa86 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1140,6 +1140,8 @@ static void pci_bios_map_devices(struct pci_bus *busses) if (r64_mem.base < top - size) { r64_mem.base = top - size; } + if (e820_is_used(r64_mem.base, size)) + r64_mem.base -= size; } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage
On Fri, May 05, 2023 at 09:11:11AM +0200, Gerd Hoffmann wrote:
v3 changes:
- rename variables, use u8 for CPULongMode.
v2 changes:
- e820 conflict fix
Thanks. Looks fine to me.
-Kevin
Gerd Hoffmann (6): better kvm detection detect physical address space size move 64bit pci window to end of address space be less conservative with the 64bit pci io window qemu: log reservations in fw_cfg e820 table check for e820 conflict
src/e820map.h | 1 + src/fw/paravirt.h | 2 ++ src/e820map.c | 15 +++++++++ src/fw/paravirt.c | 86 ++++++++++++++++++++++++++++++++++++++++++----- src/fw/pciinit.c | 20 ++++++++++- 5 files changed, 114 insertions(+), 10 deletions(-)
-- 2.40.1
SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org