v4 changes: - fix handling of 32bit memory bars. v3 changes: - rename variables, use u8 for CPULongMode. v2 changes: - e820 conflict fix
Gerd Hoffmann (6): better kvm detection detect physical address space size move 64bit pci window to end of address space be less conservative with the 64bit pci io window qemu: log reservations in fw_cfg e820 table check for e820 conflict
src/e820map.h | 1 + src/fw/paravirt.h | 2 ++ src/e820map.c | 15 ++++++++ src/fw/paravirt.c | 89 +++++++++++++++++++++++++++++++++++++++++------ src/fw/pciinit.c | 26 +++++++++++--- 5 files changed, 119 insertions(+), 14 deletions(-)
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
This patch makes seabios loop over the VMM CPUID blocks to make sure it will properly detect kvm when multiple blocks are present.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index fba4e52db684..42abac8460c5 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -53,23 +53,35 @@ inline int qemu_cfg_dma_enabled(void) * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 +static unsigned int kvm_cpuid_base = 0;
static void kvm_detect(void) { + unsigned int i, max = 0; unsigned int eax, ebx, ecx, edx; char signature[13];
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + for (i = KVM_CPUID_SIGNATURE;; i += 0x100) { + eax = 0; + cpuid(i, &eax, &ebx, &ecx, &edx); + if (eax < i) + break; + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + dprintf(1, "cpuid 0x%x: eax %x, signature '%s'\n", i, eax, signature); + if (strcmp(signature, "KVMKVMKVM") == 0) { + kvm_cpuid_base = i; + max = eax; + } + }
- if (strcmp(signature, "KVMKVMKVM") == 0) { + if (kvm_cpuid_base) { dprintf(1, "Running on KVM\n"); PlatformRunningOn |= PF_KVM; - if (eax >= KVM_CPUID_SIGNATURE + 0x10) { - cpuid(KVM_CPUID_SIGNATURE + 0x10, &eax, &ebx, &ecx, &edx); + if (max >= kvm_cpuid_base + 0x10) { + cpuid(kvm_cpuid_base + 0x10, &eax, &ebx, &ecx, &edx); dprintf(1, "kvm: have invtsc, freq %u kHz\n", eax); tsctimer_setfreq(eax, "invtsc"); } @@ -93,7 +105,7 @@ static void kvmclock_init(void) if (!runningOnKVM()) return;
- cpuid(KVM_CPUID_SIGNATURE + 0x01, &eax, &ebx, &ecx, &edx); + cpuid(kvm_cpuid_base + 0x01, &eax, &ebx, &ecx, &edx); if (eax & (1 << KVM_FEATURE_CLOCKSOURCE2)) msr = MSR_KVM_SYSTEM_TIME_NEW; else if (eax & (1 << KVM_FEATURE_CLOCKSOURCE))
Check for pae and long mode using cpuid. If present also read the physical address bits. Apply some qemu sanity checks (see below). Record results in PhysBits and LongMode variables. In case we are not sure what the address space size is leave the PhysBits variable unset.
On qemu we have the problem that for historical reasons x86_64 processors advertise 40 physical address space bits by default, even in case the host supports less than that so actually using the whole address space will not work.
Because of that the code applies some extra sanity checks in case we find 40 (or less) physical address space bits advertised. Only known-good values (which is 40 for amd processors and 36+39 for intel processors) will be accepted as valid.
Recommendation is to use 'qemu -cpu ${name},host-phys-bits=on' to advertise valid physical address space bits to the guest. Some distro builds enable this by default, and most likely the qemu default will change in near future too.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.h | 2 ++ src/fw/paravirt.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+)
diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 4e2e993ba9d3..62a2cd075d2b 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -31,6 +31,8 @@ typedef struct QemuCfgDmaAccess { extern u32 RamSize; extern u64 RamSizeOver4G; extern int PlatformRunningOn; +extern u8 CPUPhysBits; +extern u8 CPULongMode;
static inline int runningOnQEMU(void) { return CONFIG_QEMU || ( diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 42abac8460c5..3aee4c0d4e6b 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -32,6 +32,10 @@ u32 RamSize; // Amount of continuous ram >4Gig u64 RamSizeOver4G; +// physical address space bits +u8 CPUPhysBits; +// 64bit processor +u8 CPULongMode; // Type of emulator platform. int PlatformRunningOn VARFSEG; // cfg enabled @@ -130,6 +134,58 @@ static void kvmclock_init(void) tsctimer_setfreq(MHz * 1000, "kvmclock"); }
+static void physbits(int qemu_quirk) +{ + unsigned int max, eax, ebx, ecx, edx; + unsigned int physbits; + char signature[13]; + int pae = 0, valid = 0; + + cpuid(0, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &edx, 4); + memcpy(signature + 8, &ecx, 4); + signature[12] = 0; + if (eax >= 1) { + cpuid(1, &eax, &ebx, &ecx, &edx); + pae = (edx & (1 << 6)); + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + max = eax; + + if (max >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + CPULongMode = !!(edx & (1 << 29)); + } + + if (pae && CPULongMode && max >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + physbits = (u8)eax; + if (!qemu_quirk) { + valid = 1; + } else if (physbits >= 41) { + valid = 1; + } else if (strcmp(signature, "GenuineIntel") == 0) { + if ((physbits == 36) || (physbits == 39)) + valid = 1; + } else if (strcmp(signature, "AuthenticAMD") == 0) { + if (physbits == 40) + valid = 1; + } + } else { + physbits = pae ? 36 : 32; + valid = 1; + } + + dprintf(1, "%s: signature="%s", pae=%s, lm=%s, phys-bits=%d, valid=%s\n", + __func__, signature, pae ? "yes" : "no", CPULongMode ? "yes" : "no", + physbits, valid ? "yes" : "no"); + + if (valid) + CPUPhysBits = physbits; +} + static void qemu_detect(void) { if (!CONFIG_QEMU_HARDWARE) @@ -162,6 +218,7 @@ static void qemu_detect(void) dprintf(1, "Running on QEMU (unknown nb: %04x:%04x)\n", v, d); break; } + physbits(1); }
static int qemu_early_e820(void);
When the size of the physical address space is known (PhysBits is not zero) move the 64bit pci io window to the end of the address space.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index badf13d3233b..0fcd2be598a2 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1128,6 +1128,14 @@ static void pci_bios_map_devices(struct pci_bus *busses) r64_mem.base = le64_to_cpu(romfile_loadint("etc/reserved-memory-end", 0)); if (r64_mem.base < 0x100000000LL + RamSizeOver4G) r64_mem.base = 0x100000000LL + RamSizeOver4G; + if (CPUPhysBits) { + u64 top = 1LL << CPUPhysBits; + u64 size = (ALIGN(sum_mem, (1LL<<30)) + + ALIGN(sum_pref, (1LL<<30))); + if (r64_mem.base < top - size) { + r64_mem.base = top - size; + } + } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage r64_pref.base = r64_mem.base + sum_mem;
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
In case the 64bit pci io window is enabled also assign more memory to prefetchable pci bridge windows and the complete 64bit pci io window.
The total mmio window size is 1/8 of the physical address space. Minimum bridge windows size is 1/256 of the total mmio window size.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index 0fcd2be598a2..b52bd1d5054b 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -51,6 +51,7 @@ u64 pcimem_end = BUILD_PCIMEM_END; u64 pcimem64_start = BUILD_PCIMEM64_START; u64 pcimem64_end = BUILD_PCIMEM64_END; u64 pci_io_low_end = 0xa000; +u32 pci_use_64bit = 0;
struct pci_region_entry { struct pci_device *dev; @@ -960,10 +961,14 @@ static int pci_bios_check_devices(struct pci_bus *busses) if (pci_region_align(&s->r[type]) > align) align = pci_region_align(&s->r[type]); u64 sum = pci_region_sum(&s->r[type]); + int is64 = pci_bios_bridge_region_is64(&s->r[type], + s->bus_dev, type); int resource_optional = 0; if (hotplug_support == HOTPLUG_PCIE) resource_optional = pcie_cap && (type == PCI_REGION_TYPE_IO); - if (!sum && hotplug_support && !resource_optional) + if (hotplug_support && pci_use_64bit && is64 && (type == PCI_REGION_TYPE_PREFMEM)) + align = (u64)1 << (CPUPhysBits - 11); + if (align > sum && hotplug_support && !resource_optional) sum = align; /* reserve min size for hot-plug */ if (size > sum) { dprintf(1, "PCI: QEMU resource reserve cap: " @@ -975,8 +980,6 @@ static int pci_bios_check_devices(struct pci_bus *busses) } else { size = ALIGN(sum, align); } - int is64 = pci_bios_bridge_region_is64(&s->r[type], - s->bus_dev, type); // entry->bar is -1 if the entry represents a bridge region struct pci_region_entry *entry = pci_region_create_entry( parent, s->bus_dev, -1, size, align, type, is64); @@ -1108,7 +1111,7 @@ static void pci_bios_map_devices(struct pci_bus *busses) panic("PCI: out of I/O address space\n");
dprintf(1, "PCI: 32: %016llx - %016llx\n", pcimem_start, pcimem_end); - if (pci_bios_init_root_regions_mem(busses)) { + if (pci_use_64bit || pci_bios_init_root_regions_mem(busses)) { struct pci_region r64_mem, r64_pref; r64_mem.list.first = NULL; r64_pref.list.first = NULL; @@ -1132,6 +1135,8 @@ static void pci_bios_map_devices(struct pci_bus *busses) u64 top = 1LL << CPUPhysBits; u64 size = (ALIGN(sum_mem, (1LL<<30)) + ALIGN(sum_pref, (1LL<<30))); + if (pci_use_64bit) + size = ALIGN(size, (1LL<<(CPUPhysBits-3))); if (r64_mem.base < top - size) { r64_mem.base = top - size; } @@ -1174,6 +1179,9 @@ pci_setup(void)
dprintf(3, "pci setup\n");
+ if (CPUPhysBits >= 36 && CPULongMode && RamSizeOver4G) + pci_use_64bit = 1; + dprintf(1, "=== PCI bus & bridge init ===\n"); if (pci_probe_host() != 0) { return;
With loglevel 1 (same we use for RAM entries), so it is included in the firmware log by default.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 3aee4c0d4e6b..e5d4eca0cb5a 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -765,7 +765,7 @@ static int qemu_early_e820(void) switch (table.type) { case E820_RESERVED: e820_add(table.address, table.length, table.type); - dprintf(3, "qemu/e820: addr 0x%016llx len 0x%016llx [reserved]\n", + dprintf(1, "qemu/e820: addr 0x%016llx len 0x%016llx [reserved]\n", table.address, table.length); break; case E820_RAM:
On 24/8/23 10:57, Gerd Hoffmann wrote:
With loglevel 1 (same we use for RAM entries), so it is included in the firmware log by default.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
src/fw/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
Reviewed-by: Philippe Mathieu-Daudé philmd@linaro.org
Add support to check for overlaps with e820 entries. In case the 64bit pci io window has conflicts move it down.
The only known case where this happens is AMD processors with 1TB address space which has some space just below 1TB reserved for HT.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/e820map.h | 1 + src/e820map.c | 15 +++++++++++++++ src/fw/pciinit.c | 2 ++ 3 files changed, 18 insertions(+)
diff --git a/src/e820map.h b/src/e820map.h index de8b523003c5..07ce16ec213f 100644 --- a/src/e820map.h +++ b/src/e820map.h @@ -18,6 +18,7 @@ struct e820entry { void e820_add(u64 start, u64 size, u32 type); void e820_remove(u64 start, u64 size); void e820_prepboot(void); +int e820_is_used(u64 start, u64 size);
// e820 map storage extern struct e820entry e820_list[]; diff --git a/src/e820map.c b/src/e820map.c index 39445cf6399d..c761e5e98a75 100644 --- a/src/e820map.c +++ b/src/e820map.c @@ -150,3 +150,18 @@ e820_prepboot(void) { dump_map(); } + +int +e820_is_used(u64 start, u64 size) +{ + int i; + for (i=0; i<e820_count; i++) { + struct e820entry *e = &e820_list[i]; + if (start + size <= e->start) + continue; + if (start >= e->start + e->size) + continue; + return 1; + } + return 0; +} diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index b52bd1d5054b..c7084f5e397e 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1140,6 +1140,8 @@ static void pci_bios_map_devices(struct pci_bus *busses) if (r64_mem.base < top - size) { r64_mem.base = top - size; } + if (e820_is_used(r64_mem.base, size)) + r64_mem.base -= size; } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage
On Thu, Aug 24, 2023 at 10:57:50AM +0200, Gerd Hoffmann wrote:
v4 changes:
- fix handling of 32bit memory bars.
v3 changes:
- rename variables, use u8 for CPULongMode.
v2 changes:
- e820 conflict fix
Gerd Hoffmann (6): better kvm detection detect physical address space size move 64bit pci window to end of address space be less conservative with the 64bit pci io window qemu: log reservations in fw_cfg e820 table check for e820 conflict
Patch series committed now.
take care, Gerd