v2 changes: - e820 conflict fix
Gerd Hoffmann (6): better kvm detection detect physical address space size move 64bit pci window to end of address space be less conservative with the 64bit pci io window qemu: log reservations in fw_cfg e820 table check for e820 conflict
src/e820map.h | 1 + src/fw/paravirt.h | 2 ++ src/e820map.c | 15 +++++++++ src/fw/paravirt.c | 86 ++++++++++++++++++++++++++++++++++++++++++----- src/fw/pciinit.c | 20 ++++++++++- 5 files changed, 114 insertions(+), 10 deletions(-)
In case kvm emulates features of another hypervisor (for example hyperv) two VMM CPUID blocks will be present, one for the emulated hypervisor and one for kvm itself.
This patch makes seabios loop over the VMM CPUID blocks to make sure it will properly detect kvm when multiple blocks are present.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index fba4e52db684..c880cb10a1bc 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -56,20 +56,31 @@ inline int qemu_cfg_dma_enabled(void)
static void kvm_detect(void) { + unsigned int i, kvmbase = 0, max = 0; unsigned int eax, ebx, ecx, edx; char signature[13];
- cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; + for (i = KVM_CPUID_SIGNATURE;; i += 0x100) { + eax = 0; + cpuid(i, &eax, &ebx, &ecx, &edx); + if (eax < i) + break; + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + dprintf(1, "cpuid 0x%x: eax %x, signature '%s'\n", i, eax, signature); + if (strcmp(signature, "KVMKVMKVM") == 0) { + kvmbase = i; + max = eax; + } + }
- if (strcmp(signature, "KVMKVMKVM") == 0) { + if (kvmbase) { dprintf(1, "Running on KVM\n"); PlatformRunningOn |= PF_KVM; - if (eax >= KVM_CPUID_SIGNATURE + 0x10) { - cpuid(KVM_CPUID_SIGNATURE + 0x10, &eax, &ebx, &ecx, &edx); + if (max >= kvmbase + 0x10) { + cpuid(kvmbase + 0x10, &eax, &ebx, &ecx, &edx); dprintf(1, "kvm: have invtsc, freq %u kHz\n", eax); tsctimer_setfreq(eax, "invtsc"); }
Check for pae and long mode using cpuid. If present also read the physical address bits. Apply some qemu sanity checks (see below). Record results in PhysBits and LongMode variables. In case we are not sure what the address space size is leave the PhysBits variable unset.
On qemu we have the problem that for historical reasons x86_64 processors advertise 40 physical address space bits by default, even in case the host supports less than that so actually using the whole address space will not work.
Because of that the code applies some extra sanity checks in case we find 40 (or less) physical address space bits advertised. Only known-good values (which is 40 for amd processors and 36+39 for intel processors) will be accepted as valid.
Recommendation is to use 'qemu -cpu ${name},host-phys-bits=on' to advertise valid physical address space bits to the guest. Some distro builds enable this by default, and most likely the qemu default will change in near future too.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.h | 2 ++ src/fw/paravirt.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+)
diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 4e2e993ba9d3..7ca3fdcc06d4 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -31,6 +31,8 @@ typedef struct QemuCfgDmaAccess { extern u32 RamSize; extern u64 RamSizeOver4G; extern int PlatformRunningOn; +extern u8 PhysBits; +extern int LongMode;
static inline int runningOnQEMU(void) { return CONFIG_QEMU || ( diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index c880cb10a1bc..a84968661aee 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -32,6 +32,10 @@ u32 RamSize; // Amount of continuous ram >4Gig u64 RamSizeOver4G; +// physical address space bits +u8 PhysBits; +// 64bit processor +int LongMode; // Type of emulator platform. int PlatformRunningOn VARFSEG; // cfg enabled @@ -129,6 +133,58 @@ static void kvmclock_init(void) tsctimer_setfreq(MHz * 1000, "kvmclock"); }
+static void physbits(int qemu_quirk) +{ + unsigned int max, eax, ebx, ecx, edx; + unsigned int physbits; + char signature[13]; + int pae = 0, valid = 0; + + cpuid(0, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &edx, 4); + memcpy(signature + 8, &ecx, 4); + signature[12] = 0; + if (eax >= 1) { + cpuid(1, &eax, &ebx, &ecx, &edx); + pae = (edx & (1 << 6)); + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + max = eax; + + if (max >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + LongMode = (edx & (1 << 29)); + } + + if (pae && LongMode && max >= 0x80000008) { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + physbits = (u8)eax; + if (!qemu_quirk) { + valid = 1; + } else if (physbits >= 41) { + valid = 1; + } else if (strcmp(signature, "GenuineIntel") == 0) { + if ((physbits == 36) || (physbits == 39)) + valid = 1; + } else if (strcmp(signature, "AuthenticAMD") == 0) { + if (physbits == 40) + valid = 1; + } + } else { + physbits = pae ? 36 : 32; + valid = 1; + } + + dprintf(1, "%s: signature="%s", pae=%s, lm=%s, phys-bits=%d, valid=%s\n", + __func__, signature, pae ? "yes" : "no", LongMode ? "yes" : "no", + physbits, valid ? "yes" : "no"); + + if (valid) + PhysBits = physbits; +} + static void qemu_detect(void) { if (!CONFIG_QEMU_HARDWARE) @@ -161,6 +217,7 @@ static void qemu_detect(void) dprintf(1, "Running on QEMU (unknown nb: %04x:%04x)\n", v, d); break; } + physbits(1); }
static int qemu_early_e820(void);
On Wed, May 03, 2023 at 11:20:54AM +0200, Gerd Hoffmann wrote:
Check for pae and long mode using cpuid. If present also read the physical address bits. Apply some qemu sanity checks (see below). Record results in PhysBits and LongMode variables. In case we are not sure what the address space size is leave the PhysBits variable unset.
On qemu we have the problem that for historical reasons x86_64 processors advertise 40 physical address space bits by default, even in case the host supports less than that so actually using the whole address space will not work.
Because of that the code applies some extra sanity checks in case we find 40 (or less) physical address space bits advertised. Only known-good values (which is 40 for amd processors and 36+39 for intel processors) will be accepted as valid.
Recommendation is to use 'qemu -cpu ${name},host-phys-bits=on' to advertise valid physical address space bits to the guest. Some distro builds enable this by default, and most likely the qemu default will change in near future too.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
src/fw/paravirt.h | 2 ++ src/fw/paravirt.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+)
diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 4e2e993ba9d3..7ca3fdcc06d4 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -31,6 +31,8 @@ typedef struct QemuCfgDmaAccess { extern u32 RamSize; extern u64 RamSizeOver4G; extern int PlatformRunningOn; +extern u8 PhysBits; +extern int LongMode;
static inline int runningOnQEMU(void) { return CONFIG_QEMU || ( diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index c880cb10a1bc..a84968661aee 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -32,6 +32,10 @@ u32 RamSize; // Amount of continuous ram >4Gig u64 RamSizeOver4G; +// physical address space bits +u8 PhysBits; +// 64bit processor +int LongMode;
Odd that LongMode is an "int" while PhysBits is a u8.. Would probably be simpler to do: LongMode = !!(edx & (1 << 29)) - and then treat it as a flag variable.
Since these variables are exported, it might also be worthwhile to give them more descriptive names - for example CPUPhysBits and CPULongMode.
Separately, series looks fine to me. -Kevin
// Type of emulator platform. int PlatformRunningOn VARFSEG; // cfg enabled @@ -129,6 +133,58 @@ static void kvmclock_init(void) tsctimer_setfreq(MHz * 1000, "kvmclock"); }
+static void physbits(int qemu_quirk) +{
- unsigned int max, eax, ebx, ecx, edx;
- unsigned int physbits;
- char signature[13];
- int pae = 0, valid = 0;
- cpuid(0, &eax, &ebx, &ecx, &edx);
- memcpy(signature + 0, &ebx, 4);
- memcpy(signature + 4, &edx, 4);
- memcpy(signature + 8, &ecx, 4);
- signature[12] = 0;
- if (eax >= 1) {
cpuid(1, &eax, &ebx, &ecx, &edx);
pae = (edx & (1 << 6));
- }
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- max = eax;
- if (max >= 0x80000001) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
LongMode = (edx & (1 << 29));
- }
- if (pae && LongMode && max >= 0x80000008) {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
physbits = (u8)eax;
if (!qemu_quirk) {
valid = 1;
} else if (physbits >= 41) {
valid = 1;
} else if (strcmp(signature, "GenuineIntel") == 0) {
if ((physbits == 36) || (physbits == 39))
valid = 1;
} else if (strcmp(signature, "AuthenticAMD") == 0) {
if (physbits == 40)
valid = 1;
}
- } else {
physbits = pae ? 36 : 32;
valid = 1;
- }
- dprintf(1, "%s: signature="%s", pae=%s, lm=%s, phys-bits=%d, valid=%s\n",
__func__, signature, pae ? "yes" : "no", LongMode ? "yes" : "no",
physbits, valid ? "yes" : "no");
- if (valid)
PhysBits = physbits;
+}
static void qemu_detect(void) { if (!CONFIG_QEMU_HARDWARE) @@ -161,6 +217,7 @@ static void qemu_detect(void) dprintf(1, "Running on QEMU (unknown nb: %04x:%04x)\n", v, d); break; }
- physbits(1);
}
static int qemu_early_e820(void);
2.40.1
When the size of the physical address space is known (PhysBits is not zero) move the 64bit pci io window to the end of the address space.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index badf13d3233b..ad6def93633b 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1128,6 +1128,14 @@ static void pci_bios_map_devices(struct pci_bus *busses) r64_mem.base = le64_to_cpu(romfile_loadint("etc/reserved-memory-end", 0)); if (r64_mem.base < 0x100000000LL + RamSizeOver4G) r64_mem.base = 0x100000000LL + RamSizeOver4G; + if (PhysBits) { + u64 top = 1LL << PhysBits; + u64 size = (ALIGN(sum_mem, (1LL<<30)) + + ALIGN(sum_pref, (1LL<<30))); + if (r64_mem.base < top - size) { + r64_mem.base = top - size; + } + } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage r64_pref.base = r64_mem.base + sum_mem;
Current seabios code will only enable and use the 64bit pci io window in case it runs out of space in the 32bit pci mmio window below 4G.
This patch will also enable the 64bit pci io window when (a) RAM above 4G is present, and (b) the physical address space size is known, and (c) seabios is running on a 64bit capable processor.
This operates with the assumption that guests which are ok with memory above 4G most likely can handle mmio above 4G too.
In case the 64bit pci io window is enabled also assign more memory to prefetchable pci bridge windows and the complete 64bit pci io window.
The total mmio window size is 1/8 of the physical address space. Minimum bridge windows size is 1/256 of the total mmio window size.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/pciinit.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index ad6def93633b..b715bf2d772e 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -51,6 +51,7 @@ u64 pcimem_end = BUILD_PCIMEM_END; u64 pcimem64_start = BUILD_PCIMEM64_START; u64 pcimem64_end = BUILD_PCIMEM64_END; u64 pci_io_low_end = 0xa000; +u32 pci_use_64bit = 0;
struct pci_region_entry { struct pci_device *dev; @@ -920,6 +921,8 @@ static int pci_bios_check_devices(struct pci_bus *busses) for (type = 0; type < PCI_REGION_TYPE_COUNT; type++) { u64 align = (type == PCI_REGION_TYPE_IO) ? PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN; + if (pci_use_64bit && (type == PCI_REGION_TYPE_PREFMEM)) + align = (u64)1 << (PhysBits - 11); if (!pci_bridge_has_region(s->bus_dev, type)) continue; u64 size = 0; @@ -1108,7 +1111,7 @@ static void pci_bios_map_devices(struct pci_bus *busses) panic("PCI: out of I/O address space\n");
dprintf(1, "PCI: 32: %016llx - %016llx\n", pcimem_start, pcimem_end); - if (pci_bios_init_root_regions_mem(busses)) { + if (pci_use_64bit || pci_bios_init_root_regions_mem(busses)) { struct pci_region r64_mem, r64_pref; r64_mem.list.first = NULL; r64_pref.list.first = NULL; @@ -1132,6 +1135,8 @@ static void pci_bios_map_devices(struct pci_bus *busses) u64 top = 1LL << PhysBits; u64 size = (ALIGN(sum_mem, (1LL<<30)) + ALIGN(sum_pref, (1LL<<30))); + if (pci_use_64bit) + size = ALIGN(size, (1LL<<(PhysBits-3))); if (r64_mem.base < top - size) { r64_mem.base = top - size; } @@ -1174,6 +1179,9 @@ pci_setup(void)
dprintf(3, "pci setup\n");
+ if (PhysBits >= 36 && LongMode && RamSizeOver4G) + pci_use_64bit = 1; + dprintf(1, "=== PCI bus & bridge init ===\n"); if (pci_probe_host() != 0) { return;
With loglevel 1 (same we use for RAM entries), so it is included in the firmware log by default.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/fw/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index a84968661aee..2c9ed4c3f4d6 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -764,7 +764,7 @@ static int qemu_early_e820(void) switch (table.type) { case E820_RESERVED: e820_add(table.address, table.length, table.type); - dprintf(3, "qemu/e820: addr 0x%016llx len 0x%016llx [reserved]\n", + dprintf(1, "qemu/e820: addr 0x%016llx len 0x%016llx [reserved]\n", table.address, table.length); break; case E820_RAM:
Add support to check for overlaps with e820 entries. In case the 64bit pci io window has conflicts move it down.
The only known case where this happens is AMD processors with 1TB address space which has some space just below 1TB reserved for HT.
Signed-off-by: Gerd Hoffmann kraxel@redhat.com --- src/e820map.h | 1 + src/e820map.c | 15 +++++++++++++++ src/fw/pciinit.c | 2 ++ 3 files changed, 18 insertions(+)
diff --git a/src/e820map.h b/src/e820map.h index de8b523003c5..07ce16ec213f 100644 --- a/src/e820map.h +++ b/src/e820map.h @@ -18,6 +18,7 @@ struct e820entry { void e820_add(u64 start, u64 size, u32 type); void e820_remove(u64 start, u64 size); void e820_prepboot(void); +int e820_is_used(u64 start, u64 size);
// e820 map storage extern struct e820entry e820_list[]; diff --git a/src/e820map.c b/src/e820map.c index 39445cf6399d..c761e5e98a75 100644 --- a/src/e820map.c +++ b/src/e820map.c @@ -150,3 +150,18 @@ e820_prepboot(void) { dump_map(); } + +int +e820_is_used(u64 start, u64 size) +{ + int i; + for (i=0; i<e820_count; i++) { + struct e820entry *e = &e820_list[i]; + if (start + size <= e->start) + continue; + if (start >= e->start + e->size) + continue; + return 1; + } + return 0; +} diff --git a/src/fw/pciinit.c b/src/fw/pciinit.c index b715bf2d772e..190e79216f86 100644 --- a/src/fw/pciinit.c +++ b/src/fw/pciinit.c @@ -1140,6 +1140,8 @@ static void pci_bios_map_devices(struct pci_bus *busses) if (r64_mem.base < top - size) { r64_mem.base = top - size; } + if (e820_is_used(r64_mem.base, size)) + r64_mem.base -= size; } r64_mem.base = ALIGN(r64_mem.base, align_mem); r64_mem.base = ALIGN(r64_mem.base, (1LL<<30)); // 1G hugepage