Implementation of the FW CFG DMA interface.
When running a Linux guest on top of QEMU, using the -kernel options, this is the timing improvement for x86:
QEMU commit 2be4f242b50a8 and SeaBIOS commit 908a58c1d5ff QEMU startup time: .078 BIOS startup time: .060 Kernel setup time: .578 Total time: .716
QEMU with this patch series and SeaBIOS with this patch series QEMU startup time: .080 BIOS startup time: .039 Kernel setup time: .002 Total time: .121
QEMU startup time is the time between the start and the first kvm_entry. BIOS startup time is the time between the first kvm_entry and the start of function do_boot, in SeaBIOS. Kernel setup time is the time between the start of the function do_boot in SeaBIOS and the jump to the Linux kernel.
As you can see, both the BIOS (because of ACPI tables and other configurations) and the Linux kernel boot (because of the copy to memory) are greatly improved with this new interface.
Also, this new interface is an addon to the old interface. Both interfaces are compatible and interchangeable.
Implement guest-side of the QEMU FW CFG DMA interface for x86
Marc Marí (2): Add QEMU fw_cfg DMA interface Boot Linux using QEMU fw_cfg DMA interface
src/boot.c | 14 +++++- src/fw/paravirt.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++-------- src/fw/paravirt.h | 51 +++++++++++++++++++++ src/romlayout.S | 20 +++++++++ 4 files changed, 197 insertions(+), 19 deletions(-)
Add support for the new fw_cfg DMA interface. The protocol is explained in QEMU documentation.
Signed-off-by: Marc Marí markmb@redhat.com --- src/fw/paravirt.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- src/fw/paravirt.h | 17 +++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index db22ae8..287bf23 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -30,6 +30,13 @@ u32 RamSize; u64 RamSizeOver4G; // Type of emulator platform. int PlatformRunningOn VARFSEG; +// cfg_dma enabled +int cfg_dma_enabled = 0; + +inline int qemu_cfg_dma_enabled(void) +{ + return cfg_dma_enabled; +}
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. @@ -201,14 +208,39 @@ qemu_cfg_select(u16 f) static void qemu_cfg_read(void *buf, int len) { - insb(PORT_QEMU_CFG_DATA, buf, len); + if (qemu_cfg_dma_enabled()) { + QemuCfgDmaAccess access; + + access.address = (u64)(u32)buf; + access.length = len; + access.control = QEMU_CFG_DMA_CTL_READ; + + /* + * The out is done before the write of the variables on memory. This + * causes misread on the QEMU side. + */ + barrier(); + + outl((u32)&access, PORT_QEMU_CFG_DMA_ADDR); + while(access.length != 0 && !(access.control & QEMU_CFG_DMA_CTL_ERROR)); + } else { + insb(PORT_QEMU_CFG_DATA, buf, len); + } }
static void qemu_cfg_skip(int len) { - while (len--) - inb(PORT_QEMU_CFG_DATA); + if (len == 0) { + return; + } + + if (qemu_cfg_dma_enabled()) { + qemu_cfg_read(NULL, len); + } else { + while (len--) + inb(PORT_QEMU_CFG_DATA); + } }
static void @@ -422,8 +454,18 @@ void qemu_cfg_init(void) for (i = 0; i < 4; i++) if (inb(PORT_QEMU_CFG_DATA) != sig[i]) return; + dprintf(1, "Found QEMU fw_cfg\n");
+ // Detect DMA interface. + u32 id; + qemu_cfg_read_entry(&id, QEMU_CFG_ID, sizeof(id)); + + if (id == QEMU_CFG_DMA_ID) { + dprintf(1, "QEMU fw_cfg DMA interface supported\n"); + cfg_dma_enabled = 1; + } + // Populate romfiles for legacy fw_cfg entries qemu_cfg_legacy();
diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 95ffb92..05b4997 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -9,6 +9,12 @@ #define PF_XEN (1<<1) #define PF_KVM (1<<2)
+typedef struct QemuCfgDmaAccess { + u64 address; + u32 length; + u32 control; +} QemuCfgDmaAccess; + extern u32 RamSize; extern u64 RamSizeOver4G; extern int PlatformRunningOn; @@ -29,9 +35,20 @@ static inline int runningOnKVM(void) { #define PORT_SMI_STATUS 0x00b3 #define PORT_QEMU_CFG_CTL 0x0510 #define PORT_QEMU_CFG_DATA 0x0511 +#define PORT_QEMU_CFG_DMA_ADDR 0x0512 + +// QEMU_CFG_DMA_CONTROL bits +#define QEMU_CFG_DMA_CTL_ERROR 0x01 +#define QEMU_CFG_DMA_CTL_READ 0x02 +#define QEMU_CFG_DMA_CTL_MASK 0x03 + +// QEMU_CFG_DMA ID bit +#define QEMU_CFG_DMA_ID 2
+int qemu_cfg_dma_enabled(void); void qemu_preinit(void); void qemu_platform_setup(void); void qemu_cfg_init(void); +void qemu_cfg_dma_boot_linux();
#endif
On Thu, Aug 6, 2015 at 12:02 PM, Marc Marí markmb@redhat.com wrote:
- if (qemu_cfg_dma_enabled()) {
QemuCfgDmaAccess access;
access.address = (u64)(u32)buf;
access.length = len;
access.control = QEMU_CFG_DMA_CTL_READ;
/*
* The out is done before the write of the variables on memory. This
* causes misread on the QEMU side.
*/
barrier();
outl((u32)&access, PORT_QEMU_CFG_DMA_ADDR);
I thought PORT_QEMU_CFG_DMA_ADDR is a 64-bit register according to the spec you posted?
while(access.length != 0 && !(access.control & QEMU_CFG_DMA_CTL_ERROR));
Either the field accesses need to be marked volatile, or a barrier is needed to force the compiler to reload these register from memory each iteration of the loop.
I think the problem is that there are no "sequence points" (according to the C language specification) in this loop, so the compiler may assume that access.length and access.control will not change. https://en.wikipedia.org/wiki/Sequence_point
Stefan
Reading Linux from the fw_cfg interface is faster using the DMA interface. For this reason, add a Linux loader that can benefit from this interface.
Signed-off-by: Marc Marí markmb@redhat.com --- src/boot.c | 14 +++++++++- src/fw/paravirt.c | 83 +++++++++++++++++++++++++++++++++++++++++++++---------- src/fw/paravirt.h | 34 +++++++++++++++++++++++ src/romlayout.S | 20 ++++++++++++++ 4 files changed, 135 insertions(+), 16 deletions(-)
diff --git a/src/boot.c b/src/boot.c index e0f73a3..7d187c5 100644 --- a/src/boot.c +++ b/src/boot.c @@ -684,6 +684,14 @@ boot_rom(u32 vector) call_boot_entry(so, 0); }
+// Boot from a linuxboot ROM when QEMU cfg is in DMA mode +static void +boot_linux_cfg_dma(void) +{ + printf("Booting Linux from fw_cfg...\n"); + qemu_cfg_dma_boot_linux(); +} + // Unable to find bootable device - warn user and eventually retry. static void boot_fail(void) @@ -732,7 +740,11 @@ do_boot(int seq_nr) boot_cbfs((void*)ie->vector); break; case IPL_TYPE_BEV: - boot_rom(ie->vector); + if (qemu_cfg_dma_enabled()) { + boot_linux_cfg_dma(); + } else { + boot_rom(ie->vector); + } break; case IPL_TYPE_HALT: boot_fail(); diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 287bf23..d9ac918 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -23,6 +23,8 @@ #include "util.h" // pci_setup #include "x86.h" // cpuid #include "xen.h" // xen_biostable_setup +#include "bregs.h" // struct bregs +#include "stacks.h" // farcall16big
// Amount of continuous ram under 4Gig u32 RamSize; @@ -184,21 +186,6 @@ qemu_platform_setup(void) * QEMU firmware config (fw_cfg) interface ****************************************************************/
-// List of QEMU fw_cfg entries. DO NOT ADD MORE. (All new content -// should be passed via the fw_cfg "file" interface.) -#define QEMU_CFG_SIGNATURE 0x00 -#define QEMU_CFG_ID 0x01 -#define QEMU_CFG_UUID 0x02 -#define QEMU_CFG_NUMA 0x0d -#define QEMU_CFG_BOOT_MENU 0x0e -#define QEMU_CFG_MAX_CPUS 0x0f -#define QEMU_CFG_FILE_DIR 0x19 -#define QEMU_CFG_ARCH_LOCAL 0x8000 -#define QEMU_CFG_ACPI_TABLES (QEMU_CFG_ARCH_LOCAL + 0) -#define QEMU_CFG_SMBIOS_ENTRIES (QEMU_CFG_ARCH_LOCAL + 1) -#define QEMU_CFG_IRQ0_OVERRIDE (QEMU_CFG_ARCH_LOCAL + 2) -#define QEMU_CFG_E820_TABLE (QEMU_CFG_ARCH_LOCAL + 3) - static void qemu_cfg_select(u16 f) { @@ -488,3 +475,69 @@ void qemu_cfg_init(void) dprintf(1, "Moving pm_base to 0x%x\n", acpi_pm_base); } } + +void qemu_cfg_dma_boot_linux(void) +{ + dprintf(1, "Loading kernel\n"); + void *setup_addr; + u32 setup_size; + qemu_cfg_read_entry(&setup_addr, QEMU_CFG_SETUP_ADDR, 4); + qemu_cfg_read_entry(&setup_size, QEMU_CFG_SETUP_SIZE, 4); + qemu_cfg_read_entry(setup_addr, QEMU_CFG_SETUP_DATA, setup_size); + + if (readl(setup_addr + 0x202) != 0x53726448) { + dprintf(1, "Not valid kernel\n"); + return; + } + + u16 protocol = readw(setup_addr + 0x206); + if (protocol < 0x203) { + dprintf(1, "Old kernel (v %x) not supported\n", protocol); + return; + } + + void *kernel_addr; + u32 kernel_size; + + qemu_cfg_read_entry(&kernel_addr, QEMU_CFG_KERNEL_ADDR, 4); + qemu_cfg_read_entry(&kernel_size, QEMU_CFG_KERNEL_SIZE, 4); + qemu_cfg_read_entry(kernel_addr, QEMU_CFG_KERNEL_DATA, kernel_size); + + void *cmdline_addr; + u32 cmdline_size; + qemu_cfg_read_entry(&cmdline_addr, QEMU_CFG_CMDLINE_ADDR, 4); + qemu_cfg_read_entry(&cmdline_size, QEMU_CFG_CMDLINE_SIZE, 4); + if (cmdline_size) { + qemu_cfg_read_entry(cmdline_addr, QEMU_CFG_CMDLINE_DATA, cmdline_size); + } + + void *initrd_addr; + u32 initrd_size; + qemu_cfg_read_entry(&initrd_addr, QEMU_CFG_INITRD_ADDR, 4); + qemu_cfg_read_entry(&initrd_size, QEMU_CFG_INITRD_SIZE, 4); + if (initrd_size) { + qemu_cfg_read_entry(initrd_addr, QEMU_CFG_INITRD_DATA, initrd_size); + } + + // Last configurations + writel(setup_addr + 0x228, (u32)cmdline_addr); + writeb(setup_addr + 0x210, 0xB0); + writeb(setup_addr + 0x211, readb(setup_addr + 0x211) | 0x80); + writew(setup_addr + 0x224, cmdline_addr - setup_addr - 0x200); + writel(setup_addr + 0x218, (u32)initrd_addr); + writel(setup_addr + 0x21c, initrd_size); + + dprintf(1, "Jumping to kernel %d@%x %d@%x %d@%x %d@%x\n" + , setup_size, (u32)setup_addr, cmdline_size, (u32)cmdline_addr + , kernel_size, (u32)kernel_addr, initrd_size, (u32)initrd_addr); + struct bregs br; + memset(&br, 0, sizeof(br)); + extern void kernel_stub(void); + br.ebx = (u32)setup_addr >> 4; + br.edx = (u32)cmdline_addr - (u32)setup_addr - 16; + br.code = SEGOFF(SEG_BIOS, (u32)kernel_stub - BUILD_BIOS_ADDR); + + outb(0xa, 0xf4); + + farcall16big(&br); +} diff --git a/src/fw/paravirt.h b/src/fw/paravirt.h index 05b4997..c709151 100644 --- a/src/fw/paravirt.h +++ b/src/fw/paravirt.h @@ -45,6 +45,40 @@ static inline int runningOnKVM(void) { // QEMU_CFG_DMA ID bit #define QEMU_CFG_DMA_ID 2
+// List of QEMU fw_cfg entries. DO NOT ADD MORE. (All new content +// should be passed via the fw_cfg "file" interface.) +#define QEMU_CFG_SIGNATURE 0x00 +#define QEMU_CFG_ID 0x01 +#define QEMU_CFG_UUID 0x02 +#define QEMU_CFG_RAM_SIZE 0x03 +#define QEMU_CFG_NOGRAPHIC 0x04 +#define QEMU_CFG_NB_CPUS 0x05 +#define QEMU_CFG_MACHINE_ID 0x06 +#define QEMU_CFG_KERNEL_ADDR 0x07 +#define QEMU_CFG_KERNEL_SIZE 0x08 +#define QEMU_CFG_KERNEL_CMDLINE 0x09 +#define QEMU_CFG_INITRD_ADDR 0x0a +#define QEMU_CFG_INITRD_SIZE 0x0b +#define QEMU_CFG_BOOT_DEVICE 0x0c +#define QEMU_CFG_NUMA 0x0d +#define QEMU_CFG_BOOT_MENU 0x0e +#define QEMU_CFG_MAX_CPUS 0x0f +#define QEMU_CFG_KERNEL_ENTRY 0x10 +#define QEMU_CFG_KERNEL_DATA 0x11 +#define QEMU_CFG_INITRD_DATA 0x12 +#define QEMU_CFG_CMDLINE_ADDR 0x13 +#define QEMU_CFG_CMDLINE_SIZE 0x14 +#define QEMU_CFG_CMDLINE_DATA 0x15 +#define QEMU_CFG_SETUP_ADDR 0x16 +#define QEMU_CFG_SETUP_SIZE 0x17 +#define QEMU_CFG_SETUP_DATA 0x18 +#define QEMU_CFG_FILE_DIR 0x19 +#define QEMU_CFG_ARCH_LOCAL 0x8000 +#define QEMU_CFG_ACPI_TABLES (QEMU_CFG_ARCH_LOCAL + 0) +#define QEMU_CFG_SMBIOS_ENTRIES (QEMU_CFG_ARCH_LOCAL + 1) +#define QEMU_CFG_IRQ0_OVERRIDE (QEMU_CFG_ARCH_LOCAL + 2) +#define QEMU_CFG_E820_TABLE (QEMU_CFG_ARCH_LOCAL + 3) + int qemu_cfg_dma_enabled(void); void qemu_preinit(void); void qemu_platform_setup(void); diff --git a/src/romlayout.S b/src/romlayout.S index 7938e22..1c641c2 100644 --- a/src/romlayout.S +++ b/src/romlayout.S @@ -196,6 +196,26 @@ __farcall16: IRQ_TRAMPOLINE 1c IRQ_TRAMPOLINE 4a
+ DECLFUNC kernel_stub +kernel_stub: + movw %bx, %ds + movw %bx, %es + movw %bx, %fs + movw %bx, %gs + movw %bx, %ss + movl %edx, %esp + addw $0x20, %bx + pushw %bx // push CS + pushw $0 // push IP + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %edi, %edi + xorl %esi, %esi + xorl %ebp, %ebp + lretw +
/**************************************************************** * Misc. entry points.
On Thu, Aug 6, 2015 at 12:02 PM, Marc Marí markmb@redhat.com wrote:
case IPL_TYPE_BEV:
boot_rom(ie->vector);
if (qemu_cfg_dma_enabled()) {
boot_linux_cfg_dma();
} else {
boot_rom(ie->vector);
} break;
I thought the fw_cfg DMA interface is equivalent to the PIO interface?
Therefore, your Linux loading code completely replaces the linuxboot.rom. Why would you want to keep using linuxboot.rom in the PIO case if SeaBIOS now has to code to launch the kernel?
By the way, I think this change is incorrect. We shouldn't hijack the BEV. Instead fw_cfg should simply add a higher priority IPL entry when the kernel file is present. That way we just don't execute the option ROM.
Stefan
On Thu, Aug 6, 2015 at 12:00 PM, Marc Marí markmb@redhat.com wrote:
When running a Linux guest on top of QEMU, using the -kernel options, this is the timing improvement for x86:
QEMU commit 2be4f242b50a8 and SeaBIOS commit 908a58c1d5ff QEMU startup time: .078 BIOS startup time: .060 Kernel setup time: .578 Total time: .716
QEMU with this patch series and SeaBIOS with this patch series QEMU startup time: .080 BIOS startup time: .039 Kernel setup time: .002 Total time: .121
Impressive results!
Is this a fully-featured QEMU build or did you disable things?
Is this the default SeaBIOS build or did you disable things?
Stefan