Implement support for booting from a PMEM region compatible with NVDIMM and NFIT specifications.
The current implementation is still slow. When running a Linux guest on top of QEMU in x86:
Using actual -kernel option: QEMU startup time: .080 BIOS startup time: .060 Kernel setup time: .590 Total time: .730
Using -kernel option and fw_cfg DMA patch: QEMU startup time: .080 BIOS startup time: .039 Kernel setup time: .002 Total time: .121
Using this patch series and adding the vmlinuz as a NVDIMM device: QEMU startup time: .080 BIOS startup time: .065 Kernel setup time: .015 Total time: .160
This patch series is not ready for merging. There are things missing and questions to be answered: - Is it necessary to retrieve any other data from the NVDIMM? - Is there any other nicer (and faster) option for the page table? - Make NVDIMM a compile-time option, as the other devices. - Add some kind of wrapper to the boot files instead of a direct vmlinux. - Add NVDIMM boot priority support. - Add support for intitrd and cmdline. Maybe in another NVDIMM, or in the same, or through fw_cfg. - Reorder code to change to long mode less times.
Any other proposals for improvements are appreciated
Marc Marí (3): Basic NVDIMM PMEM support Transitions to and from 64 bits Add NVDIMM booting support
Makefile | 3 +- src/boot.c | 18 ++++++++ src/config.h | 2 + src/fw/biostables.c | 80 ++++++++++++++++++++++++++++++++ src/hw/nvdimm.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hw/nvdimm.h | 12 +++++ src/misc.c | 4 ++ src/post.c | 2 + src/romlayout.S | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/stacks.c | 77 +++++++++++++++++++++++++++++++ src/stacks.h | 2 + src/std/acpi.h | 40 ++++++++++++++++ src/util.h | 3 ++ src/x86.h | 1 + 14 files changed, 497 insertions(+), 1 deletion(-) create mode 100644 src/hw/nvdimm.c create mode 100644 src/hw/nvdimm.h
Follows the ACPI NFIT specification.
Signed-off-by: Marc Marí markmb@redhat.com --- Makefile | 3 +- src/fw/biostables.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hw/nvdimm.c | 21 ++++++++++++++ src/hw/nvdimm.h | 11 ++++++++ src/post.c | 2 ++ src/std/acpi.h | 40 +++++++++++++++++++++++++++ src/util.h | 2 ++ 7 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 src/hw/nvdimm.c create mode 100644 src/hw/nvdimm.h
diff --git a/Makefile b/Makefile index 3a0d2e8..69a85a1 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,8 @@ SRCBOTH=misc.c stacks.c output.c string.c block.c cdrom.c disk.c mouse.c kbd.c \ hw/usb.c hw/usb-uhci.c hw/usb-ohci.c hw/usb-ehci.c \ hw/usb-hid.c hw/usb-msc.c hw/usb-uas.c \ hw/blockcmd.c hw/floppy.c hw/ata.c hw/ramdisk.c \ - hw/lsi-scsi.c hw/esp-scsi.c hw/megasas.c + hw/lsi-scsi.c hw/esp-scsi.c hw/megasas.c \ + hw/nvdimm.c SRC16=$(SRCBOTH) SRC32FLAT=$(SRCBOTH) post.c memmap.c malloc.c romfile.c x86.c optionroms.c \ pmm.c font.c boot.c bootsplash.c jpeg.c bmp.c tcgbios.c sha1.c \ diff --git a/src/fw/biostables.c b/src/fw/biostables.c index 71a1a0d..cb0f768 100644 --- a/src/fw/biostables.c +++ b/src/fw/biostables.c @@ -17,6 +17,7 @@ #include "string.h" // memcpy #include "util.h" // copy_table #include "x86.h" // outb +#include "hw/nvdimm.h"
struct pir_header *PirAddr VARFSEG;
@@ -172,6 +173,29 @@ find_resume_vector(void) return facs->firmware_waking_vector; }
+static struct nfit_descriptor * +find_nfit(void) +{ + dprintf(4, "rsdp=%p\n", RsdpAddr); + if (!RsdpAddr || RsdpAddr->signature != RSDP_SIGNATURE) + return NULL; + struct rsdt_descriptor_rev1 *rsdt = (void*)RsdpAddr->rsdt_physical_address; + dprintf(4, "rsdt=%p\n", rsdt); + if (!rsdt || rsdt->signature != RSDT_SIGNATURE) + return NULL; + void *end = (void*)rsdt + rsdt->length; + int i; + for (i=0; (void*)&rsdt->table_offset_entry[i] < end; i++) { + struct nfit_descriptor *nfit = (void*)rsdt->table_offset_entry[i]; + if (!nfit || nfit->signature != NFIT_SIGNATURE) + continue; + dprintf(4, "nfit=%p\n", nfit); + return nfit; + } + dprintf(4, "no nfit found\n"); + return NULL; +} + static struct acpi_20_generic_address acpi_reset_reg; static u8 acpi_reset_val; u32 acpi_pm1a_cnt VARFSEG; @@ -456,6 +480,62 @@ smbios_setup(void) smbios_legacy_setup(); }
+/**************************************************************** + * NFIT + ****************************************************************/ + +#define NVDIMM_MAX 10 /* To avoid using dynamic arrays. Any nicer solution? */ +struct nfit_descriptor *NfitAddr; +struct nvdimm_addr NvdimmAddr[NVDIMM_MAX + 1]; + +int nfit_setup(void) { + if (NfitAddr) { + return 1; + } + + NfitAddr = find_nfit(); + + if (!NfitAddr) { + return 0; + } + + void *addr = (void *)NfitAddr + sizeof(*NfitAddr); + struct nfit_spa *spa; + int index_nvdimm = 0; + + while (addr < ((void *)NfitAddr + NfitAddr->length)) { + u16 type = *((u16 *)addr); + u16 length = *((u16 *)(addr + 2)); + if (type == NFIT_TABLE_SPA) { + spa = addr; + + if (*(u64 *)(&spa->type_guid[0]) == *(u64 *)(&type_guid_pmem[0]) && + *(u64 *)(&spa->type_guid[8]) == *(u64 *)(&type_guid_pmem[8])) { + + NvdimmAddr[index_nvdimm].addr = spa->spa_base; + NvdimmAddr[index_nvdimm].length = spa->spa_length; + dprintf(1, "Found NVDIMM at address 0x%llx, size %llx\n", + spa->spa_base, spa->spa_length); + ++index_nvdimm; + + if (index_nvdimm == NVDIMM_MAX) { + dprintf(1, "Too many NVDIMMs. No more will be processed\n"); + return 1; + } + } + } + + addr += length; + } + + return 1; +} + +struct nvdimm_addr *nfit_get_pmem_addr(void) +{ + return NvdimmAddr; +} + void copy_table(void *pos) { diff --git a/src/hw/nvdimm.c b/src/hw/nvdimm.c new file mode 100644 index 0000000..f7c91a1 --- /dev/null +++ b/src/hw/nvdimm.c @@ -0,0 +1,21 @@ +// Support for finding and booting from NVDIMM +// +// Copyright (C) 2015 Marc Marí markmb@redhat.com +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#include "std/acpi.h" +#include "util.h" +#include "output.h" +#include "memmap.h" +#include "malloc.h" + +void nvdimm_setup(void) +{ + if (!nfit_setup()) { + dprintf(1, "No NVDIMMs found\n"); + return; + } + + dprintf(1, "NVDIMMs found\n"); +} diff --git a/src/hw/nvdimm.h b/src/hw/nvdimm.h new file mode 100644 index 0000000..1591b97 --- /dev/null +++ b/src/hw/nvdimm.h @@ -0,0 +1,11 @@ +#ifndef __NVDIMM_H +#define __NVDIMM_H + +struct nvdimm_addr { + u64 addr; + u64 length; +}; + +void nvdimm_setup(void); + +#endif diff --git a/src/post.c b/src/post.c index 6803585..a4d9737 100644 --- a/src/post.c +++ b/src/post.c @@ -23,6 +23,7 @@ #include "hw/usb.h" // usb_setup #include "hw/virtio-blk.h" // virtio_blk_setup #include "hw/virtio-scsi.h" // virtio_scsi_setup +#include "hw/nvdimm.h" // nvdimm setup #include "malloc.h" // malloc_init #include "memmap.h" // add_e820 #include "output.h" // dprintf @@ -153,6 +154,7 @@ device_hardware_setup(void) esp_scsi_setup(); megasas_setup(); pvscsi_setup(); + nvdimm_setup(); }
static void diff --git a/src/std/acpi.h b/src/std/acpi.h index b672bbe..2562976 100644 --- a/src/std/acpi.h +++ b/src/std/acpi.h @@ -313,5 +313,45 @@ struct tcpa_descriptor_rev2 #define TCPA_ACPI_CLASS_CLIENT 0 #define TCPA_ACPI_CLASS_SERVER 1
+/* + * NFIT (NVDIMM firmware interface) table + */ + +enum { + NFIT_TABLE_SPA = 0, + NFIT_TABLE_MEM = 1, + NFIT_TABLE_IDT = 2, + NFIT_TABLE_SMBIOS = 3, + NFIT_TABLE_DCR = 4, + NFIT_TABLE_BDW = 5, + NFIT_TABLE_FLUSH = 6, +}; + +#define NFIT_SIGNATURE 0x5449464E // NFIT +struct nfit_descriptor +{ + ACPI_TABLE_HEADER_DEF + u32 reserved; +} PACKED; + +/* + * struct nfit_spa - System Physical Address Range Structure + */ +struct nfit_spa { + u16 type; + u16 length; + u16 spa_index; + u16 flags; + u32 reserved; + u32 proximity_domain; + u8 type_guid[16]; + u64 spa_base; + u64 spa_length; + u64 mem_attr; +} PACKED; + +/* Converted into little endian */ +static const u8 type_guid_pmem[16] = {0x79, 0xD3, 0xF0, 0x66, 0xF3, 0xB4, 0x74, 0x40, + 0xAC, 0x43, 0x0D, 0x33, 0x18, 0xB7, 0x8C, 0xDB};
#endif // acpi.h diff --git a/src/util.h b/src/util.h index 327abeb..84478f8 100644 --- a/src/util.h +++ b/src/util.h @@ -80,6 +80,8 @@ void copy_smbios(void *pos); void display_uuid(void); void copy_table(void *pos); void smbios_setup(void); +int nfit_setup(void); +struct nvdimm_addr *nfit_get_pmem_addr(void);
// fw/coreboot.c extern const char *CBvendor, *CBpart;
Signed-off-by: Marc Marí markmb@redhat.com --- src/config.h | 2 ++ src/misc.c | 4 +++ src/romlayout.S | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/stacks.c | 77 ++++++++++++++++++++++++++++++++++++++++ src/stacks.h | 2 ++ src/x86.h | 1 + 6 files changed, 192 insertions(+)
diff --git a/src/config.h b/src/config.h index 6c47f16..dfbed89 100644 --- a/src/config.h +++ b/src/config.h @@ -69,6 +69,8 @@ #define SEG32_MODE16_DS (4 << 3) #define SEG32_MODE16BIG_CS (5 << 3) #define SEG32_MODE16BIG_DS (6 << 3) +#define SEG32_MODE64_CS (7 << 3) +#define SEG32_MODE64_DS (8 << 3)
// Debugging levels. If non-zero and CONFIG_DEBUG_LEVEL is greater // than the specified value, then the corresponding irq handler will diff --git a/src/misc.c b/src/misc.c index 8caaf31..6c9a490 100644 --- a/src/misc.c +++ b/src/misc.c @@ -168,6 +168,10 @@ u64 rombios32_gdt[] VARFSEG __aligned(8) = { GDT_GRANLIMIT(0xffffffff) | GDT_CODE | GDT_BASE(BUILD_BIOS_ADDR), // 16 bit data segment base=0 limit=0xffffffff (SEG32_MODE16BIG_DS) GDT_GRANLIMIT(0xffffffff) | GDT_DATA, + // 64 bit code segment (SEG32_MODE64_CS) + GDT_GRANLIMIT(0xffffffff) | GDT_CODE | GDT_L, + // 64 bit code segment (SEG32_MODE64_CS) + GDT_GRANLIMIT(0xffffffff) | GDT_DATA | GDT_L, };
// GDT descriptor diff --git a/src/romlayout.S b/src/romlayout.S index fefc212..4cb9f4c 100644 --- a/src/romlayout.S +++ b/src/romlayout.S @@ -115,6 +115,112 @@ transition16big:
jmpl *%edx
+// Place CPU into 64bit mode from 32bit mode. +// %edi = Valid page table. +// %edx = return location (in 32bit mode) +// Clobbers: ebx, ecx, flags, control registers + + DECLFUNC transition64 + .code32 +transition64: + // Paging is already disabled + + // Enable PAE + movl %cr4, %ebx + orl $0x20, %ebx + movl %ebx, %cr4 + + // Point CR3 at the PML4 + movl %edi, %cr3 + + // Save valuable info in %eax and %edx before rdmsr erases it + movl %edx, %ebx + movl %eax, %edi + + // Enable IA-32E mode + movl $0xC0000080, %ecx + rdmsr + orl $0x100, %eax + wrmsr + + // Restore saved info + movl %ebx, %edx + movl %edi, %eax + + // Enable paging + movl %cr0, %ebx + orl $0x80000000, %ebx + movl %ebx, %cr0 + + ljmpl $SEG32_MODE64_CS, $(BUILD_BIOS_ADDR + 1f) + + // Set segment descriptor +1: movl $SEG32_MODE64_DS, %ebx + movw %bx, %ds + movw %bx, %es + movw %bx, %ss + movw %bx, %fs + movw %bx, %gs + + // Jump + jmpl *%edx + + .code16 + +// Place CPU into 32bit mode from 64bit mode. +// %edx = return location (in 32bit mode) +// Clobbers: ebx, ecx, edi, flags, control registers + + DECLFUNC transition32_from_64 +transition32_from_64: + // Jump into compatibility mode + .code64 + movl $SEG32_MODE32_CS, %ecx + pushq %rcx // CS + movl $(BUILD_BIOS_ADDR + 1f), %ecx + pushq %rcx // IP + retfq + + .code32 + // Set 32 bit mode segments +1: movl $SEG32_MODE32_DS, %ebx + movw %bx, %ds + movw %bx, %es + movw %bx, %ss + movw %bx, %fs + movw %bx, %gs + + // Disable paging + movl %cr0, %ebx + andl $(~0x80000000), %ebx + movl %ebx, %cr0 + + // Point cr3 to 0 again + movl $0, %ebx + movl %ebx, %cr3 + + // Disable PAE + movl %cr4, %ebx + andl $(~0x20), %ebx + movl %ebx, %cr4 + + // Save valuable info in %eax and %edx before rdmsr erases it + movl %edx, %ebx + movl %eax, %edi + + // Disable IA-32E mode + movl $0xC0000080, %ecx + rdmsr + andl $(~0x100), %eax + wrmsr + + // Restore saved info + movl %ebx, %edx + movl %edi, %eax + + // Jump + jmpl *%edx + .code16
/**************************************************************** * External calling trampolines diff --git a/src/stacks.c b/src/stacks.c index 850a335..f71a184 100644 --- a/src/stacks.c +++ b/src/stacks.c @@ -15,6 +15,8 @@ #include "stacks.h" // struct mutex_s #include "string.h" // memset #include "util.h" // useRTC +#include "memmap.h" // PAGE_SIZE +#include "string.h" // memset
#define MAIN_STACK_MAX (1024*1024)
@@ -106,6 +108,7 @@ call16_helper(u32 eax, u32 edx, u32 (*func)(u32 eax, u32 edx)) #define ASM32_BACK32 " .popsection\n .code32\n" #define ASM16_SWITCH32 " .code32\n" #define ASM16_BACK16 " .code16gcc\n" +#define ASM32_SWITCH64 " .code64\n"
// Call a SeaBIOS C function in 32bit mode using smm trampoline static u32 @@ -308,6 +311,24 @@ call16big(u32 eax, u32 edx, void *func) return call16_back(eax, edx, func); }
+u32 call64(void *page_table, void *func, u32 eax) +{ + ASSERT32FLAT(); + + asm volatile( + // Transition to 64bit mode + " movl $1f, %%edx\n" + " jmp transition64\n" + "1:movl %1, %%edx\n" + " calll *%%edx\n" + " movl $2f, %%edx\n" + " jmp transition32_from_64\n" + "2:\n" + : "+a" (eax) + : "g" (func), "D" (page_table) + : "edx", "ebx", "ecx", "cc", "memory"); + return eax; +}
/**************************************************************** * Extra 16bit stack @@ -762,3 +783,59 @@ call32_params(void *func, u32 eax, u32 edx, u32 ecx, u32 errret) return call32(_cfunc32flat_call32_params_helper , (u32)MAKE_FLATPTR(GET_SEG(SS), ¶ms), errret); } + +/**************************************************************** + * Page table + ****************************************************************/ +void *gen_identity_page_table(u64 max_addr) +{ + /* Map directly all the addresses */ + u32 pt_entries = (max_addr + 0xFFF) >> 12; + u32 pdt_entries = (pt_entries + 0x1FF) >> 9; + u32 pdpt_entries = (pdt_entries + 0x1FF) >> 9; + u32 pml4_entries = (pdpt_entries + 0x1FF) >> 9; + + if (pml4_entries > 1) { + dprintf(1, "Page table too big\n"); + return NULL; + } + + u32 table_size = (pdt_entries << 12) // PT size + + (pdpt_entries << 12) // PDT size + + (pml4_entries << 12) // PDPT size + + ((pml4_entries + 0xFFF) >> 12); // PML4 size + + void *table = memalign_tmp(PAGE_SIZE, table_size); + u64 *cur_pos = table; + u32 i; + + memset(table, 0, table_size); + + void *pt_start = cur_pos; + for (i = 0; i < pt_entries; ++i, ++cur_pos) { + *cur_pos = ((u64)i << 12) | 3; + } + + cur_pos = (u64 *)ALIGN((u32)cur_pos, PAGE_SIZE); + void *pdt_start = cur_pos; + + for (i = 0; i < pdt_entries; ++i, ++cur_pos) { + *cur_pos = ((u32)pt_start + (i << 12)) | 3; + } + + cur_pos = (u64 *)ALIGN((u32)cur_pos, PAGE_SIZE); + void *pdpt_start = cur_pos; + + for (i = 0; i < pdpt_entries; ++i, ++cur_pos) { + *cur_pos = ((u32)pdt_start + (i << 12)) | 3; + } + + cur_pos = (u64 *)ALIGN((u32)cur_pos, PAGE_SIZE); + void *pml4_start = cur_pos; + + for (i = 0; i < pml4_entries; ++i, ++cur_pos) { + *cur_pos = ((u32)pdpt_start + (i << 12)) | 3; + } + + return pml4_start; +} diff --git a/src/stacks.h b/src/stacks.h index a3b031c..d29aa88 100644 --- a/src/stacks.h +++ b/src/stacks.h @@ -40,6 +40,8 @@ void finish_preempt(void); int wait_preempt(void); void check_preempt(void); u32 call32_params(void *func, u32 eax, u32 edx, u32 ecx, u32 errret); +u32 call64(void *page_table, void *func, u32 eax); +void *gen_identity_page_table(u64 max_addr);
// Inline functions
diff --git a/src/x86.h b/src/x86.h index 234a6e2..8da4d74 100644 --- a/src/x86.h +++ b/src/x86.h @@ -218,6 +218,7 @@ static inline u8 readb(const void *addr) { // GDT bits #define GDT_CODE (0x9bULL << 40) // Code segment - P,R,A bits also set #define GDT_DATA (0x93ULL << 40) // Data segment - W,A bits also set +#define GDT_L (0X1ULL << 53) // Long flag #define GDT_B (0x1ULL << 54) // Big flag #define GDT_G (0x1ULL << 55) // Granularity flag // GDT bits for segment base
Hi,
+/****************************************************************
- Page table
- ****************************************************************/
+void *gen_identity_page_table(u64 max_addr) +{
- /* Map directly all the addresses */
- u32 pt_entries = (max_addr + 0xFFF) >> 12;
- u32 pdt_entries = (pt_entries + 0x1FF) >> 9;
- u32 pdpt_entries = (pdt_entries + 0x1FF) >> 9;
- u32 pml4_entries = (pdpt_entries + 0x1FF) >> 9;
You can use hugepages. Then you'll need _alot_ less memory for the pagetables, and the copying probably goes faster too.
You might consider mapping the nvdimm below 4G instead of entering long mode.
cheers, Gerd
On Mon, 21 Sep 2015 15:57:56 +0200 Gerd Hoffmann kraxel@redhat.com wrote:
Hi,
+/****************************************************************
- Page table
- ****************************************************************/
+void *gen_identity_page_table(u64 max_addr) +{
- /* Map directly all the addresses */
- u32 pt_entries = (max_addr + 0xFFF) >> 12;
- u32 pdt_entries = (pt_entries + 0x1FF) >> 9;
- u32 pdpt_entries = (pdt_entries + 0x1FF) >> 9;
- u32 pml4_entries = (pdpt_entries + 0x1FF) >> 9;
You can use hugepages. Then you'll need _alot_ less memory for the pagetables, and the copying probably goes faster too.
True. Tried with 2M. The memory used went down from 8M to 24K more or less, and the time for the copying went down by 4ms (from 15ms to 11ms). The other option is 1GB. I'll test later if it's enabled in QEMU CPUs.
You might consider mapping the nvdimm below 4G instead of entering long mode.
The QEMU NVDIMM driver maps it directly above 4G to avoid the trouble of looking for empty spaces in the below 4G section. I didn't want to rework the driver or add more complexity to it.
If you were talking about just enabling paging and map the NVDIMM to below 4G, it would work for just a few gigs above 4G (don't remember exactly the number). So better give it all.
And SeaBIOS may need 64 bit support for other things in the future, so this might be useful later.
Thanks Marc
On 21/09/2015 16:38, Marc Marí wrote:
True. Tried with 2M. The memory used went down from 8M to 24K more or less, and the time for the copying went down by 4ms (from 15ms to 11ms). The other option is 1GB. I'll test later if it's enabled in QEMU CPUs.
Only in some AMD CPUs (plus "-cpu host" if supported by your machine) because Intel only makes 1GB pages available on Xeons. But you can just check the guest CPUID inside SeaBIOS, and test on a suitable machine.
Paolo
On Mon, 21 Sep 2015 17:00:51 +0200 Paolo Bonzini pbonzini@redhat.com wrote:
On 21/09/2015 16:38, Marc Marí wrote:
True. Tried with 2M. The memory used went down from 8M to 24K more or less, and the time for the copying went down by 4ms (from 15ms to 11ms). The other option is 1GB. I'll test later if it's enabled in QEMU CPUs.
Only in some AMD CPUs (plus "-cpu host" if supported by your machine) because Intel only makes 1GB pages available on Xeons. But you can just check the guest CPUID inside SeaBIOS, and test on a suitable machine.
And I forgot to mention that CPUID checking for long mode support is also missing. So I'll need to check the CPUID anyway.
Thanks Marc
On Mon, Sep 21, 2015 at 04:38:00PM +0200, Marc Marí wrote:
On Mon, 21 Sep 2015 15:57:56 +0200 Gerd Hoffmann kraxel@redhat.com wrote:
+/****************************************************************
- Page table
- ****************************************************************/
+void *gen_identity_page_table(u64 max_addr) +{
- /* Map directly all the addresses */
- u32 pt_entries = (max_addr + 0xFFF) >> 12;
- u32 pdt_entries = (pt_entries + 0x1FF) >> 9;
- u32 pdpt_entries = (pdt_entries + 0x1FF) >> 9;
- u32 pml4_entries = (pdpt_entries + 0x1FF) >> 9;
You can use hugepages. Then you'll need _alot_ less memory for the pagetables, and the copying probably goes faster too.
True. Tried with 2M. The memory used went down from 8M to 24K more or less, and the time for the copying went down by 4ms (from 15ms to 11ms). The other option is 1GB. I'll test later if it's enabled in QEMU CPUs.
You might consider mapping the nvdimm below 4G instead of entering long mode.
The QEMU NVDIMM driver maps it directly above 4G to avoid the trouble of looking for empty spaces in the below 4G section. I didn't want to rework the driver or add more complexity to it.
If you were talking about just enabling paging and map the NVDIMM to below 4G, it would work for just a few gigs above 4G (don't remember exactly the number). So better give it all.
According to my Intel docs, PAE supports the same 52bit physical address range that x86_64 supports.
Switching to/from PAE looks like it would be notably simpler (it's just four 4KB page directories loaded with 2MB pages) and I think it may be possible to do the switch entirely in C code. But it does require the pain of "bank switching" to access the high memory.
And SeaBIOS may need 64 bit support for other things in the future, so this might be useful later.
If that's the case, those things will only work on 64bit CPUs, and SeaBIOS currently supports machines as old as i386. (BTW, your patch should check the cpuid stuff before jumping to 64bit mode.)
Also, your code seems to run regular 32bit code when in "long mode" - is that valid?
-Kevin
On Mon, 21 Sep 2015 12:03:29 -0400 "Kevin O'Connor" kevin@koconnor.net wrote:
On Mon, Sep 21, 2015 at 04:38:00PM +0200, Marc Marí wrote:
On Mon, 21 Sep 2015 15:57:56 +0200 Gerd Hoffmann kraxel@redhat.com wrote:
+/****************************************************************
- Page table
****************************************************************/ +void *gen_identity_page_table(u64 max_addr) +{
- /* Map directly all the addresses */
- u32 pt_entries = (max_addr + 0xFFF) >> 12;
- u32 pdt_entries = (pt_entries + 0x1FF) >> 9;
- u32 pdpt_entries = (pdt_entries + 0x1FF) >> 9;
- u32 pml4_entries = (pdpt_entries + 0x1FF) >> 9;
You can use hugepages. Then you'll need _alot_ less memory for the pagetables, and the copying probably goes faster too.
True. Tried with 2M. The memory used went down from 8M to 24K more or less, and the time for the copying went down by 4ms (from 15ms to 11ms). The other option is 1GB. I'll test later if it's enabled in QEMU CPUs.
You might consider mapping the nvdimm below 4G instead of entering long mode.
The QEMU NVDIMM driver maps it directly above 4G to avoid the trouble of looking for empty spaces in the below 4G section. I didn't want to rework the driver or add more complexity to it.
If you were talking about just enabling paging and map the NVDIMM to below 4G, it would work for just a few gigs above 4G (don't remember exactly the number). So better give it all.
According to my Intel docs, PAE supports the same 52bit physical address range that x86_64 supports.
Ok, maybe I messed up. I'll check again.
Switching to/from PAE looks like it would be notably simpler (it's just four 4KB page directories loaded with 2MB pages) and I think it may be possible to do the switch entirely in C code. But it does require the pain of "bank switching" to access the high memory.
And SeaBIOS may need 64 bit support for other things in the future, so this might be useful later.
If that's the case, those things will only work on 64bit CPUs, and SeaBIOS currently supports machines as old as i386. (BTW, your patch should check the cpuid stuff before jumping to 64bit mode.)
Yes, is one of the things missing, although I forgot to write it in the cover letter.
Also, your code seems to run regular 32bit code when in "long mode" - is that valid?
It doesn't crash. And I think it's valid. When dissasembling, the 64 bit operations have the prefix 0x48, the same that happens in a "normal" 64 bit application.
Thanks Marc
On Mon, Sep 21, 2015 at 06:23:54PM +0200, Marc Marí wrote:
On Mon, 21 Sep 2015 12:03:29 -0400 "Kevin O'Connor" kevin@koconnor.net wrote:
Also, your code seems to run regular 32bit code when in "long mode" - is that valid?
It doesn't crash. And I think it's valid. When dissasembling, the 64 bit operations have the prefix 0x48, the same that happens in a "normal" 64 bit application.
It looks like "push" and "pop" have different semantics in "long mode". It looks like they decrement the stack by 8 bytes even for 32bit operands. There may be other instructions as well.
So, I don't think we'd want to run 32bit generated gcc code when in 64bit mode.
-Kevin
On Mon, Sep 21, 2015 at 12:03:29PM -0400, Kevin O'Connor wrote:
According to my Intel docs, PAE supports the same 52bit physical address range that x86_64 supports.
Thinking about this further, I could see changing SeaBIOS' 32bit code to always use PAE mode (when available). Doing so would enable support for PCI devices mapped above 64bit (there is already a USB exception in pciinit.c because of this). It would mean implementing ioremap and teaching malloc and the drivers to use it, but that seems doable. A simple PAE ioremap implementation could use 2MB pages exclusively which would reduce complexity.
-Kevin
Detect NVDIMMs, check for Linux kernel and copy the code to low memory, so the kernel can be booted.
Signed-off-by: Marc Marí markmb@redhat.com --- src/boot.c | 18 +++++++++ src/hw/nvdimm.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- src/hw/nvdimm.h | 1 + src/romlayout.S | 20 ++++++++++ src/util.h | 1 + 5 files changed, 149 insertions(+), 2 deletions(-)
diff --git a/src/boot.c b/src/boot.c index e0f73a3..3e4dc93 100644 --- a/src/boot.c +++ b/src/boot.c @@ -20,6 +20,7 @@ #include "string.h" // memset #include "util.h" // irqtimer_calc #include "tcgbios.h" // tpm_* +#include "hw/nvdimm.h"
/**************************************************************** @@ -304,6 +305,7 @@ static struct hlist_head BootList VARVERIFY32INIT; #define IPL_TYPE_HARDDISK 0x02 #define IPL_TYPE_CDROM 0x03 #define IPL_TYPE_CBFS 0x20 +#define IPL_TYPE_NVDIMM 0x21 #define IPL_TYPE_BEV 0x80 #define IPL_TYPE_BCV 0x81 #define IPL_TYPE_HALT 0xf0 @@ -398,6 +400,12 @@ boot_add_cbfs(void *data, const char *desc, int prio) bootentry_add(IPL_TYPE_CBFS, defPrio(prio, DEFAULT_PRIO), (u32)data, desc); }
+void +boot_add_nvdimm(void *data, const char *desc, int prio) +{ + bootentry_add(IPL_TYPE_NVDIMM, defPrio(prio, DEFAULT_PRIO), (u32)data, desc); +} +
/**************************************************************** * Keyboard calls @@ -674,6 +682,13 @@ boot_cbfs(struct cbfs_file *file) cbfs_run_payload(file); }
+static void +boot_nvdimm(struct nvdimm_addr* NvdimmAddr) +{ + printf("Booting from NVDIMM...\n"); + nvdimm_boot(NvdimmAddr); +} + // Boot from a BEV entry on an optionrom. static void boot_rom(u32 vector) @@ -731,6 +746,9 @@ do_boot(int seq_nr) case IPL_TYPE_CBFS: boot_cbfs((void*)ie->vector); break; + case IPL_TYPE_NVDIMM: + boot_nvdimm((void *)ie->vector); + break; case IPL_TYPE_BEV: boot_rom(ie->vector); break; diff --git a/src/hw/nvdimm.c b/src/hw/nvdimm.c index f7c91a1..6022ce0 100644 --- a/src/hw/nvdimm.c +++ b/src/hw/nvdimm.c @@ -7,8 +7,65 @@ #include "std/acpi.h" #include "util.h" #include "output.h" -#include "memmap.h" +#include "stacks.h" +#include "x86.h" +#include "string.h" +#include "bregs.h" +#include "farptr.h" #include "malloc.h" +#include "nvdimm.h" + +void *page_table; + +static u32 nvdimm_check(struct nvdimm_addr *NvdimmAddr) +{ + u32 eax; + + // Registers are 32 bits. Pass through stack + asm volatile( + ".code64\n" + "movq %1, %%rdx\n" + "movq 0x202(%%rdx), %%rax\n" + "subq $0x53726448, %%rax\n" // Check HdrS signature + ".code32\n" + : "=a"(eax) + : "m"(NvdimmAddr->addr) + : "edx"); + + if (!eax) { + return 1; + } else { + return 0; + } +} + +static void nvdimm_copy(struct nvdimm_addr *NvdimmAddr) +{ + u32 real_addr = 0x10000, prot_addr = 0x100000; + + asm volatile( + ".code64\n" + "movq %0, %%rdx\n" + "xorq %%rbx, %%rbx\n" + "movb 0x1f1(%%rdx), %%bl\n" + "addq $1, %%rbx\n" + "shlq $9, %%rbx\n" + "movq %%rbx, %%rcx\n" // Setup size + "movq %%rdx, %%rsi\n" // Address from + "movq %2, %%rdi\n" // Address to + "rep movsb\n" // Copy setup section to "real_addr" + "movq %1, %%rcx\n" + "subq %%rbx, %%rcx\n" // Kernel size + "movq %%rdx, %%rsi\n" + "addq %%rbx, %%rsi\n" // Address from + "movq %3, %%rdi\n" // Address to + "rep movsb\n" // Copy rest of the kernel to "prot_addr" + ".code32\n" + : + : "m"(NvdimmAddr->addr), "g"(NvdimmAddr->length), + "g"(real_addr), "g"(prot_addr) + : "ebx", "ecx", "edx", "edi", "esi", "memory"); +}
void nvdimm_setup(void) { @@ -17,5 +74,55 @@ void nvdimm_setup(void) return; }
- dprintf(1, "NVDIMMs found\n"); + u64 top_addr = 0x100000000ULL; + struct nvdimm_addr *NvdimmAddr = nfit_get_pmem_addr(); + + int i = 0; + while(NvdimmAddr[i].addr != 0) { + if (NvdimmAddr[i].addr + NvdimmAddr[i].length > top_addr) { + top_addr = NvdimmAddr[i].addr + NvdimmAddr[i].length; + } + + ++i; + } + + page_table = gen_identity_page_table(top_addr); + + i = 0; + while(NvdimmAddr[i].addr != 0) { + if (NvdimmAddr[i].length > 0x300) { + if (call64(page_table, (void *)nvdimm_check, (u32)&NvdimmAddr[i])) { + boot_add_nvdimm(&NvdimmAddr[i], "NVDIMM", 0); + } + } + ++i; + } +} + +void nvdimm_boot(struct nvdimm_addr *NvdimmAddr) +{ + dprintf(1, "Loading kernel from NVDIMM\n"); + + u32 real_addr = 0x10000, cmdline_addr = 0x20000; + + call64(page_table, (void *)nvdimm_copy, (u32)NvdimmAddr); + + writel((void *)cmdline_addr, 0); + + // Last configurations + writeb((void *)real_addr + 0x210, 0xB0); + writeb((void *)real_addr + 0x211, readb((void *)real_addr + 0x211) | 0x80); + writel((void *)real_addr + 0x218, 0); + writel((void *)real_addr + 0x21c, 0); + writew((void *)real_addr + 0x224, cmdline_addr - real_addr - 0x200); + writel((void *)real_addr + 0x228, cmdline_addr); + + struct bregs br; + memset(&br, 0, sizeof(br)); + extern void kernel_stub(void); + br.ebx = real_addr >> 4; + br.edx = cmdline_addr - real_addr - 16; + br.code = SEGOFF(SEG_BIOS, (u32)kernel_stub - BUILD_BIOS_ADDR); + + farcall16big(&br); } diff --git a/src/hw/nvdimm.h b/src/hw/nvdimm.h index 1591b97..385c1e4 100644 --- a/src/hw/nvdimm.h +++ b/src/hw/nvdimm.h @@ -7,5 +7,6 @@ struct nvdimm_addr { };
void nvdimm_setup(void); +void nvdimm_boot(struct nvdimm_addr *NvdimmAddr);
#endif diff --git a/src/romlayout.S b/src/romlayout.S index 4cb9f4c..305c7c3 100644 --- a/src/romlayout.S +++ b/src/romlayout.S @@ -285,6 +285,26 @@ __farcall16: IRQ_TRAMPOLINE 1c IRQ_TRAMPOLINE 4a
+ DECLFUNC kernel_stub +kernel_stub: + movw %bx, %ds + movw %bx, %es + movw %bx, %fs + movw %bx, %gs + movw %bx, %ss + movl %edx, %esp + addw $0x20, %bx + pushw %bx // push CS + pushw $0 // push IP + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %edi, %edi + xorl %esi, %esi + xorl %ebp, %ebp + lretw +
/**************************************************************** * Misc. entry points. diff --git a/src/util.h b/src/util.h index 84478f8..524b724 100644 --- a/src/util.h +++ b/src/util.h @@ -25,6 +25,7 @@ void boot_add_floppy(struct drive_s *drive_g, const char *desc, int prio); void boot_add_hd(struct drive_s *drive_g, const char *desc, int prio); void boot_add_cd(struct drive_s *drive_g, const char *desc, int prio); void boot_add_cbfs(void *data, const char *desc, int prio); +void boot_add_nvdimm(void *data, const char *desc, int prio); void interactive_bootmenu(void); void bcv_prepboot(void); struct pci_device;
On 21/09/2015 15:14, Marc Marí wrote:
This patch series is not ready for merging. There are things missing and questions to be answered:
- Is it necessary to retrieve any other data from the NVDIMM?
- Is there any other nicer (and faster) option for the page table?
- Make NVDIMM a compile-time option, as the other devices.
- Add some kind of wrapper to the boot files instead of a direct vmlinux.
- Add NVDIMM boot priority support.
- Add support for intitrd and cmdline. Maybe in another NVDIMM, or in the same, or through fw_cfg.
Perhaps store a CBFS image in the NVDIMM? That could include the kernel, initrd and cmdline.
Paolo
- Reorder code to change to long mode less times.
Any other proposals for improvements are appreciated
On Mon, 21 Sep 2015 16:58:31 +0200 Paolo Bonzini pbonzini@redhat.com wrote:
On 21/09/2015 15:14, Marc Marí wrote:
This patch series is not ready for merging. There are things missing and questions to be answered:
- Is it necessary to retrieve any other data from the NVDIMM?
- Is there any other nicer (and faster) option for the page table?
- Make NVDIMM a compile-time option, as the other devices.
- Add some kind of wrapper to the boot files instead of a direct
vmlinux.
- Add NVDIMM boot priority support.
- Add support for intitrd and cmdline. Maybe in another NVDIMM, or
in the same, or through fw_cfg.
Perhaps store a CBFS image in the NVDIMM? That could include the kernel, initrd and cmdline.
And that links with: "Add some kind of wrapper to the boot files instead of a direct vmlinux". CBFS is an option. I don't know if there are many other options to choose, but I'll check anyway.
Thanks Marc
Paolo
- Reorder code to change to long mode less times.
Any other proposals for improvements are appreciated
On 09/21/2015 04:58 PM, Paolo Bonzini wrote:
On 21/09/2015 15:14, Marc Marí wrote:
This patch series is not ready for merging. There are things missing and questions to be answered:
- Is it necessary to retrieve any other data from the NVDIMM?
- Is there any other nicer (and faster) option for the page table?
- Make NVDIMM a compile-time option, as the other devices.
- Add some kind of wrapper to the boot files instead of a direct vmlinux.
- Add NVDIMM boot priority support.
- Add support for intitrd and cmdline. Maybe in another NVDIMM, or in the same, or through fw_cfg.
Perhaps store a CBFS image in the NVDIMM? That could include the kernel, initrd and cmdline.
Any idea how the standards body envisioned this? Surely there's something in there...
Matthew?
Cheers,
Hannes
On Mon, 21 Sep 2015 22:36:43 +0200 Hannes Reinecke hare@suse.de wrote:
On 09/21/2015 04:58 PM, Paolo Bonzini wrote:
On 21/09/2015 15:14, Marc Marí wrote:
This patch series is not ready for merging. There are things missing and questions to be answered:
- Is it necessary to retrieve any other data from the NVDIMM?
- Is there any other nicer (and faster) option for the page table?
- Make NVDIMM a compile-time option, as the other devices.
- Add some kind of wrapper to the boot files instead of a direct
vmlinux.
- Add NVDIMM boot priority support.
- Add support for intitrd and cmdline. Maybe in another NVDIMM,
or in the same, or through fw_cfg.
Perhaps store a CBFS image in the NVDIMM? That could include the kernel, initrd and cmdline.
Any idea how the standards body envisioned this? Surely there's something in there...
Matthew?
Cheers,
Hannes
As far as I know, the booting support is planned just for block mode. And it will probably be designed as booting from an SSD or any other block device.
And, if I remember correcly, no NVDIMM/NFIT specification mentions anything about booting.
Thanks Marc
On Mon, Sep 21, 2015 at 03:14:04PM +0200, Marc Marí wrote:
Implement support for booting from a PMEM region compatible with NVDIMM and NFIT specifications.
The current implementation is still slow. When running a Linux guest on top of QEMU in x86:
Using actual -kernel option: QEMU startup time: .080 BIOS startup time: .060 Kernel setup time: .590 Total time: .730
Using -kernel option and fw_cfg DMA patch: QEMU startup time: .080 BIOS startup time: .039 Kernel setup time: .002 Total time: .121
Using this patch series and adding the vmlinuz as a NVDIMM device: QEMU startup time: .080 BIOS startup time: .065 Kernel setup time: .015 Total time: .160
Thanks Marc - very interesting!
Do you think NVDIMM support will ultimately be faster than DMA or are there other advantages to using NVDIMMs?
-Kevin
On Mon, 21 Sep 2015 12:17:53 -0400 "Kevin O'Connor" kevin@koconnor.net wrote:
On Mon, Sep 21, 2015 at 03:14:04PM +0200, Marc Marí wrote:
Implement support for booting from a PMEM region compatible with NVDIMM and NFIT specifications.
The current implementation is still slow. When running a Linux guest on top of QEMU in x86:
Using actual -kernel option: QEMU startup time: .080 BIOS startup time: .060 Kernel setup time: .590 Total time: .730
Using -kernel option and fw_cfg DMA patch: QEMU startup time: .080 BIOS startup time: .039 Kernel setup time: .002 Total time: .121
Using this patch series and adding the vmlinuz as a NVDIMM device: QEMU startup time: .080 BIOS startup time: .065 Kernel setup time: .015 Total time: .160
Thanks Marc - very interesting!
Do you think NVDIMM support will ultimately be faster than DMA or are there other advantages to using NVDIMMs?
I've been experimenting with different approaches to improve speed, as you've already seen. I still don't know which will be faster.
The NVDIMM code is missing some improvements. I think it will be around the same speed as fw_cfg DMA. Unless I missed a big improvement.
But the advantage of NVDIMMs is flexibility. It is just a memory region. So you can put there anything that you want to access fast. It may be a vmlinuz, an initramfs, a normal file, or whatever. Maybe combining both fw_cfg DMA and NVDIMM can give good times for booting a minimal system up to the userspace.
At the moment, this NVDIMM code is just experimental and for testing to see its performance. But I tend to focus on the tree I have in front of me instead of the forest, so it's better if I share this code and get a few comments on it before getting into conclusions.
(When I say NVDIMM I should say NVDIMM PMEM or DAX. But as there's still no block mode in QEMU, it doesn't really matter).
Thanks Marc
Hi,
Do you think NVDIMM support will ultimately be faster than DMA or are there other advantages to using NVDIMMs?
Guest can write to nvdimm. So it can update the kernel & reboot into the new kernel without help from the host.
cheers, Gerd