On Thu, Jul 12, 2012 at 07:48:04PM +0000, Blue Swirl wrote:
On Wed, Jul 11, 2012 at 10:31 AM, Vasilis Liaskovitis vasilis.liaskovitis@profitbricks.com wrote:
The numa_fw_cfg paravirt interface is extended to include SRAT information for all hotplug-able dimms. There are 3 words for each hotplug-able memory slot, denoting start address, size and node proximity. The new info is appended after existing numa info, so that the fw_cfg layout does not break. This information is used by Seabios to build hotplug memory device objects at runtime. nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info to SeaBIOS.
v1->v2: Dimm SRAT info (#dimms) is appended at end of existing numa fw_cfg in order not to break existing layout Documentation of the new fwcfg layout is included in docs/specs/fwcfg.txt
Signed-off-by: Vasilis Liaskovitis vasilis.liaskovitis@profitbricks.com
docs/specs/fwcfg.txt | 28 ++++++++++++++++++++++++++ hw/pc.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++- vl.c | 2 +- 3 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 docs/specs/fwcfg.txt
diff --git a/docs/specs/fwcfg.txt b/docs/specs/fwcfg.txt new file mode 100644 index 0000000..e6fcd8f --- /dev/null +++ b/docs/specs/fwcfg.txt @@ -0,0 +1,28 @@ +QEMU<->BIOS Paravirt Documentation +--------------------------------------
+This document describes paravirt data structures passed from QEMU to BIOS.
+fw_cfg SRAT paravirt info +-------------------- +The SRAT info passed from QEMU to BIOS has the following layout:
+----------------------------------------------------------------------------------------------- +#nodes | cpu0_pxm | cpu1_pxm | ... | cpulast_pxm | node0_mem | node1_mem | ... | nodelast_mem
+----------------------------------------------------------------------------------------------- +#dimms | dimm0_start | dimm0_sz | dimm0_pxm | ... | dimmlast_start | dimmlast_sz | dimmlast_pxm
+Entry 0 contains the number of numa nodes (nb_numa_nodes).
+Entries 1..max_cpus: The next max_cpus entries describe node proximity for each +one of the vCPUs in the system.
+Entries max_cpus+1..max_cpus+nb_numa_nodes+1: The next nb_numa_nodes entries +describe the memory size for each one of the NUMA nodes in the system.
+Entry max_cpus+nb_numa_nodes+1 contains the number of memory dimms (nb_hp_dimms)
+The last 3 * nb_hp_dimms entries are organized in triplets: Each triplet contains +the physical address offset, size (in bytes), and node proximity for the +respective dimm.
The size and endianness are not specified, you are using LE 64 bit values for each item.
thanks, I 'll update.
diff --git a/hw/pc.c b/hw/pc.c index ef9901a..cf651d0 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -598,12 +598,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type) return index; }
+static void setup_hp_dimms(uint64_t *fw_cfg_slots);
static void *bochs_bios_init(void) { void *fw_cfg; uint8_t *smbios_table; size_t smbios_len; uint64_t *numa_fw_cfg;
uint64_t *hp_dimms_fw_cfg; int i, j;
register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -638,8 +641,10 @@ static void *bochs_bios_init(void) /* allocate memory for the NUMA channel: one (64bit) word for the number * of nodes, one word for each VCPU->node and one word for each node to * hold the amount of memory.
* Finally one word for the number of hotplug memory slots and three words
* for each hotplug memory slot (start address, size and node proximity). */
- numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
- numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8); numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); for (i = 0; i < max_cpus; i++) { for (j = 0; j < nb_numa_nodes; j++) {
@@ -652,8 +657,15 @@ static void *bochs_bios_init(void) for (i = 0; i < nb_numa_nodes; i++) { numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]); }
- numa_fw_cfg[1 + max_cpus + nb_numa_nodes] = cpu_to_le64(nb_hp_dimms);
- hp_dimms_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
- if (nb_hp_dimms)
setup_hp_dimms(hp_dimms_fw_cfg);
Braces.
- fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
(1 + max_cpus + nb_numa_nodes) * 8);
(2 + max_cpus + nb_numa_nodes + 3 * nb_hp_dimms) * 8);
return fw_cfg;
} @@ -1223,3 +1235,40 @@ target_phys_addr_t pc_set_hp_memory_offset(uint64_t size)
return ret;
}
+static void setup_hp_dimms(uint64_t *fw_cfg_slots) +{
- int i = 0;
- Error *err = NULL;
- DeviceState *dev;
- DimmState *slot;
- const char *type;
- BusChild *kid;
- BusState *bus = sysbus_get_default();
- QTAILQ_FOREACH(kid, &bus->children, sibling) {
dev = kid->child;
type = object_property_get_str(OBJECT(dev), "type", &err);
if (err) {
error_free(err);
fprintf(stderr, "error getting device type\n");
exit(1);
}
if (!strcmp(type, "dimm")) {
if (!dev->id) {
fprintf(stderr, "error getting dimm device id\n");
exit(1);
}
slot = DIMM(dev);
/* determine starting physical address for this memory slot */
assert(slot->start);
fw_cfg_slots[3 * slot->idx] = cpu_to_le64(slot->start);
fw_cfg_slots[3 * slot->idx + 1] = cpu_to_le64(slot->size);
fw_cfg_slots[3 * slot->idx + 2] = cpu_to_le64(slot->node);
i++;
}
- }
- assert(i == nb_hp_dimms);
+}
diff --git a/vl.c b/vl.c index 0ff8818..37c9798 100644 --- a/vl.c +++ b/vl.c @@ -2335,7 +2335,7 @@ int main(int argc, char **argv, char **envp) node_cpumask[i] = 0; }
- nb_numa_nodes = 0;
nb_numa_nodes = 1; nb_nics = 0;
autostart= 1;
-- 1.7.9