Hi,
There were a number of requests about support of 64bit PCI BAR allocations.
Also we have observed the issue on guests with older linux version (2.6.18): if we have a 64bit BAR allocated within first 4GB, the OS may hang during start process. (I guess it is an OS bug, but we need to take care of this).
This patch addresses these two issues and allows 64bit BARs to be allocated in ranges above 4GB. Patch consists of three parts: 1. Add new range above 4GB in _CRS table to let Windows 2008 work properly. Thanks a lot to Michael S. Triskin for this brilliant idea. 2. Added new PCI_REGION_TYPE_PREFMEM_64 region type in pciinit and changed types of variables. 3. Take care about PCI devices with 64bit BARs on secondary buses.
Patches have been tested on several configurations which includes linux 2.6.18 - 3.0 & windows 2008. Everything works quite well.
Since Windows is using CRS table for PCI resource allocation, this patch allows Windows guests to work with PCI devices when PCI BAR allocation is above 4GB. This also might be helpful on Linux when use_crs kernel boot option is set.
Michael I've added you to signed-off if you don't mind.
Signed-off-by: Alexey Korolev alexey.korolev@endace.com Signed-off-by: Michael S. Tsirkin mst@redhat.com --- src/acpi-dsdt.dsl | 7 +++++ src/acpi-dsdt.hex | 66 +++++++++++++++++++++++++++++++++++++++++++++-------- src/config.h | 2 + 3 files changed, 65 insertions(+), 10 deletions(-)
diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl index 7082b65..c17e947 100644 --- a/src/acpi-dsdt.dsl +++ b/src/acpi-dsdt.dsl @@ -175,6 +175,13 @@ DefinitionBlock ( 0x00000000, // Address Translation Offset 0x1EC00000, // Address Length ,, , AddressRangeMemory, TypeStatic) + QWordMemory (ResourceProducer, PosDecode, MinFixed, MaxFixed, Cacheable, ReadWrite, + 0x00000000, // Address Space Granularity + 0x8000000000, // Address Range Minimum + 0xFFFFFFFFFF, // Address Range Maximum + 0x00000000, // Address Translation Offset + 0x8000000000, // Address Length + ,, , AddressRangeMemory, TypeStatic) }) } } diff --git a/src/acpi-dsdt.hex b/src/acpi-dsdt.hex index 5dc7bb4..fe4c17e 100644 --- a/src/acpi-dsdt.hex +++ b/src/acpi-dsdt.hex @@ -3,12 +3,12 @@ static unsigned char AmlCode[] = { 0x53, 0x44, 0x54, -0xd3, -0x10, +0x1, +0x11, 0x0, 0x0, 0x1, -0x2d, +0xe, 0x42, 0x58, 0x50, @@ -110,16 +110,16 @@ static unsigned char AmlCode[] = { 0x47, 0x42, 0x10, -0x44, -0x81, +0x42, +0x84, 0x5f, 0x53, 0x42, 0x5f, 0x5b, 0x82, -0x4c, -0x80, +0x4a, +0x83, 0x50, 0x43, 0x49, @@ -2064,10 +2064,10 @@ static unsigned char AmlCode[] = { 0x52, 0x53, 0x11, -0x42, -0x7, +0x40, +0xa, 0xa, -0x6e, +0x9c, 0x88, 0xd, 0x0, @@ -2176,6 +2176,52 @@ static unsigned char AmlCode[] = { 0x0, 0xc0, 0x1e, +0x8a, +0x2b, +0x0, +0x0, +0xc, +0x3, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x80, +0x0, +0x0, +0x0, +0xff, +0xff, +0xff, +0xff, +0xff, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x80, +0x0, +0x0, +0x0, 0x79, 0x0, 0x10, diff --git a/src/config.h b/src/config.h index b0187a4..236f2e9 100644 --- a/src/config.h +++ b/src/config.h @@ -47,6 +47,8 @@
#define BUILD_PCIMEM_START 0xe0000000 #define BUILD_PCIMEM_END 0xfec00000 /* IOAPIC is mapped at */ +#define BUILD_PCIMEM_64_START 0x8000000000LL +#define BUILD_PCIMEM_64_END 0x10000000000LL
#define BUILD_IOAPIC_ADDR 0xfec00000 #define BUILD_HPET_ADDRESS 0xfed00000
This patch adds PCI_REGION_TYPE_PREFMEM_64 region type and modifies types of variables to make it possible to work with 64 bit addresses.
Why I've added just one region type PCI_REGION_TYPE_PREFMEM_64 and haven't added PCI_REGION_TYPE_MEM_64? According to PCI architecture specification, the bridges can describe 64bit ranges for prefetchable type of memory only. So it's very unlikely that devices exporting 64bit non-prefetchable BARs. Anyway this code will work with 64bit non-prefetchable BARs unless the PCI device is not behind the secondary bus.
Signed-off-by: Alexey Korolev alexey.korolev@endace.com --- src/pci.h | 1 - src/pciinit.c | 59 +++++++++++++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/src/pci.h b/src/pci.h index a2a5a4c..71f15fe 100644 --- a/src/pci.h +++ b/src/pci.h @@ -54,7 +54,6 @@ struct pci_device { struct { u32 addr; u32 size; - int is64; } bars[PCI_NUM_REGIONS];
// Local information on device. diff --git a/src/pciinit.c b/src/pciinit.c index 7d83368..a574e38 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -22,6 +22,7 @@ enum pci_region_type { PCI_REGION_TYPE_IO, PCI_REGION_TYPE_MEM, PCI_REGION_TYPE_PREFMEM, + PCI_REGION_TYPE_PREFMEM_64, PCI_REGION_TYPE_COUNT, };
@@ -29,18 +30,20 @@ static const char *region_type_name[] = { [ PCI_REGION_TYPE_IO ] = "io", [ PCI_REGION_TYPE_MEM ] = "mem", [ PCI_REGION_TYPE_PREFMEM ] = "prefmem", + [ PCI_REGION_TYPE_PREFMEM_64 ] = "prefmem64", };
struct pci_bus { struct { - /* pci region stats */ - u32 count[32 - PCI_MEM_INDEX_SHIFT]; - u32 sum, max; /* seconday bus region sizes */ u32 size; - /* pci region assignments */ - u32 bases[32 - PCI_MEM_INDEX_SHIFT]; - u32 base; + /* pci region stats */ + u32 max; + u32 count[32 - PCI_MEM_INDEX_SHIFT]; + s64 sum; + /* pci region assignments */ + s64 base; + s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT]; struct pci_device *bus_dev; }; @@ -69,6 +72,8 @@ static enum pci_region_type pci_addr_to_type(u32 addr) { if (addr & PCI_BASE_ADDRESS_SPACE_IO) return PCI_REGION_TYPE_IO; + if (addr & PCI_BASE_ADDRESS_MEM_TYPE_64) + return PCI_REGION_TYPE_PREFMEM_64; if (addr & PCI_BASE_ADDRESS_MEM_PREFETCH) return PCI_REGION_TYPE_PREFMEM; return PCI_REGION_TYPE_MEM; @@ -378,19 +383,16 @@ static void pci_bios_check_devices(struct pci_bus *busses) struct pci_bus *bus = &busses[pci_bdf_to_bus(pci->bdf)]; int i; for (i = 0; i < PCI_NUM_REGIONS; i++) { - u32 val, size; + u32 val, size, type; pci_bios_get_bar(pci, i, &val, &size); if (val == 0) continue;
- pci_bios_bus_reserve(bus, pci_addr_to_type(val), size); + type = pci_addr_to_type(val); + pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size; - pci->bars[i].is64 = (!(val & PCI_BASE_ADDRESS_SPACE_IO) && - (val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) - == PCI_BASE_ADDRESS_MEM_TYPE_64); - - if (pci->bars[i].is64) + if (type == PCI_REGION_TYPE_PREFMEM_64) i++; } } @@ -426,6 +428,7 @@ static void pci_bios_check_devices(struct pci_bus *busses) static int pci_bios_init_root_regions(struct pci_bus *bus, u32 start, u32 end) { bus->r[PCI_REGION_TYPE_IO].base = 0xc000; + bus->r[PCI_REGION_TYPE_PREFMEM_64].base = BUILD_PCIMEM_64_START;
int reg1 = PCI_REGION_TYPE_PREFMEM, reg2 = PCI_REGION_TYPE_MEM; if (bus->r[reg1].sum < bus->r[reg2].sum) { @@ -449,29 +452,34 @@ static int pci_bios_init_root_regions(struct pci_bus *bus, u32 start, u32 end)
static void pci_bios_init_bus_bases(struct pci_bus *bus) { - u32 base, newbase, size; + u32 size; + s64 base, newbase; int type, i;
for (type = 0; type < PCI_REGION_TYPE_COUNT; type++) { - dprintf(1, " type %s max %x sum %x base %x\n", region_type_name[type], - bus->r[type].max, bus->r[type].sum, bus->r[type].base); + dprintf(1, " type %s max %x sum 0x%08x%08x base %08x%08x\n", + region_type_name[type], bus->r[type].max, + (u32)(bus->r[type].sum>>32), (u32)bus->r[type].sum, + (u32)(bus->r[type].base>>32), (u32)bus->r[type].base); base = bus->r[type].base; for (i = ARRAY_SIZE(bus->r[type].count)-1; i >= 0; i--) { size = pci_index_to_size(i, type); if (!bus->r[type].count[i]) continue; newbase = base + size * bus->r[type].count[i]; - dprintf(1, " size %8x: %d bar(s), %8x -> %8x\n", - size, bus->r[type].count[i], base, newbase - 1); + dprintf(1, " size %8x: %d bar(s), %08x%08x -> %08x%08x\n", + size, bus->r[type].count[i], (u32)(base>>32), (u32)base, + (u32)((newbase - 1)>>32), (u32)newbase - 1); bus->r[type].bases[i] = base; base = newbase; } } }
-static u32 pci_bios_bus_get_addr(struct pci_bus *bus, int type, u32 size) +static s64 pci_bios_bus_get_addr(struct pci_bus *bus, int type, u32 size) { - u32 index, addr; + u32 index; + s64 addr;
index = pci_size_to_index(size, type); addr = bus->r[type].bases[index]; @@ -540,13 +548,16 @@ static void pci_bios_map_devices(struct pci_bus *busses) continue;
int type = pci_addr_to_type(pci->bars[i].addr); - u32 addr = pci_bios_bus_get_addr(bus, type, pci->bars[i].size); - dprintf(1, " bar %d, addr %x, size %x [%s]\n", - i, addr, pci->bars[i].size, region_type_name[type]); + s64 addr = pci_bios_bus_get_addr(bus, type, pci->bars[i].size); + dprintf(1, " bar %d, addr %08x%08x, size %x [%s]\n", + i, (u32)(addr >> 32), (u32)addr, pci->bars[i].size, + region_type_name[type]); pci_set_io_region_addr(pci, i, addr);
- if (pci->bars[i].is64) + if (type == PCI_REGION_TYPE_PREFMEM_64) { i++; + pci_set_io_region_addr(pci, i, addr >> 32); + } } } }
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
Signed-off-by: Alexey Korolevalexey.korolev@endace.com --- src/pciinit.c | 69 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/pciinit.c b/src/pciinit.c index a574e38..92942d5 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -17,6 +17,7 @@
#define PCI_BRIDGE_IO_MIN 0x1000 #define PCI_BRIDGE_MEM_MIN 0x100000 +#define PCI_BRIDGE_MEM_MAX 0x80000000
enum pci_region_type { PCI_REGION_TYPE_IO, @@ -45,6 +46,7 @@ struct pci_bus { s64 base; s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT]; + int is64; struct pci_device *bus_dev; };
@@ -369,6 +371,26 @@ static void pci_bios_bus_reserve(struct pci_bus *bus, int type, u32 size) bus->r[type].max = size; }
+static void pci_bios_secondary_bus_reserve(struct pci_bus *parent, + struct pci_bus *s, int type) +{ + u32 limit = (type == PCI_REGION_TYPE_IO) ? + PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN; + + if (s->r[type].sum> PCI_BRIDGE_MEM_MAX) { + panic("Size: %08x%08x is too big\n", + (u32)s->r[type].sum, (u32)(s->r[type].sum>>32)); + } + s->r[type].size = (u32)s->r[type].sum; + if (s->r[type].size< limit) + s->r[type].size = limit; + s->r[type].size = pci_size_roundup(s->r[type].size); + + pci_bios_bus_reserve(parent, type, s->r[type].size); + dprintf(1, " size: %x, type %s\n", + s->r[type].size, region_type_name[type]); +} + static void pci_bios_check_devices(struct pci_bus *busses) { dprintf(1, "PCI: check devices\n"); @@ -392,8 +414,10 @@ static void pci_bios_check_devices(struct pci_bus *busses) pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size; - if (type == PCI_REGION_TYPE_PREFMEM_64) + if (type == PCI_REGION_TYPE_PREFMEM_64) { + bus->is64 = 1; i++; + } } }
@@ -404,22 +428,21 @@ static void pci_bios_check_devices(struct pci_bus *busses) if (!s->bus_dev) continue; struct pci_bus *parent =&busses[pci_bdf_to_bus(s->bus_dev->bdf)]; + + if (s->r[PCI_REGION_TYPE_PREFMEM_64].sum&& + s->r[PCI_REGION_TYPE_PREFMEM].sum) { + panic("Sparse PCI prefmem regions on the bus %d\n", secondary_bus); + } + + dprintf(1, "PCI: secondary bus %d\n", secondary_bus); int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) { - u32 limit = (type == PCI_REGION_TYPE_IO) ? - PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN; - s->r[type].size = s->r[type].sum; - if (s->r[type].size< limit) - s->r[type].size = limit; - s->r[type].size = pci_size_roundup(s->r[type].size); - pci_bios_bus_reserve(parent, type, s->r[type].size); - } - dprintf(1, "PCI: secondary bus %d sizes: io %x, mem %x, prefmem %x\n", - secondary_bus, - s->r[PCI_REGION_TYPE_IO].size, - s->r[PCI_REGION_TYPE_MEM].size, - s->r[PCI_REGION_TYPE_PREFMEM].size); - } + if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) || + (type == PCI_REGION_TYPE_PREFMEM&& s->is64)) + continue; + pci_bios_secondary_bus_reserve(parent, s, type); + } + } }
#define ROOT_BASE(top, sum, max) ALIGN_DOWN((top)-(sum),(max) ?: 1) @@ -507,14 +530,17 @@ static void pci_bios_map_devices(struct pci_bus *busses) struct pci_bus *parent =&busses[pci_bdf_to_bus(bdf)]; int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) { + if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) || + (type == PCI_REGION_TYPE_PREFMEM&& s->is64)) + continue; s->r[type].base = pci_bios_bus_get_addr( parent, type, s->r[type].size); } dprintf(1, "PCI: init bases bus %d (secondary)\n", secondary_bus); pci_bios_init_bus_bases(s);
- u32 base = s->r[PCI_REGION_TYPE_IO].base; - u32 limit = base + s->r[PCI_REGION_TYPE_IO].size - 1; + s64 base = s->r[PCI_REGION_TYPE_IO].base; + s64 limit = base + s->r[PCI_REGION_TYPE_IO].size - 1; pci_config_writeb(bdf, PCI_IO_BASE, base>> PCI_IO_SHIFT); pci_config_writew(bdf, PCI_IO_BASE_UPPER16, 0); pci_config_writeb(bdf, PCI_IO_LIMIT, limit>> PCI_IO_SHIFT); @@ -525,12 +551,13 @@ static void pci_bios_map_devices(struct pci_bus *busses) pci_config_writew(bdf, PCI_MEMORY_BASE, base>> PCI_MEMORY_SHIFT); pci_config_writew(bdf, PCI_MEMORY_LIMIT, limit>> PCI_MEMORY_SHIFT);
- base = s->r[PCI_REGION_TYPE_PREFMEM].base; - limit = base + s->r[PCI_REGION_TYPE_PREFMEM].size - 1; + type = s->is64 ? PCI_REGION_TYPE_PREFMEM_64 : PCI_REGION_TYPE_PREFMEM; + base = s->r[type].base; + limit = base + s->r[type].size - 1; pci_config_writew(bdf, PCI_PREF_MEMORY_BASE, base>> PCI_PREF_MEMORY_SHIFT); pci_config_writew(bdf, PCI_PREF_MEMORY_LIMIT, limit>> PCI_PREF_MEMORY_SHIFT); - pci_config_writel(bdf, PCI_PREF_BASE_UPPER32, 0); - pci_config_writel(bdf, PCI_PREF_LIMIT_UPPER32, 0); + pci_config_writel(bdf, PCI_PREF_BASE_UPPER32, base>> 32); + pci_config_writel(bdf, PCI_PREF_LIMIT_UPPER32, limit>> 32); }
// Map regions on each device.
I hate thunderbird. Will resend the patches tomorrow.
pci_config_writel(bdf, PCI_PREF_BASE_UPPER32, base>> 32);
On Wed, Dec 28, 2011 at 06:26:05PM +1300, Alexey Korolev wrote:
This patch adds PCI_REGION_TYPE_PREFMEM_64 region type and modifies types of variables to make it possible to work with 64 bit addresses.
Why I've added just one region type PCI_REGION_TYPE_PREFMEM_64 and haven't added PCI_REGION_TYPE_MEM_64? According to PCI architecture specification, the bridges can describe 64bit ranges for prefetchable type of memory only. So it's very unlikely that devices exporting 64bit non-prefetchable BARs.
Might happen for system devices I guess.
Anyway this code will work with 64bit non-prefetchable BARs unless the PCI device is not behind the secondary bus.
So what happens if such a device is on root bus?
Signed-off-by: Alexey Korolev alexey.korolev@endace.com
src/pci.h | 1 - src/pciinit.c | 59 +++++++++++++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/src/pci.h b/src/pci.h index a2a5a4c..71f15fe 100644 --- a/src/pci.h +++ b/src/pci.h @@ -54,7 +54,6 @@ struct pci_device { struct { u32 addr; u32 size;
int is64;
} bars[PCI_NUM_REGIONS];
// Local information on device.
diff --git a/src/pciinit.c b/src/pciinit.c index 7d83368..a574e38 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -22,6 +22,7 @@ enum pci_region_type { PCI_REGION_TYPE_IO, PCI_REGION_TYPE_MEM, PCI_REGION_TYPE_PREFMEM,
- PCI_REGION_TYPE_PREFMEM_64, PCI_REGION_TYPE_COUNT,
};
@@ -29,18 +30,20 @@ static const char *region_type_name[] = { [ PCI_REGION_TYPE_IO ] = "io", [ PCI_REGION_TYPE_MEM ] = "mem", [ PCI_REGION_TYPE_PREFMEM ] = "prefmem",
- [ PCI_REGION_TYPE_PREFMEM_64 ] = "prefmem64",
};
struct pci_bus { struct {
/* pci region stats */
u32 count[32 - PCI_MEM_INDEX_SHIFT];
u32 sum, max; /* seconday bus region sizes */ u32 size;
/* pci region assignments */
u32 bases[32 - PCI_MEM_INDEX_SHIFT];
u32 base;
/* pci region stats */
u32 max;
u32 count[32 - PCI_MEM_INDEX_SHIFT];
s64 sum;
/* pci region assignments */
s64 base;
} r[PCI_REGION_TYPE_COUNT]; struct pci_device *bus_dev;s64 bases[32 - PCI_MEM_INDEX_SHIFT];
}; @@ -69,6 +72,8 @@ static enum pci_region_type pci_addr_to_type(u32 addr) { if (addr & PCI_BASE_ADDRESS_SPACE_IO) return PCI_REGION_TYPE_IO;
- if (addr & PCI_BASE_ADDRESS_MEM_TYPE_64)
if (addr & PCI_BASE_ADDRESS_MEM_PREFETCH) return PCI_REGION_TYPE_PREFMEM; return PCI_REGION_TYPE_MEM;return PCI_REGION_TYPE_PREFMEM_64;
@@ -378,19 +383,16 @@ static void pci_bios_check_devices(struct pci_bus *busses) struct pci_bus *bus = &busses[pci_bdf_to_bus(pci->bdf)]; int i; for (i = 0; i < PCI_NUM_REGIONS; i++) {
u32 val, size;
u32 val, size, type; pci_bios_get_bar(pci, i, &val, &size); if (val == 0) continue;
pci_bios_bus_reserve(bus, pci_addr_to_type(val), size);
type = pci_addr_to_type(val);
pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
pci->bars[i].is64 = (!(val & PCI_BASE_ADDRESS_SPACE_IO) &&
(val & PCI_BASE_ADDRESS_MEM_TYPE_MASK)
== PCI_BASE_ADDRESS_MEM_TYPE_64);
if (pci->bars[i].is64)
}if (type == PCI_REGION_TYPE_PREFMEM_64) i++; }
@@ -426,6 +428,7 @@ static void pci_bios_check_devices(struct pci_bus *busses) static int pci_bios_init_root_regions(struct pci_bus *bus, u32 start, u32 end) { bus->r[PCI_REGION_TYPE_IO].base = 0xc000;
bus->r[PCI_REGION_TYPE_PREFMEM_64].base = BUILD_PCIMEM_64_START;
int reg1 = PCI_REGION_TYPE_PREFMEM, reg2 = PCI_REGION_TYPE_MEM; if (bus->r[reg1].sum < bus->r[reg2].sum) {
@@ -449,29 +452,34 @@ static int pci_bios_init_root_regions(struct pci_bus *bus, u32 start, u32 end)
static void pci_bios_init_bus_bases(struct pci_bus *bus) {
- u32 base, newbase, size;
u32 size;
s64 base, newbase; int type, i;
for (type = 0; type < PCI_REGION_TYPE_COUNT; type++) {
dprintf(1, " type %s max %x sum %x base %x\n",
region_type_name[type],
bus->r[type].max, bus->r[type].sum, bus->r[type].base);
dprintf(1, " type %s max %x sum 0x%08x%08x base %08x%08x\n",
region_type_name[type], bus->r[type].max,
(u32)(bus->r[type].sum>>32), (u32)bus->r[type].sum,
(u32)(bus->r[type].base>>32), (u32)bus->r[type].base); base = bus->r[type].base; for (i = ARRAY_SIZE(bus->r[type].count)-1; i >= 0; i--) { size = pci_index_to_size(i, type); if (!bus->r[type].count[i]) continue; newbase = base + size * bus->r[type].count[i];
dprintf(1, " size %8x: %d bar(s), %8x -> %8x\n",
size, bus->r[type].count[i], base, newbase - 1);
dprintf(1, " size %8x: %d bar(s), %08x%08x -> %08x%08x\n",
size, bus->r[type].count[i], (u32)(base>>32),
(u32)base,
}(u32)((newbase - 1)>>32), (u32)newbase - 1); bus->r[type].bases[i] = base; base = newbase; }
}
-static u32 pci_bios_bus_get_addr(struct pci_bus *bus, int type, u32 size) +static s64 pci_bios_bus_get_addr(struct pci_bus *bus, int type, u32 size) {
- u32 index, addr;
u32 index;
s64 addr;
index = pci_size_to_index(size, type); addr = bus->r[type].bases[index];
@@ -540,13 +548,16 @@ static void pci_bios_map_devices(struct pci_bus *busses) continue;
int type = pci_addr_to_type(pci->bars[i].addr);
u32 addr = pci_bios_bus_get_addr(bus, type, pci->bars[i].size);
dprintf(1, " bar %d, addr %x, size %x [%s]\n",
i, addr, pci->bars[i].size, region_type_name[type]);
s64 addr = pci_bios_bus_get_addr(bus, type, pci->bars[i].size);
dprintf(1, " bar %d, addr %08x%08x, size %x [%s]\n",
i, (u32)(addr >> 32), (u32)addr, pci->bars[i].size,
region_type_name[type]); pci_set_io_region_addr(pci, i, addr);
if (pci->bars[i].is64)
if (type == PCI_REGION_TYPE_PREFMEM_64) { i++;
pci_set_io_region_addr(pci, i, addr >> 32);
}} }
}
1.7.5.4
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do: 1. Make the 64 bit device only use the low 32 bit 2. Put the 32 bit one in the non-prefetcheable range
1 probably makes more sense for small BARs 2 probably makes more sense for large ones
Try also looking at e.g. linux bus scanning code for more ideas.
Another thing I don't see addressed here is that support for 64 bit ranges is I think optional in the bridge.
Signed-off-by: Alexey Korolevalexey.korolev@endace.com
Whitespace is corrupted: checkyour mail setup? There should be spaces around operators: a < b, I see a< b. Sometimes a< b (two spaces after).
src/pciinit.c | 69 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/pciinit.c b/src/pciinit.c index a574e38..92942d5 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -17,6 +17,7 @@
#define PCI_BRIDGE_IO_MIN 0x1000 #define PCI_BRIDGE_MEM_MIN 0x100000 +#define PCI_BRIDGE_MEM_MAX 0x80000000
enum pci_region_type { PCI_REGION_TYPE_IO, @@ -45,6 +46,7 @@ struct pci_bus { s64 base; s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT];
- int is64; struct pci_device *bus_dev;
};
@@ -369,6 +371,26 @@ static void pci_bios_bus_reserve(struct pci_bus *bus, int type, u32 size) bus->r[type].max = size; }
+static void pci_bios_secondary_bus_reserve(struct pci_bus *parent,
struct pci_bus *s, int type)
+{
- u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
- if (s->r[type].sum> PCI_BRIDGE_MEM_MAX) {
panic("Size: %08x%08x is too big\n",
(u32)s->r[type].sum, (u32)(s->r[type].sum>>32));
- }
- s->r[type].size = (u32)s->r[type].sum;
- if (s->r[type].size< limit)
s->r[type].size = limit;
- s->r[type].size = pci_size_roundup(s->r[type].size);
- pci_bios_bus_reserve(parent, type, s->r[type].size);
- dprintf(1, " size: %x, type %s\n",
s->r[type].size, region_type_name[type]);
+}
static void pci_bios_check_devices(struct pci_bus *busses) { dprintf(1, "PCI: check devices\n"); @@ -392,8 +414,10 @@ static void pci_bios_check_devices(struct pci_bus *busses) pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
if (type == PCI_REGION_TYPE_PREFMEM_64)
if (type == PCI_REGION_TYPE_PREFMEM_64) {
bus->is64 = 1; i++;
}} }
@@ -404,22 +428,21 @@ static void pci_bios_check_devices(struct pci_bus *busses) if (!s->bus_dev) continue; struct pci_bus *parent =&busses[pci_bdf_to_bus(s->bus_dev->bdf)];
if (s->r[PCI_REGION_TYPE_PREFMEM_64].sum&&
Space before && here and elsewhere.
s->r[PCI_REGION_TYPE_PREFMEM].sum) {
panic("Sparse PCI prefmem regions on the bus %d\n", secondary_bus);
}
dprintf(1, "PCI: secondary bus %d\n", secondary_bus); int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) {
u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
s->r[type].size = s->r[type].sum;
if (s->r[type].size< limit)
s->r[type].size = limit;
s->r[type].size = pci_size_roundup(s->r[type].size);
pci_bios_bus_reserve(parent, type, s->r[type].size);
}
dprintf(1, "PCI: secondary bus %d sizes: io %x, mem %x, prefmem %x\n",
secondary_bus,
s->r[PCI_REGION_TYPE_IO].size,
s->r[PCI_REGION_TYPE_MEM].size,
s->r[PCI_REGION_TYPE_PREFMEM].size);
- }
if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) ||
(type == PCI_REGION_TYPE_PREFMEM&& s->is64))
continue;
Can't figure this out. What does this do?
pci_bios_secondary_bus_reserve(parent, s, type);
}
- }
}
#define ROOT_BASE(top, sum, max) ALIGN_DOWN((top)-(sum),(max) ?: 1) @@ -507,14 +530,17 @@ static void pci_bios_map_devices(struct pci_bus *busses) struct pci_bus *parent =&busses[pci_bdf_to_bus(bdf)]; int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) {
if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) ||
(type == PCI_REGION_TYPE_PREFMEM&& s->is64))
continue; s->r[type].base = pci_bios_bus_get_addr( parent, type, s->r[type].size); } dprintf(1, "PCI: init bases bus %d (secondary)\n", secondary_bus); pci_bios_init_bus_bases(s);
u32 base = s->r[PCI_REGION_TYPE_IO].base;
u32 limit = base + s->r[PCI_REGION_TYPE_IO].size - 1;
s64 base = s->r[PCI_REGION_TYPE_IO].base;
s64 limit = base + s->r[PCI_REGION_TYPE_IO].size - 1; pci_config_writeb(bdf, PCI_IO_BASE, base>> PCI_IO_SHIFT); pci_config_writew(bdf, PCI_IO_BASE_UPPER16, 0); pci_config_writeb(bdf, PCI_IO_LIMIT, limit>> PCI_IO_SHIFT);
@@ -525,12 +551,13 @@ static void pci_bios_map_devices(struct pci_bus *busses) pci_config_writew(bdf, PCI_MEMORY_BASE, base>> PCI_MEMORY_SHIFT); pci_config_writew(bdf, PCI_MEMORY_LIMIT, limit>> PCI_MEMORY_SHIFT);
base = s->r[PCI_REGION_TYPE_PREFMEM].base;
limit = base + s->r[PCI_REGION_TYPE_PREFMEM].size - 1;
type = s->is64 ? PCI_REGION_TYPE_PREFMEM_64 : PCI_REGION_TYPE_PREFMEM;
base = s->r[type].base;
limit = base + s->r[type].size - 1; pci_config_writew(bdf, PCI_PREF_MEMORY_BASE, base>> PCI_PREF_MEMORY_SHIFT); pci_config_writew(bdf, PCI_PREF_MEMORY_LIMIT, limit>> PCI_PREF_MEMORY_SHIFT);
pci_config_writel(bdf, PCI_PREF_BASE_UPPER32, 0);
pci_config_writel(bdf, PCI_PREF_LIMIT_UPPER32, 0);
pci_config_writel(bdf, PCI_PREF_BASE_UPPER32, base>> 32);
pci_config_writel(bdf, PCI_PREF_LIMIT_UPPER32, limit>> 32);
}
// Map regions on each device.
-- 1.7.5.4
On Wed, Dec 28, 2011 at 05:41:20PM +1300, Alexey Korolev wrote:
Hi,
There were a number of requests about support of 64bit PCI BAR allocations.
Also we have observed the issue on guests with older linux version (2.6.18): if we have a 64bit BAR allocated within first 4GB, the OS may hang during start process. (I guess it is an OS bug, but we need to take care of this).
This patch addresses these two issues and allows 64bit BARs to be allocated in ranges above 4GB. Patch consists of three parts:
- Add new range above 4GB in _CRS table to let Windows 2008 work
properly. Thanks a lot to Michael S. Triskin for this brilliant idea. 2. Added new PCI_REGION_TYPE_PREFMEM_64 region type in pciinit and changed types of variables. 3. Take care about PCI devices with 64bit BARs on secondary buses.
Patches have been tested on several configurations which includes linux 2.6.18 - 3.0 & windows 2008. Everything works quite well.
Which qemu version did you use?
On Wed, Dec 28, 2011 at 06:26:05PM +1300, Alexey Korolev wrote:
This patch adds PCI_REGION_TYPE_PREFMEM_64 region type and modifies types of variables to make it possible to work with 64 bit addresses.
Why I've added just one region type PCI_REGION_TYPE_PREFMEM_64 and haven't added PCI_REGION_TYPE_MEM_64? According to PCI architecture specification, the bridges can describe 64bit ranges for prefetchable type of memory only. So it's very unlikely that devices exporting 64bit non-prefetchable BARs. Anyway this code will work with 64bit non-prefetchable BARs unless the PCI device is not behind the secondary bus.
[...]
--- a/src/pciinit.c +++ b/src/pciinit.c @@ -22,6 +22,7 @@ enum pci_region_type { PCI_REGION_TYPE_IO, PCI_REGION_TYPE_MEM, PCI_REGION_TYPE_PREFMEM,
- PCI_REGION_TYPE_PREFMEM_64, PCI_REGION_TYPE_COUNT,
};
This doesn't seem right. A 64bit bar is not a new category - it's just a way of representing larger values within the existing categories. Tracking of 64bit prefmem sections separately from regular prefmem sections doesn't make sense, because both need to be allocated from the same pool when behind a bridge.
struct pci_bus { struct {
/* pci region stats */
u32 count[32 - PCI_MEM_INDEX_SHIFT];
u32 sum, max; /* seconday bus region sizes */ u32 size;
/* pci region assignments */
u32 bases[32 - PCI_MEM_INDEX_SHIFT];
u32 base;
/* pci region stats */
u32 max;
u32 count[32 - PCI_MEM_INDEX_SHIFT];
s64 sum;
/* pci region assignments */
s64 base;
s64 bases[32 - PCI_MEM_INDEX_SHIFT];
Why the choice of s64 over u64? Given the amount of bit manipulation on these values, I think using u64 would be safer.
@@ -69,6 +72,8 @@ static enum pci_region_type pci_addr_to_type(u32 addr) { if (addr & PCI_BASE_ADDRESS_SPACE_IO) return PCI_REGION_TYPE_IO;
- if (addr & PCI_BASE_ADDRESS_MEM_TYPE_64)
return PCI_REGION_TYPE_PREFMEM_64;
This seems dangerous - a 64bit bar can be non-prefetchable - getting this wrong could cause random (hard to debug) crashes.
@@ -378,19 +383,16 @@ static void pci_bios_check_devices(struct pci_bus *busses) struct pci_bus *bus = &busses[pci_bdf_to_bus(pci->bdf)]; int i; for (i = 0; i < PCI_NUM_REGIONS; i++) {
u32 val, size;
u32 val, size, type; pci_bios_get_bar(pci, i, &val, &size); if (val == 0) continue;
pci_bios_bus_reserve(bus, pci_addr_to_type(val), size);
type = pci_addr_to_type(val);
pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
pci->bars[i].is64 = (!(val & PCI_BASE_ADDRESS_SPACE_IO) &&
(val & PCI_BASE_ADDRESS_MEM_TYPE_MASK)
== PCI_BASE_ADDRESS_MEM_TYPE_64);
if (pci->bars[i].is64)
if (type == PCI_REGION_TYPE_PREFMEM_64) i++;
If there is a 64bit bar, then the size could be over 32bits - the code really needs to handle this (or it could cause overlapping regions which result in random crashes).
-Kevin
On 29/12/11 00:30, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:26:05PM +1300, Alexey Korolev wrote:
This patch adds PCI_REGION_TYPE_PREFMEM_64 region type and modifies types of variables to make it possible to work with 64 bit addresses.
Why I've added just one region type PCI_REGION_TYPE_PREFMEM_64 and haven't added PCI_REGION_TYPE_MEM_64? According to PCI architecture specification, the bridges can describe 64bit ranges for prefetchable type of memory only. So it's very unlikely that devices exporting 64bit non-prefetchable BARs.
Might happen for system devices I guess.
Anyway this code will work with 64bit non-prefetchable BARs unless the PCI device is not behind the secondary bus.
So what happens if such a device is on root bus?
If a device is on the root bus and have BAR flags 0x4 (TYPE_MEMORY and 64 bit), memory will be allocated in 64bit range all flags remain the same. I did this just out of curiosity and this appears to work well.
On 29/12/11 15:56, Kevin O'Connor wrote:
On Wed, Dec 28, 2011 at 06:26:05PM +1300, Alexey Korolev wrote:
This patch adds PCI_REGION_TYPE_PREFMEM_64 region type and modifies types of variables to make it possible to work with 64 bit addresses.
Why I've added just one region type PCI_REGION_TYPE_PREFMEM_64 and haven't added PCI_REGION_TYPE_MEM_64? According to PCI architecture specification, the bridges can describe 64bit ranges for prefetchable type of memory only. So it's very unlikely that devices exporting 64bit non-prefetchable BARs. Anyway this code will work with 64bit non-prefetchable BARs unless the PCI device is not behind the secondary bus.
[...]
--- a/src/pciinit.c +++ b/src/pciinit.c @@ -22,6 +22,7 @@ enum pci_region_type { PCI_REGION_TYPE_IO, PCI_REGION_TYPE_MEM, PCI_REGION_TYPE_PREFMEM,
- PCI_REGION_TYPE_PREFMEM_64, PCI_REGION_TYPE_COUNT, };
This doesn't seem right. A 64bit bar is not a new category - it's just a way of representing larger values within the existing categories. Tracking of 64bit prefmem sections separately from regular prefmem sections doesn't make sense, because both need to be allocated from the same pool when behind a bridge.
It's a way to account all memory sections on the root bus, as on the root bus we can have all 4 regions. I don't like this part as well, it causes confusions.
One possible solution is to have different descriptors for secondary buses and the root bus. In that case we can have 3 sections per secondary bus and the root bus will contain memory regions without any 'physical' meaning.
struct pci_bus { struct {
/* pci region stats */
u32 count[32 - PCI_MEM_INDEX_SHIFT];
u32 sum, max; /* seconday bus region sizes */ u32 size;
/* pci region assignments */
u32 bases[32 - PCI_MEM_INDEX_SHIFT];
u32 base;
/* pci region stats */
u32 max;
u32 count[32 - PCI_MEM_INDEX_SHIFT];
s64 sum;
/* pci region assignments */
s64 base;
s64 bases[32 - PCI_MEM_INDEX_SHIFT];
Why the choice of s64 over u64? Given the amount of bit manipulation on these values, I think using u64 would be safer.
No problem. Addresses could not exceed 40bit's so we basically may touch the last bit only when negative value is stored.
@@ -69,6 +72,8 @@ static enum pci_region_type pci_addr_to_type(u32 addr) { if (addr& PCI_BASE_ADDRESS_SPACE_IO) return PCI_REGION_TYPE_IO;
- if (addr& PCI_BASE_ADDRESS_MEM_TYPE_64)
return PCI_REGION_TYPE_PREFMEM_64;
This seems dangerous - a 64bit bar can be non-prefetchable - getting this wrong could cause random (hard to debug) crashes.
It is bit confusing but this doesn't touch actual types. So even if we have a 64bit not-prefetchable BAR code will be working.
@@ -378,19 +383,16 @@ static void pci_bios_check_devices(struct pci_bus *busses) struct pci_bus *bus =&busses[pci_bdf_to_bus(pci->bdf)]; int i; for (i = 0; i< PCI_NUM_REGIONS; i++) {
u32 val, size;
u32 val, size, type; pci_bios_get_bar(pci, i,&val,&size); if (val == 0) continue;
pci_bios_bus_reserve(bus, pci_addr_to_type(val), size);
type = pci_addr_to_type(val);
pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
pci->bars[i].is64 = (!(val& PCI_BASE_ADDRESS_SPACE_IO)&&
(val& PCI_BASE_ADDRESS_MEM_TYPE_MASK)
== PCI_BASE_ADDRESS_MEM_TYPE_64);
if (pci->bars[i].is64)
if (type == PCI_REGION_TYPE_PREFMEM_64) i++;
If there is a 64bit bar, then the size could be over 32bits - the code really needs to handle this (or it could cause overlapping regions which result in random crashes).
Agree, something has hold in my mind that BAR size it is limited to 4GB, but just looked into PCI spec - there are no limitations stated. Well this requires bigger changes, as 64bit BAR size accounting touches more computations comparing to 64bit BAR addresses.
It makes sense to figure out how shall we account all memory sections on the root bus. Will it be separated structures for root bus and secondary buses? Do you have another idea?
@@ -69,6 +72,8 @@ static enum pci_region_type pci_addr_to_type(u32 addr) { if (addr& PCI_BASE_ADDRESS_SPACE_IO) return PCI_REGION_TYPE_IO;
- if (addr& PCI_BASE_ADDRESS_MEM_TYPE_64)
return PCI_REGION_TYPE_PREFMEM_64;
This seems dangerous - a 64bit bar can be non-prefetchable - getting this wrong could cause random (hard to debug) crashes.
Just out of curiosity - how this could happen? Having 64bit non-prefetchable BAR implies that the device is not behind any bridge (as bridges describe 64bit ranges for prefetchable memory only). Is it possible on nowadays systems?
On 29/12/11 00:43, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this. As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
- Put the 32 bit one in the non-prefetcheable range
I'd rather not do this. Bios should not change memory region types. It will confuse guest OS drivers.
1 probably makes more sense for small BARs 2 probably makes more sense for large ones
Try also looking at e.g. linux bus scanning code for more ideas.
Another thing I don't see addressed here is that support for 64 bit ranges is I think optional in the bridge.
Signed-off-by: Alexey Korolevalexey.korolev@endace.com
Whitespace is corrupted: checkyour mail setup? There should be spaces around operators: a< b, I see a< b. Sometimes a< b (two spaces after).
Yes, it's thunderbird :(. Sorry about that. It seems the patches need to have some functional changes.
src/pciinit.c | 69 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/pciinit.c b/src/pciinit.c index a574e38..92942d5 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -17,6 +17,7 @@
#define PCI_BRIDGE_IO_MIN 0x1000 #define PCI_BRIDGE_MEM_MIN 0x100000 +#define PCI_BRIDGE_MEM_MAX 0x80000000
enum pci_region_type { PCI_REGION_TYPE_IO, @@ -45,6 +46,7 @@ struct pci_bus { s64 base; s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT];
- int is64; struct pci_device *bus_dev; };
@@ -369,6 +371,26 @@ static void pci_bios_bus_reserve(struct pci_bus *bus, int type, u32 size) bus->r[type].max = size; }
+static void pci_bios_secondary_bus_reserve(struct pci_bus *parent,
struct pci_bus *s, int type)
+{
- u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
- if (s->r[type].sum> PCI_BRIDGE_MEM_MAX) {
panic("Size: %08x%08x is too big\n",
(u32)s->r[type].sum, (u32)(s->r[type].sum>>32));
- }
- s->r[type].size = (u32)s->r[type].sum;
- if (s->r[type].size< limit)
s->r[type].size = limit;
- s->r[type].size = pci_size_roundup(s->r[type].size);
- pci_bios_bus_reserve(parent, type, s->r[type].size);
- dprintf(1, " size: %x, type %s\n",
s->r[type].size, region_type_name[type]);
+}
- static void pci_bios_check_devices(struct pci_bus *busses) { dprintf(1, "PCI: check devices\n");
@@ -392,8 +414,10 @@ static void pci_bios_check_devices(struct pci_bus *busses) pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
if (type == PCI_REGION_TYPE_PREFMEM_64)
if (type == PCI_REGION_TYPE_PREFMEM_64) {
bus->is64 = 1; i++;
} } }
@@ -404,22 +428,21 @@ static void pci_bios_check_devices(struct pci_bus *busses) if (!s->bus_dev) continue; struct pci_bus *parent =&busses[pci_bdf_to_bus(s->bus_dev->bdf)];
if (s->r[PCI_REGION_TYPE_PREFMEM_64].sum&&
Space before&& here and elsewhere.
s->r[PCI_REGION_TYPE_PREFMEM].sum) {
panic("Sparse PCI prefmem regions on the bus %d\n", secondary_bus);
}
dprintf(1, "PCI: secondary bus %d\n", secondary_bus); int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) {
u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
s->r[type].size = s->r[type].sum;
if (s->r[type].size< limit)
s->r[type].size = limit;
s->r[type].size = pci_size_roundup(s->r[type].size);
pci_bios_bus_reserve(parent, type, s->r[type].size);
}
dprintf(1, "PCI: secondary bus %d sizes: io %x, mem %x, prefmem %x\n",
secondary_bus,
s->r[PCI_REGION_TYPE_IO].size,
s->r[PCI_REGION_TYPE_MEM].size,
s->r[PCI_REGION_TYPE_PREFMEM].size);
- }
if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) ||
(type == PCI_REGION_TYPE_PREFMEM&& s->is64))
continue;
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas. Otherwise we will pick one which exists or 32bit one if both are not present. Anyway this part is very likely to be rewritten.
On 29/12/11 00:43, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this. As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
- Put the 32 bit one in the non-prefetcheable range
I'd rather not do this. Bios should not change memory region types. It will confuse guest OS drivers.
1 probably makes more sense for small BARs 2 probably makes more sense for large ones
Try also looking at e.g. linux bus scanning code for more ideas.
Another thing I don't see addressed here is that support for 64 bit ranges is I think optional in the bridge.
Signed-off-by: Alexey Korolevalexey.korolev@endace.com
Whitespace is corrupted: checkyour mail setup? There should be spaces around operators: a< b, I see a< b. Sometimes a< b (two spaces after).
Yes, it's thunderbird :(. Sorry about that. It seems the patches need to have some functional changes anyway.
src/pciinit.c | 69 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/pciinit.c b/src/pciinit.c index a574e38..92942d5 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -17,6 +17,7 @@
#define PCI_BRIDGE_IO_MIN 0x1000 #define PCI_BRIDGE_MEM_MIN 0x100000 +#define PCI_BRIDGE_MEM_MAX 0x80000000
enum pci_region_type { PCI_REGION_TYPE_IO, @@ -45,6 +46,7 @@ struct pci_bus { s64 base; s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT];
- int is64; struct pci_device *bus_dev; };
@@ -369,6 +371,26 @@ static void pci_bios_bus_reserve(struct pci_bus *bus, int type, u32 size) bus->r[type].max = size; }
+static void pci_bios_secondary_bus_reserve(struct pci_bus *parent,
struct pci_bus *s, int type)
+{
- u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
- if (s->r[type].sum> PCI_BRIDGE_MEM_MAX) {
panic("Size: %08x%08x is too big\n",
(u32)s->r[type].sum, (u32)(s->r[type].sum>>32));
- }
- s->r[type].size = (u32)s->r[type].sum;
- if (s->r[type].size< limit)
s->r[type].size = limit;
- s->r[type].size = pci_size_roundup(s->r[type].size);
- pci_bios_bus_reserve(parent, type, s->r[type].size);
- dprintf(1, " size: %x, type %s\n",
s->r[type].size, region_type_name[type]);
+}
- static void pci_bios_check_devices(struct pci_bus *busses) { dprintf(1, "PCI: check devices\n");
@@ -392,8 +414,10 @@ static void pci_bios_check_devices(struct pci_bus *busses) pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
if (type == PCI_REGION_TYPE_PREFMEM_64)
if (type == PCI_REGION_TYPE_PREFMEM_64) {
bus->is64 = 1; i++;
} } }
@@ -404,22 +428,21 @@ static void pci_bios_check_devices(struct pci_bus *busses) if (!s->bus_dev) continue; struct pci_bus *parent =&busses[pci_bdf_to_bus(s->bus_dev->bdf)];
if (s->r[PCI_REGION_TYPE_PREFMEM_64].sum&&
Space before&& here and elsewhere.
s->r[PCI_REGION_TYPE_PREFMEM].sum) {
panic("Sparse PCI prefmem regions on the bus %d\n", secondary_bus);
}
dprintf(1, "PCI: secondary bus %d\n", secondary_bus); int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) {
u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
s->r[type].size = s->r[type].sum;
if (s->r[type].size< limit)
s->r[type].size = limit;
s->r[type].size = pci_size_roundup(s->r[type].size);
pci_bios_bus_reserve(parent, type, s->r[type].size);
}
dprintf(1, "PCI: secondary bus %d sizes: io %x, mem %x, prefmem %x\n",
secondary_bus,
s->r[PCI_REGION_TYPE_IO].size,
s->r[PCI_REGION_TYPE_MEM].size,
s->r[PCI_REGION_TYPE_PREFMEM].size);
- }
if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) ||
(type == PCI_REGION_TYPE_PREFMEM&& s->is64))
continue;
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas. Otherwise we will pick one which exists or 32bit one if both are not present.
Thanks Alexey
On Thu, Dec 29, 2011 at 06:32:37PM +1300, Alexey Korolev wrote:
@@ -69,6 +72,8 @@ static enum pci_region_type pci_addr_to_type(u32 addr) { if (addr& PCI_BASE_ADDRESS_SPACE_IO) return PCI_REGION_TYPE_IO;
- if (addr& PCI_BASE_ADDRESS_MEM_TYPE_64)
return PCI_REGION_TYPE_PREFMEM_64;
This seems dangerous - a 64bit bar can be non-prefetchable - getting this wrong could cause random (hard to debug) crashes.
Just out of curiosity - how this could happen? Having 64bit non-prefetchable BAR implies that the device is not behind any bridge (as bridges describe 64bit ranges for prefetchable memory only). Is it possible on nowadays systems?
Yes. qemu puts all devices directly on the root bus. system devices are also commonly located on the root bus.
On Thu, Dec 29, 2011 at 06:40:26PM +1300, Alexey Korolev wrote:
On 29/12/11 00:43, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this. As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
- Put the 32 bit one in the non-prefetcheable range
I'd rather not do this. Bios should not change memory region types. It will confuse guest OS drivers.
It shouldn't, it's a legal behavious. Prefetcheable BAR allows certain optimizations but does not require them.
1 probably makes more sense for small BARs 2 probably makes more sense for large ones
Try also looking at e.g. linux bus scanning code for more ideas.
Another thing I don't see addressed here is that support for 64 bit ranges is I think optional in the bridge.
Signed-off-by: Alexey Korolevalexey.korolev@endace.com
Whitespace is corrupted: checkyour mail setup? There should be spaces around operators: a< b, I see a< b. Sometimes a< b (two spaces after).
Yes, it's thunderbird :(. Sorry about that. It seems the patches need to have some functional changes.
src/pciinit.c | 69 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/pciinit.c b/src/pciinit.c index a574e38..92942d5 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -17,6 +17,7 @@
#define PCI_BRIDGE_IO_MIN 0x1000 #define PCI_BRIDGE_MEM_MIN 0x100000 +#define PCI_BRIDGE_MEM_MAX 0x80000000
enum pci_region_type { PCI_REGION_TYPE_IO, @@ -45,6 +46,7 @@ struct pci_bus { s64 base; s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT];
- int is64; struct pci_device *bus_dev;
};
@@ -369,6 +371,26 @@ static void pci_bios_bus_reserve(struct pci_bus *bus, int type, u32 size) bus->r[type].max = size; }
+static void pci_bios_secondary_bus_reserve(struct pci_bus *parent,
struct pci_bus *s, int type)
+{
- u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
- if (s->r[type].sum> PCI_BRIDGE_MEM_MAX) {
panic("Size: %08x%08x is too big\n",
(u32)s->r[type].sum, (u32)(s->r[type].sum>>32));
- }
- s->r[type].size = (u32)s->r[type].sum;
- if (s->r[type].size< limit)
s->r[type].size = limit;
- s->r[type].size = pci_size_roundup(s->r[type].size);
- pci_bios_bus_reserve(parent, type, s->r[type].size);
- dprintf(1, " size: %x, type %s\n",
s->r[type].size, region_type_name[type]);
+}
static void pci_bios_check_devices(struct pci_bus *busses) { dprintf(1, "PCI: check devices\n"); @@ -392,8 +414,10 @@ static void pci_bios_check_devices(struct pci_bus *busses) pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
if (type == PCI_REGION_TYPE_PREFMEM_64)
if (type == PCI_REGION_TYPE_PREFMEM_64) {
bus->is64 = 1; i++;
}} }
@@ -404,22 +428,21 @@ static void pci_bios_check_devices(struct pci_bus *busses) if (!s->bus_dev) continue; struct pci_bus *parent =&busses[pci_bdf_to_bus(s->bus_dev->bdf)];
if (s->r[PCI_REGION_TYPE_PREFMEM_64].sum&&
Space before&& here and elsewhere.
s->r[PCI_REGION_TYPE_PREFMEM].sum) {
panic("Sparse PCI prefmem regions on the bus %d\n", secondary_bus);
}
dprintf(1, "PCI: secondary bus %d\n", secondary_bus); int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) {
u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
s->r[type].size = s->r[type].sum;
if (s->r[type].size< limit)
s->r[type].size = limit;
s->r[type].size = pci_size_roundup(s->r[type].size);
pci_bios_bus_reserve(parent, type, s->r[type].size);
}
dprintf(1, "PCI: secondary bus %d sizes: io %x, mem %x, prefmem %x\n",
secondary_bus,
s->r[PCI_REGION_TYPE_IO].size,
s->r[PCI_REGION_TYPE_MEM].size,
s->r[PCI_REGION_TYPE_PREFMEM].size);
- }
if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) ||
(type == PCI_REGION_TYPE_PREFMEM&& s->is64))
continue;
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas. Otherwise we will pick one which exists or 32bit one if both are not present. Anyway this part is very likely to be rewritten.
On Thu, Dec 29, 2011 at 06:41:36PM +1300, Alexey Korolev wrote:
On 29/12/11 00:43, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this.
Can you find out why? qemu has a flag to easily debug guests with gdb, shouldn't be hard.
As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
- Put the 32 bit one in the non-prefetcheable range
I'd rather not do this. Bios should not change memory region types. It will confuse guest OS drivers.
1 probably makes more sense for small BARs 2 probably makes more sense for large ones
Try also looking at e.g. linux bus scanning code for more ideas.
Another thing I don't see addressed here is that support for 64 bit ranges is I think optional in the bridge.
Signed-off-by: Alexey Korolevalexey.korolev@endace.com
Whitespace is corrupted: checkyour mail setup? There should be spaces around operators: a< b, I see a< b. Sometimes a< b (two spaces after).
Yes, it's thunderbird :(. Sorry about that. It seems the patches need to have some functional changes anyway.
src/pciinit.c | 69 +++++++++++++++++++++++++++++++++++++++----------------- 1 files changed, 48 insertions(+), 21 deletions(-)
diff --git a/src/pciinit.c b/src/pciinit.c index a574e38..92942d5 100644 --- a/src/pciinit.c +++ b/src/pciinit.c @@ -17,6 +17,7 @@
#define PCI_BRIDGE_IO_MIN 0x1000 #define PCI_BRIDGE_MEM_MIN 0x100000 +#define PCI_BRIDGE_MEM_MAX 0x80000000
enum pci_region_type { PCI_REGION_TYPE_IO, @@ -45,6 +46,7 @@ struct pci_bus { s64 base; s64 bases[32 - PCI_MEM_INDEX_SHIFT]; } r[PCI_REGION_TYPE_COUNT];
- int is64; struct pci_device *bus_dev;
};
@@ -369,6 +371,26 @@ static void pci_bios_bus_reserve(struct pci_bus *bus, int type, u32 size) bus->r[type].max = size; }
+static void pci_bios_secondary_bus_reserve(struct pci_bus *parent,
struct pci_bus *s, int type)
+{
- u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
- if (s->r[type].sum> PCI_BRIDGE_MEM_MAX) {
panic("Size: %08x%08x is too big\n",
(u32)s->r[type].sum, (u32)(s->r[type].sum>>32));
- }
- s->r[type].size = (u32)s->r[type].sum;
- if (s->r[type].size< limit)
s->r[type].size = limit;
- s->r[type].size = pci_size_roundup(s->r[type].size);
- pci_bios_bus_reserve(parent, type, s->r[type].size);
- dprintf(1, " size: %x, type %s\n",
s->r[type].size, region_type_name[type]);
+}
static void pci_bios_check_devices(struct pci_bus *busses) { dprintf(1, "PCI: check devices\n"); @@ -392,8 +414,10 @@ static void pci_bios_check_devices(struct pci_bus *busses) pci_bios_bus_reserve(bus, type, size); pci->bars[i].addr = val; pci->bars[i].size = size;
if (type == PCI_REGION_TYPE_PREFMEM_64)
if (type == PCI_REGION_TYPE_PREFMEM_64) {
bus->is64 = 1; i++;
}} }
@@ -404,22 +428,21 @@ static void pci_bios_check_devices(struct pci_bus *busses) if (!s->bus_dev) continue; struct pci_bus *parent =&busses[pci_bdf_to_bus(s->bus_dev->bdf)];
if (s->r[PCI_REGION_TYPE_PREFMEM_64].sum&&
Space before&& here and elsewhere.
s->r[PCI_REGION_TYPE_PREFMEM].sum) {
panic("Sparse PCI prefmem regions on the bus %d\n", secondary_bus);
}
dprintf(1, "PCI: secondary bus %d\n", secondary_bus); int type; for (type = 0; type< PCI_REGION_TYPE_COUNT; type++) {
u32 limit = (type == PCI_REGION_TYPE_IO) ?
PCI_BRIDGE_IO_MIN : PCI_BRIDGE_MEM_MIN;
s->r[type].size = s->r[type].sum;
if (s->r[type].size< limit)
s->r[type].size = limit;
s->r[type].size = pci_size_roundup(s->r[type].size);
pci_bios_bus_reserve(parent, type, s->r[type].size);
}
dprintf(1, "PCI: secondary bus %d sizes: io %x, mem %x, prefmem %x\n",
secondary_bus,
s->r[PCI_REGION_TYPE_IO].size,
s->r[PCI_REGION_TYPE_MEM].size,
s->r[PCI_REGION_TYPE_PREFMEM].size);
- }
if ((type == PCI_REGION_TYPE_PREFMEM_64&& !s->is64) ||
(type == PCI_REGION_TYPE_PREFMEM&& s->is64))
continue;
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas. Otherwise we will pick one which exists or 32bit one if both are not present.
Thanks Alexey
On Thu, Dec 29, 2011 at 06:41:36PM +1300, Alexey Korolev wrote:
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas.
That's not good, it's a legal configuration.
Otherwise we will pick one which exists or 32bit one if both are not present.
On 30/12/11 05:18, Michael S. Tsirkin wrote:
On Thu, Dec 29, 2011 at 06:40:26PM +1300, Alexey Korolev wrote:
On 29/12/11 00:43, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this. As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
- Put the 32 bit one in the non-prefetcheable range
I'd rather not do this. Bios should not change memory region types. It will confuse guest OS drivers.
It shouldn't, it's a legal behavious. Prefetcheable BAR allows certain optimizations but does not require them.
Well it is really difficult to say what is a "legal behavior". Yes it is legal behavior for PCI operations. But it could be a problem in software . I won't be surprised if a driver fail to start if we substitute the regions. It will be very hard to check all possible cases.
On Wed, Dec 28, 2011 at 01:43:02PM +0200, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
- Put the 32 bit one in the non-prefetcheable range
I would vote for option 1. Supporting option 2 seems complex and it would only help the case where there is a large 64bit prefmem bar (that can't reliably fit in low memory) behind a bridge that supports 64bit prefmem bars that also has a small 32bit prefmem bar (which can reliably fit in low memory) behind it. This condition seems too unlikely to be worth supporting the additional complexity.
-Kevin
On 30/12/11 05:21, Michael S. Tsirkin wrote:
On Thu, Dec 29, 2011 at 06:41:36PM +1300, Alexey Korolev wrote:
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas.
That's not good, it's a legal configuration.
To be more complete : Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas on the same secondary Bus. I agree about that this limitation is too strict at the moment. But this is just a limitation, there are plenty of them in Seabios.
To answer if it is a legal or not we need to have a look how real BIOS would behave in a such scenario. Unfortunately I have no chance to have this configuration on hand. If you have (may have) this I would appreciate if you let us know how the Bios will behave?
On 30/12/11 05:21, Michael S. Tsirkin wrote:
On Thu, Dec 29, 2011 at 06:41:36PM +1300, Alexey Korolev wrote:
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas.
That's not good, it's a legal configuration.
To be more complete : Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas on the same secondary Bus. I agree about that this limitation is too strict at the moment.
To answer if it is a legal or not we need to have a look how real BIOS would behave in a such scenario. Unfortunately I have no chance to have this configuration on hand. If you have (may have) this I would appreciate if you let us know how the Bios will behave?
On Thu, Dec 29, 2011 at 06:00:04PM +1300, Alexey Korolev wrote:
On 29/12/11 15:56, Kevin O'Connor wrote:
On Wed, Dec 28, 2011 at 06:26:05PM +1300, Alexey Korolev wrote:
--- a/src/pciinit.c +++ b/src/pciinit.c @@ -22,6 +22,7 @@ enum pci_region_type { PCI_REGION_TYPE_IO, PCI_REGION_TYPE_MEM, PCI_REGION_TYPE_PREFMEM,
- PCI_REGION_TYPE_PREFMEM_64, PCI_REGION_TYPE_COUNT,
};
This doesn't seem right. A 64bit bar is not a new category - it's just a way of representing larger values within the existing categories. Tracking of 64bit prefmem sections separately from regular prefmem sections doesn't make sense, because both need to be allocated from the same pool when behind a bridge.
It's a way to account all memory sections on the root bus, as on the root bus we can have all 4 regions. I don't like this part as well, it causes confusions.
The root bus can have five regions - 64bit non-prefmem is also possible.
Given that the allocation for devices on the root bus is distinct from devices behind bridges, I think one could separate out the root bus assignment pass into its own code. Indeed, much of the root bus allocation is already separate.
It makes sense to figure out how shall we account all memory sections on the root bus. Will it be separated structures for root bus and secondary buses? Do you have another idea?
As an idea - keep the current three buckets (io, mem, prefmem) in struct pci_bus, but expand them to track 64bit sizes. Add a new flag to each bucket to track if every device on the given bus is 64bit capable for that resource. When assigning the resources for a bridge, only use 64bit prefmem if all prefmem bars behind that bridge are 64bit capable.
-Kevin
On Fri, Dec 30, 2011 at 06:10:13PM +1300, Alexey Korolev wrote:
On 30/12/11 05:21, Michael S. Tsirkin wrote:
On Thu, Dec 29, 2011 at 06:41:36PM +1300, Alexey Korolev wrote:
Can't figure this out. What does this do?
Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas.
That's not good, it's a legal configuration.
To be more complete : Bios will panic if it founds prefmem BARs in both 32bit and 64bit areas on the same secondary Bus. I agree about that this limitation is too strict at the moment. But this is just a limitation, there are plenty of them in Seabios.
There are very few panic calls in seabios. Halting in the system firmware is very unfriendly and should be avoided.
As described in my other email, if there is a mix of prefmem bars on a secondary bus, then just allocate all those bars in the first 4gig. No reason to halt the boot.
To answer if it is a legal or not we need to have a look how real BIOS would behave in a such scenario. Unfortunately I have no chance to have this configuration on hand. If you have (may have) this I would appreciate if you let us know how the Bios will behave?
You can look at the pci allocation code in coreboot.
-Kevin
On Thu, Dec 29, 2011 at 06:41:36PM +1300, Alexey Korolev wrote:
On 29/12/11 00:43, Michael S. Tsirkin wrote:
On Wed, Dec 28, 2011 at 06:35:55PM +1300, Alexey Korolev wrote:
All devices behind a bridge need to have all their regions consecutive and not overlapping with all the normal memory ranges. Since prefetchable memory is described by one record, we must avoid the situations when 32bit and 64bit prefetchable regions are present within one secondary bus.
How do we avoid this? Assume we have two devices: a 32 bit and a 64 bit one, behind a bridge. There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this. As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
That seems really odd - there's nothing unusual with a 64bit bar being set to an address under 4gig.
Looking at the current code, it's not initializing 64bit bars properly (it doesn't initialize the top bits of the 64bit address). Does just the patch below (totally untested) improve things for you?
-Kevin
--- a/src/pciinit.c +++ b/src/pciinit.c @@ -545,8 +545,10 @@ static void pci_bios_map_devices(struct pci_bus *busses) i, addr, pci->bars[i].size, region_type_name[type]); pci_set_io_region_addr(pci, i, addr);
- if (pci->bars[i].is64) + if (pci->bars[i].is64) { i++; + pci_set_io_region_addr(pci, i, 0); + } } } }
There are two main things we can do:
- Make the 64 bit device only use the low 32 bit
It was my first implementation. Unfortunately older versions of Linux (Like 2.6.18) hang during startup with this. As far as I remember it was qemu-0.15 so may be 1.0 have no such an issue. I will check this.
That seems really odd - there's nothing unusual with a 64bit bar being set to an address under 4gig.
Looking at the current code, it's not initializing 64bit bars properly (it doesn't initialize the top bits of the 64bit address). Does just the patch below (totally untested) improve things for you?
Yes I see. Will test after short holidays, now it's quite late in New Zealand. BTW: Tried today version 1.0 to reproduce this issue and could not reproduce it.
On 12/29/11 03:56, Kevin O'Connor wrote:
On Wed, Dec 28, 2011 at 06:26:05PM +1300, Alexey Korolev wrote:
This patch adds PCI_REGION_TYPE_PREFMEM_64 region type and modifies types of variables to make it possible to work with 64 bit addresses.
Why I've added just one region type PCI_REGION_TYPE_PREFMEM_64 and haven't added PCI_REGION_TYPE_MEM_64? According to PCI architecture specification, the bridges can describe 64bit ranges for prefetchable type of memory only. So it's very unlikely that devices exporting 64bit non-prefetchable BARs. Anyway this code will work with 64bit non-prefetchable BARs unless the PCI device is not behind the secondary bus.
[...]
--- a/src/pciinit.c +++ b/src/pciinit.c @@ -22,6 +22,7 @@ enum pci_region_type { PCI_REGION_TYPE_IO, PCI_REGION_TYPE_MEM, PCI_REGION_TYPE_PREFMEM,
- PCI_REGION_TYPE_PREFMEM_64, PCI_REGION_TYPE_COUNT,
};
This doesn't seem right. A 64bit bar is not a new category - it's just a way of representing larger values within the existing categories.
We need to take care to not confuse things here. We have pci bars which can be 64bit capable. We (will) have a memory region (aka address space) above 4G. We may or may not map 64bit bars there.
Tracking of 64bit prefmem sections separately from regular prefmem sections doesn't make sense, because both need to be allocated from the same pool when behind a bridge.
Yea, bridges make the picture a bit more complicated. A bridge itself might not be 64bit capable, which makes it impossible to map devices behind it above 4G. Likewise with multiple devices where one of them has a 32bit prefmem bar: The (single) prefmem memory window must be below 4G then too.
I think we'll have to keep track for each bar whenever we'll go map it above or below 4G. For starters this logic can be simply to only map 64bit bars of root bus devices above 4G, which completely sidesteps the bridge issues for now.
Once qemu has full pci-e support we probably want refine this logic, which probably requires an extra pass between pci bar discovery and placing the bars into the memory regions we have.
cheers, Gerd
On Tue, Jan 03, 2012 at 04:14:58PM +0100, Gerd Hoffmann wrote:
On 12/29/11 03:56, Kevin O'Connor wrote:
Tracking of 64bit prefmem sections separately from regular prefmem sections doesn't make sense, because both need to be allocated from the same pool when behind a bridge.
Yea, bridges make the picture a bit more complicated. A bridge itself might not be 64bit capable, which makes it impossible to map devices behind it above 4G. Likewise with multiple devices where one of them has a 32bit prefmem bar: The (single) prefmem memory window must be below 4G then too.
I think we'll have to keep track for each bar whenever we'll go map it above or below 4G. For starters this logic can be simply to only map 64bit bars of root bus devices above 4G, which completely sidesteps the bridge issues for now.
Once qemu has full pci-e support we probably want refine this logic, which probably requires an extra pass between pci bar discovery and placing the bars into the memory regions we have.
I was looking at this last week, along with the coreboot PCI code. I think this could be done in four "passes" - first pass to find the total size, minimum alignment, and maximum address of all resources for all regular devices. The second pass propagates this info to bridges. The third pass allocates the resources. The fourth pass assigns the resources to the devices.
The current code already effectively has these four "passes" - the thing I think is missing is tracking the "maximum address" of resources, propagating that to bridges, and assigning allocations above 4G.
-Kevin