Hello Tim Chu,
I'd like you to do a code review. Please visit
https://review.coreboot.org/c/coreboot/+/46898
to review the following change.
Change subject: cpu/x86: increase timeout for CPUs to check in after 2nd SIPI ......................................................................
cpu/x86: increase timeout for CPUs to check in after 2nd SIPI
Increase timeout for CPUs to check in after 2nd SIPI completion from 10ms to 100ms.
Update logging level for mp init failure cases from BIOS_DEBUG to BIOS_ERR.
Without this patch, "mp initialization failure" happens on some reboots on DeltaLake server. As consequence, not all 52 cpus come up in Linux: [root@localhost ~]# lscpu ... CPU(s): 40
Also following Hardware Errors are seen: [ 4.365762] mce: [Hardware Error]: Machine check events logged [ 4.366565] mce: [Hardware Error]: CPU 0: Machine Check: 0 Bank 9: ee2000000003110a [ 4.367561] mce: [Hardware Error]: TSC 0 ADDR fe9e0000 MISC 228aa040101086 [ 4.368563] mce: [Hardware Error]: PROCESSOR 0:5065b TIME 948438164 SOCKET 0 APIC 0 microcode 700001d
With this patch, no such failure is observed with 370 reboots.
Signed-off-by: Tim Chu Tim.Chu@quantatw.com Signed-off-by: Jonathan Zhang jonzhang@fb.com Change-Id: Iab10f116dd4af152c24d5d8f999928c038a5b208 --- M src/cpu/x86/mp_init.c 1 file changed, 7 insertions(+), 7 deletions(-)
git pull ssh://review.coreboot.org:29418/coreboot refs/changes/98/46898/1
diff --git a/src/cpu/x86/mp_init.c b/src/cpu/x86/mp_init.c index 4870529..db21262 100644 --- a/src/cpu/x86/mp_init.c +++ b/src/cpu/x86/mp_init.c @@ -434,7 +434,7 @@ if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) { printk(BIOS_DEBUG, "Waiting for ICR not to be busy..."); if (apic_wait_timeout(1000 /* 1 ms */, 50)) { - printk(BIOS_DEBUG, "timed out. Aborting.\n"); + printk(BIOS_ERR, "timed out. Aborting.\n"); return -1; } printk(BIOS_DEBUG, "done.\n"); @@ -451,7 +451,7 @@ if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) { printk(BIOS_DEBUG, "Waiting for ICR not to be busy..."); if (apic_wait_timeout(1000 /* 1 ms */, 50)) { - printk(BIOS_DEBUG, "timed out. Aborting.\n"); + printk(BIOS_ERR, "timed out. Aborting.\n"); return -1; } printk(BIOS_DEBUG, "done.\n"); @@ -462,7 +462,7 @@ LAPIC_DM_STARTUP | sipi_vector); printk(BIOS_DEBUG, "Waiting for 1st SIPI to complete..."); if (apic_wait_timeout(10000 /* 10 ms */, 50 /* us */)) { - printk(BIOS_DEBUG, "timed out.\n"); + printk(BIOS_ERR, "timed out.\n"); return -1; } printk(BIOS_DEBUG, "done.\n"); @@ -477,7 +477,7 @@ if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) { printk(BIOS_DEBUG, "Waiting for ICR not to be busy..."); if (apic_wait_timeout(1000 /* 1 ms */, 50)) { - printk(BIOS_DEBUG, "timed out. Aborting.\n"); + printk(BIOS_ERR, "timed out. Aborting.\n"); return -1; } printk(BIOS_DEBUG, "done.\n"); @@ -488,14 +488,14 @@ LAPIC_DM_STARTUP | sipi_vector); printk(BIOS_DEBUG, "Waiting for 2nd SIPI to complete..."); if (apic_wait_timeout(10000 /* 10 ms */, 50 /* us */)) { - printk(BIOS_DEBUG, "timed out.\n"); + printk(BIOS_ERR, "timed out.\n"); return -1; } printk(BIOS_DEBUG, "done.\n");
/* Wait for CPUs to check in. */ - if (wait_for_aps(num_aps, ap_count, 10000 /* 10 ms */, 50 /* us */)) { - printk(BIOS_DEBUG, "Not all APs checked in: %d/%d.\n", + if (wait_for_aps(num_aps, ap_count, 100000 /* 100 ms */, 50 /* us */)) { + printk(BIOS_ERR, "Not all APs checked in: %d/%d.\n", atomic_read(num_aps), ap_count); return -1; }