OS usually expects BIOS to set certain bits in MSR_IA32_FEATURE_CONTROL for some features (e.g. VMX and LMCE). QEMU provides a fw_cfg file "etc/msr_feature_control" to advise bits that should be set in MSR_IA32_FEATURE_CONTROL. If this file exists, SeaBIOS will set the advised bits in that MSR.
Signed-off-by: Haozhong Zhang haozhong.zhang@intel.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com --- Changes in v3: * Move msr_feature_control_setup() to paravirt.c.
Changes in v2: * Call msr_feature_control_setup() before smp_setup(). * Use wrmsr_smp() instead of wrmsr() on BSP. * Rename smp_mtrr and smp_mtrr_count to smp_msr and smp_msr_count as they are not only used for MTRR now. --- src/fw/paravirt.c | 12 +++++++++++- src/fw/smp.c | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 8ed4380..73a08f0 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -131,6 +131,15 @@ qemu_preinit(void) dprintf(1, "RamSize: 0x%08x [cmos]\n", RamSize); }
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a + +static void msr_feature_control_setup(void) +{ + u64 feature_control_bits = romfile_loadint("etc/msr_feature_control", 0); + if (feature_control_bits) + wrmsr_smp(MSR_IA32_FEATURE_CONTROL, feature_control_bits); +} + void qemu_platform_setup(void) { @@ -149,8 +158,9 @@ qemu_platform_setup(void) smm_device_setup(); smm_setup();
- // Initialize mtrr and smp + // Initialize mtrr, msr_feature_control and smp mtrr_setup(); + msr_feature_control_setup(); smp_setup();
// Create bios tables diff --git a/src/fw/smp.c b/src/fw/smp.c index 579acdb..6e706e4 100644 --- a/src/fw/smp.c +++ b/src/fw/smp.c @@ -10,7 +10,7 @@ #include "output.h" // dprintf #include "romfile.h" // romfile_loadint #include "stacks.h" // yield -#include "util.h" // smp_setup +#include "util.h" // smp_setup, msr_feature_control_setup #include "x86.h" // wrmsr
#define APIC_ICR_LOW ((u8*)BUILD_APIC_ADDR + 0x300) @@ -20,20 +20,20 @@
#define APIC_ENABLED 0x0100
-static struct { u32 index; u64 val; } smp_mtrr[32]; -static u32 smp_mtrr_count; +static struct { u32 index; u64 val; } smp_msr[32]; +static u32 smp_msr_count;
void wrmsr_smp(u32 index, u64 val) { wrmsr(index, val); - if (smp_mtrr_count >= ARRAY_SIZE(smp_mtrr)) { + if (smp_msr_count >= ARRAY_SIZE(smp_msr)) { warn_noalloc(); return; } - smp_mtrr[smp_mtrr_count].index = index; - smp_mtrr[smp_mtrr_count].val = val; - smp_mtrr_count++; + smp_msr[smp_msr_count].index = index; + smp_msr[smp_msr_count].val = val; + smp_msr_count++; }
u32 MaxCountCPUs; @@ -58,10 +58,10 @@ handle_smp(void) u8 apic_id = ebx>>24; dprintf(DEBUG_HDL_smp, "handle_smp: apic_id=%d\n", apic_id);
- // MTRR setup + // MTRR and MSR_IA32_FEATURE_CONTROL setup int i; - for (i=0; i<smp_mtrr_count; i++) - wrmsr(smp_mtrr[i].index, smp_mtrr[i].val); + for (i=0; i<smp_msr_count; i++) + wrmsr(smp_msr[i].index, smp_msr[i].val);
// Set bit on FoundAPICIDs FoundAPICIDs[apic_id/32] |= (1 << (apic_id % 32));
On Mi, 2016-06-22 at 14:53 +0800, Haozhong Zhang wrote:
OS usually expects BIOS to set certain bits in MSR_IA32_FEATURE_CONTROL for some features (e.g. VMX and LMCE). QEMU provides a fw_cfg file "etc/msr_feature_control" to advise bits that should be set in MSR_IA32_FEATURE_CONTROL. If this file exists, SeaBIOS will set the advised bits in that MSR.
Committed to master, cherry-picked into 1.9-stable.
thanks, Gerd
On 06/22/16 08:53, Haozhong Zhang wrote:
OS usually expects BIOS to set certain bits in MSR_IA32_FEATURE_CONTROL for some features (e.g. VMX and LMCE). QEMU provides a fw_cfg file "etc/msr_feature_control" to advise bits that should be set in MSR_IA32_FEATURE_CONTROL. If this file exists, SeaBIOS will set the advised bits in that MSR.
Signed-off-by: Haozhong Zhang haozhong.zhang@intel.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com
Changes in v3:
- Move msr_feature_control_setup() to paravirt.c.
Changes in v2:
- Call msr_feature_control_setup() before smp_setup().
- Use wrmsr_smp() instead of wrmsr() on BSP.
- Rename smp_mtrr and smp_mtrr_count to smp_msr and smp_msr_count as they are not only used for MTRR now.
src/fw/paravirt.c | 12 +++++++++++- src/fw/smp.c | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 8ed4380..73a08f0 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -131,6 +131,15 @@ qemu_preinit(void) dprintf(1, "RamSize: 0x%08x [cmos]\n", RamSize); }
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+static void msr_feature_control_setup(void) +{
- u64 feature_control_bits = romfile_loadint("etc/msr_feature_control", 0);
- if (feature_control_bits)
wrmsr_smp(MSR_IA32_FEATURE_CONTROL, feature_control_bits);
+}
void qemu_platform_setup(void) { @@ -149,8 +158,9 @@ qemu_platform_setup(void) smm_device_setup(); smm_setup();
- // Initialize mtrr and smp
// Initialize mtrr, msr_feature_control and smp mtrr_setup();
msr_feature_control_setup(); smp_setup();
// Create bios tables
diff --git a/src/fw/smp.c b/src/fw/smp.c index 579acdb..6e706e4 100644 --- a/src/fw/smp.c +++ b/src/fw/smp.c @@ -10,7 +10,7 @@ #include "output.h" // dprintf #include "romfile.h" // romfile_loadint #include "stacks.h" // yield -#include "util.h" // smp_setup +#include "util.h" // smp_setup, msr_feature_control_setup #include "x86.h" // wrmsr
#define APIC_ICR_LOW ((u8*)BUILD_APIC_ADDR + 0x300) @@ -20,20 +20,20 @@
#define APIC_ENABLED 0x0100
-static struct { u32 index; u64 val; } smp_mtrr[32]; -static u32 smp_mtrr_count; +static struct { u32 index; u64 val; } smp_msr[32]; +static u32 smp_msr_count;
void wrmsr_smp(u32 index, u64 val) { wrmsr(index, val);
- if (smp_mtrr_count >= ARRAY_SIZE(smp_mtrr)) {
- if (smp_msr_count >= ARRAY_SIZE(smp_msr)) { warn_noalloc(); return; }
- smp_mtrr[smp_mtrr_count].index = index;
- smp_mtrr[smp_mtrr_count].val = val;
- smp_mtrr_count++;
- smp_msr[smp_msr_count].index = index;
- smp_msr[smp_msr_count].val = val;
- smp_msr_count++;
}
u32 MaxCountCPUs; @@ -58,10 +58,10 @@ handle_smp(void) u8 apic_id = ebx>>24; dprintf(DEBUG_HDL_smp, "handle_smp: apic_id=%d\n", apic_id);
- // MTRR setup
- // MTRR and MSR_IA32_FEATURE_CONTROL setup int i;
- for (i=0; i<smp_mtrr_count; i++)
wrmsr(smp_mtrr[i].index, smp_mtrr[i].val);
for (i=0; i<smp_msr_count; i++)
wrmsr(smp_msr[i].index, smp_msr[i].val);
// Set bit on FoundAPICIDs FoundAPICIDs[apic_id/32] |= (1 << (apic_id % 32));
For https://github.com/tianocore/edk2/issues/97 (i.e., doing the same in OVMF), I've been trying to come up with designs.
The combined requirement that the MSR be written on all CPUs *and* at S3 resume too is a big complication for OVMF. If we can relax either of those (that is, write the MSR on all CPUs but only on cold boot, or else, write the MSR at cold boot and resume too, but only on the BSP), that would be a relief for OVMF.
So I recalled that this patch programmed the MSR on all APs, but I wanted to see if it really did the same on S3 resume. And, I think it doesn't. I tried to construct a "maximal" call tree from the bottom up, and this is all I can come up with:
reset_vector() [src/romlayout.S] entry_post() [src/romlayout.S] handle_post() [src/post.c] dopost() [src/post.c] reloc_preinit() [src/post.c] maininit() [src/post.c] platform_hardware_setup() [src/post.c] qemu_platform_setup() [src/fw/paravirt.c] msr_feature_control_setup() [src/fw/paravirt.c] wrmsr_smp() [src/fw/smp.c] wrmsr() [src/x86.h] -- on the BSP // + stash in array smp_setup() [src/fw/smp.c] entry_smp() [src/romlayout.S] -- on all APs handle_smp() [src/fw/smp.c] wrmsr() from array [src/x86.h]
smp_setup() is not reached on the S3 resume path:
reset_vector() [src/romlayout.S] entry_post() [src/romlayout.S] entry_resume() [src/romlayout.S] <--- on a different branch here handle_resume() [src/resume.c] handle_resume32() [src/resume.c] s3_resume() [src/resume.c] ... farcall16big()
I also checked "docs/Execution_and_code_flow.md", and my findings seem to be consistent with it.
Now, MSR_IA32_FEATURE_CONTROL not being set on S3 is perfectly fine with me. I just want to avoid implementing something for OVMF that is several times harder than necessary.
So, is the fact (*) that SeaBIOS does not program the MSR on S3 a bug (or missing feature) in SeaBIOS, or is it okay? (For example because the resume code in Linux copies the MSR's value from the BSP to the APs anyway?)
If it is okay, then we should remove the S3 requirement from https://github.com/tianocore/edk2/issues/97.
(*) "fact" unless I missed something, of course
Thanks! Laszlo
On 07/06/16 00:19, Laszlo Ersek wrote:
On 06/22/16 08:53, Haozhong Zhang wrote:
OS usually expects BIOS to set certain bits in MSR_IA32_FEATURE_CONTROL for some features (e.g. VMX and LMCE). QEMU provides a fw_cfg file "etc/msr_feature_control" to advise bits that should be set in MSR_IA32_FEATURE_CONTROL. If this file exists, SeaBIOS will set the advised bits in that MSR.
Signed-off-by: Haozhong Zhang haozhong.zhang@intel.com Reviewed-by: Paolo Bonzini pbonzini@redhat.com
Changes in v3:
- Move msr_feature_control_setup() to paravirt.c.
Changes in v2:
- Call msr_feature_control_setup() before smp_setup().
- Use wrmsr_smp() instead of wrmsr() on BSP.
- Rename smp_mtrr and smp_mtrr_count to smp_msr and smp_msr_count as they are not only used for MTRR now.
src/fw/paravirt.c | 12 +++++++++++- src/fw/smp.c | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c index 8ed4380..73a08f0 100644 --- a/src/fw/paravirt.c +++ b/src/fw/paravirt.c @@ -131,6 +131,15 @@ qemu_preinit(void) dprintf(1, "RamSize: 0x%08x [cmos]\n", RamSize); }
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+static void msr_feature_control_setup(void) +{
- u64 feature_control_bits = romfile_loadint("etc/msr_feature_control", 0);
- if (feature_control_bits)
wrmsr_smp(MSR_IA32_FEATURE_CONTROL, feature_control_bits);
+}
void qemu_platform_setup(void) { @@ -149,8 +158,9 @@ qemu_platform_setup(void) smm_device_setup(); smm_setup();
- // Initialize mtrr and smp
// Initialize mtrr, msr_feature_control and smp mtrr_setup();
msr_feature_control_setup(); smp_setup();
// Create bios tables
diff --git a/src/fw/smp.c b/src/fw/smp.c index 579acdb..6e706e4 100644 --- a/src/fw/smp.c +++ b/src/fw/smp.c @@ -10,7 +10,7 @@ #include "output.h" // dprintf #include "romfile.h" // romfile_loadint #include "stacks.h" // yield -#include "util.h" // smp_setup +#include "util.h" // smp_setup, msr_feature_control_setup #include "x86.h" // wrmsr
#define APIC_ICR_LOW ((u8*)BUILD_APIC_ADDR + 0x300) @@ -20,20 +20,20 @@
#define APIC_ENABLED 0x0100
-static struct { u32 index; u64 val; } smp_mtrr[32]; -static u32 smp_mtrr_count; +static struct { u32 index; u64 val; } smp_msr[32]; +static u32 smp_msr_count;
void wrmsr_smp(u32 index, u64 val) { wrmsr(index, val);
- if (smp_mtrr_count >= ARRAY_SIZE(smp_mtrr)) {
- if (smp_msr_count >= ARRAY_SIZE(smp_msr)) { warn_noalloc(); return; }
- smp_mtrr[smp_mtrr_count].index = index;
- smp_mtrr[smp_mtrr_count].val = val;
- smp_mtrr_count++;
- smp_msr[smp_msr_count].index = index;
- smp_msr[smp_msr_count].val = val;
- smp_msr_count++;
}
u32 MaxCountCPUs; @@ -58,10 +58,10 @@ handle_smp(void) u8 apic_id = ebx>>24; dprintf(DEBUG_HDL_smp, "handle_smp: apic_id=%d\n", apic_id);
- // MTRR setup
- // MTRR and MSR_IA32_FEATURE_CONTROL setup int i;
- for (i=0; i<smp_mtrr_count; i++)
wrmsr(smp_mtrr[i].index, smp_mtrr[i].val);
for (i=0; i<smp_msr_count; i++)
wrmsr(smp_msr[i].index, smp_msr[i].val);
// Set bit on FoundAPICIDs FoundAPICIDs[apic_id/32] |= (1 << (apic_id % 32));
For https://github.com/tianocore/edk2/issues/97 (i.e., doing the same in OVMF), I've been trying to come up with designs.
The combined requirement that the MSR be written on all CPUs *and* at S3 resume too is a big complication for OVMF. If we can relax either of those (that is, write the MSR on all CPUs but only on cold boot, or else, write the MSR at cold boot and resume too, but only on the BSP), that would be a relief for OVMF.
So I recalled that this patch programmed the MSR on all APs, but I wanted to see if it really did the same on S3 resume. And, I think it doesn't. I tried to construct a "maximal" call tree from the bottom up, and this is all I can come up with:
reset_vector() [src/romlayout.S] entry_post() [src/romlayout.S] handle_post() [src/post.c] dopost() [src/post.c] reloc_preinit() [src/post.c] maininit() [src/post.c] platform_hardware_setup() [src/post.c] qemu_platform_setup() [src/fw/paravirt.c] msr_feature_control_setup() [src/fw/paravirt.c] wrmsr_smp() [src/fw/smp.c] wrmsr() [src/x86.h] -- on the BSP // + stash in array smp_setup() [src/fw/smp.c] entry_smp() [src/romlayout.S] -- on all APs handle_smp() [src/fw/smp.c] wrmsr() from array [src/x86.h]
smp_setup() is not reached on the S3 resume path:
reset_vector() [src/romlayout.S] entry_post() [src/romlayout.S] entry_resume() [src/romlayout.S] <--- on a different branch here handle_resume() [src/resume.c] handle_resume32() [src/resume.c] s3_resume() [src/resume.c] ... farcall16big()
I also checked "docs/Execution_and_code_flow.md", and my findings seem to be consistent with it.
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
Now, MSR_IA32_FEATURE_CONTROL not being set on S3 is perfectly fine with me. I just want to avoid implementing something for OVMF that is several times harder than necessary.
So, is the fact (*) that SeaBIOS does not program the MSR on S3 a bug (or missing feature) in SeaBIOS, or is it okay? (For example because the resume code in Linux copies the MSR's value from the BSP to the APs anyway?)
Not sure if it can be treated as a bug or not.
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit). - For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume. - For VMX, if KVM in Linux (guest) finds bit 20 and bit 0 are cleared in MSR_IA32_FEATURE_CONTROL while cpuid.1 says VMX is supported, it will set those bits. Therefore, KVM just works after resume even though MSR_IA32_FEATURE_CONTROL is not properly set by firmware.
I'm not sure whether other operating systems or virtual machine monitors can work properly in this case. If all of them are fine with this, I think we can remove the S3 requirement.
Thanks, Haozhong
If it is okay, then we should remove the S3 requirement from https://github.com/tianocore/edk2/issues/97.
(*) "fact" unless I missed something, of course
Thanks! Laszlo
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Paolo
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
Thanks, Haozhong
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Thanks Laszlo
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Ah, I missed a bit in my question. I meant to check whether the firmware on the real machine sets the LMCE bit and other necessary bits in MSR_IA32_FEATURE_CONTROL after S3 resume.
On 07/06/16 08:49, Haozhong Zhang wrote:
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Ah, I missed a bit in my question. I meant to check whether the firmware on the real machine sets the LMCE bit and other necessary bits in MSR_IA32_FEATURE_CONTROL after S3 resume.
I attached a minimal kernel module (reproducer / tester) to the github issue here:
https://github.com/tianocore/edk2/issues/97#issuecomment-230697897
We can use it for both testing the feature in guests, and for querying the MSR on physical machines.
Specifically on my ThinkPad W541, the MSR has value 0x5:
[ 2885.877339] MSR 0x3a on CPU 0: 0x5 [ 2908.151693] MSR 0x3a on CPU 1: 0x5
which, according to
#define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define FEATURE_CONTROL_LMCE (1<<20)
corresponds to
(FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX).
FEATURE_CONTROL_LMCE is not set at all. (I didn't do any S3 cycles in my current laptopt boot.)
My CPU is i7-4810MQ. It's a pretty modern laptop, so I think it is capable of LMCE, hardware-wise (if LMCE is hw-dependent, to begin with).
In order to verify if my laptop was indeed capable of LCME, I read the IA32_MCG_CAP MSR as well:
# insmod ./rdmsr.ko msr=0x00000179 MSR 0x179 on CPU 0: 0xc09
"MCG_LMCE_P" is bit 27 (value 0x8000000). So, apparently, I was wrong; my laptop does not support LMCE, and it's not surprising the BIOS doesn't set the LCME bit in the feature control MSR :)
Anyway I think you should be able to use the kernel module for experimenting with MSRs on other hosts.
Thanks Laszlo
On 07/06/16 09:44, Laszlo Ersek wrote:
On 07/06/16 08:49, Haozhong Zhang wrote:
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Ah, I missed a bit in my question. I meant to check whether the firmware on the real machine sets the LMCE bit and other necessary bits in MSR_IA32_FEATURE_CONTROL after S3 resume.
I attached a minimal kernel module (reproducer / tester) to the github issue here:
https://github.com/tianocore/edk2/issues/97#issuecomment-230697897
We can use it for both testing the feature in guests, and for querying the MSR on physical machines.
Thanks for the kernel module! In fact, I'm using the rdmsr command provided by msr-tools (which is included in some Linux distros).
Specifically on my ThinkPad W541, the MSR has value 0x5:
[ 2885.877339] MSR 0x3a on CPU 0: 0x5 [ 2908.151693] MSR 0x3a on CPU 1: 0x5
which, according to
#define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define FEATURE_CONTROL_LMCE (1<<20)
corresponds to
(FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX).
FEATURE_CONTROL_LMCE is not set at all. (I didn't do any S3 cycles in my current laptopt boot.)
My CPU is i7-4810MQ. It's a pretty modern laptop, so I think it is capable of LMCE, hardware-wise (if LMCE is hw-dependent, to begin with).
LMCE is supported only by server CPUs (Skylake-EX or later) and not available on desktop CPUs.
For test in guest, you could follow what I added in "How to verify?" https://github.com/tianocore/edk2/issues/97.
Thanks, Haozhong
In order to verify if my laptop was indeed capable of LCME, I read the IA32_MCG_CAP MSR as well:
# insmod ./rdmsr.ko msr=0x00000179 MSR 0x179 on CPU 0: 0xc09
"MCG_LMCE_P" is bit 27 (value 0x8000000). So, apparently, I was wrong; my laptop does not support LMCE, and it's not surprising the BIOS doesn't set the LCME bit in the feature control MSR :)
Anyway I think you should be able to use the kernel module for experimenting with MSRs on other hosts.
Thanks Laszlo
On 07/06/16 10:48, Haozhong Zhang wrote:
On 07/06/16 09:44, Laszlo Ersek wrote:
On 07/06/16 08:49, Haozhong Zhang wrote:
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
> I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and > MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
> Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect > at least Linux guest (tested 4.5). Current QEMU may advise the guest > firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock > bit). > - For LMCE, Linux only checks bit 20 and bit 0 at boot time and then > keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Ah, I missed a bit in my question. I meant to check whether the firmware on the real machine sets the LMCE bit and other necessary bits in MSR_IA32_FEATURE_CONTROL after S3 resume.
I attached a minimal kernel module (reproducer / tester) to the github issue here:
https://github.com/tianocore/edk2/issues/97#issuecomment-230697897
We can use it for both testing the feature in guests, and for querying the MSR on physical machines.
Thanks for the kernel module! In fact, I'm using the rdmsr command provided by msr-tools (which is included in some Linux distros).
Welp, that package is available on RHEL-7 too (from EPEL), I just didn't have it installed. (I guessed that the command would be called "rdmsr", and tried to run it, but when that failed, I didn't look for any packages.)
Specifically on my ThinkPad W541, the MSR has value 0x5:
[ 2885.877339] MSR 0x3a on CPU 0: 0x5 [ 2908.151693] MSR 0x3a on CPU 1: 0x5
which, according to
#define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define FEATURE_CONTROL_LMCE (1<<20)
corresponds to
(FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX).
FEATURE_CONTROL_LMCE is not set at all. (I didn't do any S3 cycles in my current laptopt boot.)
My CPU is i7-4810MQ. It's a pretty modern laptop, so I think it is capable of LMCE, hardware-wise (if LMCE is hw-dependent, to begin with).
LMCE is supported only by server CPUs (Skylake-EX or later) and not available on desktop CPUs.
For test in guest, you could follow what I added in "How to verify?" https://github.com/tianocore/edk2/issues/97.
Great, apparently I can't even read. Those steps are in the first comment. :/
Thanks Laszlo
On 07/06/16 11:00, Laszlo Ersek wrote:
On 07/06/16 10:48, Haozhong Zhang wrote:
For test in guest, you could follow what I added in "How to verify?" https://github.com/tianocore/edk2/issues/97.
Great, apparently I can't even read. Those steps are in the first comment. :/
Ah wait, you must have edited that comment in-place, just now. I checked my github issue mail, and the first version of that comment didn't contain the verification steps.
I absolutely hate that github allows in-place comment edits (without sending them out in separate emails). Bugzilla doesn't allow this (unless a user has extra privileges).
So, thank you very much for the verification steps :), and next time please add them in a separate comment. Otherwise I might not find them :)
Thanks! Laszlo
On 07/06/16 11:00, Laszlo Ersek wrote:
On 07/06/16 10:48, Haozhong Zhang wrote:
On 07/06/16 09:44, Laszlo Ersek wrote:
On 07/06/16 08:49, Haozhong Zhang wrote:
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote: >> I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and >> MSR_IA32_FEATURE_CONTROL is zero after S3 resume. > > This is a bug. Sorry Laszlo. :) > >> Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect >> at least Linux guest (tested 4.5). Current QEMU may advise the guest >> firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock >> bit). >> - For LMCE, Linux only checks bit 20 and bit 0 at boot time and then >> keeps using the result even after resume. > > On real hardware, LMCE would not be enabled after resume. I'm not > sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Ah, I missed a bit in my question. I meant to check whether the firmware on the real machine sets the LMCE bit and other necessary bits in MSR_IA32_FEATURE_CONTROL after S3 resume.
I attached a minimal kernel module (reproducer / tester) to the github issue here:
https://github.com/tianocore/edk2/issues/97#issuecomment-230697897
We can use it for both testing the feature in guests, and for querying the MSR on physical machines.
Thanks for the kernel module! In fact, I'm using the rdmsr command provided by msr-tools (which is included in some Linux distros).
Welp, that package is available on RHEL-7 too (from EPEL), I just didn't have it installed. (I guessed that the command would be called "rdmsr", and tried to run it, but when that failed, I didn't look for any packages.)
Specifically on my ThinkPad W541, the MSR has value 0x5:
[ 2885.877339] MSR 0x3a on CPU 0: 0x5 [ 2908.151693] MSR 0x3a on CPU 1: 0x5
which, according to
#define FEATURE_CONTROL_LOCKED (1<<0) #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define FEATURE_CONTROL_LMCE (1<<20)
corresponds to
(FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX).
FEATURE_CONTROL_LMCE is not set at all. (I didn't do any S3 cycles in my current laptopt boot.)
My CPU is i7-4810MQ. It's a pretty modern laptop, so I think it is capable of LMCE, hardware-wise (if LMCE is hw-dependent, to begin with).
LMCE is supported only by server CPUs (Skylake-EX or later) and not available on desktop CPUs.
For test in guest, you could follow what I added in "How to verify?" https://github.com/tianocore/edk2/issues/97.
Great, apparently I can't even read. Those steps are in the first comment. :/
I supposed every modification in github issues would trigger mail notifications to others, but apparently it does not. I'll take care in the future.
Haozhong
On 07/06/16 11:07, Haozhong Zhang wrote:
I supposed every modification in github issues would trigger mail notifications to others, but apparently it does not. I'll take care in the future.
That's actually a valid assumption, given that Bugzilla is sane like that, for example. GitHub perpetrates even worse atrocities: people don't get emails about their own comments nor metadata changes (labels etc). github's email integration is practically unusable. A bugzilla instance for edk2 has been in the making for a while now, but it's not ready just yet. I can't wait.
Thanks! Laszlo
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Actually, I think there is a bug in KVM at the moment. I ran the following test:
- modified OVMF to set the MSR to value 0x5 on just the BSP - booted an i440fx and a Q35 (SMM-enabled) OVMF guest - checked "rdmsr -a 0x3a" in both - ran "pm-suspend" in both guests, woke them - repeated the rdmsr command
The result is that the BSP had the 0x5 MSR value both after cold boot and after S3 resume. So, KVM does not seem to implement clearing of the MSR.
I checked kvm/next (currently at 196f20ca52e8c7281932663c348fa54b82d03914), and vmx_vcpu_reset() does not seem to zero vmx->msr_ia32_feature_control.
Now, what I absolutely can't tell you is whether this zeroing should happen regardless of "init_event", or just for a specific value of "init_event". Whenever I look at "init_event", I have to track it down to the commit that added it, then locate all the commits that fixed it, then guess whether the SDM language "logical processor reset" implies a specific "init_event" value or not. So, I have no idea about "init_event".
Either way, why it is important to me: the edk2 modules that are built into OVMF use loads of INIT-SIPI-SIPI sequences, for implementing multiprocessing in the firmware (regardless of the feature at hand). For example, EFI_MP_SERVICES_PROTOCOL.StartupAllAps() boots all APs with a programmer-specified routine, after which the APs are HLTed. (Until the next such call.) INIT-SIPI-SIPI is used by the implementation for booting the APs.
So I need to know whether those INIT-SIPI-SIPI sequences are supposed to clear the MSR -- in other words, whether I have to write patches that explicitly sustain the MSR across these IPIs.
If the answer is "yes", then I would like to ask for a KVM patch that zeroes vmx->msr_ia32_feature_control "under the right circumstances".
I can only attempt to re-program the MSR in all the necessary places in edk2 if I can trust that KVM will clear the MSR (visibly to the "rdmsr" command in the guest) as long as I miss at least one location in edk2.
Thanks! Laszlo
On 06/07/2016 13:04, Laszlo Ersek wrote:
Actually, I think there is a bug in KVM at the moment. I ran the following test:
- modified OVMF to set the MSR to value 0x5 on just the BSP
- booted an i440fx and a Q35 (SMM-enabled) OVMF guest
- checked "rdmsr -a 0x3a" in both
- ran "pm-suspend" in both guests, woke them
- repeated the rdmsr command
The result is that the BSP had the 0x5 MSR value both after cold boot and after S3 resume. So, KVM does not seem to implement clearing of the MSR.
I suspect that the KVM module is getting in the way. Try removing it or go lower-level. I did the following test:
- Applied the following patch to x86/s3.c from kvm-unit-tests:
diff --git a/x86/s3.c b/x86/s3.c index cef956e..0f2b48f 100644 --- a/x86/s3.c +++ b/x86/s3.c @@ -1,6 +1,7 @@ #include "libcflat.h" #include "x86/acpi.h" #include "asm/io.h" +#include "x86/processor.h"
u32* find_resume_vector_addr(void) { @@ -62,6 +63,7 @@ int main(int argc, char **argv) rtc_out(RTC_HOURS_ALARM, RTC_ALARM_DONT_CARE); rtc_out(RTC_REG_B, rtc_in(RTC_REG_B) | REG_B_AIE);
+ printf("Current value of MSR is 0x%x\nValue after S3 resume is ", (int)rdmsr(0x3a)); *(volatile int*)0 = 0; asm volatile("outw %0, %1" :: "a"((short)0x2400), "d"((short)fadt->pm1a_cnt_blk):"memory"); while(1) @@ -75,6 +77,13 @@ asm ( ".global resume_end\n" ".code16\n" "resume_start:\n" + "mov $0x3a, %ecx\n" + "rdmsr\n" + "mov $0x3f8, %dx\n" + "add $0x30, %al\n" + "out %al, %dx\n" + "mov $0xa, %al\n" + "out %al, %dx\n" "mov 0x0, %eax\n" "mov $0xf4, %dx\n" "out %eax, %dx\n"
- Compiled the latest SeaBIOS
- Applied the QEMU patches for LMCE
- ran the following:
../qemu/+build/x86_64-softmmu/qemu-system-x86_64 -display none \ --enable-kvm -m 512 -serial mon:stdio \ -cpu host,+vmx -kernel ./x86/s3.flat \ -global PIIX4_PM.disable_s3=0 -device isa-debug-exit,iobase=0xf4
and my output is:
enabling apic FACS is at 0x1ffe0000 resume vector addr is 0x1ffe000c copy resume code from 0x400340 PM1a event registers at 600 Current value of MSR is 0x5 Value after S3 resume is 0
I also tried using BITS from https://biosbits.org/downloads/bits-2073.zip with OVMF:
import bits cpu = bits.cpus()[0] bits.rdmsr(cpu, 0x3a)
0L
bits.wrmsr(cpu, 0x3a, 5)
True
bits.rdmsr(cpu, 0x3a)
5L
After a full system reset (I don't know if BITS can do S3! :)) the rdmsr gave zero again. Tracing KVM confirmed that the OVMF I used doesn't touch the MSR.
I checked kvm/next (currently at 196f20ca52e8c7281932663c348fa54b82d03914), and vmx_vcpu_reset() does not seem to zero vmx->msr_ia32_feature_control.
This is true, but QEMU does zero it.
Now, what I absolutely can't tell you is whether this zeroing should happen regardless of "init_event", or just for a specific value of "init_event". Whenever I look at "init_event", I have to track it down to the commit that added it, then locate all the commits that fixed it, then guess whether the SDM language "logical processor reset" implies a specific "init_event" value or not. So, I have no idea about "init_event".
It should be preserved by INIT, but not by reset or S3.
So I need to know whether those INIT-SIPI-SIPI sequences are supposed to clear the MSR -- in other words, whether I have to write patches that explicitly sustain the MSR across these IPIs.
No, INIT hardly changes any MSR.
Paolo
On 07/06/16 14:18, Paolo Bonzini wrote:
On 06/07/2016 13:04, Laszlo Ersek wrote:
I checked kvm/next (currently at 196f20ca52e8c7281932663c348fa54b82d03914), and vmx_vcpu_reset() does not seem to zero vmx->msr_ia32_feature_control.
This is true, but QEMU does zero it.
Indeed. Sorry for wasting your time.
Although, I am very happy about the following:
Now, what I absolutely can't tell you is whether this zeroing should happen regardless of "init_event", or just for a specific value of "init_event". Whenever I look at "init_event", I have to track it down to the commit that added it, then locate all the commits that fixed it, then guess whether the SDM language "logical processor reset" implies a specific "init_event" value or not. So, I have no idea about "init_event".
It should be preserved by INIT, but not by reset or S3.
This is great (I hope!) because this way I have a chance to stay out of the implementations of EFI_MP_SERVICES_PROTOCOL and EFI_PEI_MP_SERVICES_PPI. (They live under UefiCpuPkg.)
I've worried that if I only *call* these interfaces to set the MSR, then the next (independent) use of the same interfaces would clear the MSR through the INIT-SIPI-SIPI. That would have forced me to modify the protocol / PPI implementations so that any use of them would reprogram the MSR every time, after the INIT-SIPI-SIPI.
This way however (hopefully) it should suffice to call the PPI only -- the results should survive from PEI to DXE to the runtime OS on the normal boot path, and from PEI to the runtime OS on the S3 resume path.
So I need to know whether those INIT-SIPI-SIPI sequences are supposed to clear the MSR -- in other words, whether I have to write patches that explicitly sustain the MSR across these IPIs.
No, INIT hardly changes any MSR.
Thank you for your help! Laszlo
I've worried that if I only *call* these interfaces to set the MSR, then the next (independent) use of the same interfaces would clear the MSR through the INIT-SIPI-SIPI. That would have forced me to modify the protocol / PPI implementations so that any use of them would reprogram the MSR every time, after the INIT-SIPI-SIPI.
This way however (hopefully) it should suffice to call the PPI only -- the results should survive from PEI to DXE to the runtime OS on the normal boot path, and from PEI to the runtime OS on the S3 resume path.
This is correct (for what I understand). Out of curiosity, why is it not enough to just add the MSR to the ACPI_CPU_DATA? Or is it what you're doing, but OVMF was not restoring MTRRs at S3 resume time either?
Paolo
On 07/07/16 14:12, Paolo Bonzini wrote:
I've worried that if I only *call* these interfaces to set the MSR, then the next (independent) use of the same interfaces would clear the MSR through the INIT-SIPI-SIPI. That would have forced me to modify the protocol / PPI implementations so that any use of them would reprogram the MSR every time, after the INIT-SIPI-SIPI.
This way however (hopefully) it should suffice to call the PPI only -- the results should survive from PEI to DXE to the runtime OS on the normal boot path, and from PEI to the runtime OS on the S3 resume path.
This is correct (for what I understand). Out of curiosity, why is it not enough to just add the MSR to the ACPI_CPU_DATA? Or is it what you're doing, but OVMF was not restoring MTRRs at S3 resume time either?
At the time of writing the above emails, I had not done anything in particular yet. I was just focusing on an idea that would hopefully allow me to keep things simple (and under OvmfPkg).
The idea is that I set the feature control MSR once in PlatformPei (on all the CPUs), using EFI_PEI_MP_SERVICES_PPI. This would happen regardless of the boot path (S3 vs. normal boot path). Then the MSR set thus would survive into the OS (through DXE on the nromal boot path, and directly on the S3 boot path).
Until you confirmed that INIT IPIs would not clear the MSR, the above idea was not feasible, for the following reason:
- on the S3 resume path, any other PEIM clients of EFI_PEI_MP_SERVICES_PPI might invoke EFI_PEI_MP_SERVICES_PPI, raising an INIT IPI (for utilizing the APs), thereby clearing the MSR;
- on the normal boot path, the same, *plus*: in DXE, any client of EFI_MP_SERVICES_PROTOCOL could cause the same. (In fact just the startup of the EFI_MP_SERVICES_PROTOCOL implementation in CpuDxe involves INIT-SIPI-SIPI.)
Ultimately, the idea I'm working on is pretty simple, same as any other chipset register configuration we do in PlatformPei. The difference is "only" that I need to pull in the EFI_PEI_MP_SERVICES_PPI implementation (CpuMpPei), and then call that from PlatformPei to write the MSR on all APs as well.
The risk was that any other INIT IPIs (involved in further use of EFI_PEI_MP_SERVICES_PPI through PEI, and EFI_MP_SERVICES_PROTOCOL through DXE) would undo that work. But, you excluded this risk, so the idea should be fine. (I'm about to test it soon...)
The ACPI_CPU_DATA field would be related to PiSmmCpuDxeSmm and CpuS3DataDxe. With the above idea, I don't expect to have to touch those at all (I could be proved wrong, of course -- this is why it is important for QEMU to clear the MSR whenever it is architecturally required). PlatformPei should program the MSR independently of the SMM driver stack.
So this is something I have to test in four situations: { cold boot, S3 resume } x { SMM driver stack absent, SMM driver stack present }.
Regarding MTRRs... that's a bit messy. PlatformPei only progams the MTRRs only on the BSP. For the normal boot path, this is no problem, because when EFI_MP_SERVICES_PROTOCOL starts up (in CpuDxe), the MTRR settings are broad-cast to all APs. It is also not a problem for the S3 resume path, when the SMM driver stack is used, because CpuS3DataDxe saves the MTRRs at End-of-DXE, and at S3 resume, PiSmmCpuDxeSmm restores them.
There is one path where the firmware does not restore MTRRs on APs: S3 resume without the SMM driver stack. In practice this doesn't seem to cause problems. Maybe Linux restores those MTRRs anyway, when the APs are onlined after resume -- even at cold boot, Linux checks the MTRR config, and if it's inconsistent between BSP and APs, the APs are adapted.
(If I understand correctly, on S3 resume, SeaBIOS doesn't reprogram the MTRRs even on the BSP, and historically this has caused no problems. So in that sense OVMF is "no worse". :))
Thanks Laszlo
On 07/07/2016 14:44, Laszlo Ersek wrote:
Regarding MTRRs... that's a bit messy. PlatformPei only progams the MTRRs only on the BSP. For the normal boot path, this is no problem, because when EFI_MP_SERVICES_PROTOCOL starts up (in CpuDxe), the MTRR settings are broad-cast to all APs. It is also not a problem for the S3 resume path, when the SMM driver stack is used, because CpuS3DataDxe saves the MTRRs at End-of-DXE, and at S3 resume, PiSmmCpuDxeSmm restores them.
There is one path where the firmware does not restore MTRRs on APs: S3 resume without the SMM driver stack. In practice this doesn't seem to cause problems. Maybe Linux restores those MTRRs anyway, when the APs are onlined after resume -- even at cold boot, Linux checks the MTRR config, and if it's inconsistent between BSP and APs, the APs are adapted.
Oh, that helps indeed.
(If I understand correctly, on S3 resume, SeaBIOS doesn't reprogram the MTRRs even on the BSP, and historically this has caused no problems. So in that sense OVMF is "no worse". :))
MTRRs hardly have any effect on virt platforms, which kinda explains that. We need to fix that now that smp_setup is used for MSR_IA32_FEATURE_CONTROL.
Paolo
On 07/06/16 13:04, Laszlo Ersek wrote:
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Actually, I think there is a bug in KVM at the moment. I ran the following test:
- modified OVMF to set the MSR to value 0x5 on just the BSP
- booted an i440fx and a Q35 (SMM-enabled) OVMF guest
- checked "rdmsr -a 0x3a" in both
- ran "pm-suspend" in both guests, woke them
- repeated the rdmsr command
The result is that the BSP had the 0x5 MSR value both after cold boot and after S3 resume. So, KVM does not seem to implement clearing of the MSR.
Interesting result, is setting MSR on BSP also called after S3 resume? I went through your test steps with OVMF replaced by a modified SeaBIOS which only sets MSR_IA32_FEATURE_CONTROL on BSP at boot time, the result before S3 resume is the value on BSP is 5 and others are 0, and the result after S3 resume is values on all CPUs are 0.
I checked kvm/next (currently at 196f20ca52e8c7281932663c348fa54b82d03914), and vmx_vcpu_reset() does not seem to zero vmx->msr_ia32_feature_control.
The function reset cpu state in QEMU after S3 resume is x86_cpu_reset(CPUState *s) in target-i386/cpu.c which is called for all vcpus and does
memset(env, 0, offsetof(CPUX86State, cpuid_level));
CPUX86State.msr_ia32_feature_control is before .cpuid_level, so guest MSR_IA32_FEATURE_CONTROL on all vcpus should be zero after S3 resume.
Thanks, Haozhong
Now, what I absolutely can't tell you is whether this zeroing should happen regardless of "init_event", or just for a specific value of "init_event". Whenever I look at "init_event", I have to track it down to the commit that added it, then locate all the commits that fixed it, then guess whether the SDM language "logical processor reset" implies a specific "init_event" value or not. So, I have no idea about "init_event".
Either way, why it is important to me: the edk2 modules that are built into OVMF use loads of INIT-SIPI-SIPI sequences, for implementing multiprocessing in the firmware (regardless of the feature at hand). For example, EFI_MP_SERVICES_PROTOCOL.StartupAllAps() boots all APs with a programmer-specified routine, after which the APs are HLTed. (Until the next such call.) INIT-SIPI-SIPI is used by the implementation for booting the APs.
So I need to know whether those INIT-SIPI-SIPI sequences are supposed to clear the MSR -- in other words, whether I have to write patches that explicitly sustain the MSR across these IPIs.
If the answer is "yes", then I would like to ask for a KVM patch that zeroes vmx->msr_ia32_feature_control "under the right circumstances".
I can only attempt to re-program the MSR in all the necessary places in edk2 if I can trust that KVM will clear the MSR (visibly to the "rdmsr" command in the guest) as long as I miss at least one location in edk2.
Thanks! Laszlo
On 07/06/16 15:03, Haozhong Zhang wrote:
On 07/06/16 13:04, Laszlo Ersek wrote:
On 07/06/16 08:42, Laszlo Ersek wrote:
On 07/06/16 08:28, Haozhong Zhang wrote:
Hi Ashok,
On 07/06/16 02:18, Paolo Bonzini wrote:
I forgot to restore MSR_IA32_FEATURE_CONTROL in the resume path, and MSR_IA32_FEATURE_CONTROL is zero after S3 resume.
This is a bug. Sorry Laszlo. :)
Not restore MSR_IA32_FEATURE_CONTROL during S3 resume does not affect at least Linux guest (tested 4.5). Current QEMU may advise the guest firmware to set bit 20 (for LMCE), bit 2 (for VMX) and bit 0 (lock bit).
- For LMCE, Linux only checks bit 20 and bit 0 at boot time and then keeps using the result even after resume.
On real hardware, LMCE would not be enabled after resume. I'm not sure what would happen, but it wouldn't be good.
Could you help to check if the LMCE bit in MSR_IA32_FEATURE_CONTROL is set after S3 resume on the real hardware?
The SDM says that IA32_FEATURE_CONTROL is zeroed on logical processor reset.
23.7 ENABLING AND ENTERING VMX OPERATION
[...] VMXON is also controlled by the IA32_FEATURE_CONTROL MSR (MSR address 3AH). This MSR is cleared to zero when a logical processor is reset. [...]
Actually, I think there is a bug in KVM at the moment. I ran the following test:
- modified OVMF to set the MSR to value 0x5 on just the BSP
- booted an i440fx and a Q35 (SMM-enabled) OVMF guest
- checked "rdmsr -a 0x3a" in both
- ran "pm-suspend" in both guests, woke them
- repeated the rdmsr command
The result is that the BSP had the 0x5 MSR value both after cold boot and after S3 resume. So, KVM does not seem to implement clearing of the MSR.
Interesting result, is setting MSR on BSP also called after S3 resume?
Yes. Henceforth my middle name should be "bumblebee", because today I've been bumbling from error to error.
In short, I messed up my ad-hoc OVMF patch, and added the wrmsr to a location that is on both the normal boot path and the S3 resume path. I fixed the patch and now I get the same result as you -- the MSR is indeed clear after S3 resume.
I went through your test steps with OVMF replaced by a modified SeaBIOS which only sets MSR_IA32_FEATURE_CONTROL on BSP at boot time, the result before S3 resume is the value on BSP is 5 and others are 0, and the result after S3 resume is values on all CPUs are 0.
Right.
I checked kvm/next (currently at 196f20ca52e8c7281932663c348fa54b82d03914), and vmx_vcpu_reset() does not seem to zero vmx->msr_ia32_feature_control.
The function reset cpu state in QEMU after S3 resume is x86_cpu_reset(CPUState *s) in target-i386/cpu.c which is called for all vcpus and does
memset(env, 0, offsetof(CPUX86State, cpuid_level));
CPUX86State.msr_ia32_feature_control is before .cpuid_level, so guest MSR_IA32_FEATURE_CONTROL on all vcpus should be zero after S3 resume.
Thank you for the analysis and sorry about the noise.
Laszlo