[SeaBIOS] [PATCH RFC] x86: use volatile asm for read/write{b, w, l} implementations

Wed Jan 3 18:58:35 CET 2018

On 01/03/18 14:41, Vitaly Kuznetsov wrote:
> QEMU/KVM guests running nested on top of Hyper-V fail to boot with
> virtio-blk-pci disks, the debug log ends with
>
> Booting from Hard Disk...
> call32_smm 0x000edd01 e97a0
> handle_smi cmd=b5 smbase=0x000a0000
> vp notify fe007000 (2) -- 0x0
> vp read   fe005000 (1) -> 0x0
> handle_smi cmd=b5 smbase=0x000a0000
> call32_smm done 0x000edd01 0
> Booting from 0000:7c00
> call32_smm 0x000edd01 e97a4
> handle_smi cmd=b5 smbase=0x000a0000
> vp notify fe007000 (2) -- 0x0
> In resume (status=0)
> In 32bit resume
> Attempting a hard reboot
> ...
>
> I bisected the breakage to the following commit:
>
> commit f46739b1a819750c63fb5849844d99cc2ab001e8
> Author: Kevin O'Connor <kevin at koconnor.net>
> Date:   Tue Feb 2 22:34:27 2016 -0500
>
>     virtio: Convert to new PCI BAR helper functions
>
> But the commit itself appears to be correct. The problem is in how
> writew() function compiles into vp_notify(). For example, if we drop
> 'volatile' qualifier from the current writew() implementation
> everything starts to work. If we disassemble these two versions (as of
> f46739b1a) the difference will be:
>
> 00000000 <vp_notify>:
>
>    With 'volatile' (current)              Without 'volatile'
>
>    0:   push   %ebx			   0:   push   %ebx
>    1:   mov    %eax,%ecx   		   1:   mov    %eax,%ecx
>    3:   mov    0x1518(%edx),%eax	   3:   mov    0x1518(%edx),%eax
>    9:   cmpb   $0x0,0x2c(%ecx)		   9:   cmpb   $0x0,0x2c(%ecx)
>    d:   je     2f <vp_notify+0x2f>	   d:   je     2e <vp_notify+0x2e>
>    f:   mov    0x151c(%edx),%edx	   f:   mov    0x151c(%edx),%edx
>   15:   mov    0x28(%ecx),%ebx
>   18:   imul   %edx,%ebx                  15:   imul   0x28(%ecx),%edx
>   1b:   mov    0x8(%ecx),%edx             19:   mov    0x8(%ecx),%ebx
>   1e:   add    %ebx,%edx
>   20:   cmpb   $0x0,0xe(%ecx)             1c:   cmpb   $0x0,0xe(%ecx)
>   24:   je     2a <vp_notify+0x2a>        20:   je     28 <vp_notify+0x28>
>                                           22:   add    %ebx,%edx
>   26:   out    %ax,(%dx)                  24:   out    %ax,(%dx)
>   28:   jmp    48 <vp_notify+0x48>        26:   jmp    47 <vp_notify+0x47>
>   2a:   mov    %ax,(%edx)                 28:   mov    %ax,(%ebx,%edx,1)
>   2d:   jmp    48 <vp_notify+0x48>        2c:   jmp    47 <vp_notify+0x47>
>   2f:   lea    0x20(%ecx),%ebx            2e:   lea    0x20(%ecx),%ebx
>   32:   cltd                              31:   cltd
>   33:   push   %edx                       32:   push   %edx
>   34:   push   %eax                       33:   push   %eax
>   35:   mov    $0x2,%ecx                  34:   mov    $0x2,%ecx
>   3a:   mov    $0x10,%edx                 39:   mov    $0x10,%edx
>   3f:   mov    %ebx,%eax                  3e:   mov    %ebx,%eax
>   41:   call   42 <vp_notify+0x42>        40:   call   41 <vp_notify+0x41>
>   46:   pop    %eax                       45:   pop    %eax
>   47:   pop    %edx                       46:   pop    %edx
>   48:   pop    %ebx                       47:   pop    %ebx
>   49:   ret                               48:   ret
>
> My eyes fail to see an obvious compiler flaw here but probably the
> mov difference (at '2a' old, '28' new) is to blame. Doing some other
> subtle changes (e.g. adding local variables to the function) help in
> some cases too. At this point I got a bit lost with my debug so I
> looked at how Linux does this stuff and it seems we're not using
> '*(volatile u16) = ' there. Rewriting write/read{b,w,l} with volatile
> asm help.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets at redhat.com>
> ---
> RFC: This is rather an ongoing debug as I'm not able to point finger
> at the real culprit yet, I'd be grateful for any help and suggestions.
> In particular, I don't quite understand why nested virtualization
> makes a difference here.
> ---
>  src/x86.h | 21 +++++++++------------
>  1 file changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/src/x86.h b/src/x86.h
> index 53378e9..d45122c 100644
> --- a/src/x86.h
> +++ b/src/x86.h
> @@ -199,30 +199,27 @@ static inline void smp_wmb(void) {
>  }
>
>  static inline void writel(void *addr, u32 val) {
> -    barrier();
> -    *(volatile u32 *)addr = val;
> +    asm volatile("movl %0, %1" : : "d"(val), "m"(*(u32 *)addr) : "memory");
>  }
>  static inline void writew(void *addr, u16 val) {
> -    barrier();
> -    *(volatile u16 *)addr = val;
> +    asm volatile("movw %0, %1" : : "d"(val), "m"(*(u16 *)addr) : "memory");
>  }
>  static inline void writeb(void *addr, u8 val) {
> -    barrier();
> -    *(volatile u8 *)addr = val;
> +    asm volatile("movb %0, %1" : : "d"(val), "m"(*(u8 *)addr) : "memory");
>  }
>  static inline u32 readl(const void *addr) {
> -    u32 val = *(volatile const u32 *)addr;
> -    barrier();
> +    u32 val;
> +    asm volatile("movl %1, %0" : "=d"(val) : "m"(*(u32 *)addr) : "memory");
>      return val;
>  }
>  static inline u16 readw(const void *addr) {
> -    u16 val = *(volatile const u16 *)addr;
> -    barrier();
> +    u16 val;
> +    asm volatile("movw %1, %0" : "=d"(val) : "m"(*(u16 *)addr) : "memory");
>      return val;
>  }
>  static inline u8 readb(const void *addr) {
> -    u8 val = *(volatile const u8 *)addr;
> -    barrier();
> +    u8 val;
> +    asm volatile("movb %1, %0" : "=d"(val) : "m"(*(u8 *)addr) : "memory");
>      return val;
>  }
>
>

barrier() is defined like this, in "src/types.h":

> #define barrier() __asm__ __volatile__("": : :"memory")

and "src/x86.h" has some interesting stuff too, just above the code that
you modify:

> /* Compiler barrier is enough as an x86 CPU does not reorder reads or writes */
> static inline void smp_rmb(void) {
>     barrier();
> }
> static inline void smp_wmb(void) {
>     barrier();
> }

Is it possible that the current barrier() is not sufficient for the
intended purpose in an L2 guest?

What happens if you drop your current patch, but replace

  __asm__ __volatile__("": : :"memory")

in the barrier() macro definition, with a real, heavy-weight barrier,
such as

  __asm__ __volatile__("mfence": : :"memory")

(See mb() in "arch/x86/include/asm/barrier.h" in the kernel.)

... I think running in L2 could play a role here; see
"Documentation/memory-barriers.txt", section "VIRTUAL MACHINE GUESTS";
from kernel commit 6a65d26385bf ("asm-generic: implement virt_xxx memory
barriers", 2016-01-12).