[PATCH] tsc: use kvmclock for calibration - SeaBIOS

9 Aug 2012

Use kvmclock for tsc calibration when running on kvm.  Without this the
tsc frequency calibrated by seabios can be *way* off in case the virtual
machine is booted on a loaded host.  I've seen seabios calibrating 27
instead of ca. 2800 MHz, resulting in timeouts being to short by factor
100.  Which in turn leads to disk I/O errors due to timeouts, especially
as I/O requests tend to take a bit longer than usual on a loaded box ...
Signed-off-by: Gerd Hoffmann kraxel@redhat.com
---
 src/clock.c    |    9 +++++
 src/paravirt.c |   90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/paravirt.h |    1 +
 3 files changed, 100 insertions(+), 0 deletions(-)

diff --git a/src/clock.c b/src/clock.c
index 69e9f17..5883b1a 100644
--- a/src/clock.c
+++ b/src/clock.c
@@ -13,6 +13,7 @@
 #include "bregs.h" // struct bregs
 #include "biosvar.h" // GET_GLOBAL
 #include "usb-hid.h" // usb_check_event
+#include "paravirt.h" // kvm clock
// RTC register flags
 #define RTC_A_UIP 0x80
@@ -80,6 +81,14 @@ calibrate_tsc(void)
         return;
     }
+    if (kvm_para_available()) {
+        u32 khz = kvm_tsc_khz();
+        if (khz != 0) {
+            SET_GLOBAL(cpu_khz, khz);
+            return;
+        }
+    }
+
     // Setup "timer2"
     u8 orig = inb(PORT_PS2_CTRLB);
     outb((orig & ~PPCB_SPKR) | PPCB_T2GATE, PORT_PS2_CTRLB);
diff --git a/src/paravirt.c b/src/paravirt.c
index 2a98d53..942ce11 100644
--- a/src/paravirt.c
+++ b/src/paravirt.c
@@ -12,6 +12,7 @@
 #include "ioport.h" // outw
 #include "paravirt.h" // qemu_cfg_port_probe
 #include "smbios.h" // struct smbios_structure_header
+#include "biosvar.h" // GET_GLOBAL
int qemu_cfg_present;
@@ -346,3 +347,92 @@ void qemu_cfg_romfile_setup(void)
         dprintf(3, "Found fw_cfg file: %s (size=%d)\n", file->name, file->size);
     }
 }
+
+#define KVM_CPUID_SIGNATURE       0x40000000
+#define KVM_CPUID_FEATURES        0x40000001
+#define KVM_FEATURE_CLOCKSOURCE            0
+#define KVM_FEATURE_CLOCKSOURCE2           3
+#define MSR_KVM_SYSTEM_TIME             0x12
+#define MSR_KVM_SYSTEM_TIME_NEW   0x4b564d01
+
+struct pvclock_vcpu_time_info {
+	u32   version;
+	u32   pad0;
+	u64   tsc_timestamp;
+	u64   system_time;
+	u32   tsc_to_system_mul;
+	s8    tsc_shift;
+	u8    flags;
+	u8    pad[2];
+} PACKED;
+
+/*
+ * do_div() is NOT a C function. It wants to return
+ * two values (the quotient and the remainder), but
+ * since that doesn't work very well in C, what it
+ * does is:
+ *
+ * - modifies the 64-bit dividend _in_place_
+ * - returns the 32-bit remainder
+ *
+ * This ends up being the most efficient "calling
+ * convention" on x86.
+ */
+#define do_div(n, base)                                                 \
+    ({                                                                  \
+        unsigned long __upper, __low, __high, __mod, __base;            \
+        __base = (base);                                                \
+        asm("" : "=a" (__low), "=d" (__high) : "A" (n));                \
+        __upper = __high;                                               \
+        if (__high) {                                                   \
+            __upper = __high % (__base);                                \
+            __high = __high / (__base);                                 \
+        }                                                               \
+        asm("divl %2" : "=a" (__low), "=d" (__mod)                      \
+            : "rm" (__base), "0" (__low), "1" (__upper));               \
+        asm("" : "=A" (n) : "a" (__low), "d" (__high));                 \
+        __mod;                                                          \
+    })
+
+static u64 pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+    u64 pv_tsc_khz = 1000000ULL << 32;
+
+    do_div(pv_tsc_khz, src->tsc_to_system_mul);
+    if (src->tsc_shift < 0)
+        pv_tsc_khz <<= -src->tsc_shift;
+    else
+        pv_tsc_khz >>= src->tsc_shift;
+    return pv_tsc_khz;
+}
+
+u64 kvm_tsc_khz(void)
+{
+    u32 eax, ebx, ecx, edx, msr;
+    struct pvclock_vcpu_time_info time;
+    u32 addr = (u32)(&time);
+    u64 khz;
+
+    /* check presence and figure msr number */
+    cpuid(KVM_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
+    if (eax & KVM_FEATURE_CLOCKSOURCE2) {
+        msr = MSR_KVM_SYSTEM_TIME_NEW;
+    } else if (eax & KVM_FEATURE_CLOCKSOURCE) {
+        msr = MSR_KVM_SYSTEM_TIME;
+    } else {
+        return 0;
+    }
+
+    /* ask kvm hypervisor to fill struct */
+    memset(&time, 0, sizeof(time));
+    wrmsr(msr, addr | 1);
+    wrmsr(msr, 0);
+    if (time.version < 2 || time.tsc_to_system_mul == 0)
+        return 0;
+
+    /* go figure tsc frequency */
+    khz = pvclock_tsc_khz(&time);
+    dprintf(1, "Using kvmclock, msr 0x%x, tsc %d MHz\n",
+            msr, (u32)khz / 1000);
+    return khz;
+}
diff --git a/src/paravirt.h b/src/paravirt.h
index a284c41..eedfcc3 100644
--- a/src/paravirt.h
+++ b/src/paravirt.h
@@ -27,6 +27,7 @@ static inline int kvm_para_available(void)
return 0;
 }
+extern u64 kvm_tsc_khz(void);
#define QEMU_CFG_SIGNATURE              0x00
 #define QEMU_CFG_ID                     0x01
-- 
1.7.1