Patrick Rudolph (siro@das-labor.org) just uploaded a new patch set to gerrit, which you can find at https://review.coreboot.org/18346
-gerrit
commit 7d5fe62a626b5e7e39c1d4f5254ca5654931d95f Author: Patrick Rudolph siro@das-labor.org Date: Sun Feb 12 10:10:44 2017 +0100
libpayload/string: Add SSE memcpy functions
Improve memcpy speed by using SSE for large transfers. Performance gain is about 40% on Intel C2D with a transfer size of 16MiB. Keep using REP for small transfers as it has less overhead and SSE is slower on those anyways.
Change-Id: Ia03c27232f33f130cdf7eea8a3da86e77a221e07 Signed-off-by: Patrick Rudolph siro@das-labor.org --- payloads/libpayload/arch/x86/string.c | 116 +++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-)
diff --git a/payloads/libpayload/arch/x86/string.c b/payloads/libpayload/arch/x86/string.c index 2e38f94..6ec8243 100644 --- a/payloads/libpayload/arch/x86/string.c +++ b/payloads/libpayload/arch/x86/string.c @@ -2,6 +2,7 @@ * Copyright (C) 1991,1992,1993,1997,1998,2003, 2005 Free Software Foundation, Inc. * This file is part of the GNU C Library. * Copyright (c) 2011 The Chromium OS Authors. + * Copyright (c) 2017 Patrick Rudolph siro@das-labor.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -17,7 +18,7 @@ /* From glibc-2.14, sysdeps/i386/memset.c */
#include <stdint.h> - +#include <x86/arch/cpuid.h> #include "string.h"
typedef uint32_t op_t; @@ -78,7 +79,7 @@ void *memset(void *dstpp, int c, size_t len) return dstpp; }
-void *memcpy(void *dest, const void *src, size_t n) +static void *__memcpy_unaligned(void *dest, const void *src, size_t n) { unsigned long d0, d1, d2;
@@ -93,3 +94,114 @@ void *memcpy(void *dest, const void *src, size_t n)
return dest; } + +#ifdef __SSE__ +static void *__memcpy_sse2_unaligned(u8 *to, const u8 *from, size_t n) +{ + size_t i; + + for (i = n >> 6; i > 0; i--) { + __asm__ __volatile__ ( + "movups 0x00(%0), %%xmm0\n" + "movups 0x10(%0), %%xmm1\n" + "movups 0x20(%0), %%xmm2\n" + "movups 0x30(%0), %%xmm3\n" + "movntps %%xmm0, 0x00(%1)\n" + "movntps %%xmm1, 0x10(%1)\n" + "movntps %%xmm2, 0x20(%1)\n" + "movntps %%xmm3, 0x30(%1)\n" + :: "r" (from), "r" (to) + : "memory", "xmm0", "xmm1", "xmm2", "xmm3"); + from += 64; + to += 64; + } + + return to; +} + +static void *__memcpy_sse2(u8 *to, const u8 *from, size_t n) +{ + size_t i; + + for (i = n >> 6; i > 0; i--) { + __asm__ __volatile__ ( + "movaps 0x00(%0), %%xmm0\n" + "movaps 0x10(%0), %%xmm1\n" + "movaps 0x20(%0), %%xmm2\n" + "movaps 0x30(%0), %%xmm3\n" + "movntps %%xmm0, 0x00(%1)\n" + "movntps %%xmm1, 0x10(%1)\n" + "movntps %%xmm2, 0x20(%1)\n" + "movntps %%xmm3, 0x30(%1)\n" + :: "r" (from), "r" (to) + : "memory", "xmm0", "xmm1", "xmm2", "xmm3"); + from += 64; + to += 64; + } + + return to; +} + +static void *__memcpy_sse4(u8 *to, const u8 *from, size_t n) +{ + size_t i; + + for (i = n >> 6; i > 0; i--) { + __asm__ __volatile__ ( + "movntdqa 0x00(%0), %%xmm0\n" + "movntdqa 0x10(%0), %%xmm1\n" + "movntdqa 0x20(%0), %%xmm2\n" + "movntdqa 0x30(%0), %%xmm3\n" + "movntps %%xmm0, 0x00(%1)\n" + "movntps %%xmm1, 0x10(%1)\n" + "movntps %%xmm2, 0x20(%1)\n" + "movntps %%xmm3, 0x30(%1)\n" + :: "r" (from), "r" (to) + : "memory", "xmm0", "xmm1", "xmm2", "xmm3"); + from += 64; + to += 64; + } + + return to; +} +#endif + +void *memcpy(void *dest, const void *src, size_t len) +{ + const u8 *from = (const u8 *)src; + u8 *to = (u8 *)dest; + + /* Testes showed that REP is faster for small memory transfers. + * Use SSE to minimize cache pollution on large transfers. */ +#ifdef __SSE__ + if (cpu_id.fid.bits.sse2 && len >= (1024 * 256)) { + size_t delta; + + delta = ((uintptr_t)to) & 0xf; + if (delta) { + delta = 0x10 - delta; + len -= delta; + __memcpy_unaligned(to, from, delta); + to += delta; + from += delta; + } + if (((uintptr_t)from) & 0xf) + __memcpy_sse2_unaligned(to, from, len); + else if (cpu_id.fid.bits.sse41) + __memcpy_sse4(to, from, len); + else + __memcpy_sse2(to, from, len); + + __asm__ __volatile__ ("sfence":::"memory"); + + to += (len >> 6) * 64; + from += (len >> 6) * 64; + len &= 63; + } + + if (len) +#endif + __memcpy_unaligned(to, from, len); + + return dest; +}