[coreboot-gerrit] New patch to review for coreboot: libpayload/string: Add SSE memcpy functions
Patrick Rudolph (siro@das-labor.org)
gerrit at coreboot.org
Sun Feb 12 13:04:34 CET 2017
Patrick Rudolph (siro at das-labor.org) just uploaded a new patch set to gerrit, which you can find at https://review.coreboot.org/18346
-gerrit
commit 7d5fe62a626b5e7e39c1d4f5254ca5654931d95f
Author: Patrick Rudolph <siro at das-labor.org>
Date: Sun Feb 12 10:10:44 2017 +0100
libpayload/string: Add SSE memcpy functions
Improve memcpy speed by using SSE for large transfers.
Performance gain is about 40% on Intel C2D with a transfer size
of 16MiB.
Keep using REP for small transfers as it has less overhead and
SSE is slower on those anyways.
Change-Id: Ia03c27232f33f130cdf7eea8a3da86e77a221e07
Signed-off-by: Patrick Rudolph <siro at das-labor.org>
---
payloads/libpayload/arch/x86/string.c | 116 +++++++++++++++++++++++++++++++++-
1 file changed, 114 insertions(+), 2 deletions(-)
diff --git a/payloads/libpayload/arch/x86/string.c b/payloads/libpayload/arch/x86/string.c
index 2e38f94..6ec8243 100644
--- a/payloads/libpayload/arch/x86/string.c
+++ b/payloads/libpayload/arch/x86/string.c
@@ -2,6 +2,7 @@
* Copyright (C) 1991,1992,1993,1997,1998,2003, 2005 Free Software Foundation, Inc.
* This file is part of the GNU C Library.
* Copyright (c) 2011 The Chromium OS Authors.
+ * Copyright (c) 2017 Patrick Rudolph <siro at das-labor.org>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@@ -17,7 +18,7 @@
/* From glibc-2.14, sysdeps/i386/memset.c */
#include <stdint.h>
-
+#include <x86/arch/cpuid.h>
#include "string.h"
typedef uint32_t op_t;
@@ -78,7 +79,7 @@ void *memset(void *dstpp, int c, size_t len)
return dstpp;
}
-void *memcpy(void *dest, const void *src, size_t n)
+static void *__memcpy_unaligned(void *dest, const void *src, size_t n)
{
unsigned long d0, d1, d2;
@@ -93,3 +94,114 @@ void *memcpy(void *dest, const void *src, size_t n)
return dest;
}
+
+#ifdef __SSE__
+static void *__memcpy_sse2_unaligned(u8 *to, const u8 *from, size_t n)
+{
+ size_t i;
+
+ for (i = n >> 6; i > 0; i--) {
+ __asm__ __volatile__ (
+ "movups 0x00(%0), %%xmm0\n"
+ "movups 0x10(%0), %%xmm1\n"
+ "movups 0x20(%0), %%xmm2\n"
+ "movups 0x30(%0), %%xmm3\n"
+ "movntps %%xmm0, 0x00(%1)\n"
+ "movntps %%xmm1, 0x10(%1)\n"
+ "movntps %%xmm2, 0x20(%1)\n"
+ "movntps %%xmm3, 0x30(%1)\n"
+ :: "r" (from), "r" (to)
+ : "memory", "xmm0", "xmm1", "xmm2", "xmm3");
+ from += 64;
+ to += 64;
+ }
+
+ return to;
+}
+
+static void *__memcpy_sse2(u8 *to, const u8 *from, size_t n)
+{
+ size_t i;
+
+ for (i = n >> 6; i > 0; i--) {
+ __asm__ __volatile__ (
+ "movaps 0x00(%0), %%xmm0\n"
+ "movaps 0x10(%0), %%xmm1\n"
+ "movaps 0x20(%0), %%xmm2\n"
+ "movaps 0x30(%0), %%xmm3\n"
+ "movntps %%xmm0, 0x00(%1)\n"
+ "movntps %%xmm1, 0x10(%1)\n"
+ "movntps %%xmm2, 0x20(%1)\n"
+ "movntps %%xmm3, 0x30(%1)\n"
+ :: "r" (from), "r" (to)
+ : "memory", "xmm0", "xmm1", "xmm2", "xmm3");
+ from += 64;
+ to += 64;
+ }
+
+ return to;
+}
+
+static void *__memcpy_sse4(u8 *to, const u8 *from, size_t n)
+{
+ size_t i;
+
+ for (i = n >> 6; i > 0; i--) {
+ __asm__ __volatile__ (
+ "movntdqa 0x00(%0), %%xmm0\n"
+ "movntdqa 0x10(%0), %%xmm1\n"
+ "movntdqa 0x20(%0), %%xmm2\n"
+ "movntdqa 0x30(%0), %%xmm3\n"
+ "movntps %%xmm0, 0x00(%1)\n"
+ "movntps %%xmm1, 0x10(%1)\n"
+ "movntps %%xmm2, 0x20(%1)\n"
+ "movntps %%xmm3, 0x30(%1)\n"
+ :: "r" (from), "r" (to)
+ : "memory", "xmm0", "xmm1", "xmm2", "xmm3");
+ from += 64;
+ to += 64;
+ }
+
+ return to;
+}
+#endif
+
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ const u8 *from = (const u8 *)src;
+ u8 *to = (u8 *)dest;
+
+ /* Testes showed that REP is faster for small memory transfers.
+ * Use SSE to minimize cache pollution on large transfers. */
+#ifdef __SSE__
+ if (cpu_id.fid.bits.sse2 && len >= (1024 * 256)) {
+ size_t delta;
+
+ delta = ((uintptr_t)to) & 0xf;
+ if (delta) {
+ delta = 0x10 - delta;
+ len -= delta;
+ __memcpy_unaligned(to, from, delta);
+ to += delta;
+ from += delta;
+ }
+ if (((uintptr_t)from) & 0xf)
+ __memcpy_sse2_unaligned(to, from, len);
+ else if (cpu_id.fid.bits.sse41)
+ __memcpy_sse4(to, from, len);
+ else
+ __memcpy_sse2(to, from, len);
+
+ __asm__ __volatile__ ("sfence":::"memory");
+
+ to += (len >> 6) * 64;
+ from += (len >> 6) * 64;
+ len &= 63;
+ }
+
+ if (len)
+#endif
+ __memcpy_unaligned(to, from, len);
+
+ return dest;
+}
More information about the coreboot-gerrit
mailing list