[coreboot-gerrit] Patch set updated for coreboot: 025761e libpayload: get rid of GPL'd code

Sat Jun 6 11:11:43 CEST 2015

Patrick Georgi (pgeorgi at google.com) just uploaded a new patch set to gerrit, which you can find at http://review.coreboot.org/10413

-gerrit

commit 025761edf8362883ded7b26c88663f24302d8cf2
Author: Patrick Georgi <patrick at georgi-clan.de>
Date:   Thu Jun 4 00:44:59 2015 +0200

    libpayload: get rid of GPL'd code
    
    Replace optimized memory/string handling that came from Linux under GPLv2 with
    optimized implementations from OpenBSD to simplify libpayload's licensing situation.
    
    x86: The new memcpy and memset are slightly faster, too.
    arm: Testing required.
    
    Change-Id: I7c5e070e842bd4a32f1b0821d7ed2d1932ecd6ca
    Signed-off-by: Patrick Georgi <patrick at georgi-clan.de>
---
 payloads/libpayload/LICENSES              |  13 +-
 payloads/libpayload/arch/arm/Makefile.inc |   3 +-
 payloads/libpayload/arch/arm/_memcpy.S    | 462 ++++++++++++++++++++++++++++++
 payloads/libpayload/arch/arm/asmlib.h     |  72 -----
 payloads/libpayload/arch/arm/memcpy.S     | 265 +++--------------
 payloads/libpayload/arch/arm/memmove.S    | 221 ++------------
 payloads/libpayload/arch/arm/memset.S     | 220 +++++++-------
 payloads/libpayload/arch/x86/Makefile.inc |   5 +-
 payloads/libpayload/arch/x86/memmove.S    | 106 +++++++
 payloads/libpayload/arch/x86/memset.S     |  54 ++++
 payloads/libpayload/arch/x86/string.c     | 102 -------
 11 files changed, 810 insertions(+), 713 deletions(-)

diff --git a/payloads/libpayload/LICENSES b/payloads/libpayload/LICENSES
index f340ead..621c428 100644
--- a/payloads/libpayload/LICENSES
+++ b/payloads/libpayload/LICENSES
@@ -122,12 +122,7 @@ holders, and the exact license terms that apply.
   Original files: src/lib/libc/hash/sha1.c
   Current version we use: CVS revision 1.20 2005/08/08
 
-* arch/arm/mem*.S: GPLv2
-  Source: Linux, http://www.kernel.org
-  Original files: arch/arm/lib/mem*.S
-  Current version we use: 3.9 (418df63adac56841ef6b0f1fcf435bc64d4ed177)
-
-* arch/x86/string.c: LGPLv2.1, modified to GPLv2 under the terms of section 3
-  Source: GNU C Library (glibc), http://www.gnu.org/software/libc/libc.html
-  Original files: sysdeps/i386/memset.c
-  Current version we use: 2.14
+* arch/{arm,x86}/*mem*.S: 2-clause BSD license
+  Source: OpenBSD
+          http://www.openbsd.org/cgi-bin/cvsweb/src/lib/libc/arch/*
+  Current version we use: stated in the $OpenBSD$ header of each file
diff --git a/payloads/libpayload/arch/arm/Makefile.inc b/payloads/libpayload/arch/arm/Makefile.inc
index 6c8667a..e82c49f 100644
--- a/payloads/libpayload/arch/arm/Makefile.inc
+++ b/payloads/libpayload/arch/arm/Makefile.inc
@@ -39,8 +39,7 @@ libc-y += exception_asm.S exception.c
 libc-y += cache.c cpu.S
 libc-y += selfboot.c
 
-# Will fall back to default_memXXX() in libc/memory.c if GPL not allowed.
-libc-$(CONFIG_LP_GPL) += memcpy.S memset.S memmove.S
+libc-y += memcpy.S memset.S memmove.S _memcpy.S
 
 libgdb-y += gdb.c
 
diff --git a/payloads/libpayload/arch/arm/_memcpy.S b/payloads/libpayload/arch/arm/_memcpy.S
new file mode 100644
index 0000000..3e6acc1
--- /dev/null
+++ b/payloads/libpayload/arch/arm/_memcpy.S
@@ -0,0 +1,462 @@
+/*	$OpenBSD: _memcpy.S,v 1.3 2008/06/26 05:42:04 ray Exp $	*/
+/*	$NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $	*/
+
+/*-
+ * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Neil A. Carson and Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch/asm.h>
+
+/*
+ * This is one fun bit of code ...
+ * Some easy listening music is suggested while trying to understand this
+ * code e.g. Iron Maiden
+ *
+ * For anyone attempting to understand it :
+ *
+ * The core code is implemented here with simple stubs for memcpy()
+ * memmove() and bcopy().
+ *
+ * All local labels are prefixed with Lmemcpy_
+ * Following the prefix a label starting f is used in the forward copy code
+ * while a label using b is used in the backwards copy code
+ * The source and destination addresses determine whether a forward or
+ * backward copy is performed.
+ * Separate bits of code are used to deal with the following situations
+ * for both the forward and backwards copy.
+ * unaligned source address
+ * unaligned destination address
+ * Separate copy routines are used to produce an optimised result for each
+ * of these cases.
+ * The copy code will use LDM/STM instructions to copy up to 32 bytes at
+ * a time where possible.
+ *
+ * Note: r12 (aka ip) can be trashed during the function along with
+ * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
+ * Additional registers are preserved prior to use i.e. r4, r5 & lr
+ *
+ * Apologies for the state of the comments ;-)
+ */
+
+ENTRY(_memcpy)
+	/* Determine copy direction */
+	cmp	r1, r0
+	bcc	.Lmemcpy_backwards
+
+	moveq	r0, #0			/* Quick abort for len=0 */
+	moveq	pc, lr
+
+	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
+	subs	r2, r2, #4
+	blt	.Lmemcpy_fl4		/* less than 4 bytes */
+	ands	r12, r0, #3
+	bne	.Lmemcpy_fdestul	/* oh unaligned destination addr */
+	ands	r12, r1, #3
+	bne	.Lmemcpy_fsrcul		/* oh unaligned source addr */
+
+.Lmemcpy_ft8:
+	/* We have aligned source and destination */
+	subs	r2, r2, #8
+	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
+	subs	r2, r2, #0x14
+	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
+	stmdb	sp!, {r4}		/* borrow r4 */
+
+	/* blat 32 bytes at a time */
+	/* XXX for really big copies perhaps we should use more registers */
+.Lmemcpy_floop32:
+	ldmia	r1!, {r3, r4, r12, lr}
+	stmia	r0!, {r3, r4, r12, lr}
+	ldmia	r1!, {r3, r4, r12, lr}
+	stmia	r0!, {r3, r4, r12, lr}
+	subs	r2, r2, #0x20
+	bge	.Lmemcpy_floop32
+
+	cmn	r2, #0x10
+	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
+	stmgeia	r0!, {r3, r4, r12, lr}
+	subge	r2, r2, #0x10
+	ldmia	sp!, {r4}		/* return r4 */
+
+.Lmemcpy_fl32:
+	adds	r2, r2, #0x14
+
+	/* blat 12 bytes at a time */
+.Lmemcpy_floop12:
+	ldmgeia	r1!, {r3, r12, lr}
+	stmgeia	r0!, {r3, r12, lr}
+	subges	r2, r2, #0x0c
+	bge	.Lmemcpy_floop12
+
+.Lmemcpy_fl12:
+	adds	r2, r2, #8
+	blt	.Lmemcpy_fl4
+
+	subs	r2, r2, #4
+	ldrlt	r3, [r1], #4
+	strlt	r3, [r0], #4
+	ldmgeia	r1!, {r3, r12}
+	stmgeia	r0!, {r3, r12}
+	subge	r2, r2, #4
+
+.Lmemcpy_fl4:
+	/* less than 4 bytes to go */
+	adds	r2, r2, #4
+	ldmeqia	sp!, {r0, pc}		/* done */
+
+	/* copy the crud byte at a time */
+	cmp	r2, #2
+	ldrb	r3, [r1], #1
+	strb	r3, [r0], #1
+	ldrgeb	r3, [r1], #1
+	strgeb	r3, [r0], #1
+	ldrgtb	r3, [r1], #1
+	strgtb	r3, [r0], #1
+	ldmia	sp!, {r0, pc}
+
+	/* erg - unaligned destination */
+.Lmemcpy_fdestul:
+	rsb	r12, r12, #4
+	cmp	r12, #2
+
+	/* align destination with byte copies */
+	ldrb	r3, [r1], #1
+	strb	r3, [r0], #1
+	ldrgeb	r3, [r1], #1
+	strgeb	r3, [r0], #1
+	ldrgtb	r3, [r1], #1
+	strgtb	r3, [r0], #1
+	subs	r2, r2, r12
+	blt	.Lmemcpy_fl4		/* less the 4 bytes */
+
+	ands	r12, r1, #3
+	beq	.Lmemcpy_ft8		/* we have an aligned source */
+
+	/* erg - unaligned source */
+	/* This is where it gets nasty ... */
+.Lmemcpy_fsrcul:
+	bic	r1, r1, #3
+	ldr	lr, [r1], #4
+	cmp	r12, #2
+	bgt	.Lmemcpy_fsrcul3
+	beq	.Lmemcpy_fsrcul2
+	cmp	r2, #0x0c
+	blt	.Lmemcpy_fsrcul1loop4
+	sub	r2, r2, #0x0c
+	stmdb	sp!, {r4, r5}
+
+.Lmemcpy_fsrcul1loop16:
+	mov	r3, lr, lsr #8
+	ldmia	r1!, {r4, r5, r12, lr}
+	orr	r3, r3, r4, lsl #24
+	mov	r4, r4, lsr #8
+	orr	r4, r4, r5, lsl #24
+	mov	r5, r5, lsr #8
+	orr	r5, r5, r12, lsl #24
+	mov	r12, r12, lsr #8
+	orr	r12, r12, lr, lsl #24
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10
+	bge	.Lmemcpy_fsrcul1loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c
+	blt	.Lmemcpy_fsrcul1l4
+
+.Lmemcpy_fsrcul1loop4:
+	mov	r12, lr, lsr #8
+	ldr	lr, [r1], #4
+	orr	r12, r12, lr, lsl #24
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemcpy_fsrcul1loop4
+
+.Lmemcpy_fsrcul1l4:
+	sub	r1, r1, #3
+	b	.Lmemcpy_fl4
+
+.Lmemcpy_fsrcul2:
+	cmp	r2, #0x0c
+	blt	.Lmemcpy_fsrcul2loop4
+	sub	r2, r2, #0x0c
+	stmdb	sp!, {r4, r5}
+
+.Lmemcpy_fsrcul2loop16:
+	mov	r3, lr, lsr #16
+	ldmia	r1!, {r4, r5, r12, lr}
+	orr	r3, r3, r4, lsl #16
+	mov	r4, r4, lsr #16
+	orr	r4, r4, r5, lsl #16
+	mov	r5, r5, lsr #16
+	orr	r5, r5, r12, lsl #16
+	mov	r12, r12, lsr #16
+	orr	r12, r12, lr, lsl #16
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10
+	bge	.Lmemcpy_fsrcul2loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c
+	blt	.Lmemcpy_fsrcul2l4
+
+.Lmemcpy_fsrcul2loop4:
+	mov	r12, lr, lsr #16
+	ldr	lr, [r1], #4
+	orr	r12, r12, lr, lsl #16
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemcpy_fsrcul2loop4
+
+.Lmemcpy_fsrcul2l4:
+	sub	r1, r1, #2
+	b	.Lmemcpy_fl4
+
+.Lmemcpy_fsrcul3:
+	cmp	r2, #0x0c
+	blt	.Lmemcpy_fsrcul3loop4
+	sub	r2, r2, #0x0c
+	stmdb	sp!, {r4, r5}
+
+.Lmemcpy_fsrcul3loop16:
+	mov	r3, lr, lsr #24
+	ldmia	r1!, {r4, r5, r12, lr}
+	orr	r3, r3, r4, lsl #8
+	mov	r4, r4, lsr #24
+	orr	r4, r4, r5, lsl #8
+	mov	r5, r5, lsr #24
+	orr	r5, r5, r12, lsl #8
+	mov	r12, r12, lsr #24
+	orr	r12, r12, lr, lsl #8
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10
+	bge	.Lmemcpy_fsrcul3loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c
+	blt	.Lmemcpy_fsrcul3l4
+
+.Lmemcpy_fsrcul3loop4:
+	mov	r12, lr, lsr #24
+	ldr	lr, [r1], #4
+	orr	r12, r12, lr, lsl #8
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemcpy_fsrcul3loop4
+
+.Lmemcpy_fsrcul3l4:
+	sub	r1, r1, #1
+	b	.Lmemcpy_fl4
+
+.Lmemcpy_backwards:
+	add	r1, r1, r2
+	add	r0, r0, r2
+	subs	r2, r2, #4
+	blt	.Lmemcpy_bl4		/* less than 4 bytes */
+	ands	r12, r0, #3
+	bne	.Lmemcpy_bdestul	/* oh unaligned destination addr */
+	ands	r12, r1, #3
+	bne	.Lmemcpy_bsrcul		/* oh unaligned source addr */
+
+.Lmemcpy_bt8:
+	/* We have aligned source and destination */
+	subs	r2, r2, #8
+	blt	.Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
+	stmdb	sp!, {r4, lr}
+	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
+	blt	.Lmemcpy_bl32
+
+	/* blat 32 bytes at a time */
+	/* XXX for really big copies perhaps we should use more registers */
+.Lmemcpy_bloop32:
+	ldmdb	r1!, {r3, r4, r12, lr}
+	stmdb	r0!, {r3, r4, r12, lr}
+	ldmdb	r1!, {r3, r4, r12, lr}
+	stmdb	r0!, {r3, r4, r12, lr}
+	subs	r2, r2, #0x20
+	bge	.Lmemcpy_bloop32
+
+.Lmemcpy_bl32:
+	cmn	r2, #0x10
+	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
+	stmgedb	r0!, {r3, r4, r12, lr}
+	subge	r2, r2, #0x10
+	adds	r2, r2, #0x14
+	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
+	stmgedb	r0!, {r3, r12, lr}
+	subge	r2, r2, #0x0c
+	ldmia	sp!, {r4, lr}
+
+.Lmemcpy_bl12:
+	adds	r2, r2, #8
+	blt	.Lmemcpy_bl4
+	subs	r2, r2, #4
+	ldrlt	r3, [r1, #-4]!
+	strlt	r3, [r0, #-4]!
+	ldmgedb	r1!, {r3, r12}
+	stmgedb	r0!, {r3, r12}
+	subge	r2, r2, #4
+
+.Lmemcpy_bl4:
+	/* less than 4 bytes to go */
+	adds	r2, r2, #4
+	moveq	pc, lr			/* done */
+
+	/* copy the crud byte at a time */
+	cmp	r2, #2
+	ldrb	r3, [r1, #-1]!
+	strb	r3, [r0, #-1]!
+	ldrgeb	r3, [r1, #-1]!
+	strgeb	r3, [r0, #-1]!
+	ldrgtb	r3, [r1, #-1]!
+	strgtb	r3, [r0, #-1]!
+	mov	pc, lr
+
+	/* erg - unaligned destination */
+.Lmemcpy_bdestul:
+	cmp	r12, #2
+
+	/* align destination with byte copies */
+	ldrb	r3, [r1, #-1]!
+	strb	r3, [r0, #-1]!
+	ldrgeb	r3, [r1, #-1]!
+	strgeb	r3, [r0, #-1]!
+	ldrgtb	r3, [r1, #-1]!
+	strgtb	r3, [r0, #-1]!
+	subs	r2, r2, r12
+	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */
+	ands	r12, r1, #3
+	beq	.Lmemcpy_bt8		/* we have an aligned source */
+
+	/* erg - unaligned source */
+	/* This is where it gets nasty ... */
+.Lmemcpy_bsrcul:
+	bic	r1, r1, #3
+	ldr	r3, [r1, #0]
+	cmp	r12, #2
+	blt	.Lmemcpy_bsrcul1
+	beq	.Lmemcpy_bsrcul2
+	cmp	r2, #0x0c
+	blt	.Lmemcpy_bsrcul3loop4
+	sub	r2, r2, #0x0c
+	stmdb	sp!, {r4, r5, lr}
+
+.Lmemcpy_bsrcul3loop16:
+	mov	lr, r3, lsl #8
+	ldmdb	r1!, {r3-r5, r12}
+	orr	lr, lr, r12, lsr #24
+	mov	r12, r12, lsl #8
+	orr	r12, r12, r5, lsr #24
+	mov	r5, r5, lsl #8
+	orr	r5, r5, r4, lsr #24
+	mov	r4, r4, lsl #8
+	orr	r4, r4, r3, lsr #24
+	stmdb	r0!, {r4, r5, r12, lr}
+	subs	r2, r2, #0x10
+	bge	.Lmemcpy_bsrcul3loop16
+	ldmia	sp!, {r4, r5, lr}
+	adds	r2, r2, #0x0c
+	blt	.Lmemcpy_bsrcul3l4
+
+.Lmemcpy_bsrcul3loop4:
+	mov	r12, r3, lsl #8
+	ldr	r3, [r1, #-4]!
+	orr	r12, r12, r3, lsr #24
+	str	r12, [r0, #-4]!
+	subs	r2, r2, #4
+	bge	.Lmemcpy_bsrcul3loop4
+
+.Lmemcpy_bsrcul3l4:
+	add	r1, r1, #3
+	b	.Lmemcpy_bl4
+
+.Lmemcpy_bsrcul2:
+	cmp	r2, #0x0c
+	blt	.Lmemcpy_bsrcul2loop4
+	sub	r2, r2, #0x0c
+	stmdb	sp!, {r4, r5, lr}
+
+.Lmemcpy_bsrcul2loop16:
+	mov	lr, r3, lsl #16
+	ldmdb	r1!, {r3-r5, r12}
+	orr	lr, lr, r12, lsr #16
+	mov	r12, r12, lsl #16
+	orr	r12, r12, r5, lsr #16
+	mov	r5, r5, lsl #16
+	orr	r5, r5, r4, lsr #16
+	mov	r4, r4, lsl #16
+	orr	r4, r4, r3, lsr #16
+	stmdb	r0!, {r4, r5, r12, lr}
+	subs	r2, r2, #0x10
+	bge	.Lmemcpy_bsrcul2loop16
+	ldmia	sp!, {r4, r5, lr}
+	adds	r2, r2, #0x0c
+	blt	.Lmemcpy_bsrcul2l4
+
+.Lmemcpy_bsrcul2loop4:
+	mov	r12, r3, lsl #16
+	ldr	r3, [r1, #-4]!
+	orr	r12, r12, r3, lsr #16
+	str	r12, [r0, #-4]!
+	subs	r2, r2, #4
+	bge	.Lmemcpy_bsrcul2loop4
+
+.Lmemcpy_bsrcul2l4:
+	add	r1, r1, #2
+	b	.Lmemcpy_bl4
+
+.Lmemcpy_bsrcul1:
+	cmp	r2, #0x0c
+	blt	.Lmemcpy_bsrcul1loop4
+	sub	r2, r2, #0x0c
+	stmdb	sp!, {r4, r5, lr}
+
+.Lmemcpy_bsrcul1loop32:
+	mov	lr, r3, lsl #24
+	ldmdb	r1!, {r3-r5, r12}
+	orr	lr, lr, r12, lsr #8
+	mov	r12, r12, lsl #24
+	orr	r12, r12, r5, lsr #8
+	mov	r5, r5, lsl #24
+	orr	r5, r5, r4, lsr #8
+	mov	r4, r4, lsl #24
+	orr	r4, r4, r3, lsr #8
+	stmdb	r0!, {r4, r5, r12, lr}
+	subs	r2, r2, #0x10
+	bge	.Lmemcpy_bsrcul1loop32
+	ldmia	sp!, {r4, r5, lr}
+	adds	r2, r2, #0x0c
+	blt	.Lmemcpy_bsrcul1l4
+
+.Lmemcpy_bsrcul1loop4:
+	mov	r12, r3, lsl #24
+	ldr	r3, [r1, #-4]!
+	orr	r12, r12, r3, lsr #8
+	str	r12, [r0, #-4]!
+	subs	r2, r2, #4
+	bge	.Lmemcpy_bsrcul1loop4
+
+.Lmemcpy_bsrcul1l4:
+	add	r1, r1, #1
+	b	.Lmemcpy_bl4
diff --git a/payloads/libpayload/arch/arm/asmlib.h b/payloads/libpayload/arch/arm/asmlib.h
deleted file mode 100644
index 8b3fa22..0000000
--- a/payloads/libpayload/arch/arm/asmlib.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  arch/arm/asmlib.h
- *
- *  Adapted from Linux arch/arm/include/assembler.h
- *
- *  Copyright (C) 1996-2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  This file contains arm architecture specific defines
- *  for the different processors.
- *
- *  Do not include any C declarations in this file - it is included by
- *  assembler source.
- */
-
-/*
- * WARNING: This file is *only* meant for memcpy.S and friends which were copied
- * from Linux and require some weird macros. It does unspeakable things like
- * redefining "push", so do *not* try to turn it into a general assembly macro
- * file, and keep it out of global include directories.
- */
-
-#ifndef __ARM_ASMLIB_H__
-#define __ARM_ASMLIB_H__
-
-/*
- * Endian independent macros for shifting bytes within registers.
- */
-#ifndef __ARMEB__
-#define pull		lsr
-#define push		lsl
-#define get_byte_0	lsl #0
-#define get_byte_1	lsr #8
-#define get_byte_2	lsr #16
-#define get_byte_3	lsr #24
-#define put_byte_0	lsl #0
-#define put_byte_1	lsl #8
-#define put_byte_2	lsl #16
-#define put_byte_3	lsl #24
-#else
-#define pull		lsl
-#define push		lsr
-#define get_byte_0	lsr #24
-#define get_byte_1	lsr #16
-#define get_byte_2	lsr #8
-#define get_byte_3      lsl #0
-#define put_byte_0	lsl #24
-#define put_byte_1	lsl #16
-#define put_byte_2	lsl #8
-#define put_byte_3      lsl #0
-#endif
-
-/*
- * Data preload for architectures that support it
- */
-#if 1	/* TODO: differentiate once libpayload supports more ARM versions */
-#define PLD(code...)	code
-#else
-#define PLD(code...)
-#endif
-
-/*
- * This can be used to enable code to cacheline align the destination
- * pointer when bulk writing to memory. Linux doesn't enable this except
- * for the "Feroceon" processor, so we better just leave it out.
- */
-#define CALGN(code...)
-
-#endif	/* __ARM_ASMLIB_H */
diff --git a/payloads/libpayload/arch/arm/memcpy.S b/payloads/libpayload/arch/arm/memcpy.S
index 1388d05..6d68639 100644
--- a/payloads/libpayload/arch/arm/memcpy.S
+++ b/payloads/libpayload/arch/arm/memcpy.S
@@ -1,237 +1,44 @@
-/*
- *  linux/arch/arm/lib/memcpy.S
+/*	$OpenBSD: memcpy.S,v 1.4 2014/11/30 19:43:56 deraadt Exp $	*/
+/*	$NetBSD: memcpy.S,v 1.3 2003/04/05 23:08:52 bjh21 Exp $	*/
+
+/*-
+ * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Neil A. Carson and Mark Brinicombe
  *
- *  Author:	Nicolas Pitre
- *  Created:	Sep 28, 2005
- *  Copyright:	MontaVista Software, Inc.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <arch/asm.h>
-#include "asmlib.h"
-
-#define LDR1W_SHIFT	0
-#define STR1W_SHIFT	0
-
-	.macro ldr1w ptr reg abort
-	W(ldr) \reg, [\ptr], #4
-	.endm
-
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
-	.endm
-
-	.macro ldr1b ptr reg cond=al abort
-	ldr\cond\()b \reg, [\ptr], #1
-	.endm
-
-	.macro str1w ptr reg abort
-	W(str) \reg, [\ptr], #4
-	.endm
-
-	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
-	.endm
-
-	.macro str1b ptr reg cond=al abort
-	str\cond\()b \reg, [\ptr], #1
-	.endm
-
-	.macro enter reg1 reg2
-	stmdb sp!, {r0, \reg1, \reg2}
-	.endm
 
-	.macro exit reg1 reg2
-	ldmfd sp!, {r0, \reg1, \reg2}
-	.endm
-
-/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+/*
+ * XXX
+ * the _memcpy function which this calls is actually a _memmove
+ * variant which handles overlaps...  That should be fixed.
+ */
 
 ENTRY(memcpy)
-
-		enter	r4, lr
-
-		subs	r2, r2, #4
-		blt	8f
-		ands	ip, r0, #3
-	PLD(	pld	[r1, #0]		)
-		bne	9f
-		ands	ip, r1, #3
-		bne	10f
-
-1:		subs	r2, r2, #(28)
-		stmfd	sp!, {r5 - r8}
-		blt	5f
-
-	CALGN(	ands	ip, r0, #31		)
-	CALGN(	rsb	r3, ip, #32		)
-	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
-	CALGN(	bcs	2f			)
-	CALGN(	adr	r4, 6f			)
-	CALGN(	subs	r2, r2, r3		)  @ C gets set
-	CALGN(	add	pc, r4, ip		)
-
-	PLD(	pld	[r1, #0]		)
-2:	PLD(	subs	r2, r2, #96		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	blt	4f			)
-	PLD(	pld	[r1, #60]		)
-	PLD(	pld	[r1, #92]		)
-
-3:	PLD(	pld	[r1, #124]		)
-4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		subs	r2, r2, #32
-		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
-		bge	3b
-	PLD(	cmn	r2, #96			)
-	PLD(	bge	4b			)
-
-5:		ands	ip, r2, #28
-		rsb	ip, ip, #32
-#if LDR1W_SHIFT > 0
-		lsl	ip, ip, #LDR1W_SHIFT
-#endif
-		addne	pc, pc, ip		@ C is always clear here
-		b	7f
-6:
-		.rept	(1 << LDR1W_SHIFT)
-		W(nop)
-		.endr
-		ldr1w	r1, r3, abort=20f
-		ldr1w	r1, r4, abort=20f
-		ldr1w	r1, r5, abort=20f
-		ldr1w	r1, r6, abort=20f
-		ldr1w	r1, r7, abort=20f
-		ldr1w	r1, r8, abort=20f
-		ldr1w	r1, lr, abort=20f
-
-#if LDR1W_SHIFT < STR1W_SHIFT
-		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
-#elif LDR1W_SHIFT > STR1W_SHIFT
-		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
-#endif
-		add	pc, pc, ip
-		nop
-		.rept	(1 << STR1W_SHIFT)
-		W(nop)
-		.endr
-		str1w	r0, r3, abort=20f
-		str1w	r0, r4, abort=20f
-		str1w	r0, r5, abort=20f
-		str1w	r0, r6, abort=20f
-		str1w	r0, r7, abort=20f
-		str1w	r0, r8, abort=20f
-		str1w	r0, lr, abort=20f
-
-	CALGN(	bcs	2b			)
-
-7:		ldmfd	sp!, {r5 - r8}
-
-8:		movs	r2, r2, lsl #31
-		ldr1b	r1, r3, ne, abort=21f
-		ldr1b	r1, r4, cs, abort=21f
-		ldr1b	r1, ip, cs, abort=21f
-		str1b	r0, r3, ne, abort=21f
-		str1b	r0, r4, cs, abort=21f
-		str1b	r0, ip, cs, abort=21f
-
-		exit	r4, pc
-
-9:		rsb	ip, ip, #4
-		cmp	ip, #2
-		ldr1b	r1, r3, gt, abort=21f
-		ldr1b	r1, r4, ge, abort=21f
-		ldr1b	r1, lr, abort=21f
-		str1b	r0, r3, gt, abort=21f
-		str1b	r0, r4, ge, abort=21f
-		subs	r2, r2, ip
-		str1b	r0, lr, abort=21f
-		blt	8b
-		ands	ip, r1, #3
-		beq	1b
-
-10:		bic	r1, r1, #3
-		cmp	ip, #2
-		ldr1w	r1, lr, abort=21f
-		beq	17f
-		bgt	18f
-
-
-		.macro	forward_copy_shift pull push
-
-		subs	r2, r2, #28
-		blt	14f
-
-	CALGN(	ands	ip, r0, #31		)
-	CALGN(	rsb	ip, ip, #32		)
-	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
-	CALGN(	subcc	r2, r2, ip		)
-	CALGN(	bcc	15f			)
-
-11:		stmfd	sp!, {r5 - r9}
-
-	PLD(	pld	[r1, #0]		)
-	PLD(	subs	r2, r2, #96		)
-	PLD(	pld	[r1, #28]		)
-	PLD(	blt	13f			)
-	PLD(	pld	[r1, #60]		)
-	PLD(	pld	[r1, #92]		)
-
-12:	PLD(	pld	[r1, #124]		)
-13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, lr, pull #\pull
-		subs	r2, r2, #32
-		ldr4w	r1, r8, r9, ip, lr, abort=19f
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
-		mov	ip, ip, pull #\pull
-		orr	ip, ip, lr, push #\push
-		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
-		bge	12b
-	PLD(	cmn	r2, #96			)
-	PLD(	bge	13b			)
-
-		ldmfd	sp!, {r5 - r9}
-
-14:		ands	ip, r2, #28
-		beq	16f
-
-15:		mov	r3, lr, pull #\pull
-		ldr1w	r1, lr, abort=21f
-		subs	ip, ip, #4
-		orr	r3, r3, lr, push #\push
-		str1w	r0, r3, abort=21f
-		bgt	15b
-	CALGN(	cmp	r2, #0			)
-	CALGN(	bge	11b			)
-
-16:		sub	r1, r1, #(\push / 8)
-		b	8b
-
-		.endm
-
-
-		forward_copy_shift	pull=8	push=24
-
-17:		forward_copy_shift	pull=16	push=16
-
-18:		forward_copy_shift	pull=24	push=8
-ENDPROC(memcpy)
+	stmfd	sp!, {r0, lr}
+	bl	_memcpy
+	ldmfd	sp!, {r0, pc}
diff --git a/payloads/libpayload/arch/arm/memmove.S b/payloads/libpayload/arch/arm/memmove.S
index bd5f8f1..f071068 100644
--- a/payloads/libpayload/arch/arm/memmove.S
+++ b/payloads/libpayload/arch/arm/memmove.S
@@ -1,197 +1,38 @@
-/*
- *  linux/arch/arm/lib/memmove.S
+/*	$OpenBSD: memmove.S,v 1.3 2008/06/26 05:42:04 ray Exp $	*/
+/*	$NetBSD: memmove.S,v 1.3 2003/04/05 23:08:52 bjh21 Exp $	*/
+
+/*-
+ * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Neil A. Carson and Mark Brinicombe
  *
- *  Author:	Nicolas Pitre
- *  Created:	Sep 28, 2005
- *  Copyright:	(C) MontaVista Software Inc.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <arch/asm.h>
-#include "asmlib.h"
-
-/*
- * Prototype: void *memmove(void *dest, const void *src, size_t n);
- *
- * Note:
- *
- * If the memory regions don't overlap, we simply branch to memcpy which is
- * normally a bit faster. Otherwise the copy is done going downwards.  This
- * is a transposition of the code from copy_template.S but with the copy
- * occurring in the opposite direction.
- */
 
 ENTRY(memmove)
-
-		subs	ip, r0, r1
-		cmphi	r2, ip
-		bls	memcpy
-
-		stmfd	sp!, {r0, r4, lr}
-		add	r1, r1, r2
-		add	r0, r0, r2
-		subs	r2, r2, #4
-		blt	8f
-		ands	ip, r0, #3
-	PLD(	pld	[r1, #-4]		)
-		bne	9f
-		ands	ip, r1, #3
-		bne	10f
-
-1:		subs	r2, r2, #(28)
-		stmfd	sp!, {r5 - r8}
-		blt	5f
-
-	CALGN(	ands	ip, r0, #31		)
-	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
-	CALGN(	bcs	2f			)
-	CALGN(	adr	r4, 6f			)
-	CALGN(	subs	r2, r2, ip		)  @ C is set here
-	CALGN(	rsb	ip, ip, #32		)
-	CALGN(	add	pc, r4, ip		)
-
-	PLD(	pld	[r1, #-4]		)
-2:	PLD(	subs	r2, r2, #96		)
-	PLD(	pld	[r1, #-32]		)
-	PLD(	blt	4f			)
-	PLD(	pld	[r1, #-64]		)
-	PLD(	pld	[r1, #-96]		)
-
-3:	PLD(	pld	[r1, #-128]		)
-4:		ldmdb	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
-		subs	r2, r2, #32
-		stmdb	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
-		bge	3b
-	PLD(	cmn	r2, #96			)
-	PLD(	bge	4b			)
-
-5:		ands	ip, r2, #28
-		rsb	ip, ip, #32
-		addne	pc, pc, ip		@ C is always clear here
-		b	7f
-6:		W(nop)
-		W(ldr)	r3, [r1, #-4]!
-		W(ldr)	r4, [r1, #-4]!
-		W(ldr)	r5, [r1, #-4]!
-		W(ldr)	r6, [r1, #-4]!
-		W(ldr)	r7, [r1, #-4]!
-		W(ldr)	r8, [r1, #-4]!
-		W(ldr)	lr, [r1, #-4]!
-
-		add	pc, pc, ip
-		nop
-		W(nop)
-		W(str)	r3, [r0, #-4]!
-		W(str)	r4, [r0, #-4]!
-		W(str)	r5, [r0, #-4]!
-		W(str)	r6, [r0, #-4]!
-		W(str)	r7, [r0, #-4]!
-		W(str)	r8, [r0, #-4]!
-		W(str)	lr, [r0, #-4]!
-
-	CALGN(	bcs	2b			)
-
-7:		ldmfd	sp!, {r5 - r8}
-
-8:		movs	r2, r2, lsl #31
-		ldrneb	r3, [r1, #-1]!
-		ldrcsb	r4, [r1, #-1]!
-		ldrcsb	ip, [r1, #-1]
-		strneb	r3, [r0, #-1]!
-		strcsb	r4, [r0, #-1]!
-		strcsb	ip, [r0, #-1]
-		ldmfd	sp!, {r0, r4, pc}
-
-9:		cmp	ip, #2
-		ldrgtb	r3, [r1, #-1]!
-		ldrgeb	r4, [r1, #-1]!
-		ldrb	lr, [r1, #-1]!
-		strgtb	r3, [r0, #-1]!
-		strgeb	r4, [r0, #-1]!
-		subs	r2, r2, ip
-		strb	lr, [r0, #-1]!
-		blt	8b
-		ands	ip, r1, #3
-		beq	1b
-
-10:		bic	r1, r1, #3
-		cmp	ip, #2
-		ldr	r3, [r1, #0]
-		beq	17f
-		blt	18f
-
-
-		.macro	backward_copy_shift push pull
-
-		subs	r2, r2, #28
-		blt	14f
-
-	CALGN(	ands	ip, r0, #31		)
-	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
-	CALGN(	subcc	r2, r2, ip		)
-	CALGN(	bcc	15f			)
-
-11:		stmfd	sp!, {r5 - r9}
-
-	PLD(	pld	[r1, #-4]		)
-	PLD(	subs	r2, r2, #96		)
-	PLD(	pld	[r1, #-32]		)
-	PLD(	blt	13f			)
-	PLD(	pld	[r1, #-64]		)
-	PLD(	pld	[r1, #-96]		)
-
-12:	PLD(	pld	[r1, #-128]		)
-13:		ldmdb   r1!, {r7, r8, r9, ip}
-		mov     lr, r3, push #\push
-		subs    r2, r2, #32
-		ldmdb   r1!, {r3, r4, r5, r6}
-		orr     lr, lr, ip, pull #\pull
-		mov     ip, ip, push #\push
-		orr     ip, ip, r9, pull #\pull
-		mov     r9, r9, push #\push
-		orr     r9, r9, r8, pull #\pull
-		mov     r8, r8, push #\push
-		orr     r8, r8, r7, pull #\pull
-		mov     r7, r7, push #\push
-		orr     r7, r7, r6, pull #\pull
-		mov     r6, r6, push #\push
-		orr     r6, r6, r5, pull #\pull
-		mov     r5, r5, push #\push
-		orr     r5, r5, r4, pull #\pull
-		mov     r4, r4, push #\push
-		orr     r4, r4, r3, pull #\pull
-		stmdb   r0!, {r4 - r9, ip, lr}
-		bge	12b
-	PLD(	cmn	r2, #96			)
-	PLD(	bge	13b			)
-
-		ldmfd	sp!, {r5 - r9}
-
-14:		ands	ip, r2, #28
-		beq	16f
-
-15:		mov     lr, r3, push #\push
-		ldr	r3, [r1, #-4]!
-		subs	ip, ip, #4
-		orr	lr, lr, r3, pull #\pull
-		str	lr, [r0, #-4]!
-		bgt	15b
-	CALGN(	cmp	r2, #0			)
-	CALGN(	bge	11b			)
-
-16:		add	r1, r1, #(\pull / 8)
-		b	8b
-
-		.endm
-
-
-		backward_copy_shift	push=8	pull=24
-
-17:		backward_copy_shift	push=16	pull=16
-
-18:		backward_copy_shift	push=24	pull=8
-
-ENDPROC(memmove)
+	stmfd	sp!, {r0, lr}
+	bl	_memcpy
+	ldmfd	sp!, {r0, pc}
diff --git a/payloads/libpayload/arch/arm/memset.S b/payloads/libpayload/arch/arm/memset.S
index 0c1102d..b0a2bc9 100644
--- a/payloads/libpayload/arch/arm/memset.S
+++ b/payloads/libpayload/arch/arm/memset.S
@@ -1,121 +1,127 @@
+/*	$OpenBSD: memset.S,v 1.2 2004/02/01 05:40:52 drahn Exp $	*/
+/*	$NetBSD: memset.S,v 1.3 2003/04/05 23:08:52 bjh21 Exp $	*/
+
 /*
- *  linux/arch/arm/lib/memset.S
- *
- *  Copyright (C) 1995-2000 Russell King
+ * Copyright (c) 1995 Mark Brinicombe.
+ * All rights reserved.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Mark Brinicombe.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
  *
- *  ASM optimised string functions
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  */
 
 #include <arch/asm.h>
-#include "asmlib.h"
 
-ENTRY(memset)
-	ands	r3, r0, #3		@ 1 unaligned?
-	mov	ip, r0			@ preserve r0 as return value
-	bne	6f			@ 1
 /*
- * we know that the pointer in ip is aligned to a word boundary.
+ * Sets a block of memory to the specified value
+ *
+ * On entry:
+ *   r0 - dest address
+ *   r1 - byte to write
+ *   r2 - number of bytes to write
+ *
+ * On exit:
+ *   r0 - dest address
  */
-1:	orr	r1, r1, r1, lsl #8
-	orr	r1, r1, r1, lsl #16
-	mov	r3, r1
-	cmp	r2, #16
-	blt	4f
 
-#if ! CALGN(1)+0
+ENTRY(memset)
+	stmfd	sp!, {r0}		/* Remember address for return value */
+	and	r1, r1, #0x000000ff	/* We write bytes */
 
-/*
- * We need 2 extra registers for this loop - use r8 and the LR
- */
-	stmfd	sp!, {r8, lr}
-	mov	r8, r1
-	mov	lr, r1
-
-2:	subs	r2, r2, #64
-	stmgeia	ip!, {r1, r3, r8, lr}	@ 64 bytes at a time.
-	stmgeia	ip!, {r1, r3, r8, lr}
-	stmgeia	ip!, {r1, r3, r8, lr}
-	stmgeia	ip!, {r1, r3, r8, lr}
-	bgt	2b
-	ldmeqfd	sp!, {r8, pc}		@ Now <64 bytes to go.
-/*
- * No need to correct the count; we're only testing bits from now on
- */
-	tst	r2, #32
-	stmneia	ip!, {r1, r3, r8, lr}
-	stmneia	ip!, {r1, r3, r8, lr}
-	tst	r2, #16
-	stmneia	ip!, {r1, r3, r8, lr}
-	ldmfd	sp!, {r8, lr}
+	cmp	r2, #0x00000004		/* Do we have less than 4 bytes */
+	blt	.Lmemset_lessthanfour
 
-#else
+	/* Ok first we will word align the address */
 
-/*
- * This version aligns the destination pointer in order to write
- * whole cache lines at once.
- */
+	ands	r3, r0, #0x00000003	/* Get the bottom two bits */
+	beq	.Lmemset_addraligned	/* The address is word aligned */
 
-	stmfd	sp!, {r4-r8, lr}
-	mov	r4, r1
-	mov	r5, r1
-	mov	r6, r1
-	mov	r7, r1
-	mov	r8, r1
-	mov	lr, r1
-
-	cmp	r2, #96
-	tstgt	ip, #31
-	ble	3f
-
-	and	r8, ip, #31
-	rsb	r8, r8, #32
-	sub	r2, r2, r8
-	movs	r8, r8, lsl #(32 - 4)
-	stmcsia	ip!, {r4, r5, r6, r7}
-	stmmiia	ip!, {r4, r5}
-	tst	r8, #(1 << 30)
-	mov	r8, r1
-	strne	r1, [ip], #4
-
-3:	subs	r2, r2, #64
-	stmgeia	ip!, {r1, r3-r8, lr}
-	stmgeia	ip!, {r1, r3-r8, lr}
-	bgt	3b
-	ldmeqfd	sp!, {r4-r8, pc}
-
-	tst	r2, #32
-	stmneia	ip!, {r1, r3-r8, lr}
-	tst	r2, #16
-	stmneia	ip!, {r4-r7}
-	ldmfd	sp!, {r4-r8, lr}
-
-#endif
-
-4:	tst	r2, #8
-	stmneia	ip!, {r1, r3}
-	tst	r2, #4
-	strne	r1, [ip], #4
-/*
- * When we get here, we've got less than 4 bytes to zero.  We
- * may have an unaligned pointer as well.
- */
-5:	tst	r2, #2
-	strneb	r1, [ip], #1
-	strneb	r1, [ip], #1
-	tst	r2, #1
-	strneb	r1, [ip], #1
-	mov	pc, lr
-
-6:	subs	r2, r2, #4		@ 1 do we have enough
-	blt	5b			@ 1 bytes to align with?
-	cmp	r3, #2			@ 1
-	strltb	r1, [ip], #1		@ 1
-	strleb	r1, [ip], #1		@ 1
-	strb	r1, [ip], #1		@ 1
-	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
-	b	1b
-ENDPROC(memset)
+	rsb	r3, r3, #0x00000004
+	sub	r2, r2, r3
+	cmp	r3, #0x00000002
+	strb	r1, [r0], #0x0001	/* Set 1 byte */
+	strgeb	r1, [r0], #0x0001	/* Set another byte */
+	strgtb	r1, [r0], #0x0001	/* and a third */
+
+	cmp	r2, #0x00000004
+	blt	.Lmemset_lessthanfour
+
+	/* Now we must be word aligned */
+
+.Lmemset_addraligned:
+
+	orr	r3, r1, r1, lsl #8	/* Repeat the byte into a word */
+	orr	r3, r3, r3, lsl #16
+
+	/* We know we have at least 4 bytes ... */
+
+	cmp	r2, #0x00000020		/* If less than 32 then use words */
+	blt	.Lmemset_lessthan32
+
+	/* We have at least 32 so lets use quad words */
+
+	stmfd	sp!, {r4-r6}		/* Store registers */
+	mov	r4, r3			/* Duplicate data */
+	mov	r5, r3
+	mov	r6, r3
+
+.Lmemset_loop16:
+	stmia	r0!, {r3-r6}		/* Store 16 bytes */
+	sub	r2, r2, #0x00000010	/* Adjust count */
+	cmp	r2, #0x00000010		/* Still got at least 16 bytes ? */
+	bgt	.Lmemset_loop16
+
+	ldmfd	sp!, {r4-r6}		/* Restore registers */
+
+	/* Do we need to set some words as well ? */
+
+	cmp	r2, #0x00000004
+	blt	.Lmemset_lessthanfour
+
+	/* Have either less than 16 or less than 32 depending on route taken */
+
+.Lmemset_lessthan32:
+
+	/* We have at least 4 bytes so copy as words */
+
+.Lmemset_loop4:
+	str	r3, [r0], #0x0004
+	sub	r2, r2, #0x0004
+	cmp	r2, #0x00000004
+	bge	.Lmemset_loop4
+
+.Lmemset_lessthanfour:
+	cmp	r2, #0x00000000
+	ldmeqfd	sp!, {r0}
+	moveq	pc, lr			/* Zero length so exit */
+
+	cmp	r2, #0x00000002
+	strb	r1, [r0], #0x0001	/* Set 1 byte */
+	strgeb	r1, [r0], #0x0001	/* Set another byte */
+	strgtb	r1, [r0], #0x0001	/* and a third */
+
+	ldmfd	sp!, {r0}
+	mov	pc, lr			/* Exit */
diff --git a/payloads/libpayload/arch/x86/Makefile.inc b/payloads/libpayload/arch/x86/Makefile.inc
index 87b3e9e..19784f5 100644
--- a/payloads/libpayload/arch/x86/Makefile.inc
+++ b/payloads/libpayload/arch/x86/Makefile.inc
@@ -34,8 +34,9 @@ libc-y += exec.S virtual.c
 libc-y += selfboot.c
 libc-y += exception_asm.S exception.c
 
-# Will fall back to default_memXXX() in libc/memory.c if GPL not allowed.
-libc-$(CONFIG_LP_GPL) += string.c
+libc-y += memset.S
+# also contains memcpy
+libc-y += memmove.S
 
 libgdb-y += gdb.c
 
diff --git a/payloads/libpayload/arch/x86/memmove.S b/payloads/libpayload/arch/x86/memmove.S
new file mode 100644
index 0000000..3e9e57f
--- /dev/null
+++ b/payloads/libpayload/arch/x86/memmove.S
@@ -0,0 +1,106 @@
+/*	$OpenBSD: memmove.S,v 1.5 2014/12/02 03:07:13 tedu Exp $	*/
+
+/*-
+ * Copyright (c) 1993, 1994, 1995 Charles M. Hannum.  All rights reserved.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Emulate bcopy() by swapping the first two arguments, and jumping
+ * into memmove(), which handles overlapping regions.
+ */
+//ENTRY(bcopy)
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%esi
+	movl	16(%esp),%edi
+	jmp	docopy
+
+/*
+ * memmove(caddr_t dst, caddr_t src, size_t len);
+ * Copy len bytes, coping with overlapping space.
+ */
+	.globl memmove
+memmove:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+docopy:
+	movl	20(%esp),%ecx
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax		# overlapping?
+	jb	1f
+	jmp	docopyf			# nope
+/*
+ * memcpy() doesn't worry about overlap and always copies forward
+ */
+	.globl memcpy
+memcpy:
+	pushl	%esi
+	pushl	%edi
+	movl	12(%esp),%edi
+	movl	16(%esp),%esi
+	movl	20(%esp),%ecx
+docopyf:
+	movl	%edi,%eax		# setup return value for memcpy/memmove
+	shrl	$2,%ecx			# copy by 32-bit words
+	rep
+	movsl
+	movl	20(%esp),%ecx
+	andl	$3,%ecx			# any bytes left?
+	rep
+	movsb
+	popl	%edi
+	popl	%esi
+	ret
+
+1:	movl	%edi,%eax		# setup return value for memmove
+	addl	%ecx,%edi		# copy backward
+	addl	%ecx,%esi
+	std
+	andl	$3,%ecx			# any fractional bytes?
+	decl	%edi
+	decl	%esi
+	rep
+	movsb
+	movl	20(%esp),%ecx		# copy remainder by 32-bit words
+	shrl	$2,%ecx
+	subl	$3,%esi
+	subl	$3,%edi
+	rep
+	movsl
+	popl	%edi
+	popl	%esi
+	cld
+	ret
+
diff --git a/payloads/libpayload/arch/x86/memset.S b/payloads/libpayload/arch/x86/memset.S
new file mode 100644
index 0000000..1a8cbd9
--- /dev/null
+++ b/payloads/libpayload/arch/x86/memset.S
@@ -0,0 +1,54 @@
+/*	$OpenBSD: memset.S,v 1.4 2007/05/25 20:32:29 krw Exp $ */
+/*
+ * Written by J.T. Conklin <jtc at netbsd.org>.
+ * Public domain.
+ */
+
+	.globl memset
+memset:
+	pushl	%edi
+	pushl	%ebx
+	movl	12(%esp),%edi
+	movzbl	16(%esp),%eax		/* unsigned char, zero extend */
+	movl	20(%esp),%ecx
+	pushl	%edi			/* push address of buffer */
+
+	cld				/* set fill direction forward */
+
+	/*
+	 * if the string is too short, it's really not worth the overhead
+	 * of aligning to word boundaries, etc.  So we jump to a plain
+	 * unaligned set.
+	 */
+	cmpl	$0x0f,%ecx
+	jle	L1
+
+	movb	%al,%ah			/* copy char to all bytes in word */
+	movl	%eax,%edx
+	sall	$16,%eax
+	orl	%edx,%eax
+
+	movl	%edi,%edx		/* compute misalignment */
+	negl	%edx
+	andl	$3,%edx
+	movl	%ecx,%ebx
+	subl	%edx,%ebx
+
+	movl	%edx,%ecx		/* set until word aligned */
+	rep
+	stosb
+
+	movl	%ebx,%ecx
+	shrl	$2,%ecx			/* set by words */
+	rep
+	stosl
+
+	movl	%ebx,%ecx		/* set remainder by bytes */
+	andl	$3,%ecx
+L1:	rep
+	stosb
+
+	popl	%eax			/* pop address of buffer */
+	popl	%ebx
+	popl	%edi
+	ret
diff --git a/payloads/libpayload/arch/x86/string.c b/payloads/libpayload/arch/x86/string.c
deleted file mode 100644
index 60de812..0000000
--- a/payloads/libpayload/arch/x86/string.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (C) 1991,1992,1993,1997,1998,2003, 2005 Free Software Foundation, Inc.
- * This file is part of the GNU C Library.
- * Copyright (c) 2011 The Chromium OS Authors.
- *
- * See file CREDITS for list of people who contributed to this
- * project.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc.
- */
-
-/* From glibc-2.14, sysdeps/i386/memset.c */
-
-#include <stdint.h>
-
-#include "string.h"
-
-typedef uint32_t op_t;
-
-void *memset(void *dstpp, int c, size_t len)
-{
-	int d0;
-	unsigned long int dstp = (unsigned long int) dstpp;
-
-	/* This explicit register allocation improves code very much indeed. */
-	register op_t x asm("ax");
-
-	x = (unsigned char) c;
-
-	/* Clear the direction flag, so filling will move forward.  */
-	asm volatile("cld");
-
-	/* This threshold value is optimal.  */
-	if (len >= 12) {
-		/* Fill X with four copies of the char we want to fill with. */
-		x |= (x << 8);
-		x |= (x << 16);
-
-		/* Adjust LEN for the bytes handled in the first loop.  */
-		len -= (-dstp) % sizeof(op_t);
-
-		/*
-		 * There are at least some bytes to set. No need to test for
-		 * LEN == 0 in this alignment loop.
-		 */
-
-		/* Fill bytes until DSTP is aligned on a longword boundary. */
-		asm volatile(
-			"rep\n"
-			"stosb" /* %0, %2, %3 */ :
-			"=D" (dstp), "=c" (d0) :
-			"0" (dstp), "1" ((-dstp) % sizeof(op_t)), "a" (x) :
-			"memory");
-
-		/* Fill longwords.  */
-		asm volatile(
-			"rep\n"
-			"stosl" /* %0, %2, %3 */ :
-			"=D" (dstp), "=c" (d0) :
-			"0" (dstp), "1" (len / sizeof(op_t)), "a" (x) :
-			"memory");
-		len %= sizeof(op_t);
-	}
-
-	/* Write the last few bytes. */
-	asm volatile(
-		"rep\n"
-		"stosb" /* %0, %2, %3 */ :
-		"=D" (dstp), "=c" (d0) :
-		"0" (dstp), "1" (len), "a" (x) :
-		"memory");
-
-	return dstpp;
-}
-
-void *memcpy(void *dest, const void *src, size_t n)
-{
-	unsigned long d0, d1, d2;
-
-	asm volatile(
-		"rep ; movsl\n\t"
-		"movl %4,%%ecx\n\t"
-		"rep ; movsb\n\t"
-		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
-		: "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
-		: "memory"
-	);
-
-	return dest;
-}