Martin L Roth has submitted this change. ( https://review.coreboot.org/c/coreboot/+/69231 )
Change subject: arch/x86/memmove: Add 64bit version ......................................................................
arch/x86/memmove: Add 64bit version
The 64bit handles 64bit input variables properly.
TESTED: Both qemu and real hardware can use LZ4 properly which use this code.
Change-Id: Ib43ec19df97194d6b1c18bfacb5fe8211ba0ffe5 Signed-off-by: Arthur Heymans arthur@aheymans.xyz Reviewed-on: https://review.coreboot.org/c/coreboot/+/69231 Tested-by: build bot (Jenkins) no-reply@coreboot.org Reviewed-by: Angel Pons th3fanbus@gmail.com --- M src/arch/x86/Makefile.inc R src/arch/x86/memmove_32.c A src/arch/x86/memmove_64.S 3 files changed, 229 insertions(+), 8 deletions(-)
Approvals: build bot (Jenkins): Verified Angel Pons: Looks good to me, approved
diff --git a/src/arch/x86/Makefile.inc b/src/arch/x86/Makefile.inc index 50c344c..d281037 100644 --- a/src/arch/x86/Makefile.inc +++ b/src/arch/x86/Makefile.inc @@ -85,7 +85,8 @@ bootblock-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S bootblock-y += memcpy.c bootblock-y += memset.c -bootblock-y += memmove.c +bootblock-$(CONFIG_ARCH_BOOTBLOCK_X86_32) += memmove_32.c +bootblock-$(CONFIG_ARCH_BOOTBLOCK_X86_64) += memmove_64.S bootblock-$(CONFIG_COLLECT_TIMESTAMPS_TSC) += timestamp.c bootblock-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c bootblock-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c @@ -134,7 +135,8 @@ verstage-y += cpu_common.c verstage-y += memset.c verstage-y += memcpy.c -verstage-y += memmove.c +verstage-$(CONFIG_ARCH_VERSTAGE_X86_32) += memmove_32.c +verstage-$(CONFIG_ARCH_VERSTAGE_X86_64) += memmove_64.S verstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c verstage-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c # If verstage is a separate stage it means there's no need @@ -172,7 +174,8 @@ romstage-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c romstage-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S romstage-y += memcpy.c -romstage-y += memmove.c +romstage-$(CONFIG_ARCH_ROMSTAGE_X86_32) += memmove_32.c +romstage-$(CONFIG_ARCH_ROMSTAGE_X86_64) += memmove_64.S romstage-y += memset.c romstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c romstage-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c @@ -217,7 +220,8 @@ postcar-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S postcar-y += exit_car.S postcar-y += memcpy.c -postcar-y += memmove.c +postcar-$(CONFIG_ARCH_POSTCAR_X86_32) += memmove_32.c +postcar-$(CONFIG_ARCH_POSTCAR_X86_64) += memmove_64.S postcar-y += memset.c postcar-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c postcar-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c @@ -261,7 +265,8 @@ ramstage-y += idt.S ramstage-$(CONFIG_IOAPIC) += ioapic.c ramstage-y += memcpy.c -ramstage-y += memmove.c +ramstage-$(CONFIG_ARCH_RAMSTAGE_X86_32) += memmove_32.c +ramstage-$(CONFIG_ARCH_RAMSTAGE_X86_64) += memmove_64.S ramstage-y += memset.c ramstage-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c ramstage-$(CONFIG_GENERATE_MP_TABLE) += mpspec.c @@ -278,11 +283,11 @@ ramstage-$(CONFIG_HAVE_CF9_RESET) += cf9_reset.c
rmodules_x86_32-y += memcpy.c -rmodules_x86_32-y += memmove.c +rmodules_x86_32-y += memmove_32.c rmodules_x86_32-y += memset.c
rmodules_x86_64-y += memcpy.c -rmodules_x86_64-y += memmove.c +rmodules_x86_64-y += memmove_64.S rmodules_x86_64-y += memset.c
ifeq ($(CONFIG_ARCH_RAMSTAGE_X86_32),y) @@ -324,7 +329,8 @@ smm-$(CONFIG_IDT_IN_EVERY_STAGE) += exception.c smm-$(CONFIG_IDT_IN_EVERY_STAGE) += idt.S smm-y += memcpy.c -smm-y += memmove.c +smm-$(CONFIG_ARCH_RAMSTAGE_X86_32) += memmove_32.c +smm-$(CONFIG_ARCH_RAMSTAGE_X86_64) += memmove_64.S smm-y += memset.c smm-$(CONFIG_X86_TOP4G_BOOTMEDIA_MAP) += mmap_boot.c smm-$(CONFIG_DEBUG_NULL_DEREF_BREAKPOINTS_IN_ALL_STAGES) += null_breakpoint.c diff --git a/src/arch/x86/memmove.c b/src/arch/x86/memmove_32.c similarity index 100% rename from src/arch/x86/memmove.c rename to src/arch/x86/memmove_32.c diff --git a/src/arch/x86/memmove_64.S b/src/arch/x86/memmove_64.S new file mode 100644 index 0000000..ebec8ee --- /dev/null +++ b/src/arch/x86/memmove_64.S @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* This code originates from Linux 5.19 */ + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +.global memmove +memmove: + + mov %rdi, %rax + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f + + /* Don't optimize for FSRM and ERMS like Linux */ +.Lmemmove_begin_forward: + cmp $0x20, %rdx + jb 1f + + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32 bytes forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f +.Lmemmove_end_forward: + + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $0x20, %rdx + jb 1f + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32 bytes backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + RET