83a7a2ad2a
We already have cpufeature indicies above 255, so use a 16-bit number for the alternatives index. This consumes a padding field and so doesn't add any size, but it means that abusing the padding field to create assembly errors on overflow no longer works. We can retain the test simply by redirecting it to the .discard section, however. [ v3: updated to include open-coded locations ] Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> LKML-Reference: <tip-f88731e3068f9d1392ba71cc9f50f035d26a0d4f@git.kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
144 lines
2.3 KiB
ArmAsm
144 lines
2.3 KiB
ArmAsm
/* Copyright 2002 Andi Kleen */
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/dwarf2.h>
|
|
|
|
/*
|
|
* memcpy - Copy a memory block.
|
|
*
|
|
* Input:
|
|
* rdi destination
|
|
* rsi source
|
|
* rdx count
|
|
*
|
|
* Output:
|
|
* rax original destination
|
|
*/
|
|
|
|
/*
|
|
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
|
|
*
|
|
* This gets patched over the unrolled variant (below) via the
|
|
* alternative instructions framework:
|
|
*/
|
|
.section .altinstr_replacement, "ax", @progbits
|
|
.Lmemcpy_c:
|
|
movq %rdi, %rax
|
|
|
|
movl %edx, %ecx
|
|
shrl $3, %ecx
|
|
andl $7, %edx
|
|
rep movsq
|
|
movl %edx, %ecx
|
|
rep movsb
|
|
ret
|
|
.Lmemcpy_e:
|
|
.previous
|
|
|
|
ENTRY(__memcpy)
|
|
ENTRY(memcpy)
|
|
CFI_STARTPROC
|
|
|
|
/*
|
|
* Put the number of full 64-byte blocks into %ecx.
|
|
* Tail portion is handled at the end:
|
|
*/
|
|
movq %rdi, %rax
|
|
movl %edx, %ecx
|
|
shrl $6, %ecx
|
|
jz .Lhandle_tail
|
|
|
|
.p2align 4
|
|
.Lloop_64:
|
|
/*
|
|
* We decrement the loop index here - and the zero-flag is
|
|
* checked at the end of the loop (instructions inbetween do
|
|
* not change the zero flag):
|
|
*/
|
|
decl %ecx
|
|
|
|
/*
|
|
* Move in blocks of 4x16 bytes:
|
|
*/
|
|
movq 0*8(%rsi), %r11
|
|
movq 1*8(%rsi), %r8
|
|
movq %r11, 0*8(%rdi)
|
|
movq %r8, 1*8(%rdi)
|
|
|
|
movq 2*8(%rsi), %r9
|
|
movq 3*8(%rsi), %r10
|
|
movq %r9, 2*8(%rdi)
|
|
movq %r10, 3*8(%rdi)
|
|
|
|
movq 4*8(%rsi), %r11
|
|
movq 5*8(%rsi), %r8
|
|
movq %r11, 4*8(%rdi)
|
|
movq %r8, 5*8(%rdi)
|
|
|
|
movq 6*8(%rsi), %r9
|
|
movq 7*8(%rsi), %r10
|
|
movq %r9, 6*8(%rdi)
|
|
movq %r10, 7*8(%rdi)
|
|
|
|
leaq 64(%rsi), %rsi
|
|
leaq 64(%rdi), %rdi
|
|
|
|
jnz .Lloop_64
|
|
|
|
.Lhandle_tail:
|
|
movl %edx, %ecx
|
|
andl $63, %ecx
|
|
shrl $3, %ecx
|
|
jz .Lhandle_7
|
|
|
|
.p2align 4
|
|
.Lloop_8:
|
|
decl %ecx
|
|
movq (%rsi), %r8
|
|
movq %r8, (%rdi)
|
|
leaq 8(%rdi), %rdi
|
|
leaq 8(%rsi), %rsi
|
|
jnz .Lloop_8
|
|
|
|
.Lhandle_7:
|
|
movl %edx, %ecx
|
|
andl $7, %ecx
|
|
jz .Lend
|
|
|
|
.p2align 4
|
|
.Lloop_1:
|
|
movb (%rsi), %r8b
|
|
movb %r8b, (%rdi)
|
|
incq %rdi
|
|
incq %rsi
|
|
decl %ecx
|
|
jnz .Lloop_1
|
|
|
|
.Lend:
|
|
ret
|
|
CFI_ENDPROC
|
|
ENDPROC(memcpy)
|
|
ENDPROC(__memcpy)
|
|
|
|
/*
|
|
* Some CPUs run faster using the string copy instructions.
|
|
* It is also a lot simpler. Use this when possible:
|
|
*/
|
|
|
|
.section .altinstructions, "a"
|
|
.align 8
|
|
.quad memcpy
|
|
.quad .Lmemcpy_c
|
|
.word X86_FEATURE_REP_GOOD
|
|
|
|
/*
|
|
* Replace only beginning, memcpy is used to apply alternatives,
|
|
* so it is silly to overwrite itself with nops - reboot is the
|
|
* only outcome...
|
|
*/
|
|
.byte .Lmemcpy_e - .Lmemcpy_c
|
|
.byte .Lmemcpy_e - .Lmemcpy_c
|
|
.previous
|