kernel-ark/arch/x86_64/lib/csum-copy.S
Linus Torvalds 1da177e4c3 Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.

Let it rip!
2005-04-16 15:20:36 -07:00

234 lines
3.7 KiB
ArmAsm

/*
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of this archive
* for more details. No warranty for anything given at all.
*/
#include <linux/linkage.h>
#include <asm/errno.h>
/*
* Checksum copy with exception handling.
* On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
* destination is zeroed.
*
* Input
* rdi source
* rsi destination
* edx len (32bit)
* ecx sum (32bit)
* r8 src_err_ptr (int)
* r9 dst_err_ptr (int)
*
* Output
* eax 64bit sum. undefined in case of exception.
*
* Wrappers need to take care of valid exception sum and zeroing.
* They also should align source or destination to 8 bytes.
*/
.macro source
10:
.section __ex_table,"a"
.align 8
.quad 10b,.Lbad_source
.previous
.endm
.macro dest
20:
.section __ex_table,"a"
.align 8
.quad 20b,.Lbad_dest
.previous
.endm
.macro ignore L=.Lignore
30:
.section __ex_table,"a"
.align 8
.quad 30b,\L
.previous
.endm
.globl csum_partial_copy_generic
.p2align 4
csum_partial_copy_generic:
cmpl $3*64,%edx
jle .Lignore
.Lignore:
subq $7*8,%rsp
movq %rbx,2*8(%rsp)
movq %r12,3*8(%rsp)
movq %r14,4*8(%rsp)
movq %r13,5*8(%rsp)
movq %rbp,6*8(%rsp)
movq %r8,(%rsp)
movq %r9,1*8(%rsp)
movl %ecx,%eax
movl %edx,%ecx
xorl %r9d,%r9d
movq %rcx,%r12
shrq $6,%r12
jz .Lhandle_tail /* < 64 */
clc
/* main loop. clear in 64 byte blocks */
/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
/* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
.p2align 4
.Lloop:
source
movq (%rdi),%rbx
source
movq 8(%rdi),%r8
source
movq 16(%rdi),%r11
source
movq 24(%rdi),%rdx
source
movq 32(%rdi),%r10
source
movq 40(%rdi),%rbp
source
movq 48(%rdi),%r14
source
movq 56(%rdi),%r13
ignore 2f
prefetcht0 5*64(%rdi)
2:
adcq %rbx,%rax
adcq %r8,%rax
adcq %r11,%rax
adcq %rdx,%rax
adcq %r10,%rax
adcq %rbp,%rax
adcq %r14,%rax
adcq %r13,%rax
decl %r12d
dest
movq %rbx,(%rsi)
dest
movq %r8,8(%rsi)
dest
movq %r11,16(%rsi)
dest
movq %rdx,24(%rsi)
dest
movq %r10,32(%rsi)
dest
movq %rbp,40(%rsi)
dest
movq %r14,48(%rsi)
dest
movq %r13,56(%rsi)
3:
leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi
jnz .Lloop
adcq %r9,%rax
/* do last upto 56 bytes */
.Lhandle_tail:
/* ecx: count */
movl %ecx,%r10d
andl $63,%ecx
shrl $3,%ecx
jz .Lfold
clc
.p2align 4
.Lloop_8:
source
movq (%rdi),%rbx
adcq %rbx,%rax
decl %ecx
dest
movq %rbx,(%rsi)
leaq 8(%rsi),%rsi /* preserve carry */
leaq 8(%rdi),%rdi
jnz .Lloop_8
adcq %r9,%rax /* add in carry */
.Lfold:
/* reduce checksum to 32bits */
movl %eax,%ebx
shrq $32,%rax
addl %ebx,%eax
adcl %r9d,%eax
/* do last upto 6 bytes */
.Lhandle_7:
movl %r10d,%ecx
andl $7,%ecx
shrl $1,%ecx
jz .Lhandle_1
movl $2,%edx
xorl %ebx,%ebx
clc
.p2align 4
.Lloop_1:
source
movw (%rdi),%bx
adcl %ebx,%eax
dest
decl %ecx
movw %bx,(%rsi)
leaq 2(%rdi),%rdi
leaq 2(%rsi),%rsi
jnz .Lloop_1
adcl %r9d,%eax /* add in carry */
/* handle last odd byte */
.Lhandle_1:
testl $1,%r10d
jz .Lende
xorl %ebx,%ebx
source
movb (%rdi),%bl
dest
movb %bl,(%rsi)
addl %ebx,%eax
adcl %r9d,%eax /* carry */
.Lende:
movq 2*8(%rsp),%rbx
movq 3*8(%rsp),%r12
movq 4*8(%rsp),%r14
movq 5*8(%rsp),%r13
movq 6*8(%rsp),%rbp
addq $7*8,%rsp
ret
/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
movq (%rsp),%rax
testq %rax,%rax
jz .Lende
movl $-EFAULT,(%rax)
jmp .Lende
.Lbad_dest:
movq 8(%rsp),%rax
testq %rax,%rax
jz .Lende
movl $-EFAULT,(%rax)
jmp .Lende