086e9dc0e2
Add optimised library functions for metag. Signed-off-by: James Hogan <james.hogan@imgtec.com>
186 lines
4.7 KiB
ArmAsm
186 lines
4.7 KiB
ArmAsm
! Copyright (C) 2008-2012 Imagination Technologies Ltd.
|
|
|
|
.text
|
|
.global _memcpy
|
|
.type _memcpy,function
|
|
! D1Ar1 dst
|
|
! D0Ar2 src
|
|
! D1Ar3 cnt
|
|
! D0Re0 dst
|
|
_memcpy:
|
|
CMP D1Ar3, #16
|
|
MOV A1.2, D0Ar2 ! source pointer
|
|
MOV A0.2, D1Ar1 ! destination pointer
|
|
MOV A0.3, D1Ar1 ! for return value
|
|
! If there are less than 16 bytes to copy use the byte copy loop
|
|
BGE $Llong_copy
|
|
|
|
$Lbyte_copy:
|
|
! Simply copy a byte at a time
|
|
SUBS TXRPT, D1Ar3, #1
|
|
BLT $Lend
|
|
$Lloop_byte:
|
|
GETB D1Re0, [A1.2++]
|
|
SETB [A0.2++], D1Re0
|
|
BR $Lloop_byte
|
|
|
|
$Lend:
|
|
! Finally set return value and return
|
|
MOV D0Re0, A0.3
|
|
MOV PC, D1RtP
|
|
|
|
$Llong_copy:
|
|
ANDS D1Ar5, D1Ar1, #7 ! test destination alignment
|
|
BZ $Laligned_dst
|
|
|
|
! The destination address is not 8 byte aligned. We will copy bytes from
|
|
! the source to the destination until the remaining data has an 8 byte
|
|
! destination address alignment (i.e we should never copy more than 7
|
|
! bytes here).
|
|
$Lalign_dst:
|
|
GETB D0Re0, [A1.2++]
|
|
ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8
|
|
SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes
|
|
SETB [A0.2++], D0Re0
|
|
CMP D1Ar5, #8
|
|
BNE $Lalign_dst
|
|
|
|
! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
|
|
! blocks, then jump to the unaligned copy loop or fall through to the aligned
|
|
! copy loop as appropriate.
|
|
$Laligned_dst:
|
|
MOV D0Ar4, A1.2
|
|
LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks
|
|
ANDS D0Ar4, D0Ar4, #7 ! test source alignment
|
|
BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop
|
|
|
|
! Both source and destination are 8 byte aligned - the easy case.
|
|
$Laligned_copy:
|
|
LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks
|
|
BZ $Lbyte_copy
|
|
SUB TXRPT, D1Ar5, #1
|
|
|
|
$Laligned_32:
|
|
GETL D0Re0, D1Re0, [A1.2++]
|
|
GETL D0Ar6, D1Ar5, [A1.2++]
|
|
SETL [A0.2++], D0Re0, D1Re0
|
|
SETL [A0.2++], D0Ar6, D1Ar5
|
|
GETL D0Re0, D1Re0, [A1.2++]
|
|
GETL D0Ar6, D1Ar5, [A1.2++]
|
|
SETL [A0.2++], D0Re0, D1Re0
|
|
SETL [A0.2++], D0Ar6, D1Ar5
|
|
BR $Laligned_32
|
|
|
|
! If there are any remaining bytes use the byte copy loop, otherwise we are done
|
|
ANDS D1Ar3, D1Ar3, #0x1f
|
|
BNZ $Lbyte_copy
|
|
B $Lend
|
|
|
|
! The destination is 8 byte aligned but the source is not, and there are 8
|
|
! or more bytes to be copied.
|
|
$Lunaligned_copy:
|
|
! Adjust the source pointer (A1.2) to the 8 byte boundary before its
|
|
! current value
|
|
MOV D0Ar4, A1.2
|
|
MOV D0Ar6, A1.2
|
|
ANDMB D0Ar4, D0Ar4, #0xfff8
|
|
MOV A1.2, D0Ar4
|
|
! Save the number of bytes of mis-alignment in D0Ar4 for use later
|
|
SUBS D0Ar6, D0Ar6, D0Ar4
|
|
MOV D0Ar4, D0Ar6
|
|
! if there is no mis-alignment after all, use the aligned copy loop
|
|
BZ $Laligned_copy
|
|
|
|
! prefetch 8 bytes
|
|
GETL D0Re0, D1Re0, [A1.2]
|
|
|
|
SUB TXRPT, D1Ar5, #1
|
|
|
|
! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
|
|
! 4 bytes, and more than 4 bytes.
|
|
CMP D0Ar6, #4
|
|
BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop
|
|
BZ $Lunaligned_4 ! use 4 byte mis-alignment loop
|
|
|
|
! The mis-alignment is more than 4 bytes
|
|
$Lunaligned_5_6_7:
|
|
SUB D0Ar6, D0Ar6, #4
|
|
! Calculate the bit offsets required for the shift operations necesssary
|
|
! to align the data.
|
|
! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
|
|
MULW D0Ar6, D0Ar6, #8
|
|
MOV D1Ar5, #32
|
|
SUB D1Ar5, D1Ar5, D0Ar6
|
|
! Move data 4 bytes before we enter the main loop
|
|
MOV D0Re0, D1Re0
|
|
|
|
$Lloop_5_6_7:
|
|
GETL D0Ar2, D1Ar1, [++A1.2]
|
|
! form 64-bit data in D0Re0, D1Re0
|
|
LSR D0Re0, D0Re0, D0Ar6
|
|
MOV D1Re0, D0Ar2
|
|
LSL D1Re0, D1Re0, D1Ar5
|
|
ADD D0Re0, D0Re0, D1Re0
|
|
|
|
LSR D0Ar2, D0Ar2, D0Ar6
|
|
LSL D1Re0, D1Ar1, D1Ar5
|
|
ADD D1Re0, D1Re0, D0Ar2
|
|
|
|
SETL [A0.2++], D0Re0, D1Re0
|
|
MOV D0Re0, D1Ar1
|
|
BR $Lloop_5_6_7
|
|
|
|
B $Lunaligned_end
|
|
|
|
$Lunaligned_1_2_3:
|
|
! Calculate the bit offsets required for the shift operations necesssary
|
|
! to align the data.
|
|
! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
|
|
MULW D0Ar6, D0Ar6, #8
|
|
MOV D1Ar5, #32
|
|
SUB D1Ar5, D1Ar5, D0Ar6
|
|
|
|
$Lloop_1_2_3:
|
|
! form 64-bit data in D0Re0,D1Re0
|
|
LSR D0Re0, D0Re0, D0Ar6
|
|
LSL D1Ar1, D1Re0, D1Ar5
|
|
ADD D0Re0, D0Re0, D1Ar1
|
|
MOV D0Ar2, D1Re0
|
|
LSR D0FrT, D0Ar2, D0Ar6
|
|
GETL D0Ar2, D1Ar1, [++A1.2]
|
|
|
|
MOV D1Re0, D0Ar2
|
|
LSL D1Re0, D1Re0, D1Ar5
|
|
ADD D1Re0, D1Re0, D0FrT
|
|
|
|
SETL [A0.2++], D0Re0, D1Re0
|
|
MOV D0Re0, D0Ar2
|
|
MOV D1Re0, D1Ar1
|
|
BR $Lloop_1_2_3
|
|
|
|
B $Lunaligned_end
|
|
|
|
! The 4 byte mis-alignment case - this does not require any shifting, just a
|
|
! shuffling of registers.
|
|
$Lunaligned_4:
|
|
MOV D0Re0, D1Re0
|
|
$Lloop_4:
|
|
GETL D0Ar2, D1Ar1, [++A1.2]
|
|
MOV D1Re0, D0Ar2
|
|
SETL [A0.2++], D0Re0, D1Re0
|
|
MOV D0Re0, D1Ar1
|
|
BR $Lloop_4
|
|
|
|
$Lunaligned_end:
|
|
! If there are no remaining bytes to copy, we are done.
|
|
ANDS D1Ar3, D1Ar3, #7
|
|
BZ $Lend
|
|
! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
|
|
! address of the remaining bytes, and fall through to the byte copy loop.
|
|
MOV D0Ar6, A1.2
|
|
ADD D1Ar5, D0Ar4, D0Ar6
|
|
MOV A1.2, D1Ar5
|
|
B $Lbyte_copy
|
|
|
|
.size _memcpy,.-_memcpy
|