Add upstream aarch64 NEON fix, re-enable SIMD on aarch64
This commit is contained in:
parent
7e72dfb78e
commit
66cb5e9964
282
libjpeg-turbo-aarch64-fixneon.patch
Normal file
282
libjpeg-turbo-aarch64-fixneon.patch
Normal file
@ -0,0 +1,282 @@
|
||||
From cb88e5da8003afcdc443b787fdcb77285e5a8a02 Mon Sep 17 00:00:00 2001
|
||||
From: mayeut <mayeut@users.noreply.github.com>
|
||||
Date: Tue, 20 Sep 2016 21:06:24 +0200
|
||||
Subject: [PATCH] ARM64 NEON: Fix another ABI conformance issue
|
||||
|
||||
Based on
|
||||
https://github.com/mayeut/libjpeg-turbo/commit/98a5a9dc899aa9265858a3cbe0a96289a31a1322
|
||||
with wordsmithing by DRC.
|
||||
|
||||
In the AArch64 ABI, as in many others, it's forbidden to read/store data
|
||||
below the stack pointer. Some SIMD functions were doing just that
|
||||
(stack pointer misuse) when trying to preserve callee-saved registers,
|
||||
and this resulted in those registers being restored with incorrect
|
||||
contents under certain circumstances.
|
||||
|
||||
This patch fixes that behavior, and callee-saved registers are now
|
||||
stored above the stack pointer throughout the function call. The patch
|
||||
also removes register saving in places where it is unnecessary for this
|
||||
ABI, or it makes use of unused scratch regiters instead of callee-saved
|
||||
registers.
|
||||
|
||||
Fixes #97. Closes #101.
|
||||
|
||||
Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569
|
||||
---
|
||||
simd/jsimd_arm64_neon.S | 108 +++++++++++++-----------------------------------
|
||||
2 files changed, 34 insertions(+), 80 deletions(-)
|
||||
|
||||
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
|
||||
index 6c1a959..3309858 100644
|
||||
--- a/simd/jsimd_arm64_neon.S
|
||||
+++ b/simd/jsimd_arm64_neon.S
|
||||
@@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon
|
||||
|
||||
sub sp, sp, #64
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
|
||||
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
|
||||
+ mov x10, sp
|
||||
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
|
||||
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
|
||||
ld1 {v0.8h, v1.8h}, [x15]
|
||||
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
|
||||
ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
|
||||
@@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon
|
||||
shl v10.8h, v2.8h, #(PASS1_BITS)
|
||||
sqxtn v16.8b, v15.8h
|
||||
mov TMP1, v16.d[0]
|
||||
- sub sp, sp, #64
|
||||
mvn TMP2, TMP1
|
||||
|
||||
cbnz TMP2, 2f
|
||||
@@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon
|
||||
uxtw x3, w3
|
||||
|
||||
/* Save all used NEON registers */
|
||||
- sub sp, sp, 272
|
||||
- str x15, [sp], 16
|
||||
+ sub sp, sp, 64
|
||||
+ mov x9, sp
|
||||
/* Load constants (v3.4h is just used for padding) */
|
||||
adr TMP4, Ljsimd_idct_4x4_neon_consts
|
||||
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
- st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
|
||||
#endif
|
||||
|
||||
/* vpop {v8.4h - v15.4h} ;not available */
|
||||
- sub sp, sp, #272
|
||||
- ldr x15, [sp], 16
|
||||
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
- ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
- ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon
|
||||
uxtw x3, w3
|
||||
|
||||
/* vpush {v8.4h - v15.4h} ; not available */
|
||||
- sub sp, sp, 208
|
||||
- str x15, [sp], 16
|
||||
+ sub sp, sp, 64
|
||||
+ mov x9, sp
|
||||
|
||||
/* Load constants */
|
||||
adr TMP2, Ljsimd_idct_2x2_neon_consts
|
||||
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
- st1 {v21.8b, v22.8b}, [sp], 16
|
||||
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
- st1 {v30.8b, v31.8b}, [sp], 16
|
||||
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v14.4h}, [TMP2]
|
||||
|
||||
/* Load all COEF_BLOCK into NEON registers with the following allocation:
|
||||
@@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
|
||||
st1 {v26.b}[1], [TMP2], 1
|
||||
st1 {v27.b}[5], [TMP2], 1
|
||||
|
||||
- sub sp, sp, #208
|
||||
- ldr x15, [sp], 16
|
||||
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
- ld1 {v21.8b, v22.8b}, [sp], 16
|
||||
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
- ld1 {v30.8b, v31.8b}, [sp], 16
|
||||
blr x30
|
||||
|
||||
.unreq DCT_TABLE
|
||||
@@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
INPUT_BUF2 .req x1
|
||||
|
||||
RGB .req x7
|
||||
- Y .req x8
|
||||
- U .req x9
|
||||
- V .req x10
|
||||
+ Y .req x9
|
||||
+ U .req x10
|
||||
+ V .req x11
|
||||
N .req w15
|
||||
|
||||
- sub sp, sp, 336
|
||||
- str x15, [sp], 16
|
||||
+ sub sp, sp, 64
|
||||
+ mov x9, sp
|
||||
|
||||
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
|
||||
.if \fast_st3 == 1
|
||||
@@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
.endif
|
||||
|
||||
/* Save NEON registers */
|
||||
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
- st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
ld1 {v0.4h, v1.4h}, [x15], 16
|
||||
ld1 {v2.8h}, [x15]
|
||||
|
||||
- /* Save ARM registers and handle input arguments */
|
||||
- /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
|
||||
- stp x4, x5, [sp], 16
|
||||
- stp x6, x7, [sp], 16
|
||||
- stp x8, x9, [sp], 16
|
||||
- stp x10, x30, [sp], 16
|
||||
ldr INPUT_BUF0, [INPUT_BUF]
|
||||
ldr INPUT_BUF1, [INPUT_BUF, #8]
|
||||
ldr INPUT_BUF2, [INPUT_BUF, #16]
|
||||
@@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
b.gt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
- sub sp, sp, #336
|
||||
- ldr x15, [sp], 16
|
||||
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
|
||||
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
|
||||
- ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
|
||||
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
|
||||
- ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
|
||||
- /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
|
||||
- ldp x4, x5, [sp], 16
|
||||
- ldp x6, x7, [sp], 16
|
||||
- ldp x8, x9, [sp], 16
|
||||
- ldp x10, x30, [sp], 16
|
||||
br x30
|
||||
.unreq OUTPUT_WIDTH
|
||||
.unreq INPUT_ROW
|
||||
@@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
|
||||
/* Save NEON registers */
|
||||
sub sp, sp, #64
|
||||
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
+ mov x9, sp
|
||||
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
|
||||
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
|
||||
|
||||
/* Outer loop over scanlines */
|
||||
cmp NUM_ROWS, #1
|
||||
@@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
b.gt 0b
|
||||
9:
|
||||
/* Restore all registers and return */
|
||||
- sub sp, sp, #64
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
br x30
|
||||
@@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
|
||||
|
||||
/* Save NEON registers */
|
||||
sub sp, sp, #64
|
||||
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
+ mov x10, sp
|
||||
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
|
||||
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
|
||||
|
||||
/* Load all DATA into NEON registers with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
@@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
|
||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
|
||||
|
||||
/* Restore NEON registers */
|
||||
- sub sp, sp, #64
|
||||
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
|
||||
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
|
||||
|
||||
@@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
sub sp, sp, 272
|
||||
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
|
||||
/* Save ARM registers */
|
||||
- stp x19, x20, [sp], 16
|
||||
+ stp x19, x20, [sp]
|
||||
.if \fast_tbl == 1
|
||||
adr x15, Ljsimd_huff_encode_one_block_neon_consts
|
||||
.else
|
||||
@@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
and v18.16b, v18.16b, v23.16b
|
||||
add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
|
||||
and v20.16b, v20.16b, v23.16b
|
||||
- add x15, sp, #0x80 /* x15 = t2 */
|
||||
+ add x15, sp, #0x90 /* x15 = t2 */
|
||||
and v22.16b, v22.16b, v23.16b
|
||||
ldr w10, [x4, x12, lsl #2]
|
||||
addp v16.16b, v16.16b, v18.16b
|
||||
@@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
rbit x9, x9 /* x9 = index0 */
|
||||
ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
|
||||
cmp w12, #(64-8)
|
||||
- mov x11, sp
|
||||
+ add x11, sp, #16
|
||||
b.lt 4f
|
||||
cbz x9, 6f
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
|
||||
@@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
put_bits x3, x11
|
||||
cbnz x9, 1b
|
||||
6:
|
||||
- add x13, sp, #0xfe
|
||||
+ add x13, sp, #0x10e
|
||||
cmp x15, x13
|
||||
b.hs 1f
|
||||
ldr w12, [x5]
|
||||
@@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
|
||||
checkbuf47
|
||||
put_bits x12, x14
|
||||
1:
|
||||
- sub sp, sp, 16
|
||||
str PUT_BUFFER, [x0, #0x10]
|
||||
str PUT_BITSw, [x0, #0x18]
|
||||
ldp x19, x20, [sp], 16
|
@ -1,6 +1,6 @@
|
||||
Name: libjpeg-turbo
|
||||
Version: 1.5.0
|
||||
Release: 3%{?dist}
|
||||
Release: 4%{?dist}
|
||||
Summary: A MMX/SSE2/SIMD accelerated library for manipulating JPEG image files
|
||||
License: IJG
|
||||
URL: http://sourceforge.net/projects/libjpeg-turbo
|
||||
@ -10,6 +10,7 @@ Patch0: libjpeg-turbo14-noinst.patch
|
||||
Patch1: libjpeg-turbo-header-files.patch
|
||||
Patch2: libjpeg-turbo-aarch64.patch
|
||||
Patch3: libjpeg-turbo-arm-neon.patch
|
||||
Patch4: libjpeg-turbo-aarch64-fixneon.patch
|
||||
|
||||
BuildRequires: autoconf
|
||||
BuildRequires: automake
|
||||
@ -75,14 +76,11 @@ manipulate JPEG files using the TurboJPEG library.
|
||||
%patch1 -p1 -b .header-files
|
||||
%patch2 -p1 -b .aarch64
|
||||
%patch3 -p1 -b .neon
|
||||
%patch4 -p1 -b .a64-neon
|
||||
|
||||
%build
|
||||
autoreconf -vif
|
||||
%configure \
|
||||
%ifarch aarch64
|
||||
--without-simd \
|
||||
%endif
|
||||
--disable-static
|
||||
%configure --disable-static
|
||||
|
||||
make %{?_smp_mflags} V=1
|
||||
|
||||
@ -176,6 +174,9 @@ make test %{?_smp_mflags}
|
||||
%{_libdir}/pkgconfig/libturbojpeg.pc
|
||||
|
||||
%changelog
|
||||
* Wed Sep 21 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-4
|
||||
- Add upstream aarch64 NEON fix, re-enable SIMD on aarch64
|
||||
|
||||
* Mon Sep 19 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-3
|
||||
- Temporarily disable SIMD on aarch64 until upstream #97 is fixed
|
||||
- Add NEON fix for ARMv7
|
||||
|
Loading…
x
Reference in New Issue
Block a user