Add upstream aarch64 NEON fix, re-enable SIMD on aarch64

This commit is contained in:
Peter Robinson 2016-09-21 19:29:28 +01:00
parent 7e72dfb78e
commit 66cb5e9964
2 changed files with 289 additions and 6 deletions

View File

@ -0,0 +1,282 @@
From cb88e5da8003afcdc443b787fdcb77285e5a8a02 Mon Sep 17 00:00:00 2001
From: mayeut <mayeut@users.noreply.github.com>
Date: Tue, 20 Sep 2016 21:06:24 +0200
Subject: [PATCH] ARM64 NEON: Fix another ABI conformance issue
Based on
https://github.com/mayeut/libjpeg-turbo/commit/98a5a9dc899aa9265858a3cbe0a96289a31a1322
with wordsmithing by DRC.
In the AArch64 ABI, as in many others, it's forbidden to read/store data
below the stack pointer. Some SIMD functions were doing just that
(stack pointer misuse) when trying to preserve callee-saved registers,
and this resulted in those registers being restored with incorrect
contents under certain circumstances.
This patch fixes that behavior, and callee-saved registers are now
stored above the stack pointer throughout the function call. The patch
also removes register saving in places where it is unnecessary for this
ABI, or it makes use of unused scratch regiters instead of callee-saved
registers.
Fixes #97. Closes #101.
Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569
---
simd/jsimd_arm64_neon.S | 108 +++++++++++++-----------------------------------
2 files changed, 34 insertions(+), 80 deletions(-)
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
index 6c1a959..3309858 100644
--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
@@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon
sub sp, sp, #64
adr x15, Ljsimd_idct_islow_neon_consts
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
ld1 {v0.8h, v1.8h}, [x15]
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
@@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon
shl v10.8h, v2.8h, #(PASS1_BITS)
sqxtn v16.8b, v15.8h
mov TMP1, v16.d[0]
- sub sp, sp, #64
mvn TMP2, TMP1
cbnz TMP2, 2f
@@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon
uxtw x3, w3
/* Save all used NEON registers */
- sub sp, sp, 272
- str x15, [sp], 16
+ sub sp, sp, 64
+ mov x9, sp
/* Load constants (v3.4h is just used for padding) */
adr TMP4, Ljsimd_idct_4x4_neon_consts
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
#endif
/* vpop {v8.4h - v15.4h} ;not available */
- sub sp, sp, #272
- ldr x15, [sp], 16
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
blr x30
.unreq DCT_TABLE
@@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon
uxtw x3, w3
/* vpush {v8.4h - v15.4h} ; not available */
- sub sp, sp, 208
- str x15, [sp], 16
+ sub sp, sp, 64
+ mov x9, sp
/* Load constants */
adr TMP2, Ljsimd_idct_2x2_neon_consts
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- st1 {v21.8b, v22.8b}, [sp], 16
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- st1 {v30.8b, v31.8b}, [sp], 16
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v14.4h}, [TMP2]
/* Load all COEF_BLOCK into NEON registers with the following allocation:
@@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
st1 {v26.b}[1], [TMP2], 1
st1 {v27.b}[5], [TMP2], 1
- sub sp, sp, #208
- ldr x15, [sp], 16
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- ld1 {v21.8b, v22.8b}, [sp], 16
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- ld1 {v30.8b, v31.8b}, [sp], 16
blr x30
.unreq DCT_TABLE
@@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
INPUT_BUF2 .req x1
RGB .req x7
- Y .req x8
- U .req x9
- V .req x10
+ Y .req x9
+ U .req x10
+ V .req x11
N .req w15
- sub sp, sp, 336
- str x15, [sp], 16
+ sub sp, sp, 64
+ mov x9, sp
/* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
.if \fast_st3 == 1
@@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
.endif
/* Save NEON registers */
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- st1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- st1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- st1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- st1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
ld1 {v0.4h, v1.4h}, [x15], 16
ld1 {v2.8h}, [x15]
- /* Save ARM registers and handle input arguments */
- /* push {x4, x5, x6, x7, x8, x9, x10, x30} */
- stp x4, x5, [sp], 16
- stp x6, x7, [sp], 16
- stp x8, x9, [sp], 16
- stp x10, x30, [sp], 16
ldr INPUT_BUF0, [INPUT_BUF]
ldr INPUT_BUF1, [INPUT_BUF, #8]
ldr INPUT_BUF2, [INPUT_BUF, #16]
@@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
b.gt 0b
9:
/* Restore all registers and return */
- sub sp, sp, #336
- ldr x15, [sp], 16
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
- ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
- ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
- ld1 {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
- ld1 {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
- ld1 {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
- /* pop {r4, r5, r6, r7, r8, r9, r10, pc} */
- ldp x4, x5, [sp], 16
- ldp x6, x7, [sp], 16
- ldp x8, x9, [sp], 16
- ldp x10, x30, [sp], 16
br x30
.unreq OUTPUT_WIDTH
.unreq INPUT_ROW
@@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
/* Save NEON registers */
sub sp, sp, #64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ mov x9, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
/* Outer loop over scanlines */
cmp NUM_ROWS, #1
@@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
b.gt 0b
9:
/* Restore all registers and return */
- sub sp, sp, #64
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
br x30
@@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
/* Save NEON registers */
sub sp, sp, #64
- st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
- st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ mov x10, sp
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
/* Load all DATA into NEON registers with the following allocation:
* 0 1 2 3 | 4 5 6 7
@@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
/* Restore NEON registers */
- sub sp, sp, #64
ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
@@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
sub sp, sp, 272
sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
/* Save ARM registers */
- stp x19, x20, [sp], 16
+ stp x19, x20, [sp]
.if \fast_tbl == 1
adr x15, Ljsimd_huff_encode_one_block_neon_consts
.else
@@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
and v18.16b, v18.16b, v23.16b
add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
and v20.16b, v20.16b, v23.16b
- add x15, sp, #0x80 /* x15 = t2 */
+ add x15, sp, #0x90 /* x15 = t2 */
and v22.16b, v22.16b, v23.16b
ldr w10, [x4, x12, lsl #2]
addp v16.16b, v16.16b, v18.16b
@@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
rbit x9, x9 /* x9 = index0 */
ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
cmp w12, #(64-8)
- mov x11, sp
+ add x11, sp, #16
b.lt 4f
cbz x9, 6f
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
@@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
put_bits x3, x11
cbnz x9, 1b
6:
- add x13, sp, #0xfe
+ add x13, sp, #0x10e
cmp x15, x13
b.hs 1f
ldr w12, [x5]
@@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
checkbuf47
put_bits x12, x14
1:
- sub sp, sp, 16
str PUT_BUFFER, [x0, #0x10]
str PUT_BITSw, [x0, #0x18]
ldp x19, x20, [sp], 16

View File

@ -1,6 +1,6 @@
Name: libjpeg-turbo
Version: 1.5.0
Release: 3%{?dist}
Release: 4%{?dist}
Summary: A MMX/SSE2/SIMD accelerated library for manipulating JPEG image files
License: IJG
URL: http://sourceforge.net/projects/libjpeg-turbo
@ -10,6 +10,7 @@ Patch0: libjpeg-turbo14-noinst.patch
Patch1: libjpeg-turbo-header-files.patch
Patch2: libjpeg-turbo-aarch64.patch
Patch3: libjpeg-turbo-arm-neon.patch
Patch4: libjpeg-turbo-aarch64-fixneon.patch
BuildRequires: autoconf
BuildRequires: automake
@ -75,14 +76,11 @@ manipulate JPEG files using the TurboJPEG library.
%patch1 -p1 -b .header-files
%patch2 -p1 -b .aarch64
%patch3 -p1 -b .neon
%patch4 -p1 -b .a64-neon
%build
autoreconf -vif
%configure \
%ifarch aarch64
--without-simd \
%endif
--disable-static
%configure --disable-static
make %{?_smp_mflags} V=1
@ -176,6 +174,9 @@ make test %{?_smp_mflags}
%{_libdir}/pkgconfig/libturbojpeg.pc
%changelog
* Wed Sep 21 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-4
- Add upstream aarch64 NEON fix, re-enable SIMD on aarch64
* Mon Sep 19 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-3
- Temporarily disable SIMD on aarch64 until upstream #97 is fixed
- Add NEON fix for ARMv7