Add upstream aarch64 NEON fix, re-enable SIMD on aarch64

2016-09-21 19:29:28 +01:00 · 2016-09-21 19:29:28 +01:00 · 66cb5e9964
commit 66cb5e9964
parent 7e72dfb78e
2 changed files with 289 additions and 6 deletions
--- a/libjpeg-turbo-aarch64-fixneon.patch
+++ b/libjpeg-turbo-aarch64-fixneon.patch
@ -0,0 +1,282 @@
+From cb88e5da8003afcdc443b787fdcb77285e5a8a02 Mon Sep 17 00:00:00 2001
+From: mayeut <mayeut@users.noreply.github.com>
+Date: Tue, 20 Sep 2016 21:06:24 +0200
+Subject: [PATCH] ARM64 NEON: Fix another ABI conformance issue
+
+Based on
+https://github.com/mayeut/libjpeg-turbo/commit/98a5a9dc899aa9265858a3cbe0a96289a31a1322
+with wordsmithing by DRC.
+
+In the AArch64 ABI, as in many others, it's forbidden to read/store data
+below the stack pointer.  Some SIMD functions were doing just that
+(stack pointer misuse) when trying to preserve callee-saved registers,
+and this resulted in those registers being restored with incorrect
+contents under certain circumstances.
+
+This patch fixes that behavior, and callee-saved registers are now
+stored above the stack pointer throughout the function call.  The patch
+also removes register saving in places where it is unnecessary for this
+ABI, or it makes use of unused scratch regiters instead of callee-saved
+registers.
+
+Fixes #97.  Closes #101.
+
+Refer also to https://bugzilla.redhat.com/show_bug.cgi?id=1368569
+---
+ simd/jsimd_arm64_neon.S | 108 +++++++++++++-----------------------------------
+ 2 files changed, 34 insertions(+), 80 deletions(-)
+
+diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
+index 6c1a959..3309858 100644
+--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
+@@ -217,8 +217,9 @@ asm_function jsimd_idct_islow_neon
+ 
+     sub             sp, sp, #64
+     adr             x15, Ljsimd_idct_islow_neon_consts
+-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
+     ld1             {v0.8h, v1.8h}, [x15]
+     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
+     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
+@@ -243,7 +244,6 @@ asm_function jsimd_idct_islow_neon
+     shl             v10.8h, v2.8h, #(PASS1_BITS)
+     sqxtn           v16.8b, v15.8h
+     mov             TMP1, v16.d[0]
+-    sub             sp, sp, #64
+     mvn             TMP2, TMP1
+ 
+     cbnz            TMP2, 2f
+@@ -1117,18 +1117,12 @@ asm_function jsimd_idct_4x4_neon
+     uxtw x3, w3
+ 
+     /* Save all used NEON registers */
+-    sub             sp, sp, 272
+-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
+     /* Load constants (v3.4h is just used for padding) */
+     adr             TMP4, Ljsimd_idct_4x4_neon_consts
+-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
+ 
+     /* Load all COEF_BLOCK into NEON registers with the following allocation:
+@@ -1237,16 +1231,8 @@ asm_function jsimd_idct_4x4_neon
+ #endif
+ 
+     /* vpop            {v8.4h - v15.4h}    ;not available */
+-    sub             sp, sp, #272
+-    ldr             x15, [sp], 16
+-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+     blr             x30
+ 
+     .unreq          DCT_TABLE
+@@ -1320,18 +1306,13 @@ asm_function jsimd_idct_2x2_neon
+     uxtw x3, w3
+ 
+     /* vpush           {v8.4h - v15.4h}            ; not available */
+-    sub             sp, sp, 208
+-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
+ 
+     /* Load constants */
+     adr             TMP2, Ljsimd_idct_2x2_neon_consts
+-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+-    st1             {v21.8b, v22.8b}, [sp], 16
+-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+-    st1             {v30.8b, v31.8b}, [sp], 16
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+     ld1             {v14.4h}, [TMP2]
+ 
+     /* Load all COEF_BLOCK into NEON registers with the following allocation:
+@@ -1431,15 +1412,8 @@ asm_function jsimd_idct_2x2_neon
+     st1             {v26.b}[1], [TMP2], 1
+     st1             {v27.b}[5], [TMP2], 1
+ 
+-    sub             sp, sp, #208
+-    ldr             x15, [sp], 16
+-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+-    ld1             {v21.8b, v22.8b}, [sp], 16
+-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+-    ld1             {v30.8b, v31.8b}, [sp], 16
+     blr             x30
+ 
+     .unreq          DCT_TABLE
+@@ -1719,13 +1693,13 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+     INPUT_BUF2      .req x1
+ 
+     RGB             .req x7
+-    Y               .req x8
+-    U               .req x9
+-    V               .req x10
+    Y               .req x9
+    U               .req x10
+    V               .req x11
+     N               .req w15
+ 
+-    sub             sp, sp, 336
+-    str             x15, [sp], 16
+    sub             sp, sp, 64
+    mov             x9, sp
+ 
+     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
+     .if \fast_st3 == 1
+@@ -1735,23 +1709,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+     .endif
+ 
+     /* Save NEON registers */
+-    st1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+-    st1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+-    st1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+-    st1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+-    st1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+-    st1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+     ld1             {v0.4h, v1.4h}, [x15], 16
+     ld1             {v2.8h}, [x15]
+ 
+-    /* Save ARM registers and handle input arguments */
+-    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
+-    stp             x4, x5, [sp], 16
+-    stp             x6, x7, [sp], 16
+-    stp             x8, x9, [sp], 16
+-    stp             x10, x30, [sp], 16
+     ldr             INPUT_BUF0, [INPUT_BUF]
+     ldr             INPUT_BUF1, [INPUT_BUF, #8]
+     ldr             INPUT_BUF2, [INPUT_BUF, #16]
+@@ -1818,21 +1780,8 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+     b.gt            0b
+ 9:
+     /* Restore all registers and return */
+-    sub             sp, sp, #336
+-    ldr             x15, [sp], 16
+-    ld1             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp], 32
+-    ld1             {v4.8b, v5.8b, v6.8b, v7.8b}, [sp], 32
+     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+-    ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp], 32
+-    ld1             {v20.8b, v21.8b, v22.8b, v23.8b}, [sp], 32
+-    ld1             {v24.8b, v25.8b, v26.8b, v27.8b}, [sp], 32
+-    ld1             {v28.8b, v29.8b, v30.8b, v31.8b}, [sp], 32
+-    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
+-    ldp             x4, x5, [sp], 16
+-    ldp             x6, x7, [sp], 16
+-    ldp             x8, x9, [sp], 16
+-    ldp             x10, x30, [sp], 16
+     br              x30
+     .unreq          OUTPUT_WIDTH
+     .unreq          INPUT_ROW
+@@ -2101,8 +2050,9 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+ 
+     /* Save NEON registers */
+     sub             sp, sp, #64
+-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x9, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
+ 
+     /* Outer loop over scanlines */
+     cmp             NUM_ROWS, #1
+@@ -2155,7 +2105,6 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+     b.gt            0b
+ 9:
+     /* Restore all registers and return */
+-    sub             sp, sp, #64
+     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+     br              x30
+@@ -2359,8 +2308,9 @@ asm_function jsimd_fdct_islow_neon
+ 
+     /* Save NEON registers */
+     sub             sp, sp, #64
+-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+    mov             x10, sp
+    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
+    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
+ 
+     /* Load all DATA into NEON registers with the following allocation:
+      *       0 1 2 3 | 4 5 6 7
+@@ -2590,7 +2540,6 @@ asm_function jsimd_fdct_islow_neon
+     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
+ 
+     /* Restore NEON registers */
+-    sub             sp, sp, #64
+     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
+     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
+ 
+@@ -3104,7 +3053,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
+     sub             sp, sp, 272
+     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
+     /* Save ARM registers */
+-    stp             x19, x20, [sp], 16
+    stp             x19, x20, [sp]
+ .if \fast_tbl == 1
+     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
+ .else
+@@ -3318,7 +3267,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
+     and             v18.16b, v18.16b, v23.16b
+       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
+     and             v20.16b, v20.16b, v23.16b
+-      add             x15, sp, #0x80           /* x15 = t2 */
+      add             x15, sp, #0x90           /* x15 = t2 */
+     and             v22.16b, v22.16b, v23.16b
+       ldr             w10, [x4, x12, lsl #2]
+     addp            v16.16b, v16.16b, v18.16b
+@@ -3341,7 +3290,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
+     rbit            x9, x9             /* x9 = index0 */
+     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
+     cmp             w12, #(64-8)
+-    mov             x11, sp
+    add             x11, sp, #16
+     b.lt            4f
+     cbz             x9, 6f
+     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
+@@ -3445,7 +3394,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
+     put_bits        x3, x11
+     cbnz            x9, 1b
+ 6:
+-    add             x13, sp, #0xfe
+    add             x13, sp, #0x10e
+     cmp             x15, x13
+     b.hs            1f
+     ldr             w12, [x5]
+@@ -3453,7 +3402,6 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
+     checkbuf47
+     put_bits        x12, x14
+ 1:
+-    sub             sp, sp, 16
+     str             PUT_BUFFER, [x0, #0x10]
+     str             PUT_BITSw, [x0, #0x18]
+     ldp             x19, x20, [sp], 16
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@ -1,6 +1,6 @@
 Name:           libjpeg-turbo
 Version:        1.5.0
-Release:        3%{?dist}
+Release:        4%{?dist}
 Summary:        A MMX/SSE2/SIMD accelerated library for manipulating JPEG image files
 License:        IJG
 URL:            http://sourceforge.net/projects/libjpeg-turbo
@ -10,6 +10,7 @@ Patch0:         libjpeg-turbo14-noinst.patch
 Patch1:         libjpeg-turbo-header-files.patch
 Patch2:         libjpeg-turbo-aarch64.patch
 Patch3:         libjpeg-turbo-arm-neon.patch
+Patch4:         libjpeg-turbo-aarch64-fixneon.patch

 BuildRequires:  autoconf
 BuildRequires:  automake
@ -75,14 +76,11 @@ manipulate JPEG files using the TurboJPEG library.
 %patch1 -p1 -b .header-files
 %patch2 -p1 -b .aarch64
 %patch3 -p1 -b .neon
+%patch4 -p1 -b .a64-neon

 %build
 autoreconf -vif
-%configure \
-%ifarch aarch64
--without-simd \
-%endif
--disable-static
+%configure --disable-static

 make %{?_smp_mflags} V=1

@ -176,6 +174,9 @@ make test %{?_smp_mflags}
 %{_libdir}/pkgconfig/libturbojpeg.pc

 %changelog
+* Wed Sep 21 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-4
+- Add upstream aarch64 NEON fix, re-enable SIMD on aarch64
+
 * Mon Sep 19 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-3
 - Temporarily disable SIMD on aarch64 until upstream #97 is fixed
 - Add NEON fix for ARMv7