Add upstream fix to fix SIMD crash on aarch64 (rhbz #1368569)

2016-09-13 20:33:52 +01:00 · 2016-09-13 20:33:52 +01:00 · 81c58f0640
parent 5af0418032
commit 81c58f0640
2 changed files with 168 additions and 2 deletions
--- a/libjpeg-turbo-aarch64.patch
+++ b/libjpeg-turbo-aarch64.patch
@ -0,0 +1,161 @@
+From 1120ff29a178ee666504f0067e7c079a6b792296 Mon Sep 17 00:00:00 2001
+From: DRC <information@libjpeg-turbo.org>
+Date: Wed, 13 Jul 2016 12:15:02 -0500
+Subject: [PATCH] Fix AArch64 ABI conformance issue in SIMD code
+
+In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's
+register is undefined, so it was incorrect to use 64-bit
+instructions to transfer a JDIMENSION argument in the 64-bit NEON SIMD
+functions.  The code worked thus far only because the existing compiler
+optimizers weren't smart enough to do anything else with the register in
+question, so the upper 32 bits happened to be all zeroes.
+
+The latest builds of Clang/LLVM have a smarter optimizer, and under
+certain circumstances, it will attempt to load-combine adjacent 32-bit
+integers from one of the libjpeg structures into a single 64-bit integer
+and pass that 64-bit integer as a 32-bit argument to one of the SIMD
+functions (which is allowed by the ABI, since the upper 32 bits of the
+32-bit argument's register are undefined.)  This caused the
+libjpeg-turbo regression tests to crash.
+
+This patch tries to use the Wn registers whenever possible.  Otherwise,
+it uses a zero-extend instruction to avoid using the upper 32 bits of
+the 64-bit registers, which are not guaranteed to be valid for 32-bit
+arguments.
+
+Based on https://github.com/sebpop/libjpeg-turbo/commit/1fbae13021eb98f6fffdfaf8678fcdb00b0b04d9
+
+Closes #91.  Refer also to android-ndk/ndk#110 and
+https://llvm.org/bugs/show_bug.cgi?id=28393
+---
+ simd/jsimd_arm64_neon.S | 50 ++++++++++++++++++++++++++++++++++++-------------
+ 2 files changed, 47 insertions(+), 13 deletions(-)
+
+diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
+index 74d6c76..6c1a959 100644
+--- a/simd/jsimd_arm64_neon.S
+++ b/simd/jsimd_arm64_neon.S
+@@ -210,6 +210,11 @@ asm_function jsimd_idct_islow_neon
+     TMP7            .req x13
+     TMP8            .req x14
+ 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+     sub             sp, sp, #64
+     adr             x15, Ljsimd_idct_islow_neon_consts
+     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
+@@ -807,6 +812,11 @@ asm_function jsimd_idct_ifast_neon
+     TMP7            .req x13
+     TMP8            .req x14
+ 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+     /* Load and dequantize coefficients into NEON registers
+      * with the following allocation:
+      *       0 1 2 3 | 4 5 6 7
+@@ -1101,6 +1111,11 @@ asm_function jsimd_idct_4x4_neon
+     TMP3            .req x2
+     TMP4            .req x15
+ 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+     /* Save all used NEON registers */
+     sub             sp, sp, 272
+     str             x15, [sp], 16
+@@ -1299,6 +1314,11 @@ asm_function jsimd_idct_2x2_neon
+     TMP1            .req x0
+     TMP2            .req x15
+ 
+    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x3 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x3, w3
+
+     /* vpush           {v8.4h - v15.4h}            ; not available */
+     sub             sp, sp, 208
+     str             x15, [sp], 16
+@@ -1688,11 +1708,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
+ .else
+ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+ .endif
+-    OUTPUT_WIDTH    .req x0
+    OUTPUT_WIDTH    .req w0
+     INPUT_BUF       .req x1
+-    INPUT_ROW       .req x2
+    INPUT_ROW       .req w2
+     OUTPUT_BUF      .req x3
+-    NUM_ROWS        .req x4
+    NUM_ROWS        .req w4
+ 
+     INPUT_BUF0      .req x5
+     INPUT_BUF1      .req x6
+@@ -1702,7 +1722,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+     Y               .req x8
+     U               .req x9
+     V               .req x10
+-    N               .req x15
+    N               .req w15
+ 
+     sub             sp, sp, 336
+     str             x15, [sp], 16
+@@ -1745,11 +1765,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
+     cmp             NUM_ROWS, #1
+     b.lt            9f
+ 0:
+-    lsl             x16, INPUT_ROW, #3
+-    ldr             Y, [INPUT_BUF0, x16]
+-    ldr             U, [INPUT_BUF1, x16]
+    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
+    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
+     mov             N, OUTPUT_WIDTH
+-    ldr             V, [INPUT_BUF2, x16]
+    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
+     add             INPUT_ROW, INPUT_ROW, #1
+     ldr             RGB, [OUTPUT_BUF], #8
+ 
+@@ -2054,8 +2073,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+     OUTPUT_WIDTH    .req w0
+     INPUT_BUF       .req x1
+     OUTPUT_BUF      .req x2
+-    OUTPUT_ROW      .req x3
+-    NUM_ROWS        .req x4
+    OUTPUT_ROW      .req w3
+    NUM_ROWS        .req w4
+ 
+     OUTPUT_BUF0     .req x5
+     OUTPUT_BUF1     .req x6
+@@ -2089,10 +2108,10 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
+     cmp             NUM_ROWS, #1
+     b.lt            9f
+ 0:
+-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
+-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
+     mov             N, OUTPUT_WIDTH
+-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
+     add             OUTPUT_ROW, OUTPUT_ROW, #1
+     ldr             RGB, [INPUT_BUF], #8
+ 
+@@ -2199,6 +2218,11 @@ asm_function jsimd_convsamp_neon
+     TMP8            .req x4
+     TMPDUP          .req w3
+ 
+    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
+       guarantee that the upper (unused) 32 bits of x1 are valid.  This
+       instruction ensures that those bits are set to zero. */
+    uxtw x1, w1
+
+     mov             TMPDUP, #128
+     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
+     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
--- a/libjpeg-turbo.spec
+++ b/libjpeg-turbo.spec
@ -1,13 +1,14 @@
 Name:           libjpeg-turbo
 Version:        1.5.0
-Release:        1%{?dist}
-Summary:        A MMX/SSE2 accelerated library for manipulating JPEG image files
+Release:        2%{?dist}
+Summary:        A MMX/SSE2/SIMD accelerated library for manipulating JPEG image files
 License:        IJG
 URL:            http://sourceforge.net/projects/libjpeg-turbo

 Source0:        http://downloads.sourceforge.net/%{name}/%{name}-%{version}.tar.gz
 Patch0:         libjpeg-turbo14-noinst.patch
 Patch1:         libjpeg-turbo-header-files.patch
+Patch2:         libjpeg-turbo-aarch64.patch

 BuildRequires:  autoconf
 BuildRequires:  automake
@ -71,6 +72,7 @@ manipulate JPEG files using the TurboJPEG library.
 %setup -q
 %patch0 -p1 -b .noinst
 %patch1 -p1 -b .header-files
+%patch2 -p1 -b .aarch64

 %build
 autoreconf -vif
@ -167,6 +169,9 @@ make test %{?_smp_mflags}
 %{_libdir}/pkgconfig/libturbojpeg.pc

 %changelog
+* Tue Sep 13 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-2
+- Add upstream fix to fix SIMD crash on aarch64 (rhbz #1368569)
+
 * Tue Jun 21 2016 Petr Hracek <phracek@redhat.com> - 1.5.0-1
 - New upstream release 1.5.0 (#1343786)