Add upstream fix to fix SIMD crash on aarch64 (rhbz #1368569)
This commit is contained in:
parent
5af0418032
commit
81c58f0640
|
@ -0,0 +1,161 @@
|
|||
From 1120ff29a178ee666504f0067e7c079a6b792296 Mon Sep 17 00:00:00 2001
|
||||
From: DRC <information@libjpeg-turbo.org>
|
||||
Date: Wed, 13 Jul 2016 12:15:02 -0500
|
||||
Subject: [PATCH] Fix AArch64 ABI conformance issue in SIMD code
|
||||
|
||||
In the AArch64 ABI, the high (unused) DWORD of a 32-bit argument's
|
||||
register is undefined, so it was incorrect to use 64-bit
|
||||
instructions to transfer a JDIMENSION argument in the 64-bit NEON SIMD
|
||||
functions. The code worked thus far only because the existing compiler
|
||||
optimizers weren't smart enough to do anything else with the register in
|
||||
question, so the upper 32 bits happened to be all zeroes.
|
||||
|
||||
The latest builds of Clang/LLVM have a smarter optimizer, and under
|
||||
certain circumstances, it will attempt to load-combine adjacent 32-bit
|
||||
integers from one of the libjpeg structures into a single 64-bit integer
|
||||
and pass that 64-bit integer as a 32-bit argument to one of the SIMD
|
||||
functions (which is allowed by the ABI, since the upper 32 bits of the
|
||||
32-bit argument's register are undefined.) This caused the
|
||||
libjpeg-turbo regression tests to crash.
|
||||
|
||||
This patch tries to use the Wn registers whenever possible. Otherwise,
|
||||
it uses a zero-extend instruction to avoid using the upper 32 bits of
|
||||
the 64-bit registers, which are not guaranteed to be valid for 32-bit
|
||||
arguments.
|
||||
|
||||
Based on https://github.com/sebpop/libjpeg-turbo/commit/1fbae13021eb98f6fffdfaf8678fcdb00b0b04d9
|
||||
|
||||
Closes #91. Refer also to android-ndk/ndk#110 and
|
||||
https://llvm.org/bugs/show_bug.cgi?id=28393
|
||||
---
|
||||
simd/jsimd_arm64_neon.S | 50 ++++++++++++++++++++++++++++++++++++-------------
|
||||
2 files changed, 47 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/simd/jsimd_arm64_neon.S b/simd/jsimd_arm64_neon.S
|
||||
index 74d6c76..6c1a959 100644
|
||||
--- a/simd/jsimd_arm64_neon.S
|
||||
+++ b/simd/jsimd_arm64_neon.S
|
||||
@@ -210,6 +210,11 @@ asm_function jsimd_idct_islow_neon
|
||||
TMP7 .req x13
|
||||
TMP8 .req x14
|
||||
|
||||
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
+ instruction ensures that those bits are set to zero. */
|
||||
+ uxtw x3, w3
|
||||
+
|
||||
sub sp, sp, #64
|
||||
adr x15, Ljsimd_idct_islow_neon_consts
|
||||
st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
|
||||
@@ -807,6 +812,11 @@ asm_function jsimd_idct_ifast_neon
|
||||
TMP7 .req x13
|
||||
TMP8 .req x14
|
||||
|
||||
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
+ instruction ensures that those bits are set to zero. */
|
||||
+ uxtw x3, w3
|
||||
+
|
||||
/* Load and dequantize coefficients into NEON registers
|
||||
* with the following allocation:
|
||||
* 0 1 2 3 | 4 5 6 7
|
||||
@@ -1101,6 +1111,11 @@ asm_function jsimd_idct_4x4_neon
|
||||
TMP3 .req x2
|
||||
TMP4 .req x15
|
||||
|
||||
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
+ instruction ensures that those bits are set to zero. */
|
||||
+ uxtw x3, w3
|
||||
+
|
||||
/* Save all used NEON registers */
|
||||
sub sp, sp, 272
|
||||
str x15, [sp], 16
|
||||
@@ -1299,6 +1314,11 @@ asm_function jsimd_idct_2x2_neon
|
||||
TMP1 .req x0
|
||||
TMP2 .req x15
|
||||
|
||||
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
|
||||
+ instruction ensures that those bits are set to zero. */
|
||||
+ uxtw x3, w3
|
||||
+
|
||||
/* vpush {v8.4h - v15.4h} ; not available */
|
||||
sub sp, sp, 208
|
||||
str x15, [sp], 16
|
||||
@@ -1688,11 +1708,11 @@ asm_function jsimd_ycc_\colorid\()_convert_neon
|
||||
.else
|
||||
asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
.endif
|
||||
- OUTPUT_WIDTH .req x0
|
||||
+ OUTPUT_WIDTH .req w0
|
||||
INPUT_BUF .req x1
|
||||
- INPUT_ROW .req x2
|
||||
+ INPUT_ROW .req w2
|
||||
OUTPUT_BUF .req x3
|
||||
- NUM_ROWS .req x4
|
||||
+ NUM_ROWS .req w4
|
||||
|
||||
INPUT_BUF0 .req x5
|
||||
INPUT_BUF1 .req x6
|
||||
@@ -1702,7 +1722,7 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
Y .req x8
|
||||
U .req x9
|
||||
V .req x10
|
||||
- N .req x15
|
||||
+ N .req w15
|
||||
|
||||
sub sp, sp, 336
|
||||
str x15, [sp], 16
|
||||
@@ -1745,11 +1765,10 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
|
||||
cmp NUM_ROWS, #1
|
||||
b.lt 9f
|
||||
0:
|
||||
- lsl x16, INPUT_ROW, #3
|
||||
- ldr Y, [INPUT_BUF0, x16]
|
||||
- ldr U, [INPUT_BUF1, x16]
|
||||
+ ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
|
||||
+ ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
|
||||
mov N, OUTPUT_WIDTH
|
||||
- ldr V, [INPUT_BUF2, x16]
|
||||
+ ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
|
||||
add INPUT_ROW, INPUT_ROW, #1
|
||||
ldr RGB, [OUTPUT_BUF], #8
|
||||
|
||||
@@ -2054,8 +2073,8 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
OUTPUT_WIDTH .req w0
|
||||
INPUT_BUF .req x1
|
||||
OUTPUT_BUF .req x2
|
||||
- OUTPUT_ROW .req x3
|
||||
- NUM_ROWS .req x4
|
||||
+ OUTPUT_ROW .req w3
|
||||
+ NUM_ROWS .req w4
|
||||
|
||||
OUTPUT_BUF0 .req x5
|
||||
OUTPUT_BUF1 .req x6
|
||||
@@ -2089,10 +2108,10 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
|
||||
cmp NUM_ROWS, #1
|
||||
b.lt 9f
|
||||
0:
|
||||
- ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #3]
|
||||
- ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #3]
|
||||
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
|
||||
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
|
||||
mov N, OUTPUT_WIDTH
|
||||
- ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #3]
|
||||
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
|
||||
add OUTPUT_ROW, OUTPUT_ROW, #1
|
||||
ldr RGB, [INPUT_BUF], #8
|
||||
|
||||
@@ -2199,6 +2218,11 @@ asm_function jsimd_convsamp_neon
|
||||
TMP8 .req x4
|
||||
TMPDUP .req w3
|
||||
|
||||
+ /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
|
||||
+ guarantee that the upper (unused) 32 bits of x1 are valid. This
|
||||
+ instruction ensures that those bits are set to zero. */
|
||||
+ uxtw x1, w1
|
||||
+
|
||||
mov TMPDUP, #128
|
||||
ldp TMP1, TMP2, [SAMPLE_DATA], 16
|
||||
ldp TMP3, TMP4, [SAMPLE_DATA], 16
|
|
@ -1,13 +1,14 @@
|
|||
Name: libjpeg-turbo
|
||||
Version: 1.5.0
|
||||
Release: 1%{?dist}
|
||||
Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files
|
||||
Release: 2%{?dist}
|
||||
Summary: A MMX/SSE2/SIMD accelerated library for manipulating JPEG image files
|
||||
License: IJG
|
||||
URL: http://sourceforge.net/projects/libjpeg-turbo
|
||||
|
||||
Source0: http://downloads.sourceforge.net/%{name}/%{name}-%{version}.tar.gz
|
||||
Patch0: libjpeg-turbo14-noinst.patch
|
||||
Patch1: libjpeg-turbo-header-files.patch
|
||||
Patch2: libjpeg-turbo-aarch64.patch
|
||||
|
||||
BuildRequires: autoconf
|
||||
BuildRequires: automake
|
||||
|
@ -71,6 +72,7 @@ manipulate JPEG files using the TurboJPEG library.
|
|||
%setup -q
|
||||
%patch0 -p1 -b .noinst
|
||||
%patch1 -p1 -b .header-files
|
||||
%patch2 -p1 -b .aarch64
|
||||
|
||||
%build
|
||||
autoreconf -vif
|
||||
|
@ -167,6 +169,9 @@ make test %{?_smp_mflags}
|
|||
%{_libdir}/pkgconfig/libturbojpeg.pc
|
||||
|
||||
%changelog
|
||||
* Tue Sep 13 2016 Peter Robinson <pbrobinson@fedoraproject.org> 1.5.0-2
|
||||
- Add upstream fix to fix SIMD crash on aarch64 (rhbz #1368569)
|
||||
|
||||
* Tue Jun 21 2016 Petr Hracek <phracek@redhat.com> - 1.5.0-1
|
||||
- New upstream release 1.5.0 (#1343786)
|
||||
|
||||
|
|
Loading…
Reference in New Issue