From patchwork Mon Apr 24 21:59:48 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [Mesa-dev,1/4] vc4: Only build the NEON code on arm32. From: Eric Anholt X-Patchwork-Id: 152503 Message-Id: <20170424215951.21544-1-eric@anholt.net> To: mesa-dev@lists.freedesktop.org Cc: mesa-stable@lists.freedesktop.org Date: Mon, 24 Apr 2017 14:59:48 -0700 NEON is sufficiently different on arm64 that we can't just reuse this code. Disable it on arm64 for now. v2: Use PIPE_ARCH_ARM instead, as __ARM_ARCH may be 8 for a 32-bit build for a v8 CPU. Signed-off-by: Eric Anholt Cc: --- src/gallium/drivers/vc4/vc4_tiling_lt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c index c9cbc65e2dbc..f37a92e9390e 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -61,7 +61,7 @@ static void vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) { uint32_t gpu_stride = vc4_utile_stride(cpp); -#if defined(VC4_BUILD_NEON) && defined(__ARM_ARCH) +#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) if (gpu_stride == 8) { __asm__ volatile ( /* Load from the GPU in one shot, no interleave, to @@ -118,7 +118,7 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) { uint32_t gpu_stride = vc4_utile_stride(cpp); -#if defined(VC4_BUILD_NEON) && defined(__ARM_ARCH) +#if defined(VC4_BUILD_NEON) && defined(PIPE_ARCH_ARM) if (gpu_stride == 8) { __asm__ volatile ( /* Load each 8-byte line from cpu-side source, From patchwork Mon Apr 24 21:59:49 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [Mesa-dev,2/4] gallium: Enable ARM NEON CPU detection. From: Eric Anholt X-Patchwork-Id: 152500 Message-Id: <20170424215951.21544-2-eric@anholt.net> To: mesa-dev@lists.freedesktop.org Date: Mon, 24 Apr 2017 14:59:49 -0700 I wrote this code with reference to pixman, though I've only decided to cover Linux (what I'm testing) and Android (seems obvious enough). Linux has getauxval() as a cleaner interface to the /proc entry, but it's more glibc-specific and I didn't want to add detection for that. This will be used to enable NEON at runtime on ARMv6 builds of vc4. v2: Actually initialize the temp vars in the Android path (noticed by daniels) v3: Actually pull in the cpufeatures library (change by robher). Use O_CLOEXEC. Break out of the loop when we find our feature. v4: Drop VFP code, which was confused about what it was detecting and not actually used yet. --- src/gallium/auxiliary/util/u_cpu_detect.c | 43 +++++++++++++++++++++++++++++++ src/gallium/auxiliary/util/u_cpu_detect.h | 1 + 3 files changed, 46 insertions(+) diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index 845fc6b34d5c..76115bf8d55d 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -59,12 +59,18 @@ #if defined(PIPE_OS_LINUX) #include +#include +#include #endif #ifdef PIPE_OS_UNIX #include #endif +#if defined(PIPE_OS_ANDROID) +#include +#endif + #if defined(PIPE_OS_WINDOWS) #include #if defined(PIPE_CC_MSVC) @@ -294,6 +300,38 @@ PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void) #endif /* X86 or X86_64 */ +#if defined(PIPE_ARCH_ARM) +static void +check_os_arm_support(void) +{ +#if defined(PIPE_OS_ANDROID) + AndroidCpuFamily cpu_family = android_getCpuFamily(); + uint64_t cpu_features = android_getCpuFeatures(); + + if (cpu_family == ANDROID_CPU_FAMILY_ARM) { + if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) + util_cpu_caps.has_neon = 1; + } +#elif defined(PIPE_OS_LINUX) + Elf32_auxv_t aux; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) { + if (aux.a_type == AT_HWCAP) { + uint32_t hwcap = aux.a_un.a_val; + + util_cpu_caps.has_neon = (hwcap >> 12) & 1; + break; + } + } + close (fd); + } +#endif /* PIPE_OS_LINUX */ +} +#endif /* PIPE_ARCH_ARM */ + void util_cpu_detect(void) { @@ -443,6 +481,10 @@ util_cpu_detect(void) } #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ +#if defined(PIPE_ARCH_ARM) + check_os_arm_support(); +#endif + #if defined(PIPE_ARCH_PPC) check_os_altivec_support(); #endif /* PIPE_ARCH_PPC */ @@ -471,6 +513,7 @@ util_cpu_detect(void) debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); + debug_printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon); debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); debug_printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f); debug_printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq); diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h index 3bd7294f0759..4a34ac4d9a63 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/gallium/auxiliary/util/u_cpu_detect.h @@ -72,6 +72,7 @@ struct util_cpu_caps { unsigned has_xop:1; unsigned has_altivec:1; unsigned has_daz:1; + unsigned has_neon:1; unsigned has_avx512f:1; unsigned has_avx512dq:1; From patchwork Mon Apr 24 21:59:50 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [Mesa-dev, 3/4] vc4: Use a wrapper file to set VC4_BUILD_NEON instead of CFLAGS. From: Eric Anholt X-Patchwork-Id: 152502 Message-Id: <20170424215951.21544-3-eric@anholt.net> To: mesa-dev@lists.freedesktop.org Date: Mon, 24 Apr 2017 14:59:50 -0700 Android.mk was setting the flag across the entire driver, so we didn't have non-NEON versions getting built. This was going to be a problem with the next commit, when I start auto-detecting NEON support and use the non-NEON version when appropriate. --- Rob: I'm happy to just drop this patch if you'd rather go the other route for the Android build. I do think this makes for a slightly faster and simpler build, due to not having the intermediate lib. src/gallium/drivers/vc4/Makefile.am | 6 ------ src/gallium/drivers/vc4/Makefile.sources | 1 + src/gallium/drivers/vc4/vc4_tiling_lt_neon.c | 30 ++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 src/gallium/drivers/vc4/vc4_tiling_lt_neon.c diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am index b361a0c588a8..0ed49b128b2d 100644 --- a/src/gallium/drivers/vc4/Makefile.am +++ b/src/gallium/drivers/vc4/Makefile.am @@ -41,10 +41,4 @@ libvc4_la_SOURCES = $(C_SOURCES) libvc4_la_LIBADD = $(SIM_LIB) $(VC4_LIBS) libvc4_la_LDFLAGS = $(SIM_LDFLAGS) -noinst_LTLIBRARIES += libvc4_neon.la -libvc4_la_LIBADD += libvc4_neon.la - -libvc4_neon_la_SOURCES = vc4_tiling_lt.c -libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -DVC4_BUILD_NEON - EXTRA_DIST = kernel/README diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 10de34361260..442d7a561782 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -56,6 +56,7 @@ C_SOURCES := \ vc4_state.c \ vc4_tiling.c \ vc4_tiling_lt.c \ + vc4_tiling_lt_neon.c \ vc4_tiling.h \ vc4_uniforms.c \ $() diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c new file mode 100644 index 000000000000..7ba66ae4cdf4 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_tiling_lt_neon.c @@ -0,0 +1,30 @@ +/* + * Copyright © 2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* Wrapper file for building vc4_tiling_lt.c with the "build NEON assembly if + * possible" flag set, since Android.mk doesn't have a way to set CFLAGS for a + * single file. + */ + +#define VC4_BUILD_NEON +#include "vc4_tiling_lt.c" From patchwork Mon Apr 24 21:59:51 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [Mesa-dev, 4/4] vc4: Use runtime CPU detection for whether NEON is available. From: Eric Anholt X-Patchwork-Id: 152501 Message-Id: <20170424215951.21544-4-eric@anholt.net> To: mesa-dev@lists.freedesktop.org Date: Mon, 24 Apr 2017 14:59:51 -0700 This will allow Raspbian's ARMv6 builds to take advantage of the new NEON code, and could prevent problems if vc4 ends up getting used on a v7 CPU without NEON. v2: Drop dead NEON_SUFFIX (noted by Erik Faye-Lund) --- src/gallium/drivers/vc4/vc4_screen.c | 3 +++ src/gallium/drivers/vc4/vc4_tiling.h | 27 +++++++++++++-------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index b5b1ced49fd5..ce6a9dbaa6cc 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -27,6 +27,7 @@ #include "pipe/p_screen.h" #include "pipe/p_state.h" +#include "util/u_cpu_detect.h" #include "util/u_debug.h" #include "util/u_memory.h" #include "util/u_format.h" @@ -628,6 +629,8 @@ vc4_screen_create(int fd) if (!vc4_get_chip_info(screen)) goto fail; + util_cpu_detect(); + slab_create_parent(&screen->transfer_pool, sizeof(struct vc4_transfer), 16); vc4_fence_init(screen); diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h index ba1ad6fb3f7d..3168ec20a606 100644 --- a/src/gallium/drivers/vc4/vc4_tiling.h +++ b/src/gallium/drivers/vc4/vc4_tiling.h @@ -27,6 +27,7 @@ #include #include #include "util/macros.h" +#include "util/u_cpu_detect.h" /** Return the width in pixels of a 64-byte microtile. */ static inline uint32_t @@ -83,23 +84,18 @@ void vc4_store_tiled_image(void *dst, uint32_t dst_stride, uint8_t tiling_format, int cpp, const struct pipe_box *box); -/* If we're building for ARMv7 (Pi 2+), assume it has NEON. For Raspbian we - * should extend this to have some runtime detection of being built for ARMv6 - * on a Pi 2+. - */ -#if defined(__ARM_ARCH) && __ARM_ARCH == 7 -#define NEON_SUFFIX(x) x ## _neon -#else -#define NEON_SUFFIX(x) x ## _base -#endif - static inline void vc4_load_lt_image(void *dst, uint32_t dst_stride, void *src, uint32_t src_stride, int cpp, const struct pipe_box *box) { - NEON_SUFFIX(vc4_load_lt_image)(dst, dst_stride, src, src_stride, + if (util_cpu_caps.has_neon) { + vc4_load_lt_image_neon(dst, dst_stride, src, src_stride, cpp, box); + } else { + vc4_load_lt_image_base(dst, dst_stride, src, src_stride, + cpp, box); + } } static inline void @@ -107,10 +103,13 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride, void *src, uint32_t src_stride, int cpp, const struct pipe_box *box) { - NEON_SUFFIX(vc4_store_lt_image)(dst, dst_stride, src, src_stride, + if (util_cpu_caps.has_neon) { + vc4_store_lt_image_neon(dst, dst_stride, src, src_stride, cpp, box); + } else { + vc4_store_lt_image_base(dst, dst_stride, src, src_stride, + cpp, box); + } } -#undef NEON_SUFFIX - #endif /* VC4_TILING_H */