From 1f594d5911a8835e47bd9559275b12a2237a3d53 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 1 Apr 2021 10:53:45 +1000 Subject: [PATCH] add missing patch --- cpu-affinity-fixes-20.3.3.patch | 1583 +++++++++++++++++++++++++++++++ 1 file changed, 1583 insertions(+) create mode 100644 cpu-affinity-fixes-20.3.3.patch diff --git a/cpu-affinity-fixes-20.3.3.patch b/cpu-affinity-fixes-20.3.3.patch new file mode 100644 index 0000000..d11f5c4 --- /dev/null +++ b/cpu-affinity-fixes-20.3.3.patch @@ -0,0 +1,1583 @@ +diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp +index cb646e2dd30..eac0a244adf 100644 +--- a/src/amd/compiler/tests/main.cpp ++++ b/src/amd/compiler/tests/main.cpp +@@ -34,6 +34,8 @@ + #include "aco_ir.h" + #include "framework.h" + ++#include "util/u_cpu_detect.h" ++ + static const char *help_message = + "Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n" + "\n" +@@ -227,6 +229,8 @@ int main(int argc, char **argv) + return 99; + } + ++ util_cpu_detect(); ++ + if (do_list) { + for (auto test : tests) + printf("%s\n", test.first.c_str()); +diff --git a/src/compiler/glsl/standalone.cpp b/src/compiler/glsl/standalone.cpp +index ca187001186..2714d8b95ed 100644 +--- a/src/compiler/glsl/standalone.cpp ++++ b/src/compiler/glsl/standalone.cpp +@@ -401,6 +401,8 @@ standalone_compile_shader(const struct standalone_options *_options, + int status = EXIT_SUCCESS; + bool glsl_es = false; + ++ util_cpu_detect(); ++ + options = _options; + + switch (options->glsl_version) { +diff --git a/src/compiler/nir/tests/negative_equal_tests.cpp b/src/compiler/nir/tests/negative_equal_tests.cpp +index f83041a4fbf..76472e48309 100644 +--- a/src/compiler/nir/tests/negative_equal_tests.cpp ++++ b/src/compiler/nir/tests/negative_equal_tests.cpp +@@ -36,6 +36,7 @@ protected: + const_value_negative_equal_test() + { + glsl_type_singleton_init_or_ref(); ++ util_cpu_detect(); + + memset(c1, 0, sizeof(c1)); + memset(c2, 0, sizeof(c2)); +@@ -55,6 +56,7 @@ protected: + alu_srcs_negative_equal_test() + { + glsl_type_singleton_init_or_ref(); ++ util_cpu_detect(); + + static const nir_shader_compiler_options options = { }; + nir_builder_init_simple_shader(&bld, NULL, MESA_SHADER_VERTEX, &options); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c +index 165d73d94fc..33269e528fe 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c +@@ -104,13 +104,13 @@ lp_build_min_simple(struct lp_build_context *bld, + + /* TODO: optimize the constant case */ + +- if (type.floating && util_cpu_caps.has_sse) { ++ if (type.floating && util_get_cpu_caps()->has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.min.ss"; + intr_size = 128; + } +- else if (type.length <= 4 || !util_cpu_caps.has_avx) { ++ else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse.min.ps"; + intr_size = 128; + } +@@ -119,12 +119,12 @@ lp_build_min_simple(struct lp_build_context *bld, + intr_size = 256; + } + } +- if (type.width == 64 && util_cpu_caps.has_sse2) { ++ if (type.width == 64 && util_get_cpu_caps()->has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.min.sd"; + intr_size = 128; + } +- else if (type.length == 2 || !util_cpu_caps.has_avx) { ++ else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse2.min.pd"; + intr_size = 128; + } +@@ -134,7 +134,7 @@ lp_build_min_simple(struct lp_build_context *bld, + } + } + } +- else if (type.floating && util_cpu_caps.has_altivec) { ++ else if (type.floating && util_get_cpu_caps()->has_altivec) { + if (nan_behavior == GALLIVM_NAN_RETURN_NAN || + nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { + debug_printf("%s: altivec doesn't support nan return nan behavior\n", +@@ -144,7 +144,7 @@ lp_build_min_simple(struct lp_build_context *bld, + intrinsic = "llvm.ppc.altivec.vminfp"; + intr_size = 128; + } +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intr_size = 128; + if (type.width == 8) { + if (!type.sign) { +@@ -174,7 +174,7 @@ lp_build_min_simple(struct lp_build_context *bld, + * The sse intrinsics return the second operator in case of nan by + * default so we need to special code to handle those. + */ +- if (util_cpu_caps.has_sse && type.floating && ++ if (util_get_cpu_caps()->has_sse && type.floating && + nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && + nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && + nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { +@@ -274,13 +274,13 @@ lp_build_max_simple(struct lp_build_context *bld, + + /* TODO: optimize the constant case */ + +- if (type.floating && util_cpu_caps.has_sse) { ++ if (type.floating && util_get_cpu_caps()->has_sse) { + if (type.width == 32) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse.max.ss"; + intr_size = 128; + } +- else if (type.length <= 4 || !util_cpu_caps.has_avx) { ++ else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse.max.ps"; + intr_size = 128; + } +@@ -289,12 +289,12 @@ lp_build_max_simple(struct lp_build_context *bld, + intr_size = 256; + } + } +- if (type.width == 64 && util_cpu_caps.has_sse2) { ++ if (type.width == 64 && util_get_cpu_caps()->has_sse2) { + if (type.length == 1) { + intrinsic = "llvm.x86.sse2.max.sd"; + intr_size = 128; + } +- else if (type.length == 2 || !util_cpu_caps.has_avx) { ++ else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { + intrinsic = "llvm.x86.sse2.max.pd"; + intr_size = 128; + } +@@ -304,7 +304,7 @@ lp_build_max_simple(struct lp_build_context *bld, + } + } + } +- else if (type.floating && util_cpu_caps.has_altivec) { ++ else if (type.floating && util_get_cpu_caps()->has_altivec) { + if (nan_behavior == GALLIVM_NAN_RETURN_NAN || + nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { + debug_printf("%s: altivec doesn't support nan return nan behavior\n", +@@ -314,7 +314,7 @@ lp_build_max_simple(struct lp_build_context *bld, + intrinsic = "llvm.ppc.altivec.vmaxfp"; + intr_size = 128; + } +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intr_size = 128; + if (type.width == 8) { + if (!type.sign) { +@@ -338,7 +338,7 @@ lp_build_max_simple(struct lp_build_context *bld, + } + + if (intrinsic) { +- if (util_cpu_caps.has_sse && type.floating && ++ if (util_get_cpu_caps()->has_sse && type.floating && + nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && + nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && + nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { +@@ -472,12 +472,12 @@ lp_build_add(struct lp_build_context *bld, + return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); + } + if (type.width * type.length == 128) { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; + if (type.width == 16) +@@ -485,7 +485,7 @@ lp_build_add(struct lp_build_context *bld, + } + } + if (type.width * type.length == 256) { +- if (util_cpu_caps.has_avx2) { ++ if (util_get_cpu_caps()->has_avx2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; + if (type.width == 16) +@@ -713,11 +713,11 @@ lp_build_hadd_partial4(struct lp_build_context *bld, + tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; + tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; + +- if (util_cpu_caps.has_sse3 && bld->type.width == 32 && ++ if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 && + bld->type.length == 4) { + intrinsic = "llvm.x86.sse3.hadd.ps"; + } +- else if (util_cpu_caps.has_avx && bld->type.width == 32 && ++ else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 && + bld->type.length == 8) { + intrinsic = "llvm.x86.avx.hadd.ps.256"; + } +@@ -796,12 +796,12 @@ lp_build_sub(struct lp_build_context *bld, + return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); + } + if (type.width * type.length == 128) { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; + if (type.width == 16) +@@ -809,7 +809,7 @@ lp_build_sub(struct lp_build_context *bld, + } + } + if (type.width * type.length == 256) { +- if (util_cpu_caps.has_avx2) { ++ if (util_get_cpu_caps()->has_avx2) { + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; + if (type.width == 16) +@@ -1078,8 +1078,8 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, + */ + if (LLVM_VERSION_MAJOR < 7 && + (bld->type.length == 4 || bld->type.length == 8) && +- ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || +- util_cpu_caps.has_sse4_1)) { ++ ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) || ++ util_get_cpu_caps()->has_sse4_1)) { + const char *intrinsic = NULL; + LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; + LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; +@@ -1096,7 +1096,7 @@ lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, + aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); + bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); + +- if (util_cpu_caps.has_avx2 && bld->type.length == 8) { ++ if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) { + if (bld->type.sign) { + intrinsic = "llvm.x86.avx2.pmul.dq"; + } else { +@@ -1331,8 +1331,8 @@ lp_build_div(struct lp_build_context *bld, + + /* fast rcp is disabled (just uses div), so makes no sense to try that */ + if(FALSE && +- ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && ++ ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) && + type.floating) + return lp_build_mul(bld, a, lp_build_rcp(bld, b)); + +@@ -1745,7 +1745,7 @@ lp_build_abs(struct lp_build_context *bld, + return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); + } + +- if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) { ++ if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) { + switch(type.width) { + case 8: + return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); +@@ -1755,7 +1755,7 @@ lp_build_abs(struct lp_build_context *bld, + return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); + } + } +- else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) { ++ else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) { + switch(type.width) { + case 8: + return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); +@@ -1897,15 +1897,15 @@ lp_build_int_to_float(struct lp_build_context *bld, + static boolean + arch_rounding_available(const struct lp_type type) + { +- if ((util_cpu_caps.has_sse4_1 && ++ if ((util_get_cpu_caps()->has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) || +- (util_cpu_caps.has_avx && type.width*type.length == 256) || +- (util_cpu_caps.has_avx512f && type.width*type.length == 512)) ++ (util_get_cpu_caps()->has_avx && type.width*type.length == 256) || ++ (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512)) + return TRUE; +- else if ((util_cpu_caps.has_altivec && ++ else if ((util_get_cpu_caps()->has_altivec && + (type.width == 32 && type.length == 4))) + return TRUE; +- else if (util_cpu_caps.has_neon) ++ else if (util_get_cpu_caps()->has_neon) + return TRUE; + + return FALSE; +@@ -1935,7 +1935,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld, + assert(type.width == 32); + + assert(lp_check_value(type, a)); +- assert(util_cpu_caps.has_sse2); ++ assert(util_get_cpu_caps()->has_sse2); + + /* This is relying on MXCSR rounding mode, which should always be nearest. */ + if (type.length == 1) { +@@ -1961,7 +1961,7 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld, + } + else { + assert(type.width*type.length == 256); +- assert(util_cpu_caps.has_avx); ++ assert(util_get_cpu_caps()->has_avx); + + intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; + } +@@ -1987,7 +1987,7 @@ lp_build_round_altivec(struct lp_build_context *bld, + assert(type.floating); + + assert(lp_check_value(type, a)); +- assert(util_cpu_caps.has_altivec); ++ assert(util_get_cpu_caps()->has_altivec); + + (void)type; + +@@ -2014,7 +2014,7 @@ lp_build_round_arch(struct lp_build_context *bld, + LLVMValueRef a, + enum lp_build_round_mode mode) + { +- if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) { ++ if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) { + LLVMBuilderRef builder = bld->gallivm->builder; + const struct lp_type type = bld->type; + const char *intrinsic_root; +@@ -2042,7 +2042,7 @@ lp_build_round_arch(struct lp_build_context *bld, + lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); + return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); + } +- else /* (util_cpu_caps.has_altivec) */ ++ else /* (util_get_cpu_caps()->has_altivec) */ + return lp_build_round_altivec(bld, a, mode); + } + +@@ -2377,9 +2377,9 @@ lp_build_iround(struct lp_build_context *bld, + + assert(lp_check_value(type, a)); + +- if ((util_cpu_caps.has_sse2 && ++ if ((util_get_cpu_caps()->has_sse2 && + ((type.width == 32) && (type.length == 1 || type.length == 4))) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { + return lp_build_iround_nearest_sse2(bld, a); + } + if (arch_rounding_available(type)) { +@@ -2664,8 +2664,8 @@ lp_build_rcp(struct lp_build_context *bld, + * particular uses that require less workarounds. + */ + +- if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ ++ if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){ + const unsigned num_iterations = 0; + LLVMValueRef res; + unsigned i; +@@ -2784,8 +2784,8 @@ lp_build_fast_rsqrt_available(struct lp_type type) + { + assert(type.floating); + +- if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || +- (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { ++ if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || ++ (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { + return true; + } + return false; +@@ -3694,7 +3694,7 @@ lp_build_is_inf_or_nan(struct gallivm_state *gallivm, + LLVMValueRef + lp_build_fpstate_get(struct gallivm_state *gallivm) + { +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef mxcsr_ptr = lp_build_alloca( + gallivm, +@@ -3715,7 +3715,7 @@ void + lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, + boolean zero) + { +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ + int daz_ftz = _MM_FLUSH_ZERO_MASK; + +@@ -3724,7 +3724,7 @@ lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, + LLVMValueRef mxcsr = + LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr"); + +- if (util_cpu_caps.has_daz) { ++ if (util_get_cpu_caps()->has_daz) { + /* Enable denormals are zero mode */ + daz_ftz |= _MM_DENORMALS_ZERO_MASK; + } +@@ -3745,7 +3745,7 @@ void + lp_build_fpstate_set(struct gallivm_state *gallivm, + LLVMValueRef mxcsr_ptr) + { +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + LLVMBuilderRef builder = gallivm->builder; + mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, + LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c +index c68b8850473..af445b00c1a 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c +@@ -101,7 +101,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm, + LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); + LLVMValueRef h; + +- if (util_cpu_caps.has_f16c && ++ if (util_get_cpu_caps()->has_f16c && + (src_length == 4 || src_length == 8)) { + if (LLVM_VERSION_MAJOR < 11) { + const char *intrinsic = NULL; +@@ -167,7 +167,7 @@ lp_build_float_to_half(struct gallivm_state *gallivm, + * useless. + */ + +- if (util_cpu_caps.has_f16c && ++ if (util_get_cpu_caps()->has_f16c && + (length == 4 || length == 8)) { + struct lp_type i168_type = lp_type_int_vec(16, 16 * 8); + unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */ +@@ -489,7 +489,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, + + /* Special case 4x4x32 --> 1x16x8 */ + if (src_type.length == 4 && +- (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) ++ (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) + { + num_dsts = (num_srcs + 3) / 4; + dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4; +@@ -500,7 +500,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm, + + /* Special case 2x8x32 --> 1x16x8 */ + if (src_type.length == 8 && +- util_cpu_caps.has_avx) ++ util_get_cpu_caps()->has_avx) + { + num_dsts = (num_srcs + 1) / 2; + dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8; +@@ -597,7 +597,7 @@ lp_build_conv(struct gallivm_state *gallivm, + ((dst_type.length == 16 && 4 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) && + +- (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec)) ++ (util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec)) + { + struct lp_build_context bld; + struct lp_type int16_type, int32_type; +@@ -710,7 +710,7 @@ lp_build_conv(struct gallivm_state *gallivm, + ((dst_type.length == 16 && 2 * num_dsts == num_srcs) || + (num_dsts == 1 && dst_type.length * num_srcs == 8)) && + +- util_cpu_caps.has_avx) { ++ util_get_cpu_caps()->has_avx) { + + struct lp_build_context bld; + struct lp_type int16_type, int32_type; +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c +index 174857e06d9..e17c7881e7d 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c +@@ -642,8 +642,8 @@ s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm, + * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1. + * Much cheaper (but we don't care that much if n == 1). + */ +- if ((util_cpu_caps.has_sse2 && n == 4) || +- (util_cpu_caps.has_avx2 && n == 8)) { ++ if ((util_get_cpu_caps()->has_sse2 && n == 4) || ++ (util_get_cpu_caps()->has_avx2 && n == 8)) { + color2_2 = lp_build_pavgb(&bld8, colors0, colors1); + color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, ""); + } +@@ -1350,7 +1350,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm, + if (is_dxt1_variant) { + LLVMValueRef color23_2, color2_2; + +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + LLVMValueRef intrargs[2]; + intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, ""); + /* same interleave as for lerp23 - correct result in 2nd element */ +@@ -1389,7 +1389,7 @@ s3tc_decode_block_dxt1(struct gallivm_state *gallivm, + color23 = lp_build_select(&bld32, sel_mask, color23, color23_2); + } + +- if (util_cpu_caps.has_ssse3) { ++ if (util_get_cpu_caps()->has_ssse3) { + /* + * Use pshufb as mini-lut. (Only doable with intrinsics as the + * final shuffles are non-constant. pshufb is awesome!) +@@ -1689,7 +1689,7 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm, + type16.sign = FALSE; + sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, ""); + +- if (!util_cpu_caps.has_ssse3) { ++ if (!util_get_cpu_caps()->has_ssse3) { + LLVMValueRef acodeg, mask1, acode0, acode1; + + /* extraction of the 3 bit values into something more useful is HARD */ +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +index 121452d7596..97deffe1de0 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +@@ -90,7 +90,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm, + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ +- if (util_cpu_caps.has_sse2 && n > 1) { ++ if (util_get_cpu_caps()->has_sse2 && n > 1) { + LLVMValueRef sel, tmp, tmp2; + struct lp_build_context bld32; + +@@ -174,7 +174,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm, + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ +- if (util_cpu_caps.has_sse2 && n > 1) { ++ if (util_get_cpu_caps()->has_sse2 && n > 1) { + LLVMValueRef sel, tmp; + struct lp_build_context bld32; + +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c +index e991b0dc375..42cc17371a0 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c +@@ -488,7 +488,7 @@ lp_build_gather(struct gallivm_state *gallivm, + * 32bit/64bit fetches you're doing it wrong (this is gather, not + * conversion) and it would be awkward for floats. + */ +- } else if (util_cpu_caps.has_avx2 && !need_expansion && ++ } else if (util_get_cpu_caps()->has_avx2 && !need_expansion && + src_width == 32 && (length == 4 || length == 8)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); +@@ -500,7 +500,7 @@ lp_build_gather(struct gallivm_state *gallivm, + * (In general, should be more of a win if the fetch is 256bit wide - + * this is true for the 32bit case above too.) + */ +- } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && ++ } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion && + src_width == 64 && (length == 2 || length == 4)) { + return lp_build_gather_avx2(gallivm, length, src_width, dst_type, + base_ptr, offsets); +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c +index 685ed0e58aa..dd428242cb9 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c +@@ -433,6 +433,7 @@ lp_build_init(void) + /* For simulating less capable machines */ + #ifdef DEBUG + if (debug_get_bool_option("LP_FORCE_SSE2", FALSE)) { ++ extern struct util_cpu_caps_t util_cpu_caps; + assert(util_cpu_caps.has_sse2); + util_cpu_caps.has_sse3 = 0; + util_cpu_caps.has_ssse3 = 0; +@@ -445,7 +446,7 @@ lp_build_init(void) + } + #endif + +- if (util_cpu_caps.has_avx2 || util_cpu_caps.has_avx) { ++ if (util_get_cpu_caps()->has_avx2 || util_get_cpu_caps()->has_avx) { + lp_native_vector_width = 256; + } else { + /* Leave it at 128, even when no SIMD extensions are available. +@@ -460,16 +461,16 @@ lp_build_init(void) + #if LLVM_VERSION_MAJOR < 4 + if (lp_native_vector_width <= 128) { + /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by +- * "util_cpu_caps.has_avx" predicate, and lack the ++ * "util_get_cpu_caps()->has_avx" predicate, and lack the + * "lp_native_vector_width > 128" predicate. And also to ensure a more + * consistent behavior, allowing one to test SSE2 on AVX machines. + * XXX: should not play games with util_cpu_caps directly as it might + * get used for other things outside llvm too. + */ +- util_cpu_caps.has_avx = 0; +- util_cpu_caps.has_avx2 = 0; +- util_cpu_caps.has_f16c = 0; +- util_cpu_caps.has_fma = 0; ++ util_get_cpu_caps()->has_avx = 0; ++ util_get_cpu_caps()->has_avx2 = 0; ++ util_get_cpu_caps()->has_f16c = 0; ++ util_get_cpu_caps()->has_fma = 0; + } + #endif + +@@ -482,7 +483,7 @@ lp_build_init(void) + * Right now denorms get explicitly disabled (but elsewhere) for x86, + * whereas ppc64 explicitly enables them... + */ +- if (util_cpu_caps.has_altivec) { ++ if (util_get_cpu_caps()->has_altivec) { + unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, + 0xFFFF, 0xFFFF, 0xFFFE, 0xFFFF }; + __asm ( +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c +index 315977ae745..3ed3b5a74b1 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c +@@ -196,7 +196,7 @@ lp_build_compare(struct gallivm_state *gallivm, + + if (!type.floating && !type.sign && + type.width * type.length == 128 && +- util_cpu_caps.has_sse2 && ++ util_get_cpu_caps()->has_sse2 && + (func == PIPE_FUNC_LESS || + func == PIPE_FUNC_LEQUAL || + func == PIPE_FUNC_GREATER || +@@ -348,11 +348,11 @@ lp_build_select(struct lp_build_context *bld, + + res = LLVMBuildSelect(builder, mask, a, b, ""); + } +- else if (((util_cpu_caps.has_sse4_1 && ++ else if (((util_get_cpu_caps()->has_sse4_1 && + type.width * type.length == 128) || +- (util_cpu_caps.has_avx && ++ (util_get_cpu_caps()->has_avx && + type.width * type.length == 256 && type.width >= 32) || +- (util_cpu_caps.has_avx2 && ++ (util_get_cpu_caps()->has_avx2 && + type.width * type.length == 256)) && + !LLVMIsConstant(a) && + !LLVMIsConstant(b) && +@@ -379,7 +379,7 @@ lp_build_select(struct lp_build_context *bld, + intrinsic = "llvm.x86.avx.blendv.ps.256"; + arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); + } else { +- assert(util_cpu_caps.has_avx2); ++ assert(util_get_cpu_caps()->has_avx2); + intrinsic = "llvm.x86.avx2.pblendvb"; + arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32); + } +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +index 9b75676a4e2..4f3e696816c 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp ++++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +@@ -400,22 +400,22 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + * http://llvm.org/PR19429 + * http://llvm.org/PR16721 + */ +- MAttrs.push_back(util_cpu_caps.has_sse ? "+sse" : "-sse" ); +- MAttrs.push_back(util_cpu_caps.has_sse2 ? "+sse2" : "-sse2" ); +- MAttrs.push_back(util_cpu_caps.has_sse3 ? "+sse3" : "-sse3" ); +- MAttrs.push_back(util_cpu_caps.has_ssse3 ? "+ssse3" : "-ssse3" ); +- MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1"); +- MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2"); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse ? "+sse" : "-sse" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse2 ? "+sse2" : "-sse2" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse3 ? "+sse3" : "-sse3" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_ssse3 ? "+ssse3" : "-ssse3" ); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse4_1 ? "+sse4.1" : "-sse4.1"); ++ MAttrs.push_back(util_get_cpu_caps()->has_sse4_2 ? "+sse4.2" : "-sse4.2"); + /* + * AVX feature is not automatically detected from CPUID by the X86 target + * yet, because the old (yet default) JIT engine is not capable of + * emitting the opcodes. On newer llvm versions it is and at least some + * versions (tested with 3.3) will emit avx opcodes without this anyway. + */ +- MAttrs.push_back(util_cpu_caps.has_avx ? "+avx" : "-avx"); +- MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c"); +- MAttrs.push_back(util_cpu_caps.has_fma ? "+fma" : "-fma"); +- MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2"); ++ MAttrs.push_back(util_get_cpu_caps()->has_avx ? "+avx" : "-avx"); ++ MAttrs.push_back(util_get_cpu_caps()->has_f16c ? "+f16c" : "-f16c"); ++ MAttrs.push_back(util_get_cpu_caps()->has_fma ? "+fma" : "-fma"); ++ MAttrs.push_back(util_get_cpu_caps()->has_avx2 ? "+avx2" : "-avx2"); + /* disable avx512 and all subvariants */ + MAttrs.push_back("-avx512cd"); + MAttrs.push_back("-avx512er"); +@@ -426,7 +426,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + MAttrs.push_back("-avx512vl"); + #endif + #if defined(PIPE_ARCH_ARM) +- if (!util_cpu_caps.has_neon) { ++ if (!util_get_cpu_caps()->has_neon) { + MAttrs.push_back("-neon"); + MAttrs.push_back("-crypto"); + MAttrs.push_back("-vfp2"); +@@ -434,7 +434,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + #endif + + #if defined(PIPE_ARCH_PPC) +- MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec"); ++ MAttrs.push_back(util_get_cpu_caps()->has_altivec ? "+altivec" : "-altivec"); + #if (LLVM_VERSION_MAJOR < 4) + /* + * Make sure VSX instructions are disabled +@@ -444,7 +444,7 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + * https://llvm.org/bugs/show_bug.cgi?id=33531 (fixed in 4.0) + * https://llvm.org/bugs/show_bug.cgi?id=34647 (llc performance on certain unusual shader IR; intro'd in 4.0, pending as of 5.0) + */ +- if (util_cpu_caps.has_altivec) { ++ if (util_get_cpu_caps()->has_altivec) { + MAttrs.push_back("-vsx"); + } + #else +@@ -458,8 +458,8 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT, + * Make sure VSX instructions are ENABLED (if supported), unless + * VSX instructions are explicitly enabled/disabled via GALLIVM_VSX=1 or 0. + */ +- if (util_cpu_caps.has_altivec) { +- MAttrs.push_back(util_cpu_caps.has_vsx ? "+vsx" : "-vsx"); ++ if (util_get_cpu_caps()->has_altivec) { ++ MAttrs.push_back(util_get_cpu_caps()->has_vsx ? "+vsx" : "-vsx"); + } + #endif + #endif +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c +index e1f652a9342..76e57c52f80 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c +@@ -322,7 +322,7 @@ lp_build_interleave2(struct gallivm_state *gallivm, + { + LLVMValueRef shuffle; + +- if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) { ++ if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) { + /* + * XXX: This is a workaround for llvm code generation deficiency. Strangely + * enough, while this needs vinsertf128/vextractf128 instructions (hence +@@ -484,7 +484,7 @@ lp_build_unpack2_native(struct gallivm_state *gallivm, + + /* Interleave bits */ + #if UTIL_ARCH_LITTLE_ENDIAN +- if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) { ++ if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) { + *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0); + *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1); + } else { +@@ -585,22 +585,22 @@ lp_build_pack2(struct gallivm_state *gallivm, + assert(src_type.length * 2 == dst_type.length); + + /* Check for special cases first */ +- if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) && ++ if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) && + src_type.width * src_type.length >= 128) { + const char *intrinsic = NULL; + boolean swap_intrinsic_operands = FALSE; + + switch(src_type.width) { + case 32: +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + if (dst_type.sign) { + intrinsic = "llvm.x86.sse2.packssdw.128"; + } else { +- if (util_cpu_caps.has_sse4_1) { ++ if (util_get_cpu_caps()->has_sse4_1) { + intrinsic = "llvm.x86.sse41.packusdw"; + } + } +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + if (dst_type.sign) { + intrinsic = "llvm.ppc.altivec.vpkswss"; + } else { +@@ -613,18 +613,18 @@ lp_build_pack2(struct gallivm_state *gallivm, + break; + case 16: + if (dst_type.sign) { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + intrinsic = "llvm.x86.sse2.packsswb.128"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intrinsic = "llvm.ppc.altivec.vpkshss"; + #if UTIL_ARCH_LITTLE_ENDIAN + swap_intrinsic_operands = TRUE; + #endif + } + } else { +- if (util_cpu_caps.has_sse2) { ++ if (util_get_cpu_caps()->has_sse2) { + intrinsic = "llvm.x86.sse2.packuswb.128"; +- } else if (util_cpu_caps.has_altivec) { ++ } else if (util_get_cpu_caps()->has_altivec) { + intrinsic = "llvm.ppc.altivec.vpkshus"; + #if UTIL_ARCH_LITTLE_ENDIAN + swap_intrinsic_operands = TRUE; +@@ -740,7 +740,7 @@ lp_build_pack2_native(struct gallivm_state *gallivm, + + /* At this point only have special case for avx2 */ + if (src_type.length * src_type.width == 256 && +- util_cpu_caps.has_avx2) { ++ util_get_cpu_caps()->has_avx2) { + switch(src_type.width) { + case 32: + if (dst_type.sign) { +@@ -793,7 +793,7 @@ lp_build_packs2(struct gallivm_state *gallivm, + + /* All X86 SSE non-interleaved pack instructions take signed inputs and + * saturate them, so no need to clamp for those cases. */ +- if(util_cpu_caps.has_sse2 && ++ if(util_get_cpu_caps()->has_sse2 && + src_type.width * src_type.length >= 128 && + src_type.sign && + (src_type.width == 32 || src_type.width == 16)) +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c +index 686abc08620..98dcde912b5 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c +@@ -1152,7 +1152,7 @@ lp_build_minify(struct lp_build_context *bld, + LLVMValueRef size; + assert(bld->type.sign); + if (lod_scalar || +- (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) { ++ (util_get_cpu_caps()->has_avx2 || !util_get_cpu_caps()->has_sse)) { + size = LLVMBuildLShr(builder, base_size, level, "minify"); + size = lp_build_max(bld, size, bld->one); + } +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +index 2b91edd37c7..6e47640e70d 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c ++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +@@ -3234,7 +3234,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, + * as it appears to be a loss with just AVX) + */ + if (num_quads == 1 || !use_aos || +- (util_cpu_caps.has_avx2 && ++ (util_get_cpu_caps()->has_avx2 && + (bld.num_lods == 1 || + derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) { + if (use_aos) { +diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c +index b1c8b990ef1..03b11f914b4 100644 +--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c ++++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c +@@ -35,10 +35,10 @@ + + DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", false); + +-static struct util_cpu_caps *get_cpu_caps(void) ++static const struct util_cpu_caps_t *get_cpu_caps(void) + { + util_cpu_detect(); +- return &util_cpu_caps; ++ return util_get_cpu_caps(); + } + + int rtasm_cpu_has_sse(void) +diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +index ad687f32853..ddd65fb6a08 100644 +--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c ++++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +@@ -2152,17 +2152,17 @@ static void x86_init_func_common( struct x86_function *p ) + { + util_cpu_detect(); + p->caps = 0; +- if(util_cpu_caps.has_mmx) ++ if(util_get_cpu_caps()->has_mmx) + p->caps |= X86_MMX; +- if(util_cpu_caps.has_mmx2) ++ if(util_get_cpu_caps()->has_mmx2) + p->caps |= X86_MMX2; +- if(util_cpu_caps.has_sse) ++ if(util_get_cpu_caps()->has_sse) + p->caps |= X86_SSE; +- if(util_cpu_caps.has_sse2) ++ if(util_get_cpu_caps()->has_sse2) + p->caps |= X86_SSE2; +- if(util_cpu_caps.has_sse3) ++ if(util_get_cpu_caps()->has_sse3) + p->caps |= X86_SSE3; +- if(util_cpu_caps.has_sse4_1) ++ if(util_get_cpu_caps()->has_sse4_1) + p->caps |= X86_SSE4_1; + p->csr = p->store; + #if defined(PIPE_ARCH_X86) +diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c +index 1eaff77724e..bf56993db09 100644 +--- a/src/gallium/auxiliary/util/u_threaded_context.c ++++ b/src/gallium/auxiliary/util/u_threaded_context.c +@@ -2071,8 +2071,8 @@ tc_set_context_param(struct pipe_context *_pipe, + if (param == PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE) { + /* Pin the gallium thread as requested. */ + util_set_thread_affinity(tc->queue.threads[0], +- util_cpu_caps.L3_affinity_mask[value], +- NULL, UTIL_MAX_CPUS); ++ util_get_cpu_caps()->L3_affinity_mask[value], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); + + /* Execute this immediately (without enqueuing). + * It's required to be thread-safe. +@@ -2720,7 +2720,7 @@ threaded_context_create(struct pipe_context *pipe, + + util_cpu_detect(); + +- if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1)) ++ if (!debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1)) + return pipe; + + tc = os_malloc_aligned(sizeof(struct threaded_context), 16); +diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c +index 64cf72ae101..913c1bd2462 100644 +--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c ++++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c +@@ -435,7 +435,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, + assert(type.length <= 16); + assert(type.floating); + +- if(util_cpu_caps.has_sse && type.length == 4) { ++ if(util_get_cpu_caps()->has_sse && type.length == 4) { + const char *movmskintr = "llvm.x86.sse.movmsk.ps"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, +@@ -446,7 +446,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm, + LLVMInt32TypeInContext(context), bits); + count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); + } +- else if(util_cpu_caps.has_avx && type.length == 8) { ++ else if(util_get_cpu_caps()->has_avx && type.length == 8) { + const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; + const char *popcntintr = "llvm.ctpop.i32"; + LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, +diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c +index f133bbf8a4d..628a4338c1e 100644 +--- a/src/gallium/drivers/llvmpipe/lp_screen.c ++++ b/src/gallium/drivers/llvmpipe/lp_screen.c +@@ -915,7 +915,7 @@ llvmpipe_create_screen(struct sw_winsys *winsys) + + screen->allow_cl = !!getenv("LP_CL"); + screen->use_tgsi = (LP_DEBUG & DEBUG_TGSI_IR); +- screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0; ++ screen->num_threads = util_get_cpu_caps()->nr_cpus > 1 ? util_get_cpu_caps()->nr_cpus : 0; + #ifdef EMBEDDED_DEVICE + screen->num_threads = 0; + #endif +diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c +index 873dcf37fac..725854cc25c 100644 +--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c ++++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c +@@ -382,7 +382,7 @@ flush_denorm_to_zero(float val) + fi_val.f = val; + + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + if ((fi_val.ui & 0x7f800000) == 0) { + fi_val.ui &= 0xff800000; + } +@@ -458,7 +458,7 @@ test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test, unsigned + continue; + } + +- if (!util_cpu_caps.has_neon && ++ if (!util_get_cpu_caps()->has_neon && + test->ref == &nearbyintf && length == 2 && + ref != roundf(testval)) { + /* FIXME: The generic (non SSE) path in lp_build_iround, which is +diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c +index 2bf223d66f9..815736166d5 100644 +--- a/src/gallium/drivers/llvmpipe/lp_texture.c ++++ b/src/gallium/drivers/llvmpipe/lp_texture.c +@@ -85,7 +85,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, + * of a block for all formats) though this should not be strictly necessary + * neither. In any case it can only affect compressed or 1d textures. + */ +- unsigned mip_align = MAX2(64, util_cpu_caps.cacheline); ++ unsigned mip_align = MAX2(64, util_get_cpu_caps()->cacheline); + + assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS); + assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS); +@@ -123,7 +123,7 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen, + if (util_format_is_compressed(pt->format)) + lpr->row_stride[level] = nblocksx * block_size; + else +- lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline); ++ lpr->row_stride[level] = align(nblocksx * block_size, util_get_cpu_caps()->cacheline); + + /* if row_stride * height > LP_MAX_TEXTURE_SIZE */ + if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) { +diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp +index 97db7ca3e8b..d891b6b14e8 100644 +--- a/src/gallium/drivers/swr/swr_loader.cpp ++++ b/src/gallium/drivers/swr/swr_loader.cpp +@@ -91,7 +91,7 @@ swr_create_screen(struct sw_winsys *winsys) + + util_cpu_detect(); + +- if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) { ++ if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512er) { + swr_print_info("SWR detected KNL instruction support "); + #ifndef HAVE_SWR_KNL + swr_print_info("(skipping: not built).\n"); +@@ -103,7 +103,7 @@ swr_create_screen(struct sw_winsys *winsys) + #endif + } + +- if (util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512bw) { ++ if (util_get_cpu_caps()->has_avx512f && util_get_cpu_caps()->has_avx512bw) { + swr_print_info("SWR detected SKX instruction support "); + #ifndef HAVE_SWR_SKX + swr_print_info("(skipping not built).\n"); +@@ -113,7 +113,7 @@ swr_create_screen(struct sw_winsys *winsys) + #endif + } + +- if (util_cpu_caps.has_avx2) { ++ if (util_get_cpu_caps()->has_avx2) { + swr_print_info("SWR detected AVX2 instruction support "); + #ifndef HAVE_SWR_AVX2 + swr_print_info("(skipping not built).\n"); +@@ -123,7 +123,7 @@ swr_create_screen(struct sw_winsys *winsys) + #endif + } + +- if (util_cpu_caps.has_avx) { ++ if (util_get_cpu_caps()->has_avx) { + swr_print_info("SWR detected AVX instruction support "); + #ifndef HAVE_SWR_AVX + swr_print_info("(skipping not built).\n"); +diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h +index 66767e7f1f8..5afe32939a8 100644 +--- a/src/gallium/drivers/vc4/vc4_tiling.h ++++ b/src/gallium/drivers/vc4/vc4_tiling.h +@@ -90,7 +90,7 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride, + int cpp, const struct pipe_box *box) + { + #ifdef USE_ARM_ASM +- if (util_cpu_caps.has_neon) { ++ if (util_get_cpu_caps()->has_neon) { + vc4_load_lt_image_neon(dst, dst_stride, src, src_stride, + cpp, box); + return; +@@ -106,7 +106,7 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride, + int cpp, const struct pipe_box *box) + { + #ifdef USE_ARM_ASM +- if (util_cpu_caps.has_neon) { ++ if (util_get_cpu_caps()->has_neon) { + vc4_store_lt_image_neon(dst, dst_stride, src, src_stride, + cpp, box); + return; +diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c +index 4d9c4e27ebf..782f16e7f78 100644 +--- a/src/gallium/tests/unit/translate_test.c ++++ b/src/gallium/tests/unit/translate_test.c +@@ -50,6 +50,7 @@ int main(int argc, char** argv) + { + struct translate *(*create_fn)(const struct translate_key *key) = 0; + ++ extern struct util_cpu_caps_t util_cpu_caps; + struct translate_key key; + unsigned output_format; + unsigned input_format; +@@ -87,7 +88,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse")) + { +- if(!util_cpu_caps.has_sse || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE (test with qemu)\n"); + return 2; +@@ -99,7 +100,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse2")) + { +- if(!util_cpu_caps.has_sse2 || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse2 || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE2 (test with qemu)\n"); + return 2; +@@ -110,7 +111,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse3")) + { +- if(!util_cpu_caps.has_sse3 || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse3 || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE3 (test with qemu)\n"); + return 2; +@@ -120,7 +121,7 @@ int main(int argc, char** argv) + } + else if (!strcmp(argv[1], "sse4.1")) + { +- if(!util_cpu_caps.has_sse4_1 || !rtasm_cpu_has_sse()) ++ if(!util_get_cpu_caps()->has_sse4_1 || !rtasm_cpu_has_sse()) + { + printf("Error: CPU doesn't support SSE4.1 (test with qemu)\n"); + return 2; +diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c +index 7f2eba9382b..4474cfb82b0 100644 +--- a/src/gallium/tests/unit/u_half_test.c ++++ b/src/gallium/tests/unit/u_half_test.c +@@ -36,13 +36,14 @@ test(void) + int + main(int argc, char **argv) + { +- assert(!util_cpu_caps.has_f16c); ++ util_cpu_detect(); + test(); + +- /* Test f16c. */ +- util_cpu_detect(); +- if (util_cpu_caps.has_f16c) ++ /* Test non-f16c. */ ++ if (util_get_cpu_caps()->has_f16c) { ++ ((struct util_cpu_caps_t *)util_get_cpu_caps())->has_f16c = false; + test(); ++ } + + printf("Success!\n"); + return 0; +diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +index 8a0aedfed64..a18362ce6ea 100644 +--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c ++++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +@@ -312,8 +312,8 @@ static void amdgpu_pin_threads_to_L3_cache(struct radeon_winsys *rws, + struct amdgpu_winsys *ws = amdgpu_winsys(rws); + + util_set_thread_affinity(ws->cs_queue.threads[0], +- util_cpu_caps.L3_affinity_mask[cache], +- NULL, UTIL_MAX_CPUS); ++ util_get_cpu_caps()->L3_affinity_mask[cache], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); + } + + static uint32_t kms_handle_hash(const void *key) +diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +index f0e1b9f7df3..4430ce50466 100644 +--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c ++++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +@@ -801,8 +801,8 @@ static void radeon_pin_threads_to_L3_cache(struct radeon_winsys *ws, + + if (util_queue_is_initialized(&rws->cs_queue)) { + util_set_thread_affinity(rws->cs_queue.threads[0], +- util_cpu_caps.L3_affinity_mask[cache], +- NULL, UTIL_MAX_CPUS); ++ util_get_cpu_caps()->L3_affinity_mask[cache], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); + } + } + +diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c +index eb8eb30cabc..c9dfef541fc 100644 +--- a/src/mesa/main/glthread.c ++++ b/src/mesa/main/glthread.c +@@ -199,19 +199,20 @@ _mesa_glthread_flush_batch(struct gl_context *ctx) + /* Pin threads regularly to the same Zen CCX that the main thread is + * running on. The main thread can move between CCXs. + */ +- if (util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 && ++ if (util_get_cpu_caps()->nr_cpus != util_get_cpu_caps()->cores_per_L3 && + /* driver support */ + ctx->Driver.PinDriverToL3Cache && + ++glthread->pin_thread_counter % 128 == 0) { + int cpu = util_get_current_cpu(); + + if (cpu >= 0) { +- unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu]; +- +- util_set_thread_affinity(glthread->queue.threads[0], +- util_cpu_caps.L3_affinity_mask[L3_cache], +- NULL, UTIL_MAX_CPUS); +- ctx->Driver.PinDriverToL3Cache(ctx, L3_cache); ++ uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu]; ++ if (L3_cache != U_CPU_INVALID_L3) { ++ util_set_thread_affinity(glthread->queue.threads[0], ++ util_get_cpu_caps()->L3_affinity_mask[L3_cache], ++ NULL, util_get_cpu_caps()->num_cpu_mask_bits); ++ ctx->Driver.PinDriverToL3Cache(ctx, L3_cache); ++ } + } + } + +diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c +index 40364296664..f27fa7ff29c 100644 +--- a/src/mesa/state_tracker/st_context.c ++++ b/src/mesa/state_tracker/st_context.c +@@ -815,6 +815,10 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe, + !st->lower_ucp; + st->shader_has_one_variant[MESA_SHADER_COMPUTE] = st->has_shareable_shaders; + ++ if (util_get_cpu_caps()->cores_per_L3 == util_get_cpu_caps()->nr_cpus || ++ !st->pipe->set_context_param) ++ st->pin_thread_counter = ST_L3_PINNING_DISABLED; ++ + st->bitmap.cache.empty = true; + + if (ctx->Const.ForceGLNamesReuse && ctx->Shared->RefCount == 1) { +diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h +index b1fda06ff3e..9ab6969de62 100644 +--- a/src/mesa/state_tracker/st_context.h ++++ b/src/mesa/state_tracker/st_context.h +@@ -55,6 +55,7 @@ struct st_program; + struct st_perf_monitor_group; + struct u_upload_mgr; + ++#define ST_L3_PINNING_DISABLED 0xffffffff + + struct st_bitmap_cache + { +@@ -130,6 +131,9 @@ struct st_context + struct draw_stage *feedback_stage; /**< For GL_FEEDBACK rendermode */ + struct draw_stage *selection_stage; /**< For GL_SELECT rendermode */ + struct draw_stage *rastpos_stage; /**< For glRasterPos */ ++ ++ unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */ ++ + GLboolean clamp_frag_color_in_shader; + GLboolean clamp_vert_color_in_shader; + boolean clamp_frag_depth_in_shader; +@@ -235,8 +239,6 @@ struct st_context + /** This masks out unused shader resources. Only valid in draw calls. */ + uint64_t active_states; + +- unsigned pin_thread_counter; /* for L3 thread pinning on AMD Zen */ +- + /* If true, further analysis of states is required to know if something + * has changed. Used mainly for shaders. + */ +diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c +index 996d985510c..159d7017b07 100644 +--- a/src/mesa/state_tracker/st_draw.c ++++ b/src/mesa/state_tracker/st_draw.c +@@ -124,26 +124,26 @@ prepare_draw(struct st_context *st, struct gl_context *ctx) + st_validate_state(st, ST_PIPELINE_RENDER); + } + +- struct pipe_context *pipe = st->pipe; +- + /* Pin threads regularly to the same Zen CCX that the main thread is + * running on. The main thread can move between CCXs. + */ +- if (unlikely(/* AMD Zen */ +- util_cpu_caps.nr_cpus != util_cpu_caps.cores_per_L3 && ++ if (unlikely(st->pin_thread_counter != ST_L3_PINNING_DISABLED && + /* no glthread */ + ctx->CurrentClientDispatch != ctx->MarshalExec && +- /* driver support */ +- pipe->set_context_param && + /* do it occasionally */ + ++st->pin_thread_counter % 512 == 0)) { ++ st->pin_thread_counter = 0; ++ + int cpu = util_get_current_cpu(); + if (cpu >= 0) { +- unsigned L3_cache = util_cpu_caps.cpu_to_L3[cpu]; +- +- pipe->set_context_param(pipe, +- PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE, +- L3_cache); ++ struct pipe_context *pipe = st->pipe; ++ uint16_t L3_cache = util_get_cpu_caps()->cpu_to_L3[cpu]; ++ ++ if (L3_cache != U_CPU_INVALID_L3) { ++ pipe->set_context_param(pipe, ++ PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE, ++ L3_cache); ++ } + } + } + } +diff --git a/src/util/half_float.h b/src/util/half_float.h +index c52bccf8d1e..8f1a1dbf11d 100644 +--- a/src/util/half_float.h ++++ b/src/util/half_float.h +@@ -59,7 +59,7 @@ static inline uint16_t + _mesa_float_to_half(float val) + { + #if defined(USE_X86_64_ASM) +- if (util_cpu_caps.has_f16c) { ++ if (util_get_cpu_caps()->has_f16c) { + __m128 in = {val}; + __m128i out; + +@@ -75,7 +75,7 @@ static inline float + _mesa_half_to_float(uint16_t val) + { + #if defined(USE_X86_64_ASM) +- if (util_cpu_caps.has_f16c) { ++ if (util_get_cpu_caps()->has_f16c) { + __m128i in = {val}; + __m128 out; + +@@ -90,7 +90,7 @@ static inline uint16_t + _mesa_float_to_float16_rtz(float val) + { + #if defined(USE_X86_64_ASM) +- if (util_cpu_caps.has_f16c) { ++ if (util_get_cpu_caps()->has_f16c) { + __m128 in = {val}; + __m128i out; + +diff --git a/src/util/tests/format/u_format_test.c b/src/util/tests/format/u_format_test.c +index f4a62a5c6a8..e6473c2bf6d 100644 +--- a/src/util/tests/format/u_format_test.c ++++ b/src/util/tests/format/u_format_test.c +@@ -850,6 +850,8 @@ int main(int argc, char **argv) + { + boolean success; + ++ util_cpu_detect(); ++ + success = test_all(); + + return success ? 0 : 1; +diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c +index 025f2f30156..4a4b06e1bc6 100644 +--- a/src/util/u_cpu_detect.c ++++ b/src/util/u_cpu_detect.c +@@ -90,7 +90,7 @@ + DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false) + + +-struct util_cpu_caps util_cpu_caps; ++struct util_cpu_caps_t util_cpu_caps; + + #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + static int has_cpuid(void); +@@ -438,26 +438,22 @@ get_cpu_topology(void) + util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus; + util_cpu_caps.num_L3_caches = 1; + ++ memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3)); ++ + #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* AMD Zen */ + if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 && + util_cpu_caps.family < CPU_AMD_LAST) { + uint32_t regs[4]; + +- /* Query the L3 cache count. */ +- cpuid_count(0x8000001D, 3, regs); +- unsigned cache_level = (regs[0] >> 5) & 0x7; +- unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; +- +- if (cache_level != 3 || cores_per_L3 == util_cpu_caps.nr_cpus) +- return; +- + uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; + uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; +- uint32_t allowed_mask[UTIL_MAX_CPUS / 32] = {0}; +- uint32_t apic_id[UTIL_MAX_CPUS]; + bool saved = false; + ++ uint32_t L3_found[UTIL_MAX_CPUS] = {0}; ++ uint32_t num_L3_caches = 0; ++ util_affinity_mask *L3_affinity_masks = NULL; ++ + /* Query APIC IDs from each CPU core. + * + * An APIC ID is a logical ID of the CPU with respect to the cache +@@ -482,41 +478,60 @@ get_cpu_topology(void) + + if (util_set_current_thread_affinity(mask, + !saved ? saved_mask : NULL, +- UTIL_MAX_CPUS)) { ++ util_cpu_caps.num_cpu_mask_bits)) { + saved = true; +- allowed_mask[i / 32] |= cpu_bit; + + /* Query the APIC ID of the current core. */ + cpuid(0x00000001, regs); +- apic_id[i] = regs[1] >> 24; ++ unsigned apic_id = regs[1] >> 24; ++ ++ /* Query the total core count for the CPU */ ++ uint32_t core_count = 1; ++ if (regs[3] & (1 << 28)) ++ core_count = (regs[1] >> 16) & 0xff; ++ ++ core_count = util_next_power_of_two(core_count); ++ ++ /* Query the L3 cache count. */ ++ cpuid_count(0x8000001D, 3, regs); ++ unsigned cache_level = (regs[0] >> 5) & 0x7; ++ unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; ++ ++ if (cache_level != 3) ++ continue; ++ ++ unsigned local_core_id = apic_id & (core_count - 1); ++ unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count); ++ unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3); ++#define L3_ID(p, i) (p << 16 | i << 1 | 1); ++ ++ unsigned l3_id = L3_ID(phys_id, local_l3_cache_index); ++ int idx = -1; ++ for (unsigned c = 0; c < num_L3_caches; c++) { ++ if (L3_found[c] == l3_id) { ++ idx = c; ++ break; ++ } ++ } ++ if (idx == -1) { ++ idx = num_L3_caches; ++ L3_found[num_L3_caches++] = l3_id; ++ L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches); ++ if (!L3_affinity_masks) ++ return; ++ memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask)); ++ } ++ util_cpu_caps.cpu_to_L3[i] = idx; ++ L3_affinity_masks[idx][i / 32] |= cpu_bit; ++ + } + mask[i / 32] = 0; + } + +- if (saved) { +- +- /* We succeeded in using at least one CPU. */ +- util_cpu_caps.num_L3_caches = util_cpu_caps.nr_cpus / cores_per_L3; +- util_cpu_caps.cores_per_L3 = cores_per_L3; +- util_cpu_caps.L3_affinity_mask = calloc(sizeof(util_affinity_mask), +- util_cpu_caps.num_L3_caches); +- +- for (unsigned i = 0; i < util_cpu_caps.nr_cpus && i < UTIL_MAX_CPUS; +- i++) { +- uint32_t cpu_bit = 1u << (i % 32); +- +- if (allowed_mask[i / 32] & cpu_bit) { +- /* Each APIC ID bit represents a topology level, so we need +- * to round up to the next power of two. +- */ +- unsigned L3_index = apic_id[i] / +- util_next_power_of_two(cores_per_L3); +- +- util_cpu_caps.L3_affinity_mask[L3_index][i / 32] |= cpu_bit; +- util_cpu_caps.cpu_to_L3[i] = L3_index; +- } +- } ++ util_cpu_caps.num_L3_caches = num_L3_caches; ++ util_cpu_caps.L3_affinity_mask = L3_affinity_masks; + ++ if (saved) { + if (debug_get_option_dump_cpu()) { + fprintf(stderr, "CPU <-> L3 cache mapping:\n"); + for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) { +@@ -528,7 +543,8 @@ get_cpu_topology(void) + } + + /* Restore the original affinity mask. */ +- util_set_current_thread_affinity(saved_mask, NULL, UTIL_MAX_CPUS); ++ util_set_current_thread_affinity(saved_mask, NULL, ++ util_cpu_caps.num_cpu_mask_bits); + } else { + if (debug_get_option_dump_cpu()) + fprintf(stderr, "Cannot set thread affinity for any thread.\n"); +@@ -547,7 +563,7 @@ util_cpu_detect_once(void) + { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); +- util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors; ++ util_cpu_caps.nr_cpus = MAX2(1, system_info.dwNumberOfProcessors); + } + #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN) + util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); +@@ -569,6 +585,8 @@ util_cpu_detect_once(void) + util_cpu_caps.nr_cpus = 1; + #endif + ++ util_cpu_caps.num_cpu_mask_bits = align(util_cpu_caps.nr_cpus, 32); ++ + /* Make the fallback cacheline size nonzero so that it can be + * safely passed to align(). + */ +diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h +index a76fd912910..1c7239b2ec7 100644 +--- a/src/util/u_cpu_detect.h ++++ b/src/util/u_cpu_detect.h +@@ -55,7 +55,7 @@ enum cpu_family { + + typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32]; + +-struct util_cpu_caps { ++struct util_cpu_caps_t { + int nr_cpus; + enum cpu_family family; + +@@ -98,14 +98,27 @@ struct util_cpu_caps { + + unsigned num_L3_caches; + unsigned cores_per_L3; ++ unsigned num_cpu_mask_bits; + + uint16_t cpu_to_L3[UTIL_MAX_CPUS]; + /* Affinity masks for each L3 cache. */ + util_affinity_mask *L3_affinity_mask; + }; + +-extern struct util_cpu_caps +-util_cpu_caps; ++#define U_CPU_INVALID_L3 0xffff ++ ++static inline const struct util_cpu_caps_t * ++util_get_cpu_caps(void) ++{ ++ extern struct util_cpu_caps_t util_cpu_caps; ++ ++ /* If you hit this assert, it means that something is using the ++ * cpu-caps without having first called util_cpu_detect() ++ */ ++ assert(util_cpu_caps.nr_cpus >= 1); ++ ++ return &util_cpu_caps; ++} + + void util_cpu_detect(void); + +diff --git a/src/util/u_math.c b/src/util/u_math.c +index 9a8a9ecbbde..41e7f599eb0 100644 +--- a/src/util/u_math.c ++++ b/src/util/u_math.c +@@ -92,7 +92,7 @@ util_fpstate_get(void) + unsigned mxcsr = 0; + + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + mxcsr = _mm_getcsr(); + } + #endif +@@ -110,10 +110,10 @@ unsigned + util_fpstate_set_denorms_to_zero(unsigned current_mxcsr) + { + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + /* Enable flush to zero mode */ + current_mxcsr |= _MM_FLUSH_ZERO_MASK; +- if (util_cpu_caps.has_daz) { ++ if (util_get_cpu_caps()->has_daz) { + /* Enable denormals are zero mode */ + current_mxcsr |= _MM_DENORMALS_ZERO_MASK; + } +@@ -132,7 +132,7 @@ void + util_fpstate_set(unsigned mxcsr) + { + #if defined(PIPE_ARCH_SSE) +- if (util_cpu_caps.has_sse) { ++ if (util_get_cpu_caps()->has_sse) { + _mm_setcsr(mxcsr); + } + #endif +diff --git a/src/util/u_queue.c b/src/util/u_queue.c +index b11b297a45c..8f21f0667c6 100644 +--- a/src/util/u_queue.c ++++ b/src/util/u_queue.c +@@ -27,7 +27,7 @@ + #include "u_queue.h" + + #include "c11/threads.h" +- ++#include "util/u_cpu_detect.h" + #include "util/os_time.h" + #include "util/u_string.h" + #include "util/u_thread.h" +@@ -258,7 +258,8 @@ util_queue_thread_func(void *input) + uint32_t mask[UTIL_MAX_CPUS / 32]; + + memset(mask, 0xff, sizeof(mask)); +- util_set_current_thread_affinity(mask, NULL, UTIL_MAX_CPUS); ++ util_set_current_thread_affinity(mask, NULL, ++ util_get_cpu_caps()->num_cpu_mask_bits); + } + + #if defined(__linux__)