diff --git a/0001-Initial-public-Mesa-SWR.patch b/0001-Initial-public-Mesa-SWR.patch new file mode 100644 index 0000000..528678e --- /dev/null +++ b/0001-Initial-public-Mesa-SWR.patch @@ -0,0 +1,6428 @@ +From 293435cf5955935a6ce43bf59a6d743aad8be6d8 Mon Sep 17 00:00:00 2001 +From: Tim Rowley +Date: Mon, 19 Oct 2015 13:31:29 -0500 +Subject: [PATCH 1/3] Initial public Mesa+SWR + +--- + README.md | 33 + + configure.ac | 54 + + src/gallium/Makefile.am | 4 + + src/gallium/SConscript | 1 + + src/gallium/auxiliary/gallivm/lp_bld_flow.h | 7 + + src/gallium/auxiliary/gallivm/lp_bld_init.h | 7 + + src/gallium/auxiliary/gallivm/lp_bld_sample.h | 6 + + src/gallium/auxiliary/gallivm/lp_bld_tgsi.h | 8 + + .../auxiliary/target-helpers/inline_sw_helper.h | 13 +- + .../target-helpers/inline_wrapper_sw_helper.h | 2 +- + src/gallium/drivers/swr/.clang-format | 64 + + src/gallium/drivers/swr/Automake.inc | 28 + + src/gallium/drivers/swr/Makefile.am | 82 ++ + src/gallium/drivers/swr/Makefile.sources | 114 ++ + src/gallium/drivers/swr/SConscript | 69 + + src/gallium/drivers/swr/swr_clear.cpp | 141 ++ + src/gallium/drivers/swr/swr_context.cpp | 392 ++++++ + src/gallium/drivers/swr/swr_context.h | 172 +++ + src/gallium/drivers/swr/swr_context_llvm.h | 124 ++ + src/gallium/drivers/swr/swr_draw.cpp | 277 ++++ + src/gallium/drivers/swr/swr_fence.cpp | 141 ++ + src/gallium/drivers/swr/swr_fence.h | 73 ++ + src/gallium/drivers/swr/swr_memory.h | 99 ++ + src/gallium/drivers/swr/swr_public.h | 40 + + src/gallium/drivers/swr/swr_query.cpp | 334 +++++ + src/gallium/drivers/swr/swr_query.h | 48 + + src/gallium/drivers/swr/swr_resource.h | 98 ++ + src/gallium/drivers/swr/swr_scratch.cpp | 116 ++ + src/gallium/drivers/swr/swr_scratch.h | 63 + + src/gallium/drivers/swr/swr_screen.cpp | 666 ++++++++++ + src/gallium/drivers/swr/swr_screen.h | 52 + + src/gallium/drivers/swr/swr_shader.cpp | 608 +++++++++ + src/gallium/drivers/swr/swr_shader.h | 61 + + src/gallium/drivers/swr/swr_state.cpp | 1344 ++++++++++++++++++++ + src/gallium/drivers/swr/swr_state.h | 240 ++++ + src/gallium/drivers/swr/swr_tex_sample.cpp | 338 +++++ + src/gallium/drivers/swr/swr_tex_sample.h | 47 + + src/gallium/targets/libgl-xlib/Makefile.am | 5 + + src/gallium/targets/libgl-xlib/SConscript | 4 + + src/gallium/targets/osmesa/Makefile.am | 6 + + 40 files changed, 5979 insertions(+), 2 deletions(-) + create mode 100644 README.md + create mode 100644 src/gallium/drivers/swr/.clang-format + create mode 100644 src/gallium/drivers/swr/Automake.inc + create mode 100644 src/gallium/drivers/swr/Makefile.am + create mode 100644 src/gallium/drivers/swr/Makefile.sources + create mode 100644 src/gallium/drivers/swr/SConscript + create mode 100644 src/gallium/drivers/swr/swr_clear.cpp + create mode 100644 src/gallium/drivers/swr/swr_context.cpp + create mode 100644 src/gallium/drivers/swr/swr_context.h + create mode 100644 src/gallium/drivers/swr/swr_context_llvm.h + create mode 100644 src/gallium/drivers/swr/swr_draw.cpp + create mode 100644 src/gallium/drivers/swr/swr_fence.cpp + create mode 100644 src/gallium/drivers/swr/swr_fence.h + create mode 100644 src/gallium/drivers/swr/swr_memory.h + create mode 100644 src/gallium/drivers/swr/swr_public.h + create mode 100644 src/gallium/drivers/swr/swr_query.cpp + create mode 100644 src/gallium/drivers/swr/swr_query.h + create mode 100644 src/gallium/drivers/swr/swr_resource.h + create mode 100644 src/gallium/drivers/swr/swr_scratch.cpp + create mode 100644 src/gallium/drivers/swr/swr_scratch.h + create mode 100644 src/gallium/drivers/swr/swr_screen.cpp + create mode 100644 src/gallium/drivers/swr/swr_screen.h + create mode 100644 src/gallium/drivers/swr/swr_shader.cpp + create mode 100644 src/gallium/drivers/swr/swr_shader.h + create mode 100644 src/gallium/drivers/swr/swr_state.cpp + create mode 100644 src/gallium/drivers/swr/swr_state.h + create mode 100644 src/gallium/drivers/swr/swr_tex_sample.cpp + create mode 100644 src/gallium/drivers/swr/swr_tex_sample.h + +diff --git a/README.md b/README.md +new file mode 100644 +index 0000000..3bf3031 +--- /dev/null ++++ b/README.md +@@ -0,0 +1,33 @@ ++OpenSWR-Mesa ++============ ++ ++Overview ++-------- ++ ++This is repository of the integration work combining the high ++performance, highly scalable core SWR rasterizer with Mesa. A more ++complete introduction and discussion towards upstreaming to the Mesa ++project can be found on the mesa-dev mailing list. ++ ++Notes ++----- ++ ++* SWR is set as the default software renderer. Use ++GALLIUM_DRIVER=llvmpipe to switch to Mesa's standard rasterizer. This ++particular change is to make it easier for people evaluating OpenSWR, ++and will not be upstreamed. ++ ++* LLVM-3.6 is required. ++ ++* To build SWR with autoconf, include the following in the config ++line: "--with-gallium-drivers=swr --enable-swr-native". ++ ++* Build defaults to AVX2; for a version to run on AVX build with ++ "--with-swr-arch=AVX". ++ ++* To build SWR with SCons, nothing needs to be done - it is built by ++ default. ++ ++* Code for the driver is in src/gallium/drivers/swr ++ ++* Code for the rasterizer is in src/gallium/drivers/swr/rasterizer +diff --git a/configure.ac b/configure.ac +index d3df195..f216dc7 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -1753,6 +1753,11 @@ AC_SUBST([LLVM_LIBS]) + AC_SUBST([LLVM_LDFLAGS]) + AC_SUBST([LLVM_INCLUDEDIR]) + AC_SUBST([LLVM_VERSION]) ++AC_SUBST([SWR_LIBDIR]) ++AC_SUBST([SWR_ARCH]) ++AC_SUBST([SWR_ARCH_FLAG]) ++AC_SUBST([SWR_NATIVE]) ++AC_SUBST([SWR_INCLUDEDIR]) + AC_SUBST([CLANG_RESOURCE_DIR]) + + case "x$enable_opengl$enable_gles1$enable_gles2" in +@@ -2177,6 +2182,9 @@ if test -n "$with_gallium_drivers"; then + HAVE_GALLIUM_LLVMPIPE=yes + fi + ;; ++ xswr) ++ HAVE_GALLIUM_SWR=yes ++ ;; + xvc4) + HAVE_GALLIUM_VC4=yes + gallium_require_drm "vc4" +@@ -2243,6 +2251,41 @@ if test "x$MESA_LLVM" != x0; then + fi + fi + ++dnl SWR include/library ++ ++AC_ARG_WITH([swr-includedir], ++ [AS_HELP_STRING([--with-swr-includedir], [Path to SWR includes])], ++ [SWR_INCLUDEDIR="$withval"], ++ [SWR_INCLUDEDIR='']) ++ ++AC_ARG_WITH([swr-libdir], ++ [AS_HELP_STRING([--with-swr-libdir], [Path to SWR library])], ++ [SWR_LIBDIR="$withval"], ++ [SWR_LIBDIR='']) ++ ++AC_ARG_WITH([swr-arch], ++ [AS_HELP_STRING([--with-swr-arch], [AVX architecture for swr (AVX | CORE_AVX2) ])], ++ [SWR_ARCH="$withval"], ++ [SWR_ARCH="CORE-AVX2"]) ++ ++case "$SWR_ARCH" in ++"AVX") ++ SWR_ARCH_FLAG='-march=core-avx-i -DKNOB_ARCH=KNOB_ARCH_AVX ' ++ ;; ++"CORE-AVX2") ++ SWR_ARCH_FLAG='-march=core-avx2 -DKNOB_ARCH=KNOB_ARCH_AVX2 ' ++ ;; ++**) ++ SWR_ARCH_FLAG='-march=core-avx2 -DKNOB_ARCH=KNOB_ARCH_AVX2 ' ++esac ++ ++AC_ARG_ENABLE([swr-native], ++ [AS_HELP_STRING([--enable-swr-native], ++ [use in-tree version of SWR core @<:@default=disabled@:>@])], ++ [enable_swr_native="$enableval"], ++ [enable_swr_native=no] ++) ++ + AM_CONDITIONAL(HAVE_GALLIUM_SVGA, test "x$HAVE_GALLIUM_SVGA" = xyes) + AM_CONDITIONAL(HAVE_GALLIUM_I915, test "x$HAVE_GALLIUM_I915" = xyes) + AM_CONDITIONAL(HAVE_GALLIUM_ILO, test "x$HAVE_GALLIUM_ILO" = xyes) +@@ -2255,6 +2298,8 @@ AM_CONDITIONAL(HAVE_GALLIUM_NOUVEAU, test "x$HAVE_GALLIUM_NOUVEAU" = xyes) + AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes) + AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes) + AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes) ++AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes) ++AM_CONDITIONAL(SWR_NATIVE, test "x$enable_swr_native" = xyes) + AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes) + + AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno) +@@ -2374,6 +2419,7 @@ AC_CONFIG_FILES([Makefile + src/gallium/drivers/rbug/Makefile + src/gallium/drivers/softpipe/Makefile + src/gallium/drivers/svga/Makefile ++ src/gallium/drivers/swr/Makefile + src/gallium/drivers/trace/Makefile + src/gallium/drivers/vc4/Makefile + src/gallium/state_trackers/clover/Makefile +@@ -2562,6 +2608,14 @@ if test "x$MESA_LLVM" = x1; then + echo " LLVM_LDFLAGS: $LLVM_LDFLAGS" + echo "" + fi ++if test "x$HAVE_GALLIUM_SWR" = xyes; then ++ echo " SWR_INCLUDEDIR: $SWR_INCLUDEDIR" ++ echo " SWR_LIBDIR: $SWR_LIBDIR" ++ echo " SWR_ARCH: $SWR_ARCH" ++ echo " SWR_ARCH_FLAG: $SWR_ARCH_FLAG" ++ echo " SWR_NATIVE: $enable_swr_native" ++ echo "" ++fi + echo " PYTHON2: $PYTHON2" + + echo "" +diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am +index a7c3606..dcce6a3 100644 +--- a/src/gallium/Makefile.am ++++ b/src/gallium/Makefile.am +@@ -77,6 +77,10 @@ SUBDIRS += drivers/llvmpipe + endif + endif + ++if HAVE_GALLIUM_SWR ++SUBDIRS += drivers/swr ++endif ++ + ## vc4/rpi + if HAVE_GALLIUM_VC4 + SUBDIRS += drivers/vc4 winsys/vc4/drm +diff --git a/src/gallium/SConscript b/src/gallium/SConscript +index fa5fa6e..766c24a 100644 +--- a/src/gallium/SConscript ++++ b/src/gallium/SConscript +@@ -17,6 +17,7 @@ SConscript([ + 'drivers/softpipe/SConscript', + 'drivers/svga/SConscript', + 'drivers/trace/SConscript', ++ 'drivers/swr/SConscript', + ]) + + # +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h +index 0da849b..083b0ad 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h ++++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h +@@ -37,6 +37,9 @@ + + #include "gallivm/lp_bld.h" + ++#ifdef __cplusplus ++extern "C" { ++#endif + + struct lp_type; + +@@ -198,4 +201,8 @@ lp_build_array_alloca(struct gallivm_state *gallivm, + LLVMValueRef count, + const char *name); + ++#ifdef __cplusplus ++} ++#endif ++ + #endif /* !LP_BLD_FLOW_H */ +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h +index 9e50f88..ab44661 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h ++++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h +@@ -35,6 +35,9 @@ + #include "lp_bld.h" + #include + ++#ifdef __cplusplus ++extern "C" { ++#endif + + struct gallivm_state + { +@@ -82,4 +85,8 @@ void + lp_set_store_alignment(LLVMValueRef Inst, + unsigned Align); + ++#ifdef __cplusplus ++} ++#endif ++ + #endif /* !LP_BLD_INIT_H */ +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h +index eba758d..5f53c47 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h ++++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h +@@ -42,6 +42,9 @@ + #include "gallivm/lp_bld_type.h" + #include "gallivm/lp_bld_swizzle.h" + ++#ifdef __cplusplus ++extern "C" { ++#endif + + struct pipe_resource; + struct pipe_sampler_view; +@@ -612,5 +615,8 @@ lp_build_minify(struct lp_build_context *bld, + LLVMValueRef level, + boolean lod_scalar); + ++#ifdef __cplusplus ++} ++#endif + + #endif /* LP_BLD_SAMPLE_H */ +diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +index 2ca9c61..189d03d 100644 +--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h ++++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +@@ -48,6 +48,10 @@ + #include "tgsi/tgsi_scan.h" + #include "tgsi/tgsi_info.h" + ++#ifdef __cplusplus ++extern "C" { ++#endif ++ + #define LP_CHAN_ALL ~0 + + #define LP_MAX_INSTRUCTIONS 256 +@@ -661,4 +665,8 @@ lp_build_tgsi_llvm( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_token *tokens); + ++#ifdef __cplusplus ++} ++#endif ++ + #endif /* LP_BLD_TGSI_H */ +diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h +index 5f46552..e67dd17 100644 +--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h ++++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h +@@ -19,6 +19,10 @@ + #include "llvmpipe/lp_public.h" + #endif + ++#ifdef GALLIUM_SWR ++#include "swr/swr_public.h" ++#endif ++ + + static inline struct pipe_screen * + sw_screen_create_named(struct sw_winsys *winsys, const char *driver) +@@ -30,6 +34,11 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver) + screen = llvmpipe_create_screen(winsys); + #endif + ++#if defined(GALLIUM_SWR) ++ if (screen == NULL && strcmp(driver, "swr") == 0) ++ screen = swr_create_screen(winsys); ++#endif ++ + #if defined(GALLIUM_SOFTPIPE) + if (screen == NULL) + screen = softpipe_create_screen(winsys); +@@ -45,7 +54,9 @@ sw_screen_create(struct sw_winsys *winsys) + const char *default_driver; + const char *driver; + +-#if defined(GALLIUM_LLVMPIPE) ++#if defined(GALLIUM_SWR) ++ default_driver = "swr"; ++#elif defined(GALLIUM_LLVMPIPE) + default_driver = "llvmpipe"; + #elif defined(GALLIUM_SOFTPIPE) + default_driver = "softpipe"; +diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h +index 4f38ba9..d707b8b 100644 +--- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h ++++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h +@@ -12,7 +12,7 @@ + static inline struct pipe_screen * + sw_screen_wrap(struct pipe_screen *screen) + { +-#if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE) ++#if defined(GALLIUM_SOFTPIPE) || defined(GALLIUM_LLVMPIPE) || defined(GALLIUM_SWR) + struct sw_winsys *sws; + struct pipe_screen *sw_screen = NULL; + const char *driver; +diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format +new file mode 100644 +index 0000000..0ec65a5 +--- /dev/null ++++ b/src/gallium/drivers/swr/.clang-format +@@ -0,0 +1,64 @@ ++--- ++Language: Cpp ++AccessModifierOffset: -3 ++AlignAfterOpenBracket: true ++AlignEscapedNewlinesLeft: false ++AlignOperands: false ++AlignTrailingComments: false ++AllowAllParametersOfDeclarationOnNextLine: true ++AllowShortBlocksOnASingleLine: false ++AllowShortCaseLabelsOnASingleLine: false ++AllowShortIfStatementsOnASingleLine: false ++AllowShortLoopsOnASingleLine: false ++AllowShortFunctionsOnASingleLine: All ++AlwaysBreakAfterDefinitionReturnType: true ++AlwaysBreakTemplateDeclarations: false ++AlwaysBreakBeforeMultilineStrings: false ++BreakBeforeBinaryOperators: NonAssignment ++BreakBeforeTernaryOperators: true ++BreakConstructorInitializersBeforeComma: true ++BinPackParameters: false ++BinPackArguments: false ++ColumnLimit: 78 ++ConstructorInitializerAllOnOneLineOrOnePerLine: false ++ConstructorInitializerIndentWidth: 3 ++DerivePointerAlignment: false ++ExperimentalAutoDetectBinPacking: false ++IndentCaseLabels: false ++IndentWrappedFunctionNames: false ++IndentFunctionDeclarationAfterType: false ++MaxEmptyLinesToKeep: 2 ++KeepEmptyLinesAtTheStartOfBlocks: true ++NamespaceIndentation: Inner ++ObjCBlockIndentWidth: 3 ++ObjCSpaceAfterProperty: true ++ObjCSpaceBeforeProtocolList: true ++PenaltyBreakBeforeFirstCallParameter: 19 ++PenaltyBreakComment: 300 ++PenaltyBreakString: 1000 ++PenaltyBreakFirstLessLess: 120 ++PenaltyExcessCharacter: 1000000 ++PenaltyReturnTypeOnItsOwnLine: 0 ++PointerAlignment: Right ++SpacesBeforeTrailingComments: 1 ++Cpp11BracedListStyle: true ++Standard: Cpp11 ++IndentWidth: 3 ++TabWidth: 8 ++UseTab: Never ++BreakBeforeBraces: Linux ++SpacesInParentheses: false ++SpacesInSquareBrackets: false ++SpacesInAngles: false ++SpaceInEmptyParentheses: false ++SpacesInCStyleCastParentheses: false ++SpaceAfterCStyleCast: false ++SpacesInContainerLiterals: true ++SpaceBeforeAssignmentOperators: true ++ContinuationIndentWidth: 3 ++CommentPragmas: '^ IWYU pragma:' ++ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] ++SpaceBeforeParens: ControlStatements ++DisableFormat: false ++... ++ +diff --git a/src/gallium/drivers/swr/Automake.inc b/src/gallium/drivers/swr/Automake.inc +new file mode 100644 +index 0000000..8e66744 +--- /dev/null ++++ b/src/gallium/drivers/swr/Automake.inc +@@ -0,0 +1,28 @@ ++# Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a ++# copy of this software and associated documentation files (the "Software"), ++# to deal in the Software without restriction, including without limitation ++# the rights to use, copy, modify, merge, publish, distribute, sublicense, ++# and/or sell copies of the Software, and to permit persons to whom the ++# Software is furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice (including the next ++# paragraph) shall be included in all copies or substantial portions of the ++# Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++# IN THE SOFTWARE. ++ ++if HAVE_GALLIUM_SWR ++ ++TARGET_CPPFLAGS += -DGALLIUM_SWR ++TARGET_LIB_DEPS += \ ++ $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la ++ ++endif +diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am +new file mode 100644 +index 0000000..5dff02c +--- /dev/null ++++ b/src/gallium/drivers/swr/Makefile.am +@@ -0,0 +1,82 @@ ++# Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a ++# copy of this software and associated documentation files (the "Software"), ++# to deal in the Software without restriction, including without limitation ++# the rights to use, copy, modify, merge, publish, distribute, sublicense, ++# and/or sell copies of the Software, and to permit persons to whom the ++# Software is furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice (including the next ++# paragraph) shall be included in all copies or substantial portions of the ++# Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++# IN THE SOFTWARE. ++ ++AUTOMAKE_OPTIONS = subdir-objects ++ ++include Makefile.sources ++include $(top_srcdir)/src/gallium/Automake.inc ++ ++AM_CXXFLAGS = \ ++ $(GALLIUM_DRIVER_CFLAGS) \ ++ -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS \ ++ $(SWR_ARCH_FLAG) \ ++ $(LLVM_CFLAGS) ++ ++noinst_LTLIBRARIES = libmesaswr.la ++ ++libmesaswr_la_SOURCES = $(CXX_SOURCES) ++ ++libmesaswr_la_LDFLAGS = ++ ++if SWR_NATIVE ++BUILT_SOURCES = \ ++ rasterizer/scripts/gen_knobs.cpp \ ++ rasterizer/scripts/gen_knobs.h \ ++ rasterizer/jitter/state_llvm.h ++ ++rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template ++ $(PYTHON2) $(PYTHON_FLAGS) \ ++ $(srcdir)/rasterizer/scripts/gen_knobs.py \ ++ rasterizer/scripts ++ ++rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h ++ $(PYTHON2) $(PYTHON_FLAGS) \ ++ $(srcdir)/rasterizer/jitter/scripts/gen_llvm_types.py \ ++ --input $(srcdir)/rasterizer/core/state.h \ ++ --output rasterizer/jitter/state_llvm.h ++ ++libmesaswr_la_SOURCES += \ ++ $(COMMON_CXX_SOURCES) \ ++ $(CORE_CXX_SOURCES) \ ++ $(JITTER_CXX_SOURCES) \ ++ $(MEMORY_CXX_SOURCES) \ ++ rasterizer/scripts/gen_knobs.cpp \ ++ rasterizer/scripts/gen_knobs.h ++AM_CXXFLAGS += \ ++ -I$(srcdir)/rasterizer \ ++ -I$(srcdir)/rasterizer/core \ ++ -I$(srcdir)/rasterizer/jitter \ ++ -I$(builddir)/rasterizer/scripts \ ++ -I$(builddir)/rasterizer/jitter ++else ++libmesaswr_la_LDFLAGS += -L$(SWR_LIBDIR) -lSWR ++AM_CXXFLAGS += \ ++ -I$(SWR_INCLUDEDIR) \ ++ -I$(SWR_INCLUDEDIR)/core \ ++ -I$(SWR_INCLUDEDIR)/jitter \ ++ -I$(SWR_INCLUDEDIR)/build/jitter \ ++ -I$(SWR_INCLUDEDIR)/build/scripts ++endif ++ ++libmesaswr_la_LDFLAGS += -lnuma ++ ++ ++EXTRA_DIST = SConscript +diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources +new file mode 100644 +index 0000000..1c6fe08 +--- /dev/null ++++ b/src/gallium/drivers/swr/Makefile.sources +@@ -0,0 +1,114 @@ ++# Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a ++# copy of this software and associated documentation files (the "Software"), ++# to deal in the Software without restriction, including without limitation ++# the rights to use, copy, modify, merge, publish, distribute, sublicense, ++# and/or sell copies of the Software, and to permit persons to whom the ++# Software is furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice (including the next ++# paragraph) shall be included in all copies or substantial portions of the ++# Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++# IN THE SOFTWARE. ++ ++CXX_SOURCES := \ ++ swr_clear.cpp \ ++ swr_context.cpp \ ++ swr_context.h \ ++ swr_context_llvm.h \ ++ swr_draw.cpp \ ++ swr_public.h \ ++ swr_resource.h \ ++ swr_screen.cpp \ ++ swr_screen.h \ ++ swr_state.cpp \ ++ swr_state.h \ ++ swr_tex_sample.cpp \ ++ swr_tex_sample.h \ ++ swr_scratch.h \ ++ swr_scratch.cpp \ ++ swr_shader.cpp \ ++ swr_memory.h \ ++ swr_fence.h \ ++ swr_fence.cpp \ ++ swr_query.h \ ++ swr_query.cpp ++ ++COMMON_CXX_SOURCES := \ ++ rasterizer/common/containers.hpp \ ++ rasterizer/common/formats.cpp \ ++ rasterizer/common/formats.h \ ++ rasterizer/common/isa.hpp \ ++ rasterizer/common/os.h \ ++ rasterizer/common/rdtsc_buckets.cpp \ ++ rasterizer/common/rdtsc_buckets.h \ ++ rasterizer/common/rdtsc_buckets_shared.h \ ++ rasterizer/common/rdtsc_buckets_shared.h \ ++ rasterizer/common/simdintrin.h \ ++ rasterizer/common/swr_assert.cpp \ ++ rasterizer/common/swr_assert.h ++ ++CORE_CXX_SOURCES := \ ++ rasterizer/core/api.cpp \ ++ rasterizer/core/api.h \ ++ rasterizer/core/arena.cpp \ ++ rasterizer/core/arena.h \ ++ rasterizer/core/backend.cpp \ ++ rasterizer/core/backend.h \ ++ rasterizer/core/blend.h \ ++ rasterizer/core/clip.cpp \ ++ rasterizer/core/clip.h \ ++ rasterizer/core/context.h \ ++ rasterizer/core/depthstencil.h \ ++ rasterizer/core/fifo.hpp \ ++ rasterizer/core/format_traits.h \ ++ rasterizer/core/format_types.h \ ++ rasterizer/core/frontend.cpp \ ++ rasterizer/core/frontend.h \ ++ rasterizer/core/knobs.h \ ++ rasterizer/core/knobs_init.h \ ++ rasterizer/core/multisample.h \ ++ rasterizer/core/pa_avx.cpp \ ++ rasterizer/core/pa.h \ ++ rasterizer/core/rasterizer.cpp \ ++ rasterizer/core/rasterizer.h \ ++ rasterizer/core/rdtsc_core.cpp \ ++ rasterizer/core/rdtsc_core.h \ ++ rasterizer/core/state.h \ ++ rasterizer/core/threads.cpp \ ++ rasterizer/core/threads.h \ ++ rasterizer/core/tilemgr.cpp \ ++ rasterizer/core/tilemgr.h \ ++ rasterizer/core/utils.cpp \ ++ rasterizer/core/utils.h ++ ++JITTER_CXX_SOURCES := \ ++ rasterizer/jitter/blend_jit.cpp \ ++ rasterizer/jitter/blend_jit.h \ ++ rasterizer/jitter/builder.cpp \ ++ rasterizer/jitter/builder_gen.cpp \ ++ rasterizer/jitter/builder_gen.h \ ++ rasterizer/jitter/builder.h \ ++ rasterizer/jitter/builder_misc.cpp \ ++ rasterizer/jitter/builder_misc.h \ ++ rasterizer/jitter/builder_x86.cpp \ ++ rasterizer/jitter/builder_x86.h \ ++ rasterizer/jitter/fetch_jit.cpp \ ++ rasterizer/jitter/fetch_jit.h \ ++ rasterizer/jitter/JitManager.cpp \ ++ rasterizer/jitter/JitManager.h \ ++ rasterizer/jitter/streamout_jit.cpp \ ++ rasterizer/jitter/streamout_jit.h ++ ++MEMORY_CXX_SOURCES := \ ++ rasterizer/memory/ClearTile.cpp \ ++ rasterizer/memory/LoadTile.cpp \ ++ rasterizer/memory/StoreTile.cpp +diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript +new file mode 100644 +index 0000000..4c8c121 +--- /dev/null ++++ b/src/gallium/drivers/swr/SConscript +@@ -0,0 +1,69 @@ ++from sys import executable as python_cmd ++import distutils.version ++ ++Import('*') ++ ++if not env['llvm']: ++ print 'warning: LLVM disabled: not building swr' ++ Return() ++ ++env = env.Clone() ++ ++env.MSVC2008Compat() ++ ++env.Append(CPPDEFINES = [ ++ '__STDC_CONSTANT_MACROS', ++ '__STDC_LIMIT_MACROS', ++ 'KNOB_ARCH=KNOB_ARCH_AVX2', ++ ]) ++ ++env.Append(CCFLAGS = [ ++ '-std=c++11', ++ '-march=core-avx2', ++ ]) ++ ++env.Prepend(CPPPATH = [ ++ 'rasterizer', ++ 'rasterizer/core', ++ 'rasterizer/jitter', ++ 'rasterizer/scripts', ++ ]) ++ ++gen_knobs = env.CodeGenerate( ++ target = 'rasterizer/scripts/gen_knobs.cpp', ++ script = 'rasterizer/scripts/gen_knobs.py', ++ source = [], ++ command = python_cmd + ' $SCRIPT ' + Dir('rasterizer/scripts').abspath ++) ++ ++gen_knobs = env.CodeGenerate( ++ target = 'rasterizer/scripts/gen_knobs.h', ++ script = 'rasterizer/scripts/gen_knobs.py', ++ source = [], ++ command = python_cmd + ' $SCRIPT ' + Dir('rasterizer/scripts').abspath ++) ++ ++state_llvm = env.CodeGenerate( ++ target = 'rasterizer/jitter/state_llvm.h', ++ script = 'rasterizer/jitter/scripts/gen_llvm_types.py', ++ source = 'rasterizer/core/state.h', ++ command = python_cmd + ' $SCRIPT --input $SOURCE --output $TARGET' ++) ++ ++source = ['rasterizer/scripts/gen_knobs.cpp', 'rasterizer/scripts/gen_knobs.h'] ++source += env.ParseSourceList('Makefile.sources', [ ++ 'CXX_SOURCES', ++ 'COMMON_CXX_SOURCES', ++ 'CORE_CXX_SOURCES', ++ 'JITTER_CXX_SOURCES', ++ 'MEMORY_CXX_SOURCES' ++]) ++ ++swr = env.ConvenienceLibrary( ++ target = 'swr', ++ source = source, ++ ) ++ ++env.Alias('swr', swr) ++ ++Export('swr') +diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp +new file mode 100644 +index 0000000..7704359 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_clear.cpp +@@ -0,0 +1,141 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "swr_context.h" ++#include "swr_query.h" ++ ++static void ++swr_clear(struct pipe_context *pipe, ++ unsigned buffers, ++ const union pipe_color_union *color, ++ double depth, ++ unsigned stencil) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct pipe_framebuffer_state *fb = &ctx->framebuffer; ++ ++ UINT clearMask = 0; ++ ++ if (!swr_check_render_cond(pipe)) ++ return; ++ ++ if (ctx->dirty) ++ swr_update_derived(ctx); ++ ++/* Update clearMask/targetMask */ ++#if 0 /* XXX SWR currently only clears SWR_ATTACHMENT_COLOR0, don't bother \ ++ checking others yet. */ ++ if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) { ++ UINT i; ++ for (i = 0; i < fb->nr_cbufs; ++i) ++ if (fb->cbufs[i]) ++ clearMask |= (SWR_CLEAR_COLOR0 << i); ++ } ++#else ++ if (buffers & PIPE_CLEAR_COLOR && fb->cbufs[0]) ++ clearMask |= SWR_CLEAR_COLOR; ++#endif ++ ++ if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf) ++ clearMask |= SWR_CLEAR_DEPTH; ++ ++ if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf) ++ clearMask |= SWR_CLEAR_STENCIL; ++ ++#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are ++ // transparent. ++ ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */ ++#endif ++ ++ /* Reset viewport to full framebuffer width/height before clear, then ++ * restore it */ ++ /* Scissor affects clear, viewport should not */ ++ ctx->dirty |= SWR_NEW_VIEWPORT; ++ SWR_VIEWPORT vp = {0}; ++ vp.width = ctx->framebuffer.width; ++ vp.height = ctx->framebuffer.height; ++ SwrSetViewports(ctx->swrContext, 1, &vp, NULL); ++ ++ SwrClearRenderTarget(ctx->swrContext, clearMask, color->f, depth, stencil); ++} ++ ++ ++#if 0 // XXX, these don't get called. how to get these called? Do we need ++ // them? Docs? ++static void ++swr_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps, ++ const union pipe_color_union *color, ++ unsigned x, unsigned y, unsigned w, unsigned h) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ fprintf(stderr, "SWR swr_clear_render_target!\n"); ++ ++ ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; ++} ++ ++static void ++swr_clear_depth_stencil(struct pipe_context *pipe, struct pipe_surface *ps, ++ unsigned buffers, double depth, unsigned stencil, ++ unsigned x, unsigned y, unsigned w, unsigned h) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ fprintf(stderr, "SWR swr_clear_depth_stencil!\n"); ++ ++ ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; ++} ++ ++static void ++swr_clear_buffer(struct pipe_context *pipe, ++ struct pipe_resource *res, ++ unsigned offset, unsigned size, ++ const void *data, int data_size) ++{ ++ fprintf(stderr, "SWR swr_clear_buffer!\n"); ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_resource *buf = swr_resource(res); ++ union pipe_color_union color; ++ enum pipe_format dst_fmt; ++ unsigned width, height, elements; ++ ++ assert(res->target == PIPE_BUFFER); ++ assert(buf); ++ assert(size % data_size == 0); ++ ++ SWR_SURFACE_STATE &swr_buffer = buf->swr; ++ ++ ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR; ++} ++#endif ++ ++ ++void ++swr_clear_init(struct pipe_context *pipe) ++{ ++ pipe->clear = swr_clear; ++#if 0 // XXX, these don't get called. how to get these called? Do we need ++ // them? Docs? ++ pipe->clear_render_target = swr_clear_render_target; ++ pipe->clear_depth_stencil = swr_clear_depth_stencil; ++ pipe->clear_buffer = swr_clear_buffer; ++#endif ++} +diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp +new file mode 100644 +index 0000000..6269cd0 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_context.cpp +@@ -0,0 +1,392 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "util/u_memory.h" ++#include "util/u_inlines.h" ++#include "util/u_format.h" ++ ++extern "C" { ++#include "util/u_transfer.h" ++#include "util/u_surface.h" ++} ++ ++#include "swr_context.h" ++#include "swr_memory.h" ++#include "swr_screen.h" ++#include "swr_resource.h" ++#include "swr_scratch.h" ++#include "swr_query.h" ++ ++#include "api.h" ++ ++static struct pipe_surface * ++swr_create_surface(struct pipe_context *pipe, ++ struct pipe_resource *pt, ++ const struct pipe_surface *surf_tmpl) ++{ ++ struct pipe_surface *ps; ++ ++ ps = CALLOC_STRUCT(pipe_surface); ++ if (ps) { ++ pipe_reference_init(&ps->reference, 1); ++ pipe_resource_reference(&ps->texture, pt); ++ ps->context = pipe; ++ ps->format = surf_tmpl->format; ++ if (pt->target != PIPE_BUFFER) { ++ assert(surf_tmpl->u.tex.level <= pt->last_level); ++ ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level); ++ ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level); ++ ps->u.tex.level = surf_tmpl->u.tex.level; ++ ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer; ++ ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer; ++ if (ps->u.tex.first_layer != ps->u.tex.last_layer) { ++ debug_printf("creating surface with multiple layers, rendering " ++ "to first layer only\n"); ++ } ++ } else { ++ /* setting width as number of elements should get us correct ++ * renderbuffer width */ ++ ps->width = surf_tmpl->u.buf.last_element ++ - surf_tmpl->u.buf.first_element + 1; ++ ps->height = pt->height0; ++ ps->u.buf.first_element = surf_tmpl->u.buf.first_element; ++ ps->u.buf.last_element = surf_tmpl->u.buf.last_element; ++ assert(ps->u.buf.first_element <= ps->u.buf.last_element); ++ assert(ps->u.buf.last_element < ps->width); ++ } ++ } ++ return ps; ++} ++ ++static void ++swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf) ++{ ++ assert(surf->texture); ++ struct pipe_resource *resource = surf->texture; ++ ++ /* If the surface being destroyed is a current render target, ++ * call StoreTiles to resolve the hotTile state then set attachment ++ * to NULL. ++ */ ++ if (resource->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL ++ | PIPE_BIND_DISPLAY_TARGET)) { ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_resource *spr = swr_resource(resource); ++ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++) ++ if (ctx->current.attachment[i] == &spr->swr) { ++ swr_store_render_target(ctx, i, SWR_TILE_RESOLVED); ++ ctx->current.attachment[i] = nullptr; ++ /* ++ * Mesa thinks depth/stencil are fused, so we'll never get an ++ * explicit resource for stencil. So, if checking depth, then ++ * also ++ * check for stencil. ++ */ ++ if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) { ++ swr_store_render_target( ++ ctx, SWR_ATTACHMENT_STENCIL, SWR_TILE_RESOLVED); ++ ctx->current.attachment[SWR_ATTACHMENT_STENCIL] = nullptr; ++ } ++ ++ SwrWaitForIdle(ctx->swrContext); ++ break; ++ } ++ } ++ ++ pipe_resource_reference(&surf->texture, NULL); ++ FREE(surf); ++} ++ ++ ++static void * ++swr_transfer_map(struct pipe_context *pipe, ++ struct pipe_resource *resource, ++ unsigned level, ++ unsigned usage, ++ const struct pipe_box *box, ++ struct pipe_transfer **transfer) ++{ ++ struct swr_resource *spr = swr_resource(resource); ++ struct pipe_transfer *pt; ++ enum pipe_format format = resource->format; ++ ++ assert(resource); ++ assert(level <= resource->last_level); ++ ++ /* ++ * If mapping any attached rendertarget, store tiles and wait for idle ++ * before giving CPU access to the surface. ++ * (set postStoreTileState to SWR_TILE_INVALID so tiles are reloaded) ++ */ ++ if (resource->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL ++ | PIPE_BIND_DISPLAY_TARGET)) { ++ struct swr_context *ctx = swr_context(pipe); ++ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++) ++ if (ctx->current.attachment[i] == &spr->swr) { ++ swr_store_render_target(ctx, i, SWR_TILE_INVALID); ++ /* ++ * Mesa thinks depth/stencil are fused, so we'll never get an ++ * explicit map for stencil. So, if mapping depth, then also ++ * store tile for stencil. ++ */ ++ if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) ++ swr_store_render_target( ++ ctx, SWR_ATTACHMENT_STENCIL, SWR_TILE_INVALID); ++ SwrWaitForIdle(ctx->swrContext); ++ break; ++ } ++ } ++ ++ ++ pt = CALLOC_STRUCT(pipe_transfer); ++ if (!pt) ++ return NULL; ++ pipe_resource_reference(&pt->resource, resource); ++ pt->level = level; ++ pt->box = *box; ++ pt->stride = spr->row_stride[level]; ++ pt->layer_stride = spr->img_stride[level]; ++ ++ /* if we're mapping the depth/stencil, copy in stencil */ ++ if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT ++ && spr->has_stencil) { ++ for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) { ++ spr->swr.pBaseAddress[4 * i + 3] = spr->secondary.pBaseAddress[i]; ++ } ++ } ++ ++ unsigned offset = box->z * pt->layer_stride + box->y * pt->stride ++ + box->x * util_format_get_blocksize(format); ++ ++ *transfer = pt; ++ ++ return spr->swr.pBaseAddress + offset + spr->mip_offsets[level]; ++} ++ ++static void ++swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer) ++{ ++ assert(transfer->resource); ++ ++ /* ++ * XXX TODO: use fences and come up with a real resource manager. ++ * ++ * If this resource has been mapped/unmapped, it's probably in use. Tag it ++ *with this context so ++ * we'll know to check dependencies when it's deleted. ++ */ ++ struct swr_resource *res = swr_resource(transfer->resource); ++ res->bound_to_context = (void *)pipe; ++ ++ /* if we're mapping the depth/stencil, copy out stencil */ ++ if (res->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT ++ && res->has_stencil) { ++ for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) { ++ res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[4 * i + 3]; ++ } ++ } ++ ++ pipe_resource_reference(&transfer->resource, NULL); ++ FREE(transfer); ++} ++ ++ ++static void ++swr_resource_copy(struct pipe_context *pipe, ++ struct pipe_resource *dst, ++ unsigned dst_level, ++ unsigned dstx, ++ unsigned dsty, ++ unsigned dstz, ++ struct pipe_resource *src, ++ unsigned src_level, ++ const struct pipe_box *src_box) ++{ ++ if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) ++ || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) { ++ util_resource_copy_region( ++ pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); ++ return; ++ } ++ ++ debug_printf("unhandled swr_resource_copy\n"); ++} ++ ++ ++static void ++swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct pipe_blit_info info = *blit_info; ++ ++ if (blit_info->render_condition_enable && !swr_check_render_cond(pipe)) ++ return; ++ ++ if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1 ++ && !util_format_is_depth_or_stencil(info.src.resource->format) ++ && !util_format_is_pure_integer(info.src.resource->format)) { ++ debug_printf("swr: color resolve unimplemented\n"); ++ return; ++ } ++ ++ if (util_try_blit_via_copy_region(pipe, &info)) { ++ return; /* done */ ++ } ++ ++ if (info.mask & PIPE_MASK_S) { ++ debug_printf("swr: cannot blit stencil, skipping\n"); ++ info.mask &= ~PIPE_MASK_S; ++ } ++ ++ if (!util_blitter_is_blit_supported(ctx->blitter, &info)) { ++ debug_printf("swr: blit unsupported %s -> %s\n", ++ util_format_short_name(info.src.resource->format), ++ util_format_short_name(info.dst.resource->format)); ++ return; ++ } ++ ++ /* XXX turn off occlusion and streamout queries */ ++ ++ util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer); ++ util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems); ++ util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs); ++ /*util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs);*/ ++ util_blitter_save_so_targets( ++ ctx->blitter, ++ ctx->num_so_targets, ++ (struct pipe_stream_output_target **)ctx->so_targets); ++ util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer); ++ util_blitter_save_viewport(ctx->blitter, &ctx->viewport); ++ util_blitter_save_scissor(ctx->blitter, &ctx->scissor); ++ util_blitter_save_fragment_shader(ctx->blitter, ctx->fs); ++ util_blitter_save_blend(ctx->blitter, (void *)ctx->blend); ++ util_blitter_save_depth_stencil_alpha(ctx->blitter, ++ (void *)ctx->depth_stencil); ++ util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); ++ util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask); ++ util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer); ++ util_blitter_save_fragment_sampler_states( ++ ctx->blitter, ++ ctx->num_samplers[PIPE_SHADER_FRAGMENT], ++ (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]); ++ util_blitter_save_fragment_sampler_views( ++ ctx->blitter, ++ ctx->num_sampler_views[PIPE_SHADER_FRAGMENT], ++ ctx->sampler_views[PIPE_SHADER_FRAGMENT]); ++ util_blitter_save_render_condition(ctx->blitter, ++ ctx->render_cond_query, ++ ctx->render_cond_cond, ++ ctx->render_cond_mode); ++ ++ util_blitter_blit(ctx->blitter, &info); ++} ++ ++ ++static void ++swr_destroy(struct pipe_context *pipe) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (ctx->blitter) ++ util_blitter_destroy(ctx->blitter); ++ ++ if (ctx->swrContext) ++ SwrDestroyContext(ctx->swrContext); ++ ++ delete ctx->blendJIT; ++ ++ swr_destroy_scratch_buffers(ctx); ++ ++ FREE(ctx); ++} ++ ++ ++static void ++swr_render_condition(struct pipe_context *pipe, ++ struct pipe_query *query, ++ boolean condition, ++ uint mode) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->render_cond_query = query; ++ ctx->render_cond_mode = mode; ++ ctx->render_cond_cond = condition; ++} ++ ++ ++struct pipe_context * ++swr_create_context(struct pipe_screen *screen, void *priv) ++{ ++ struct swr_context *ctx = CALLOC_STRUCT(swr_context); ++ ctx->blendJIT = ++ new std::unordered_map; ++ ++ SWR_CREATECONTEXT_INFO createInfo; ++ createInfo.driver = GL; ++ createInfo.privateStateSize = sizeof(swr_draw_context); ++ createInfo.pfnLoadTile = swr_LoadHotTile; ++ createInfo.pfnStoreTile = swr_StoreHotTile; ++ createInfo.pfnClearTile = swr_StoreHotTileClear; ++ ctx->swrContext = SwrCreateContext(&createInfo); ++ ++ /* Init Load/Store/ClearTiles Tables */ ++ swr_InitMemoryModule(); ++ ++ if (ctx->swrContext == NULL) ++ goto fail; ++ ++ ctx->pipe.screen = screen; ++ ctx->pipe.destroy = swr_destroy; ++ ctx->pipe.priv = priv; ++ ctx->pipe.create_surface = swr_create_surface; ++ ctx->pipe.surface_destroy = swr_surface_destroy; ++ ctx->pipe.transfer_map = swr_transfer_map; ++ ctx->pipe.transfer_unmap = swr_transfer_unmap; ++ ++ ctx->pipe.transfer_flush_region = u_default_transfer_flush_region; ++ ctx->pipe.transfer_inline_write = u_default_transfer_inline_write; ++ ++ ctx->pipe.resource_copy_region = swr_resource_copy; ++ ctx->pipe.render_condition = swr_render_condition; ++ ++ swr_state_init(&ctx->pipe); ++ swr_clear_init(&ctx->pipe); ++ swr_draw_init(&ctx->pipe); ++ swr_query_init(&ctx->pipe); ++ ++ ctx->pipe.blit = swr_blit; ++ ctx->blitter = util_blitter_create(&ctx->pipe); ++ if (!ctx->blitter) { ++ goto fail; ++ } ++ ++ swr_init_scratch_buffers(ctx); ++ ++ return &ctx->pipe; ++ ++fail: ++ /* Should really validate the init steps and fail gracefully */ ++ swr_destroy(&ctx->pipe); ++ return NULL; ++} +diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h +new file mode 100644 +index 0000000..9d93a6d +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_context.h +@@ -0,0 +1,172 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_CONTEXT_H ++#define SWR_CONTEXT_H ++ ++#include "pipe/p_context.h" ++#include "pipe/p_state.h" ++#include "util/u_blitter.h" ++#include "jit_api.h" ++#include "swr_state.h" ++#include ++ ++#define SWR_NEW_BLEND (1 << 0) ++#define SWR_NEW_RASTERIZER (1 << 1) ++#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2) ++#define SWR_NEW_SAMPLER (1 << 3) ++#define SWR_NEW_SAMPLER_VIEW (1 << 4) ++#define SWR_NEW_VS (1 << 5) ++#define SWR_NEW_FS (1 << 6) ++#define SWR_NEW_VSCONSTANTS (1 << 7) ++#define SWR_NEW_FSCONSTANTS (1 << 8) ++#define SWR_NEW_VERTEX (1 << 9) ++#define SWR_NEW_STIPPLE (1 << 10) ++#define SWR_NEW_SCISSOR (1 << 11) ++#define SWR_NEW_VIEWPORT (1 << 12) ++#define SWR_NEW_FRAMEBUFFER (1 << 13) ++#define SWR_NEW_CLIP (1 << 14) ++#define SWR_NEW_SO (1 << 15) ++#define SWR_NEW_ALL 0x0000ffff ++ ++namespace std ++{ ++template <> struct hash { ++ std::size_t operator()(const BLEND_COMPILE_STATE &k) const ++ { ++ return util_hash_crc32(&k, sizeof(k)); ++ } ++}; ++}; ++ ++struct swr_context { ++ struct pipe_context pipe; /**< base class */ ++ ++ HANDLE swrContext; ++ ++ /** Constant state objects */ ++ struct swr_blend_state *blend; ++ struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; ++ struct pipe_depth_stencil_alpha_state *depth_stencil; ++ struct pipe_rasterizer_state *rasterizer; ++ ++ struct swr_vertex_shader *vs; ++ struct swr_fragment_shader *fs; ++ struct swr_vertex_element_state *velems; ++ ++ /** Other rendering state */ ++ struct pipe_blend_color blend_color; ++ struct pipe_stencil_ref stencil_ref; ++ struct pipe_clip_state clip; ++ struct pipe_constant_buffer ++ constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; ++ struct pipe_framebuffer_state framebuffer; ++ struct pipe_poly_stipple poly_stipple; ++ struct pipe_scissor_state scissor; ++ struct pipe_sampler_view * ++ sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; ++ ++ struct pipe_viewport_state viewport; ++ struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; ++ struct pipe_index_buffer index_buffer; ++ ++ struct blitter_context *blitter; ++ ++ /** Conditional query object and mode */ ++ struct pipe_query *render_cond_query; ++ uint render_cond_mode; ++ boolean render_cond_cond; ++ unsigned active_queries; ++ ++ unsigned num_vertex_buffers; ++ unsigned num_samplers[PIPE_SHADER_TYPES]; ++ unsigned num_sampler_views[PIPE_SHADER_TYPES]; ++ ++ unsigned sample_mask; ++ ++ // streamout ++ pipe_stream_output_target *so_targets[MAX_SO_STREAMS]; ++ uint32_t num_so_targets; ++ ++ /* Temp storage for user_buffer constants */ ++ struct swr_scratch_buffers *scratch; ++ ++ // blend jit functions ++ std::unordered_map *blendJIT; ++ ++ /* Shadows of current SWR API DrawState */ ++ struct swr_shadow_state current; ++ ++ unsigned dirty; /**< Mask of SWR_NEW_x flags */ ++}; ++ ++struct swr_jit_texture { ++ uint32_t width; // same as number of elements ++ uint32_t height; ++ uint32_t depth; // doubles as array size ++ uint32_t first_level; ++ uint32_t last_level; ++ const void *base_ptr; ++ uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS]; ++ uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS]; ++ uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; ++}; ++ ++struct swr_jit_sampler { ++ float min_lod; ++ float max_lod; ++ float lod_bias; ++ float border_color[4]; ++}; ++ ++struct swr_draw_context { ++ const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS]; ++ unsigned num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS]; ++ const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS]; ++ unsigned num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS]; ++ ++ swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; ++ swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS]; ++ swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS]; ++ swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS]; ++ ++ SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS]; ++}; ++ ++ ++static INLINE struct swr_context * ++swr_context(struct pipe_context *pipe) ++{ ++ return (struct swr_context *)pipe; ++} ++ ++struct pipe_context *swr_create_context(struct pipe_screen *, void *priv); ++ ++void swr_state_init(struct pipe_context *pipe); ++ ++void swr_clear_init(struct pipe_context *pipe); ++ ++void swr_draw_init(struct pipe_context *pipe); ++ ++void swr_finish(struct pipe_context *pipe); ++#endif +diff --git a/src/gallium/drivers/swr/swr_context_llvm.h b/src/gallium/drivers/swr/swr_context_llvm.h +new file mode 100644 +index 0000000..58da813 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_context_llvm.h +@@ -0,0 +1,124 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#pragma once ++ ++////////////////////////////////////////////////////////////////////////// ++/// Generate LLVM type information for swr_jit_texture ++INLINE static StructType * ++Gen_swr_jit_texture(JitManager *pShG) ++{ ++ LLVMContext &ctx = pShG->mContext; ++ std::vector members; ++ ++ members.push_back(Type::getInt32Ty(ctx)); // width ++ members.push_back(Type::getInt32Ty(ctx)); // height ++ members.push_back(Type::getInt32Ty(ctx)); // depth ++ members.push_back(Type::getInt32Ty(ctx)); // first_level ++ members.push_back(Type::getInt32Ty(ctx)); // last_level ++ members.push_back(PointerType::get(Type::getInt8Ty(ctx), 0)); // base_ptr ++ members.push_back(ArrayType::get(Type::getInt32Ty(ctx), ++ PIPE_MAX_TEXTURE_LEVELS)); // row_stride ++ members.push_back(ArrayType::get(Type::getInt32Ty(ctx), ++ PIPE_MAX_TEXTURE_LEVELS)); // img_stride ++ members.push_back(ArrayType::get(Type::getInt32Ty(ctx), ++ PIPE_MAX_TEXTURE_LEVELS)); // mip_offsets ++ ++ return StructType::get(ctx, members, false); ++} ++ ++static const UINT swr_jit_texture_width = 0; ++static const UINT swr_jit_texture_height = 1; ++static const UINT swr_jit_texture_depth = 2; ++static const UINT swr_jit_texture_first_level = 3; ++static const UINT swr_jit_texture_last_level = 4; ++static const UINT swr_jit_texture_base_ptr = 5; ++static const UINT swr_jit_texture_row_stride = 6; ++static const UINT swr_jit_texture_img_stride = 7; ++static const UINT swr_jit_texture_mip_offsets = 8; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Generate LLVM type information for swr_jit_sampler ++INLINE static StructType * ++Gen_swr_jit_sampler(JitManager *pShG) ++{ ++ LLVMContext &ctx = pShG->mContext; ++ std::vector members; ++ ++ members.push_back(Type::getFloatTy(ctx)); // min_lod ++ members.push_back(Type::getFloatTy(ctx)); // max_lod ++ members.push_back(Type::getFloatTy(ctx)); // lod_bias ++ members.push_back( ++ ArrayType::get(Type::getFloatTy(ctx), 4)); // border_color ++ ++ return StructType::get(ctx, members, false); ++} ++ ++static const UINT swr_jit_sampler_min_lod = 0; ++static const UINT swr_jit_sampler_max_lod = 1; ++static const UINT swr_jit_sampler_lod_bias = 2; ++static const UINT swr_jit_sampler_border_color = 3; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Generate LLVM type information for swr_draw_context ++INLINE static StructType * ++Gen_swr_draw_context(JitManager *pShG) ++{ ++ LLVMContext &ctx = pShG->mContext; ++ std::vector members; ++ ++ members.push_back( ++ ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0), ++ PIPE_MAX_CONSTANT_BUFFERS)); // constantVS ++ members.push_back(ArrayType::get( ++ Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsVS ++ members.push_back( ++ ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0), ++ PIPE_MAX_CONSTANT_BUFFERS)); // constantFS ++ members.push_back(ArrayType::get( ++ Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsFS ++ members.push_back( ++ ArrayType::get(Gen_swr_jit_texture(pShG), ++ PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesVS ++ members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG), ++ PIPE_MAX_SAMPLERS)); // samplersVS ++ members.push_back( ++ ArrayType::get(Gen_swr_jit_texture(pShG), ++ PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesFS ++ members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG), ++ PIPE_MAX_SAMPLERS)); // samplersFS ++ members.push_back(ArrayType::get(Gen_SWR_SURFACE_STATE(pShG), ++ SWR_NUM_ATTACHMENTS)); // renderTargets ++ ++ return StructType::get(ctx, members, false); ++} ++ ++static const UINT swr_draw_context_constantVS = 0; ++static const UINT swr_draw_context_num_constantsVS = 1; ++static const UINT swr_draw_context_constantFS = 2; ++static const UINT swr_draw_context_num_constantsFS = 3; ++static const UINT swr_draw_context_texturesVS = 4; ++static const UINT swr_draw_context_samplersVS = 5; ++static const UINT swr_draw_context_texturesFS = 6; ++static const UINT swr_draw_context_samplersFS = 7; ++static const UINT swr_draw_context_renderTargets = 8; +diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp +new file mode 100644 +index 0000000..797ebdc +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_draw.cpp +@@ -0,0 +1,277 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "swr_screen.h" ++#include "swr_context.h" ++#include "swr_resource.h" ++#include "swr_fence.h" ++#include "swr_query.h" ++#include "jit_api.h" ++ ++#include "util/u_draw.h" ++#include "util/u_prim.h" ++ ++/* ++ * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY ++ */ ++static INLINE enum PRIMITIVE_TOPOLOGY ++swr_convert_prim_topology(const unsigned mode) ++{ ++ switch (mode) { ++ case PIPE_PRIM_POINTS: ++ return TOP_POINT_LIST; ++ case PIPE_PRIM_LINES: ++ return TOP_LINE_LIST; ++ case PIPE_PRIM_LINE_LOOP: ++ return TOP_LINE_LOOP; ++ case PIPE_PRIM_LINE_STRIP: ++ return TOP_LINE_STRIP; ++ case PIPE_PRIM_TRIANGLES: ++ return TOP_TRIANGLE_LIST; ++ case PIPE_PRIM_TRIANGLE_STRIP: ++ return TOP_TRIANGLE_STRIP; ++ case PIPE_PRIM_TRIANGLE_FAN: ++ return TOP_TRIANGLE_FAN; ++ case PIPE_PRIM_QUADS: ++ return TOP_QUAD_LIST; ++ case PIPE_PRIM_QUAD_STRIP: ++ return TOP_QUAD_STRIP; ++ case PIPE_PRIM_POLYGON: ++ return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */ ++ case PIPE_PRIM_LINES_ADJACENCY: ++ return TOP_LINE_LIST_ADJ; ++ case PIPE_PRIM_LINE_STRIP_ADJACENCY: ++ return TOP_LISTSTRIP_ADJ; ++ case PIPE_PRIM_TRIANGLES_ADJACENCY: ++ return TOP_TRI_LIST_ADJ; ++ case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: ++ return TOP_TRI_STRIP_ADJ; ++ default: ++ assert(0 && "Unknown topology"); ++ return TOP_UNKNOWN; ++ } ++}; ++ ++ ++/* ++ * Draw vertex arrays, with optional indexing, optional instancing. ++ */ ++static void ++swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (!swr_check_render_cond(pipe)) ++ return; ++ ++ if (info->indirect) { ++ util_draw_indirect(pipe, info); ++ return; ++ } ++ ++ /* Update derived state, pass draw info to update function */ ++ if (ctx->dirty) ++ swr_update_derived(ctx, info); ++ ++ if (ctx->vs->pipe.stream_output.num_outputs) { ++ if (!ctx->vs->soFunc[info->mode]) { ++ STREAMOUT_COMPILE_STATE state = {0}; ++ struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output; ++ ++ state.numVertsPerPrim = u_vertices_per_prim(info->mode); ++ ++ uint32_t offsets[MAX_SO_STREAMS] = {0}; ++ uint32_t num = 0; ++ ++ for (uint32_t i = 0; i < so->num_outputs; i++) { ++ assert(so->output[i].stream == 0); // @todo ++ uint32_t output_buffer = so->output[i].output_buffer; ++ if (so->output[i].dst_offset != offsets[output_buffer]) { ++ // hole - need to fill ++ state.stream.decl[num].bufferIndex = output_buffer; ++ state.stream.decl[num].hole = true; ++ state.stream.decl[num].componentMask = ++ (1 << (so->output[i].dst_offset - offsets[output_buffer])) ++ - 1; ++ num++; ++ offsets[output_buffer] = so->output[i].dst_offset; ++ } ++ ++ state.stream.decl[num].bufferIndex = output_buffer; ++ state.stream.decl[num].attribSlot = so->output[i].register_index - 1; ++ state.stream.decl[num].componentMask = ++ ((1 << so->output[i].num_components) - 1) ++ << so->output[i].start_component; ++ state.stream.decl[num].hole = false; ++ num++; ++ ++ offsets[output_buffer] += so->output[i].num_components; ++ } ++ ++ state.stream.numDecls = num; ++ ++ HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr; ++ ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state); ++ debug_printf("so shader %p\n", ctx->vs->soFunc[info->mode]); ++ assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL"); ++ } ++ ++ SwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0); ++ } ++ ++ struct swr_vertex_element_state *velems = ctx->velems; ++ if (!velems->fsFunc ++ || (velems->fsState.cutIndex != info->restart_index) ++ || (velems->fsState.bEnableCutIndex != info->primitive_restart)) { ++ ++ velems->fsState.cutIndex = info->restart_index; ++ velems->fsState.bEnableCutIndex = info->primitive_restart; ++ ++ /* Create Fetch Shader */ ++ HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr; ++ velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState); ++ ++ debug_printf("fetch shader %p\n", velems->fsFunc); ++ assert(velems->fsFunc && "Error: FetchShader = NULL"); ++ } ++ ++ SwrSetFetchFunc(ctx->swrContext, velems->fsFunc); ++ ++ if (info->indexed) ++ SwrDrawIndexedInstanced(ctx->swrContext, ++ swr_convert_prim_topology(info->mode), ++ info->count, ++ info->instance_count, ++ info->start, ++ info->index_bias, ++ info->start_instance); ++ else ++ SwrDrawInstanced(ctx->swrContext, ++ swr_convert_prim_topology(info->mode), ++ info->count, ++ info->instance_count, ++ info->start, ++ info->start_instance); ++} ++ ++ ++static void ++swr_flush(struct pipe_context *pipe, ++ struct pipe_fence_handle **fence, ++ unsigned flags) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_screen *screen = swr_screen(pipe->screen); ++ ++ /* If the current renderTarget is the display surface, store tiles back to ++ * the surface, in ++ * preparation for present (swr_flush_frontbuffer) ++ */ ++ struct pipe_surface *cb = ctx->framebuffer.cbufs[0]; ++ if (cb && swr_resource(cb->texture)->display_target) ++ swr_store_render_target(ctx, SWR_ATTACHMENT_COLOR0, SWR_TILE_RESOLVED); ++ ++ // SwrStoreTiles is asynchronous, always submit the "flush" fence. ++ // flush_frontbuffer needs it. ++ swr_fence_submit(ctx, screen->flush_fence); ++ ++ if (fence) ++ swr_fence_reference(pipe->screen, fence, screen->flush_fence); ++} ++ ++void ++swr_finish(struct pipe_context *pipe) ++{ ++ struct swr_screen *screen = swr_screen(pipe->screen); ++ struct pipe_fence_handle *fence = NULL; ++ ++ swr_flush(pipe, &fence, 0); ++ swr_fence_finish(&screen->base, fence, 0); ++ swr_fence_reference(&screen->base, &fence, NULL); ++} ++ ++ ++/* ++ * Store SWR HotTiles back to RenderTarget surface. ++ */ ++void ++swr_store_render_target(struct swr_context *ctx, ++ uint32_t attachment, ++ enum SWR_TILE_STATE post_tile_state, ++ struct SWR_SURFACE_STATE *surface) ++{ ++ struct swr_draw_context *pDC = ++ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); ++ struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment]; ++ ++ /* If the passed in surface isn't already attached, it will be attached and ++ * then restored. */ ++ if (surface && (surface != ctx->current.attachment[attachment])) ++ *renderTarget = *surface; ++ ++ /* Only proceed if there's a valid surface to store to */ ++ if (renderTarget->pBaseAddress) { ++ /* Set viewport to full renderTarget width/height and disable scissor ++ * before StoreTiles */ ++ boolean change_viewport = ++ (ctx->current.vp.x != 0.0f || ctx->current.vp.y != 0.0f ++ || ctx->current.vp.width != renderTarget->width ++ || ctx->current.vp.height != renderTarget->height); ++ if (change_viewport) { ++ SWR_VIEWPORT vp = {0}; ++ vp.width = renderTarget->width; ++ vp.height = renderTarget->height; ++ SwrSetViewports(ctx->swrContext, 1, &vp, NULL); ++ } ++ ++ boolean scissor_enable = ctx->current.rastState.scissorEnable; ++ if (scissor_enable) { ++ ctx->current.rastState.scissorEnable = FALSE; ++ SwrSetRastState(ctx->swrContext, &ctx->current.rastState); ++ } ++ ++ SwrStoreTiles(ctx->swrContext, ++ (enum SWR_RENDERTARGET_ATTACHMENT)attachment, ++ post_tile_state); ++ ++ /* Restore viewport and scissor enable */ ++ if (change_viewport) ++ SwrSetViewports(ctx->swrContext, 1, &ctx->current.vp, &ctx->current.vpm); ++ if (scissor_enable) { ++ ctx->current.rastState.scissorEnable = scissor_enable; ++ SwrSetRastState(ctx->swrContext, &ctx->current.rastState); ++ } ++ ++ /* Restore surface attachment, if changed */ ++ if (surface && (surface != ctx->current.attachment[attachment])) ++ *renderTarget = *ctx->current.attachment[attachment]; ++ } ++} ++ ++ ++void ++swr_draw_init(struct pipe_context *pipe) ++{ ++ pipe->draw_vbo = swr_draw_vbo; ++ pipe->flush = swr_flush; ++} +diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp +new file mode 100644 +index 0000000..aaf7223 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_fence.cpp +@@ -0,0 +1,141 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "pipe/p_screen.h" ++#include "util/u_memory.h" ++#include "os/os_time.h" ++ ++#include "swr_context.h" ++#include "swr_screen.h" ++#include "swr_fence.h" ++ ++ ++/* ++ * Fence callback, called by back-end thread on completion of all rendering up ++ * to SwrSync call. ++ */ ++static void ++swr_sync_cb(UINT64 userData, UINT64 userData2) ++{ ++ struct swr_fence *fence = (struct swr_fence *)userData; ++ ++ fence->read = fence->write; ++} ++ ++/* ++ * Submit an existing fence. ++ */ ++void ++swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh) ++{ ++ struct swr_fence *fence = swr_fence(fh); ++ ++ fence->write++; ++ SwrSync(ctx->swrContext, swr_sync_cb, (UINT64)fence, 0); ++} ++ ++/* ++ * Create a new fence object. ++ */ ++struct pipe_fence_handle * ++swr_fence_create() ++{ ++ static int fence_id = 0; ++ struct swr_fence *fence = CALLOC_STRUCT(swr_fence); ++ if (!fence) ++ return NULL; ++ ++ memset(fence, 0, sizeof(*fence)); ++ pipe_reference_init(&fence->reference, 1); ++ fence->id = fence_id++; ++ ++ return (struct pipe_fence_handle *)fence; ++} ++ ++/** Destroy a fence. Called when refcount hits zero. */ ++static void ++swr_fence_destroy(struct swr_fence *fence) ++{ ++ FREE(fence); ++} ++ ++/** ++ * Set ptr = fence, with reference counting ++ */ ++void ++swr_fence_reference(struct pipe_screen *screen, ++ struct pipe_fence_handle **ptr, ++ struct pipe_fence_handle *f) ++{ ++ struct swr_fence *fence = swr_fence(f); ++ struct swr_fence *old; ++ ++ if (likely(ptr)) { ++ old = swr_fence(*ptr); ++ *ptr = f; ++ } else { ++ old = NULL; ++ } ++ ++ if (pipe_reference(&old->reference, &fence->reference)) ++ swr_fence_destroy(old); ++} ++ ++/* ++ * Wait for the fence to finish. ++ */ ++boolean ++swr_fence_finish(struct pipe_screen *screen, ++ struct pipe_fence_handle *fence_handle, ++ uint64_t timeout) ++{ ++ struct swr_fence *fence = swr_fence(fence_handle); ++ ++ while (!swr_is_fence_done(fence)) ++ sched_yield(); ++ ++ return TRUE; ++} ++ ++ ++uint64_t ++swr_get_timestamp(struct pipe_screen *screen) ++{ ++ return os_time_get_nano(); ++} ++ ++ ++void ++swr_fence_init(struct pipe_screen *p_screen) ++{ ++ p_screen->fence_reference = swr_fence_reference; ++ p_screen->fence_finish = swr_fence_finish; ++ ++ p_screen->get_timestamp = swr_get_timestamp; ++ ++ /* ++ * Create persistant "flush" fence, submitted when swr_flush is called. ++ */ ++ struct swr_screen *screen = swr_screen(p_screen); ++ screen->flush_fence = swr_fence_create(); ++} +diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h +new file mode 100644 +index 0000000..317d74c +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_fence.h +@@ -0,0 +1,73 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included ++ * in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN ++ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_FENCE_H ++#define SWR_FENCE_H ++ ++ ++#include "os/os_thread.h" ++#include "pipe/p_state.h" ++#include "util/u_inlines.h" ++ ++ ++struct pipe_screen; ++ ++struct swr_fence { ++ struct pipe_reference reference; ++ ++ uint64_t read; ++ uint64_t write; ++ ++ unsigned id; /* Just for reference */ ++}; ++ ++ ++static inline struct swr_fence * ++swr_fence(struct pipe_fence_handle *fence) ++{ ++ return (struct swr_fence *)fence; ++} ++ ++static INLINE boolean ++swr_is_fence_done(struct swr_fence *fence) ++{ ++ return (fence->read == fence->write); ++} ++ ++ ++void swr_fence_init(struct pipe_screen *screen); ++ ++struct pipe_fence_handle *swr_fence_create(); ++ ++void swr_fence_reference(struct pipe_screen *screen, ++ struct pipe_fence_handle **ptr, ++ struct pipe_fence_handle *f); ++ ++boolean swr_fence_finish(struct pipe_screen *screen, ++ struct pipe_fence_handle *fence_handle, ++ uint64_t timeout); ++ ++void ++swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence); ++ ++uint64_t swr_get_timestamp(struct pipe_screen *screen); ++ ++#endif +diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h +new file mode 100644 +index 0000000..d116781 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_memory.h +@@ -0,0 +1,99 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#pragma once ++ ++void LoadHotTile( ++ SWR_SURFACE_STATE *pSrcSurface, ++ SWR_FORMAT dstFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, UINT y, uint32_t renderTargetArrayIndex, ++ BYTE *pDstHotTile); ++ ++void StoreHotTile( ++ SWR_SURFACE_STATE *pDstSurface, ++ SWR_FORMAT srcFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, UINT y, uint32_t renderTargetArrayIndex, ++ BYTE *pSrcHotTile); ++ ++void StoreHotTileClear( ++ SWR_SURFACE_STATE *pDstSurface, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, ++ UINT y, ++ const float* pClearColor); ++ ++INLINE void ++swr_LoadHotTile(HANDLE hPrivateContext, ++ SWR_FORMAT dstFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, UINT y, ++ uint32_t renderTargetArrayIndex, BYTE* pDstHotTile) ++{ ++ // Grab source surface state from private context ++ swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; ++ SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex]; ++ ++ LoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile); ++} ++ ++INLINE void ++swr_StoreHotTile(HANDLE hPrivateContext, ++ SWR_FORMAT srcFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, UINT y, ++ uint32_t renderTargetArrayIndex, BYTE* pSrcHotTile) ++{ ++ // Grab destination surface state from private context ++ swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; ++ SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; ++ ++ StoreHotTile(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile); ++} ++ ++INLINE void ++swr_StoreHotTileClear(HANDLE hPrivateContext, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, ++ UINT y, ++ const float* pClearColor) ++{ ++ // Grab destination surface state from private context ++ swr_draw_context *pDC = (swr_draw_context*)hPrivateContext; ++ SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex]; ++ ++ StoreHotTileClear(pDstSurface, renderTargetIndex, x, y, pClearColor); ++} ++ ++void InitSimLoadTilesTable(); ++void InitSimStoreTilesTable(); ++void InitSimClearTilesTable(); ++ ++/* Init Load/Store/ClearTiles Tables */ ++INLINE void swr_InitMemoryModule() ++{ ++ InitSimLoadTilesTable(); ++ InitSimStoreTilesTable(); ++ InitSimClearTilesTable(); ++} +diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h +new file mode 100644 +index 0000000..4d56ead +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_public.h +@@ -0,0 +1,40 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_PUBLIC_H ++#define SWR_PUBLIC_H ++ ++struct pipe_screen; ++struct sw_winsys; ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++struct pipe_screen *swr_create_screen(struct sw_winsys *winsys); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp +new file mode 100644 +index 0000000..2510b3a +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_query.cpp +@@ -0,0 +1,334 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "pipe/p_defines.h" ++#include "util/u_memory.h" ++#include "os/os_time.h" ++#include "swr_context.h" ++#include "swr_fence.h" ++#include "swr_query.h" ++#include "swr_screen.h" ++#include "swr_state.h" ++ ++ ++static struct swr_query * ++swr_query(struct pipe_query *p) ++{ ++ return (struct swr_query *)p; ++} ++ ++static struct pipe_query * ++swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index) ++{ ++ struct swr_query *pq; ++ ++ assert(type < PIPE_QUERY_TYPES); ++ assert(index < MAX_SO_STREAMS); ++ ++ pq = CALLOC_STRUCT(swr_query); ++ ++ if (pq) { ++ pq->type = type; ++ pq->index = index; ++ } ++ ++ return (struct pipe_query *)pq; ++} ++ ++ ++static void ++swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q) ++{ ++ struct swr_query *pq = swr_query(q); ++ ++ if (pq->fence) { ++ if (!swr_is_fence_done(swr_fence(pq->fence))) { ++ swr_fence_submit(swr_context(pipe), pq->fence); ++ swr_fence_finish(pipe->screen, pq->fence, 0); ++ } ++ swr_fence_reference(pipe->screen, &pq->fence, NULL); ++ } ++ ++ FREE(pq); ++} ++ ++ ++// XXX Create a fence callback, rather than stalling SwrWaitForIdle ++static void ++swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ assert(pq->result); ++ union pipe_query_result *result = pq->result; ++ boolean enable_stats = pq->enable_stats; ++ SWR_STATS swr_stats = {0}; ++ ++ if (pq->fence) { ++ if (!swr_is_fence_done(swr_fence(pq->fence))) { ++ swr_fence_submit(ctx, pq->fence); ++ swr_fence_finish(pipe->screen, pq->fence, 0); ++ } ++ swr_fence_reference(pipe->screen, &pq->fence, NULL); ++ } ++ ++ /* ++ * These queries don't need SWR Stats enabled in the core ++ * Set and return. ++ */ ++ switch (pq->type) { ++ case PIPE_QUERY_TIMESTAMP: ++ case PIPE_QUERY_TIME_ELAPSED: ++ result->u64 = swr_get_timestamp(pipe->screen); ++ return; ++ break; ++ case PIPE_QUERY_TIMESTAMP_DISJOINT: ++ /* nothing to do here */ ++ return; ++ break; ++ case PIPE_QUERY_GPU_FINISHED: ++ result->b = TRUE; /* XXX TODO Add an api func to SWR to compare drawId ++ vs LastRetiredId? */ ++ return; ++ break; ++ default: ++ /* Any query that needs SwrCore stats */ ++ break; ++ } ++ ++ /* ++ * All other results are collected from SwrCore counters ++ */ ++ ++ /* XXX, Should turn this into a fence callback and skip the stall */ ++ SwrGetStats(ctx->swrContext, &swr_stats); ++ /* SwrGetStats returns immediately, wait for collection */ ++ SwrWaitForIdle(ctx->swrContext); ++ ++ switch (pq->type) { ++ case PIPE_QUERY_OCCLUSION_PREDICATE: ++ case PIPE_QUERY_OCCLUSION_COUNTER: ++ result->u64 = swr_stats.DepthPassCount; ++ break; ++ case PIPE_QUERY_PRIMITIVES_GENERATED: ++ result->u64 = swr_stats.IaPrimitives; ++ break; ++ case PIPE_QUERY_PRIMITIVES_EMITTED: ++ result->u64 = swr_stats.SoNumPrimsWritten[pq->index]; ++ break; ++ case PIPE_QUERY_SO_STATISTICS: ++ case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { ++ struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; ++ so_stats->num_primitives_written = ++ swr_stats.SoNumPrimsWritten[pq->index]; ++ so_stats->primitives_storage_needed = ++ swr_stats.SoPrimStorageNeeded[pq->index]; ++ } break; ++ case PIPE_QUERY_PIPELINE_STATISTICS: { ++ struct pipe_query_data_pipeline_statistics *p_stats = ++ &result->pipeline_statistics; ++ p_stats->ia_vertices = swr_stats.IaVertices; ++ p_stats->ia_primitives = swr_stats.IaPrimitives; ++ p_stats->vs_invocations = swr_stats.VsInvocations; ++ p_stats->gs_invocations = swr_stats.GsInvocations; ++ p_stats->gs_primitives = swr_stats.GsPrimitives; ++ p_stats->c_invocations = swr_stats.CPrimitives; ++ p_stats->c_primitives = swr_stats.CPrimitives; ++ p_stats->ps_invocations = swr_stats.PsInvocations; ++ p_stats->hs_invocations = swr_stats.HsInvocations; ++ p_stats->ds_invocations = swr_stats.DsInvocations; ++ p_stats->cs_invocations = swr_stats.CsInvocations; ++ } break; ++ default: ++ assert(0 && "Unsupported query"); ++ break; ++ } ++ ++ /* Only change stat collection if there are no active queries */ ++ if (ctx->active_queries == 0) ++ SwrEnableStats(ctx->swrContext, enable_stats); ++} ++ ++ ++static boolean ++swr_get_query_result(struct pipe_context *pipe, ++ struct pipe_query *q, ++ boolean wait, ++ union pipe_query_result *result) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_query *pq = swr_query(q); ++ ++ if (pq->fence) { ++ if (!swr_is_fence_done(swr_fence(pq->fence))) { ++ swr_fence_submit(ctx, pq->fence); ++ if (!wait) ++ return FALSE; ++ swr_fence_finish(pipe->screen, pq->fence, 0); ++ } ++ swr_fence_reference(pipe->screen, &pq->fence, NULL); ++ } ++ ++ /* XXX: Need to handle counter rollover */ ++ ++ switch (pq->type) { ++ /* Booleans */ ++ case PIPE_QUERY_OCCLUSION_PREDICATE: ++ result->b = pq->end.u64 != pq->start.u64 ? TRUE : FALSE; ++ break; ++ case PIPE_QUERY_GPU_FINISHED: ++ result->b = pq->end.b; ++ break; ++ /* Counters */ ++ case PIPE_QUERY_OCCLUSION_COUNTER: ++ case PIPE_QUERY_TIMESTAMP: ++ case PIPE_QUERY_TIME_ELAPSED: ++ case PIPE_QUERY_PRIMITIVES_GENERATED: ++ case PIPE_QUERY_PRIMITIVES_EMITTED: ++ result->u64 = pq->end.u64 - pq->start.u64; ++ break; ++ /* Structures */ ++ case PIPE_QUERY_SO_STATISTICS: { ++ struct pipe_query_data_so_statistics *so_stats = &result->so_statistics; ++ struct pipe_query_data_so_statistics *start = &pq->start.so_statistics; ++ struct pipe_query_data_so_statistics *end = &pq->end.so_statistics; ++ so_stats->num_primitives_written = ++ end->num_primitives_written - start->num_primitives_written; ++ so_stats->primitives_storage_needed = ++ end->primitives_storage_needed - start->primitives_storage_needed; ++ } break; ++ case PIPE_QUERY_TIMESTAMP_DISJOINT: { ++ /* os_get_time_nano returns nanoseconds */ ++ result->timestamp_disjoint.frequency = UINT64_C(1000000000); ++ result->timestamp_disjoint.disjoint = FALSE; ++ } break; ++ case PIPE_QUERY_PIPELINE_STATISTICS: { ++ struct pipe_query_data_pipeline_statistics *p_stats = ++ &result->pipeline_statistics; ++ struct pipe_query_data_pipeline_statistics *start = ++ &pq->start.pipeline_statistics; ++ struct pipe_query_data_pipeline_statistics *end = ++ &pq->end.pipeline_statistics; ++ p_stats->ia_vertices = end->ia_vertices - start->ia_vertices; ++ p_stats->ia_primitives = end->ia_primitives - start->ia_primitives; ++ p_stats->vs_invocations = end->vs_invocations - start->vs_invocations; ++ p_stats->gs_invocations = end->gs_invocations - start->gs_invocations; ++ p_stats->gs_primitives = end->gs_primitives - start->gs_primitives; ++ p_stats->c_invocations = end->c_invocations - start->c_invocations; ++ p_stats->c_primitives = end->c_primitives - start->c_primitives; ++ p_stats->ps_invocations = end->ps_invocations - start->ps_invocations; ++ p_stats->hs_invocations = end->hs_invocations - start->hs_invocations; ++ p_stats->ds_invocations = end->ds_invocations - start->ds_invocations; ++ p_stats->cs_invocations = end->cs_invocations - start->cs_invocations; ++ } break; ++ case PIPE_QUERY_SO_OVERFLOW_PREDICATE: { ++ struct pipe_query_data_so_statistics *start = &pq->start.so_statistics; ++ struct pipe_query_data_so_statistics *end = &pq->end.so_statistics; ++ uint64_t num_primitives_written = ++ end->num_primitives_written - start->num_primitives_written; ++ uint64_t primitives_storage_needed = ++ end->primitives_storage_needed - start->primitives_storage_needed; ++ result->b = num_primitives_written > primitives_storage_needed; ++ } break; ++ default: ++ assert(0 && "Unsupported query"); ++ break; ++ } ++ ++ return TRUE; ++} ++ ++static boolean ++swr_begin_query(struct pipe_context *pipe, struct pipe_query *q) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_query *pq = swr_query(q); ++ ++ /* Initialize Results */ ++ memset(&pq->start, 0, sizeof(pq->start)); ++ memset(&pq->end, 0, sizeof(pq->end)); ++ ++ /* Gather start stats and enable SwrCore counters */ ++ pq->result = &pq->start; ++ pq->enable_stats = TRUE; ++ swr_gather_stats(pipe, pq); ++ ctx->active_queries++; ++ ++ /* override start timestamp to 0 for TIMESTAMP query */ ++ if (pq->type == PIPE_QUERY_TIMESTAMP) ++ pq->start.u64 = 0; ++ ++ return true; ++} ++ ++static void ++swr_end_query(struct pipe_context *pipe, struct pipe_query *q) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_query *pq = swr_query(q); ++ ++ assert(ctx->active_queries ++ && "swr_end_query, there are no active queries!"); ++ ctx->active_queries--; ++ ++ /* Gather end stats and disable SwrCore counters */ ++ pq->result = &pq->end; ++ pq->enable_stats = FALSE; ++ swr_gather_stats(pipe, pq); ++} ++ ++ ++boolean ++swr_check_render_cond(struct pipe_context *pipe) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ boolean b, wait; ++ uint64_t result; ++ ++ if (!ctx->render_cond_query) ++ return TRUE; /* no query predicate, draw normally */ ++ ++ wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ++ || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT); ++ ++ b = pipe->get_query_result( ++ pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result); ++ if (b) ++ return (!result == ctx->render_cond_cond); ++ else ++ return TRUE; ++} ++ ++void ++swr_query_init(struct pipe_context *pipe) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ pipe->create_query = swr_create_query; ++ pipe->destroy_query = swr_destroy_query; ++ pipe->begin_query = swr_begin_query; ++ pipe->end_query = swr_end_query; ++ pipe->get_query_result = swr_get_query_result; ++ ++ ctx->active_queries = 0; ++} +diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h +new file mode 100644 +index 0000000..2a2aeee +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_query.h +@@ -0,0 +1,48 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_QUERY_H ++#define SWR_QUERY_H ++ ++ ++#include ++#include "os/os_thread.h" ++ ++ ++struct swr_query { ++ unsigned type; /* PIPE_QUERY_* */ ++ unsigned index; ++ ++ union pipe_query_result *result; ++ union pipe_query_result start; ++ union pipe_query_result end; ++ ++ struct pipe_fence_handle *fence; ++ ++ boolean enable_stats; ++}; ++ ++extern void swr_query_init(struct pipe_context *pipe); ++ ++extern boolean swr_check_render_cond(struct pipe_context *pipe); ++#endif +diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h +new file mode 100644 +index 0000000..f7f641e +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_resource.h +@@ -0,0 +1,98 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_RESOURCE_H ++#define SWR_RESOURCE_H ++ ++#include "pipe/p_state.h" ++#include "api.h" ++ ++struct sw_displaytarget; ++ ++struct swr_resource { ++ struct pipe_resource base; ++ ++ bool has_depth; ++ bool has_stencil; ++ ++ UINT alignedWidth; ++ UINT alignedHeight; ++ ++ SWR_SURFACE_STATE swr; ++ SWR_SURFACE_STATE secondary; // for faking depth/stencil merged formats ++ ++ struct sw_displaytarget *display_target; ++ ++ unsigned row_stride[PIPE_MAX_TEXTURE_LEVELS]; ++ unsigned img_stride[PIPE_MAX_TEXTURE_LEVELS]; ++ unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; ++ ++ /* Opaque pointer to swr_context to mark resource in use */ ++ void *bound_to_context; ++}; ++ ++ ++static INLINE struct swr_resource * ++swr_resource(struct pipe_resource *resource) ++{ ++ return (struct swr_resource *)resource; ++} ++ ++static INLINE boolean ++swr_resource_is_texture(const struct pipe_resource *resource) ++{ ++ switch (resource->target) { ++ case PIPE_BUFFER: ++ return FALSE; ++ case PIPE_TEXTURE_1D: ++ case PIPE_TEXTURE_1D_ARRAY: ++ case PIPE_TEXTURE_2D: ++ case PIPE_TEXTURE_2D_ARRAY: ++ case PIPE_TEXTURE_RECT: ++ case PIPE_TEXTURE_3D: ++ case PIPE_TEXTURE_CUBE: ++ case PIPE_TEXTURE_CUBE_ARRAY: ++ return TRUE; ++ default: ++ assert(0); ++ return FALSE; ++ } ++} ++ ++ ++static INLINE void * ++swr_resource_data(struct pipe_resource *resource) ++{ ++ struct swr_resource *swr_r = swr_resource(resource); ++ ++ assert(!swr_resource_is_texture(resource)); ++ ++ return swr_r->swr.pBaseAddress; ++} ++ ++ ++void swr_store_render_target(struct swr_context *ctx, ++ uint32_t attachment, ++ enum SWR_TILE_STATE post_tile_state, ++ struct SWR_SURFACE_STATE *surface = nullptr); ++#endif +diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp +new file mode 100644 +index 0000000..e6c448c +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_scratch.cpp +@@ -0,0 +1,116 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "util/u_memory.h" ++#include "swr_context.h" ++#include "swr_scratch.h" ++#include "api.h" ++ ++ ++void * ++swr_copy_to_scratch_space(struct swr_context *ctx, ++ struct swr_scratch_space *space, ++ const void *user_buffer, ++ unsigned int size) ++{ ++ void *ptr; ++ assert(space); ++ assert(user_buffer); ++ assert(size); ++ ++ if (size >= 2048) { /* XXX TODO create KNOB_ for this */ ++ /* Use per draw SwrAllocDrawContextMemory for larger copies */ ++ ptr = SwrAllocDrawContextMemory(ctx->swrContext, size, 4); ++ } else { ++ /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */ ++ unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT; ++ ++ /* Need to grow space */ ++ if (max_size_in_flight > space->current_size) { ++ /* Must idle the pipeline, this is infrequent */ ++ SwrWaitForIdle(ctx->swrContext); ++ ++ space->current_size = max_size_in_flight; ++ ++ if (space->base) { ++ align_free(space->base); ++ space->base = NULL; ++ } ++ ++ if (!space->base) { ++ space->base = (BYTE *)align_malloc(space->current_size, 4); ++ space->head = (void *)space->base; ++ } ++ } ++ ++ /* Wrap */ ++ if (((BYTE *)space->head + size) ++ >= ((BYTE *)space->base + space->current_size)) { ++ /* ++ * TODO XXX: Should add a fence on wrap. Assumption is that ++ * current_space >> size, and there are at least MAX_DRAWS_IN_FLIGHT ++ * draws in scratch. So fence would always be met on wrap. A fence ++ * would ensure that first frame in buffer is done before wrapping. ++ * If fence ever needs to be waited on, can increase buffer size. ++ * So far in testing, this hasn't been necessary. ++ */ ++ space->head = space->base; ++ } ++ ++ ptr = space->head; ++ space->head = (BYTE *)space->head + size; ++ } ++ ++ /* Copy user_buffer to scratch */ ++ memcpy(ptr, user_buffer, size); ++ ++ return ptr; ++} ++ ++ ++void ++swr_init_scratch_buffers(struct swr_context *ctx) ++{ ++ struct swr_scratch_buffers *scratch; ++ ++ scratch = CALLOC_STRUCT(swr_scratch_buffers); ++ ctx->scratch = scratch; ++} ++ ++void ++swr_destroy_scratch_buffers(struct swr_context *ctx) ++{ ++ struct swr_scratch_buffers *scratch = ctx->scratch; ++ ++ if (scratch) { ++ if (scratch->vs_constants.base) ++ align_free(scratch->vs_constants.base); ++ if (scratch->fs_constants.base) ++ align_free(scratch->fs_constants.base); ++ if (scratch->vertex_buffer.base) ++ align_free(scratch->vertex_buffer.base); ++ if (scratch->index_buffer.base) ++ align_free(scratch->index_buffer.base); ++ FREE(scratch); ++ } ++} +diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h +new file mode 100644 +index 0000000..74218d6 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_scratch.h +@@ -0,0 +1,63 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_SCRATCH_H ++#define SWR_SCRATCH_H ++ ++struct swr_scratch_space { ++ void *head; ++ unsigned int current_size; ++ /* TODO XXX: Add a fence for wrap condition. */ ++ ++ void *base; ++}; ++ ++struct swr_scratch_buffers { ++ struct swr_scratch_space vs_constants; ++ struct swr_scratch_space fs_constants; ++ struct swr_scratch_space vertex_buffer; ++ struct swr_scratch_space index_buffer; ++}; ++ ++ ++/* ++ * swr_copy_to_scratch_space ++ * Copies size bytes of user_buffer into the scratch ring buffer. ++ * Used to store temporary data such as client arrays and constants. ++ * ++ * Inputs: ++ * space ptr to scratch pool (vs_constants, fs_constants) ++ * user_buffer, data to copy into scratch space ++ * size to be copied ++ * Returns: ++ * pointer to data copied to scratch space. ++ */ ++void *swr_copy_to_scratch_space(struct swr_context *ctx, ++ struct swr_scratch_space *space, ++ const void *user_buffer, ++ unsigned int size); ++ ++void swr_init_scratch_buffers(struct swr_context *ctx); ++void swr_destroy_scratch_buffers(struct swr_context *ctx); ++ ++#endif +diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp +new file mode 100644 +index 0000000..66eb58b +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_screen.cpp +@@ -0,0 +1,666 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "pipe/p_screen.h" ++#include "pipe/p_defines.h" ++#include "util/u_memory.h" ++#include "util/u_format.h" ++#include "util/u_inlines.h" ++#include "util/u_cpu_detect.h" ++ ++#include "state_tracker/sw_winsys.h" ++ ++extern "C" { ++#include "gallivm/lp_bld_limits.h" ++} ++ ++#include "swr_public.h" ++#include "swr_screen.h" ++#include "swr_context.h" ++#include "swr_resource.h" ++#include "swr_fence.h" ++#include "gen_knobs.h" ++ ++#include "jit_api.h" ++ ++#include ++ ++static const char * ++swr_get_name(struct pipe_screen *screen) ++{ ++ return "SWR"; ++} ++ ++static const char * ++swr_get_vendor(struct pipe_screen *screen) ++{ ++ return "Intel Corporation"; ++} ++ ++static boolean ++swr_is_format_supported(struct pipe_screen *screen, ++ enum pipe_format format, ++ enum pipe_texture_target target, ++ unsigned sample_count, ++ unsigned bind) ++{ ++ struct sw_winsys *winsys = swr_screen(screen)->winsys; ++ const struct util_format_description *format_desc; ++ ++ assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D ++ || target == PIPE_TEXTURE_1D_ARRAY ++ || target == PIPE_TEXTURE_2D ++ || target == PIPE_TEXTURE_2D_ARRAY ++ || target == PIPE_TEXTURE_RECT ++ || target == PIPE_TEXTURE_3D ++ || target == PIPE_TEXTURE_CUBE ++ || target == PIPE_TEXTURE_CUBE_ARRAY); ++ ++ format_desc = util_format_description(format); ++ if (!format_desc) ++ return FALSE; ++ ++ if (sample_count > 1) ++ return FALSE; ++ ++ if (bind ++ & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) { ++ if (!winsys->is_displaytarget_format_supported(winsys, bind, format)) ++ return FALSE; ++ } ++ ++ if (bind & PIPE_BIND_RENDER_TARGET) { ++ if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) ++ return FALSE; ++ ++ if (mesa_to_swr_format(format) == (SWR_FORMAT)-1) ++ return FALSE; ++ ++ /* ++ * Although possible, it is unnatural to render into compressed or YUV ++ * surfaces. So disable these here to avoid going into weird paths ++ * inside the state trackers. ++ */ ++ if (format_desc->block.width != 1 || format_desc->block.height != 1) ++ return FALSE; ++ } ++ ++ /* We're going to lie and say we support all depth/stencil formats. ++ * SWR actually needs separate bindings, and only does F32 depth. ++ */ ++ if (bind & PIPE_BIND_DEPTH_STENCIL) { ++ if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) ++ return FALSE; ++ } ++ ++ return TRUE; ++} ++ ++static int ++swr_get_param(struct pipe_screen *screen, enum pipe_cap param) ++{ ++ switch (param) { ++ case PIPE_CAP_NPOT_TEXTURES: ++ case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: ++ return 1; ++ case PIPE_CAP_TWO_SIDED_STENCIL: ++ return 1; ++ case PIPE_CAP_SM3: ++ return 1; ++ case PIPE_CAP_ANISOTROPIC_FILTER: ++ return 0; ++ case PIPE_CAP_POINT_SPRITE: ++ return 1; ++ case PIPE_CAP_MAX_RENDER_TARGETS: ++ return PIPE_MAX_COLOR_BUFS; ++ case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: ++ return 1; ++ case PIPE_CAP_OCCLUSION_QUERY: ++ case PIPE_CAP_QUERY_TIME_ELAPSED: ++ case PIPE_CAP_QUERY_PIPELINE_STATISTICS: ++ return 1; ++ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: ++ return 1; ++ case PIPE_CAP_TEXTURE_SHADOW_MAP: ++ return 1; ++ case PIPE_CAP_TEXTURE_SWIZZLE: ++ return 1; ++ case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: ++ return 0; ++ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: ++ return 13; // xxx This increases rendertarget max size to 4k x 4k. No ++ // way to separate widht/height. ++ case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: ++ return 12; // xxx ++ case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: ++ return 12; // xxx ++ case PIPE_CAP_BLEND_EQUATION_SEPARATE: ++ return 1; ++ case PIPE_CAP_INDEP_BLEND_ENABLE: ++ return 1; ++ case PIPE_CAP_INDEP_BLEND_FUNC: ++ return 1; ++ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: ++ return 0; // Don't support lower left frag coord. ++ case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: ++ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: ++ case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: ++ return 1; ++ case PIPE_CAP_DEPTH_CLIP_DISABLE: ++ return 1; ++ case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: ++ return MAX_SO_STREAMS; ++ case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: ++ case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: ++ return MAX_ATTRIBUTES; ++ case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: ++ case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: ++ return 1024; ++ case PIPE_CAP_MAX_VERTEX_STREAMS: ++ return 1; ++ case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: ++ return 2048; ++ case PIPE_CAP_PRIMITIVE_RESTART: ++ return 1; ++ case PIPE_CAP_SHADER_STENCIL_EXPORT: ++ return 1; ++ case PIPE_CAP_TGSI_INSTANCEID: ++ case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: ++ case PIPE_CAP_START_INSTANCE: ++ return 1; ++ case PIPE_CAP_SEAMLESS_CUBE_MAP: ++ case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: ++ return 1; ++ case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: ++ return 256; /* for GL3 */ ++ case PIPE_CAP_MIN_TEXEL_OFFSET: ++ return -8; ++ case PIPE_CAP_MAX_TEXEL_OFFSET: ++ return 7; ++ case PIPE_CAP_CONDITIONAL_RENDER: ++ return 1; ++ case PIPE_CAP_TEXTURE_BARRIER: ++ return 0; ++ case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: ++ case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: /* draw module */ ++ case PIPE_CAP_VERTEX_COLOR_CLAMPED: /* draw module */ ++ return 1; ++ case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: ++ return 0; ++ case PIPE_CAP_GLSL_FEATURE_LEVEL: ++ return 330; ++ case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: ++ return 0; ++ case PIPE_CAP_COMPUTE: ++ return 0; ++ case PIPE_CAP_USER_VERTEX_BUFFERS: ++ case PIPE_CAP_USER_INDEX_BUFFERS: ++ case PIPE_CAP_USER_CONSTANT_BUFFERS: ++ case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: ++ case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: ++ return 1; ++ case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: ++ return 16; ++ case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: ++ case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: ++ case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: ++ case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: ++ case PIPE_CAP_TEXTURE_MULTISAMPLE: ++ return 0; ++ case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: ++ return 64; ++ case PIPE_CAP_QUERY_TIMESTAMP: ++ return 1; ++ case PIPE_CAP_CUBE_MAP_ARRAY: ++ return 0; ++ case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: ++ return 1; ++ case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: ++ return 65536; ++ case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: ++ return 0; ++ case PIPE_CAP_TGSI_TEXCOORD: ++ case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: ++ return 0; ++ case PIPE_CAP_MAX_VIEWPORTS: ++ return 1; ++ case PIPE_CAP_ENDIANNESS: ++ return PIPE_ENDIAN_NATIVE; ++ case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: ++ case PIPE_CAP_TEXTURE_GATHER_SM5: ++ return 0; ++ case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: ++ return 1; ++ case PIPE_CAP_TEXTURE_QUERY_LOD: ++ case PIPE_CAP_SAMPLE_SHADING: ++ case PIPE_CAP_TEXTURE_GATHER_OFFSETS: ++ case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: ++ case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: ++ case PIPE_CAP_SAMPLER_VIEW_TARGET: ++ return 0; ++ case PIPE_CAP_FAKE_SW_MSAA: ++ return 1; ++ case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: ++ case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: ++ return 0; ++ case PIPE_CAP_DRAW_INDIRECT: ++ return 1; ++ ++ case PIPE_CAP_VENDOR_ID: ++ return 0xFFFFFFFF; ++ case PIPE_CAP_DEVICE_ID: ++ return 0xFFFFFFFF; ++ case PIPE_CAP_ACCELERATED: ++ return 0; ++ case PIPE_CAP_VIDEO_MEMORY: { ++ /* XXX: Do we want to return the full amount of system memory ? */ ++ uint64_t system_memory; ++ ++ if (!os_get_total_physical_memory(&system_memory)) ++ return 0; ++ ++ return (int)(system_memory >> 20); ++ } ++ case PIPE_CAP_UMA: ++ return 1; ++ case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: ++ return 1; ++ case PIPE_CAP_CLIP_HALFZ: ++ return 1; ++ case PIPE_CAP_VERTEXID_NOBASE: ++ return 0; ++ case PIPE_CAP_POLYGON_OFFSET_CLAMP: ++ return 1; ++ case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: ++ return 0; ++ case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: ++ return 0; // xxx ++ case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: ++ return 0; ++ case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: ++ return 0; ++ case PIPE_CAP_DEPTH_BOUNDS_TEST: ++ return 0; // xxx ++ case PIPE_CAP_TEXTURE_FLOAT_LINEAR: ++ case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: ++ return 1; ++ } ++ ++ /* should only get here on unhandled cases */ ++ debug_printf("Unexpected PIPE_CAP %d query\n", param); ++ return 0; ++} ++ ++static int ++swr_get_shader_param(struct pipe_screen *screen, ++ unsigned shader, ++ enum pipe_shader_cap param) ++{ ++ if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_FRAGMENT) ++ return gallivm_get_shader_param(param); ++ ++ // Todo: geometry, tesselation, compute ++ return 0; ++} ++ ++ ++static float ++swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param) ++{ ++ switch (param) { ++ case PIPE_CAPF_MAX_LINE_WIDTH: ++ case PIPE_CAPF_MAX_LINE_WIDTH_AA: ++ case PIPE_CAPF_MAX_POINT_WIDTH: ++ return 255.0; /* arbitrary */ ++ case PIPE_CAPF_MAX_POINT_WIDTH_AA: ++ return 0.0; ++ case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: ++ return 0.0; ++ case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: ++ return 0.0; ++ case PIPE_CAPF_GUARD_BAND_LEFT: ++ case PIPE_CAPF_GUARD_BAND_TOP: ++ case PIPE_CAPF_GUARD_BAND_RIGHT: ++ case PIPE_CAPF_GUARD_BAND_BOTTOM: ++ return 0.0; ++ } ++ /* should only get here on unhandled cases */ ++ debug_printf("Unexpected PIPE_CAPF %d query\n", param); ++ return 0.0; ++} ++ ++SWR_FORMAT ++mesa_to_swr_format(enum pipe_format format) ++{ ++ const struct util_format_description *format_desc = ++ util_format_description(format); ++ if (!format_desc) ++ return (SWR_FORMAT)-1; ++ ++ // more robust check would be comparing all attributes of the formats ++ // luckily format names are mostly standardized ++ for (int i = 0; i < NUM_SWR_FORMATS; i++) { ++ const SWR_FORMAT_INFO &swr_desc = GetFormatInfo((SWR_FORMAT)i); ++ ++ if (!strcasecmp(format_desc->short_name, swr_desc.name)) ++ return (SWR_FORMAT)i; ++ } ++ ++ // ... with some exceptions ++ switch (format) { ++ case PIPE_FORMAT_R8G8B8A8_SRGB: ++ return R8G8B8A8_UNORM_SRGB; ++ case PIPE_FORMAT_B8G8R8A8_SRGB: ++ return B8G8R8A8_UNORM_SRGB; ++ case PIPE_FORMAT_I8_UNORM: ++ return R8_UNORM; ++ case PIPE_FORMAT_Z24_UNORM_S8_UINT: ++ return R24_UNORM_X8_TYPELESS; ++ case PIPE_FORMAT_L8A8_UNORM: ++ return R8G8_UNORM; ++ default: ++ break; ++ } ++ ++ debug_printf("asked to convert unsupported format %s\n", ++ format_desc->name); ++ return (SWR_FORMAT)-1; ++} ++ ++static boolean ++swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res) ++{ ++ struct sw_winsys *winsys = screen->winsys; ++ ++ UINT stride; ++ res->display_target = winsys->displaytarget_create(winsys, ++ res->base.bind, ++ res->base.format, ++ res->alignedWidth, ++ res->alignedHeight, ++ 64, ++ &stride); ++ ++ if (res->display_target == NULL) ++ return FALSE; ++ ++ /* Clear the display target surface */ ++ void *map = winsys->displaytarget_map( ++ winsys, res->display_target, PIPE_TRANSFER_WRITE); ++ ++ if (map) ++ memset(map, 0, res->alignedHeight * stride); ++ ++ winsys->displaytarget_unmap(winsys, res->display_target); ++ ++ return TRUE; ++} ++ ++static struct pipe_resource * ++swr_resource_create(struct pipe_screen *_screen, ++ const struct pipe_resource *templat) ++{ ++ struct swr_screen *screen = swr_screen(_screen); ++ struct swr_resource *res = CALLOC_STRUCT(swr_resource); ++ if (!res) ++ return NULL; ++ ++ res->base = *templat; ++ pipe_reference_init(&res->base.reference, 1); ++ res->base.screen = &screen->base; ++ ++ const struct util_format_description *desc = ++ util_format_description(templat->format); ++ res->has_depth = util_format_has_depth(desc); ++ res->has_stencil = util_format_has_stencil(desc); ++ ++ pipe_format fmt = templat->format; ++ if (res->has_depth) ++ fmt = PIPE_FORMAT_Z24_UNORM_S8_UINT; ++ if (res->has_stencil && !res->has_depth) ++ fmt = PIPE_FORMAT_R8_UINT; ++ ++ res->swr.width = templat->width0; ++ res->swr.height = templat->height0; ++ res->swr.depth = templat->depth0; ++ res->swr.type = SURFACE_2D; ++ res->swr.tileMode = SWR_TILE_NONE; ++ res->swr.format = mesa_to_swr_format(fmt); ++ res->swr.numSamples = (1 << templat->nr_samples); ++ ++ SWR_FORMAT_INFO finfo = GetFormatInfo(res->swr.format); ++ ++ unsigned total_size = 0; ++ unsigned width = templat->width0; ++ unsigned height = templat->height0; ++ unsigned depth = templat->depth0; ++ unsigned layers = templat->array_size; ++ ++ for (int level = 0; level <= templat->last_level; level++) { ++ unsigned alignedWidth, alignedHeight; ++ unsigned num_slices; ++ ++ if (templat->bind & (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET ++ | PIPE_BIND_DISPLAY_TARGET)) { ++ alignedWidth = (width + (KNOB_MACROTILE_X_DIM - 1)) ++ & ~(KNOB_MACROTILE_X_DIM - 1); ++ alignedHeight = (height + (KNOB_MACROTILE_Y_DIM - 1)) ++ & ~(KNOB_MACROTILE_Y_DIM - 1); ++ } else { ++ alignedWidth = width; ++ alignedHeight = height; ++ } ++ ++ if (level == 0) { ++ res->alignedWidth = alignedWidth; ++ res->alignedHeight = alignedHeight; ++ } ++ ++ res->row_stride[level] = alignedWidth * finfo.Bpp; ++ res->img_stride[level] = res->row_stride[level] * alignedHeight; ++ res->mip_offsets[level] = total_size; ++ ++ if (templat->target == PIPE_TEXTURE_3D) ++ num_slices = depth; ++ else if (templat->target == PIPE_TEXTURE_1D_ARRAY ++ || templat->target == PIPE_TEXTURE_2D_ARRAY ++ || templat->target == PIPE_TEXTURE_CUBE ++ || templat->target == PIPE_TEXTURE_CUBE_ARRAY) ++ num_slices = layers; ++ else ++ num_slices = 1; ++ ++ total_size += res->img_stride[level] * num_slices; ++ ++ width = u_minify(width, 1); ++ height = u_minify(height, 1); ++ depth = u_minify(depth, 1); ++ } ++ ++ res->swr.halign = res->alignedWidth; ++ res->swr.valign = res->alignedHeight; ++ res->swr.pitch = res->row_stride[0]; ++ res->swr.pBaseAddress = (BYTE *)_aligned_malloc(total_size, 64); ++ ++ if (res->has_depth && res->has_stencil) { ++ res->secondary.width = templat->width0; ++ res->secondary.height = templat->height0; ++ res->secondary.depth = templat->depth0; ++ res->secondary.type = SURFACE_2D; ++ res->secondary.tileMode = SWR_TILE_NONE; ++ res->secondary.format = R8_UINT; ++ res->secondary.numSamples = (1 << templat->nr_samples); ++ ++ SWR_FORMAT_INFO finfo = GetFormatInfo(res->secondary.format); ++ res->secondary.pitch = res->alignedWidth * finfo.Bpp; ++ res->secondary.pBaseAddress = (BYTE *)_aligned_malloc( ++ res->alignedHeight * res->secondary.pitch, 64); ++ } ++ ++ if (swr_resource_is_texture(&res->base)) { ++ if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT ++ | PIPE_BIND_SHARED)) { ++ /* displayable surface */ ++ if (!swr_displaytarget_layout(screen, res)) ++ goto fail; ++ } ++ } ++ ++ return &res->base; ++ ++fail: ++ FREE(res); ++ return NULL; ++} ++ ++static void ++swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt) ++{ ++ struct swr_screen *screen = swr_screen(p_screen); ++ struct swr_resource *res = swr_resource(pt); ++ ++ /* ++ * If this resource is attached to a context it may still be in use, check ++ * dependencies before freeing ++ * XXX TODO: don't use SwrWaitForIdle, use fences and come up with a real ++ * resource manager. ++ * XXX It's happened that we get a swr_destroy prior to freeing the ++ * framebuffer resource. Don't wait on it. ++ */ ++ if (res->bound_to_context && !res->display_target) { ++ struct swr_context *ctx = ++ swr_context((pipe_context *)res->bound_to_context); ++ SwrWaitForIdle( ++ ctx->swrContext); // BMCDEBUG, don't SwrWaitForIdle!!! Use a fence. ++ } ++ ++ if (res->display_target) { ++ /* display target */ ++ struct sw_winsys *winsys = screen->winsys; ++ winsys->displaytarget_destroy(winsys, res->display_target); ++ } ++ ++ _aligned_free(res->swr.pBaseAddress); ++ _aligned_free(res->secondary.pBaseAddress); ++ ++ FREE(res); ++} ++ ++ ++static void ++swr_flush_frontbuffer(struct pipe_screen *p_screen, ++ struct pipe_resource *resource, ++ unsigned level, ++ unsigned layer, ++ void *context_private, ++ struct pipe_box *sub_box) ++{ ++ SWR_SURFACE_STATE &colorBuffer = swr_resource(resource)->swr; ++ ++ struct swr_screen *screen = swr_screen(p_screen); ++ struct sw_winsys *winsys = screen->winsys; ++ struct swr_resource *res = swr_resource(resource); ++ ++ /* Ensure fence set at flush is finished, before reading frame buffer */ ++ swr_fence_finish(p_screen, screen->flush_fence, 0); ++ ++ void *map = winsys->displaytarget_map( ++ winsys, res->display_target, PIPE_TRANSFER_WRITE); ++ memcpy( ++ map, colorBuffer.pBaseAddress, colorBuffer.pitch * colorBuffer.height); ++ winsys->displaytarget_unmap(winsys, res->display_target); ++ ++ assert(res->display_target); ++ if (res->display_target) ++ winsys->displaytarget_display( ++ winsys, res->display_target, context_private, sub_box); ++} ++ ++ ++static void ++swr_destroy_screen(struct pipe_screen *p_screen) ++{ ++ struct swr_screen *screen = swr_screen(p_screen); ++ struct sw_winsys *winsys = screen->winsys; ++ ++ fprintf(stderr, "SWR destroy screen!\n"); ++ ++ swr_fence_finish(p_screen, screen->flush_fence, 0); ++ swr_fence_reference(p_screen, &screen->flush_fence, NULL); ++ ++ JitDestroyContext(screen->hJitMgr); ++ ++ if (winsys->destroy) ++ winsys->destroy(winsys); ++ ++ FREE(screen); ++} ++ ++ ++struct pipe_screen * ++swr_create_screen(struct sw_winsys *winsys) ++{ ++ struct swr_screen *screen = CALLOC_STRUCT(swr_screen); ++ ++ if (!screen) ++ return NULL; ++ ++ fprintf(stderr, "SWR create screen!\n"); ++ util_cpu_detect(); ++ if (util_cpu_caps.has_avx2) ++ fprintf(stderr, "This processor supports AVX2.\n"); ++ else if (util_cpu_caps.has_avx) ++ fprintf(stderr, "This processor supports AVX.\n"); ++ /* Exit gracefully if there is no AVX support */ ++ else { ++ fprintf(stderr, " !!! This processor does not support AVX or AVX2. " ++ "OpenSWR requires AVX.\n"); ++ exit(-1); ++ } ++ ++ if (!getenv("KNOB_MAX_PRIMS_PER_DRAW")) { ++ g_GlobalKnobs.MAX_PRIMS_PER_DRAW.Value(49152); ++ } ++ ++ screen->winsys = winsys; ++ screen->base.get_name = swr_get_name; ++ screen->base.get_vendor = swr_get_vendor; ++ screen->base.is_format_supported = swr_is_format_supported; ++ screen->base.context_create = swr_create_context; ++ ++ screen->base.destroy = swr_destroy_screen; ++ screen->base.get_param = swr_get_param; ++ screen->base.get_shader_param = swr_get_shader_param; ++ screen->base.get_paramf = swr_get_paramf; ++ ++ screen->base.resource_create = swr_resource_create; ++ screen->base.resource_destroy = swr_resource_destroy; ++ ++ screen->base.flush_frontbuffer = swr_flush_frontbuffer; ++ ++ screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, KNOB_ARCH_STR); ++ ++ swr_fence_init(&screen->base); ++ ++ return &screen->base; ++} +diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h +new file mode 100644 +index 0000000..a96dc44 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_screen.h +@@ -0,0 +1,52 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_SCREEN_H ++#define SWR_SCREEN_H ++ ++#include "pipe/p_screen.h" ++#include "pipe/p_defines.h" ++#include "api.h" ++ ++struct sw_winsys; ++ ++struct swr_screen { ++ struct pipe_screen base; ++ ++ struct pipe_fence_handle *flush_fence; ++ ++ struct sw_winsys *winsys; ++ ++ HANDLE hJitMgr; ++}; ++ ++static INLINE struct swr_screen * ++swr_screen(struct pipe_screen *pipe) ++{ ++ return (struct swr_screen *)pipe; ++} ++ ++SWR_FORMAT ++mesa_to_swr_format(enum pipe_format format); ++ ++#endif +diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp +new file mode 100644 +index 0000000..edad4c2 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_shader.cpp +@@ -0,0 +1,608 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "JitManager.h" ++#include "state.h" ++#include "state_llvm.h" ++#include "builder.h" ++ ++#include "llvm-c/Core.h" ++#include "llvm/Support/CBindingWrapping.h" ++ ++#include "tgsi/tgsi_strings.h" ++#include "gallivm/lp_bld_init.h" ++#include "gallivm/lp_bld_flow.h" ++#include "gallivm/lp_bld_struct.h" ++#include "gallivm/lp_bld_tgsi.h" ++ ++#include "swr_context.h" ++#include "swr_context_llvm.h" ++#include "swr_state.h" ++#include "swr_screen.h" ++ ++bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs) ++{ ++ return !memcmp(&lhs, &rhs, sizeof(lhs)); ++} ++ ++void ++swr_generate_fs_key(struct swr_jit_key &key, ++ struct swr_context *ctx, ++ swr_fragment_shader *swr_fs) ++{ ++ key.nr_cbufs = ctx->framebuffer.nr_cbufs; ++ key.light_twoside = ctx->rasterizer->light_twoside; ++ memcpy(&key.vs_output_semantic_name, ++ &ctx->vs->info.base.output_semantic_name, ++ sizeof(key.vs_output_semantic_name)); ++ memcpy(&key.vs_output_semantic_idx, ++ &ctx->vs->info.base.output_semantic_index, ++ sizeof(key.vs_output_semantic_idx)); ++ ++ key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1; ++ ++ for (unsigned i = 0; i < key.nr_samplers; i++) { ++ if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { ++ lp_sampler_static_sampler_state( ++ &key.sampler[i].sampler_state, ++ ctx->samplers[PIPE_SHADER_FRAGMENT][i]); ++ } ++ } ++ ++ /* ++ * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes ++ * are dx10-style? Can't really have mixed opcodes, at least not ++ * if we want to skip the holes here (without rescanning tgsi). ++ */ ++ if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { ++ key.nr_sampler_views = ++ swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; ++ for (unsigned i = 0; i < key.nr_sampler_views; i++) { ++ if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { ++ lp_sampler_static_texture_state( ++ &key.sampler[i].texture_state, ++ ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); ++ } ++ } ++ } else { ++ key.nr_sampler_views = key.nr_samplers; ++ for (unsigned i = 0; i < key.nr_sampler_views; i++) { ++ if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { ++ lp_sampler_static_texture_state( ++ &key.sampler[i].texture_state, ++ ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); ++ } ++ } ++ } ++ ++ memcpy(&key.alphaTest, ++ &ctx->depth_stencil->alpha, ++ sizeof(struct pipe_alpha_state)); ++} ++ ++struct BuilderSWR : public Builder { ++ BuilderSWR(JitManager *pJitMgr) ++ : Builder(pJitMgr) ++ { ++ pJitMgr->SetupNewModule(); ++ } ++ ++ PFN_VERTEX_FUNC ++ CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs); ++ PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key); ++}; ++ ++PFN_VERTEX_FUNC ++BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) ++{ ++ swr_vs->linkageMask = 0; ++ ++ for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) { ++ switch (swr_vs->info.base.output_semantic_name[i]) { ++ case TGSI_SEMANTIC_POSITION: ++ break; ++ case TGSI_SEMANTIC_PSIZE: ++ swr_vs->pointSizeAttrib = i; ++ break; ++ default: ++ swr_vs->linkageMask |= (1 << i); ++ break; ++ } ++ } ++ ++ // tgsi_dump(swr_vs->pipe.tokens, 0); ++ ++ struct gallivm_state *gallivm = ++ gallivm_create("VS", wrap(&JM()->mContext)); ++ gallivm->module = wrap(JM()->mpCurrentModule); ++ ++ LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; ++ LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; ++ ++ memset(outputs, 0, sizeof(outputs)); ++ ++ AttrBuilder attrBuilder; ++ attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); ++ AttributeSet attrSet = AttributeSet::get( ++ JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); ++ ++ std::vector vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), ++ PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)}; ++ FunctionType *vsFuncType = ++ FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false); ++ ++ // create new vertex shader function ++ auto pFunction = Function::Create(vsFuncType, ++ GlobalValue::ExternalLinkage, ++ "VS", ++ JM()->mpCurrentModule); ++ pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); ++ ++ BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); ++ IRB()->SetInsertPoint(block); ++ LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); ++ ++ auto argitr = pFunction->getArgumentList().begin(); ++ Value *hPrivateData = argitr++; ++ hPrivateData->setName("hPrivateData"); ++ Value *pVsCtx = argitr++; ++ pVsCtx->setName("vsCtx"); ++ ++ Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantVS}); ++ consts_ptr->setName("vs_constants"); ++ Value *const_sizes_ptr = ++ GEP(hPrivateData, {0, swr_draw_context_num_constantsVS}); ++ const_sizes_ptr->setName("num_vs_constants"); ++ ++ Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin}); ++ ++ for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { ++ const unsigned mask = swr_vs->info.base.input_usage_mask[attrib]; ++ for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { ++ if (mask & (1 << channel)) { ++ inputs[attrib][channel] = ++ wrap(LOAD(vtxInput, {0, 0, attrib, channel})); ++ } ++ } ++ } ++ ++ struct lp_bld_tgsi_system_values system_values; ++ memset(&system_values, 0, sizeof(system_values)); ++ system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID})); ++ system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID})); ++ ++ lp_build_tgsi_soa(gallivm, ++ swr_vs->pipe.tokens, ++ lp_type_float_vec(32, 32 * 8), ++ NULL, // mask ++ wrap(consts_ptr), ++ wrap(const_sizes_ptr), ++ &system_values, ++ inputs, ++ outputs, ++ NULL, // wrap(hPrivateData), (sampler context) ++ NULL, // sampler ++ &swr_vs->info.base, ++ NULL); // geometry shader face ++ ++ IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); ++ ++ Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout}); ++ ++ for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { ++ for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) { ++ if (!outputs[attrib][channel]) ++ continue; ++ ++ Value *val = LOAD(unwrap(outputs[attrib][channel])); ++ STORE(val, vtxOutput, {0, 0, attrib, channel}); ++ } ++ } ++ ++ RET_VOID(); ++ ++ gallivm_verify_function(gallivm, wrap(pFunction)); ++ gallivm_compile_module(gallivm); ++ ++ // lp_debug_dump_value(func); ++ ++ PFN_VERTEX_FUNC pFunc = ++ (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction)); ++ ++ debug_printf("vert shader %p\n", pFunc); ++ assert(pFunc && "Error: VertShader = NULL"); ++ ++#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5) ++ JM()->mIsModuleFinalized = true; ++#endif ++ ++ return pFunc; ++} ++ ++PFN_VERTEX_FUNC ++swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs) ++{ ++ BuilderSWR builder( ++ reinterpret_cast(swr_screen(ctx->screen)->hJitMgr)); ++ return builder.CompileVS(ctx, swr_vs); ++} ++ ++static unsigned ++locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info) ++{ ++ for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { ++ if ((info->output_semantic_name[i] == name) ++ && (info->output_semantic_index[i] == index)) { ++ return i - 1; // position is not part of the linkage ++ } ++ } ++ ++ if (name == TGSI_SEMANTIC_COLOR) { // BCOLOR fallback ++ for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) { ++ if ((info->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) ++ && (info->output_semantic_index[i] == index)) { ++ return i - 1; // position is not part of the linkage ++ } ++ } ++ } ++ ++ return 0xFFFFFFFF; ++} ++ ++PFN_PIXEL_KERNEL ++BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) ++{ ++ struct swr_fragment_shader *swr_fs = ctx->fs; ++ ++ // tgsi_dump(swr_fs->pipe.tokens, 0); ++ ++ struct gallivm_state *gallivm = ++ gallivm_create("FS", wrap(&JM()->mContext)); ++ gallivm->module = wrap(JM()->mpCurrentModule); ++ ++ LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS]; ++ LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS]; ++ ++ memset(inputs, 0, sizeof(inputs)); ++ memset(outputs, 0, sizeof(outputs)); ++ ++ struct lp_build_sampler_soa *sampler = NULL; ++ ++ AttrBuilder attrBuilder; ++ attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float)); ++ AttributeSet attrSet = AttributeSet::get( ++ JM()->mContext, AttributeSet::FunctionIndex, attrBuilder); ++ ++ std::vector fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0), ++ PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)}; ++ FunctionType *funcType = ++ FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false); ++ ++ auto pFunction = Function::Create(funcType, ++ GlobalValue::ExternalLinkage, ++ "FS", ++ JM()->mpCurrentModule); ++ pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet); ++ ++ BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction); ++ IRB()->SetInsertPoint(block); ++ LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block)); ++ ++ auto &args = pFunction->getArgumentList(); ++ Value *hPrivateData = args.begin(); ++ hPrivateData->setName("hPrivateData"); ++ Value *pPS = ++args.begin(); ++ pPS->setName("psCtx"); ++ ++ Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS}); ++ consts_ptr->setName("fs_constants"); ++ Value *const_sizes_ptr = ++ GEP(hPrivateData, {0, swr_draw_context_num_constantsFS}); ++ const_sizes_ptr->setName("num_fs_constants"); ++ ++ // xxx should check for flat shading versus interpolation ++ ++ // load i ++ Value *vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI}, "i"); ++ ++ // load j ++ Value *vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ}, "j"); ++ ++ // load/compute w ++ Value *vw = FDIV(VIMMED1(1.0f), LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW})); ++ vw->setName("w"); ++ ++ // load *pAttribs, *pPerspAttribs ++ Value *pAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pAttribs"); ++ Value *pPerspAttribs = ++ LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs"); ++ ++ swr_fs->constantMask = 0; ++ ++ for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) { ++ const unsigned mask = swr_fs->info.base.input_usage_mask[attrib]; ++ const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib]; ++ ++ if (!mask) ++ continue; ++ ++ ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib]; ++ ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib]; ++ ++ if (semantic_name == TGSI_SEMANTIC_FACE) { ++ Value *ff = ++ UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty); ++ ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f)); ++ ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace"); ++ ++ inputs[attrib][0] = wrap(ff); ++ inputs[attrib][1] = wrap(VIMMED1(0.0f)); ++ inputs[attrib][2] = wrap(VIMMED1(0.0f)); ++ inputs[attrib][3] = wrap(VIMMED1(1.0f)); ++ continue; ++ } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord ++ inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX}, "vX")); ++ inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY}, "vY")); ++ inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ")); ++ inputs[attrib][3] = ++ wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW}, "vOneOverW")); ++ continue; ++ } else if (semantic_name == TGSI_SEMANTIC_PRIMID) { ++ Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID"); ++ inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID)); ++ inputs[attrib][1] = wrap(VIMMED1(0)); ++ inputs[attrib][2] = wrap(VIMMED1(0)); ++ inputs[attrib][3] = wrap(VIMMED1(0)); ++ continue; ++ } ++ ++ unsigned linkedAttrib = ++ locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base); ++ if (linkedAttrib == 0xFFFFFFFF) { ++ // not found - check for point sprite ++ if (ctx->rasterizer->sprite_coord_enable) { ++ linkedAttrib = ctx->vs->info.base.num_outputs - 1; ++ } else { ++ fprintf(stderr, ++ "Missing %s[%d]\n", ++ tgsi_semantic_names[semantic_name], ++ semantic_idx); ++ assert(0 && "attribute linkage not found"); ++ } ++ } ++ ++ if (interpMode == TGSI_INTERPOLATE_CONSTANT) { ++ swr_fs->constantMask |= 1 << linkedAttrib; ++ } ++ ++ for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { ++ if (mask & (1 << channel)) { ++ Value *indexA = C(linkedAttrib * 12 + channel); ++ Value *indexB = C(linkedAttrib * 12 + channel + 4); ++ Value *indexC = C(linkedAttrib * 12 + channel + 8); ++ ++ if ((semantic_name == TGSI_SEMANTIC_COLOR) ++ && ctx->rasterizer->light_twoside) { ++ unsigned bcolorAttrib = locate_linkage( ++ TGSI_SEMANTIC_BCOLOR, semantic_idx, &ctx->vs->info.base); ++ ++ unsigned diff = 12 * (bcolorAttrib - linkedAttrib); ++ ++ Value *back = ++ XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace"); ++ ++ Value *offset = MUL(back, C(diff)); ++ offset->setName("offset"); ++ ++ indexA = ADD(indexA, offset); ++ indexB = ADD(indexB, offset); ++ indexC = ADD(indexC, offset); ++ ++ if (interpMode == TGSI_INTERPOLATE_CONSTANT) { ++ swr_fs->constantMask |= 1 << bcolorAttrib; ++ } ++ } ++ ++ Value *pAttribPtr = (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) ++ ? pPerspAttribs ++ : pAttribs; ++ ++ Value *va = ++ VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexA))); ++ Value *vb = ++ VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexB))); ++ Value *vc = ++ VECTOR_SPLAT(JM()->mVWidth, LOAD(GEP(pAttribPtr, indexC))); ++ ++ if (interpMode == TGSI_INTERPOLATE_CONSTANT) { ++ inputs[attrib][channel] = wrap(va); ++ } else { ++ Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj); ++ ++ vc = FMUL(vk, vc); ++ ++ Value *interp = FMUL(va, vi); ++ Value *interp1 = FMUL(vb, vj); ++ interp = FADD(interp, interp1); ++ interp = FADD(interp, vc); ++ if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) ++ interp = FMUL(interp, vw); ++ inputs[attrib][channel] = wrap(interp); ++ } ++ } ++ } ++ } ++ ++ sampler = swr_sampler_soa_create(key.sampler); ++ ++ struct lp_bld_tgsi_system_values system_values; ++ memset(&system_values, 0, sizeof(system_values)); ++ ++ struct lp_build_mask_context mask; ++ ++ if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) { ++ Value *mask_val = LOAD(pPS, {0, SWR_PS_CONTEXT_mask}, "coverage_mask"); ++ lp_build_mask_begin( ++ &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val)); ++ } ++ ++ lp_build_tgsi_soa(gallivm, ++ swr_fs->pipe.tokens, ++ lp_type_float_vec(32, 32 * 8), ++ swr_fs->info.base.uses_kill ? &mask : NULL, // mask ++ wrap(consts_ptr), ++ wrap(const_sizes_ptr), ++ &system_values, ++ inputs, ++ outputs, ++ wrap(hPrivateData), ++ sampler, // sampler ++ &swr_fs->info.base, ++ NULL); // geometry shader face ++ ++ IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); ++ ++ for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs; ++ attrib++) { ++ switch (swr_fs->info.base.output_semantic_name[attrib]) { ++ case TGSI_SEMANTIC_POSITION: { ++ // write z ++ LLVMValueRef outZ = ++ LLVMBuildLoad(gallivm->builder, outputs[attrib][2], ""); ++ STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ}); ++ break; ++ } ++ case TGSI_SEMANTIC_COLOR: { ++ for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) { ++ if (!outputs[attrib][channel]) ++ continue; ++ ++ LLVMValueRef out = ++ LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], ""); ++ if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) { ++ for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) { ++ STORE(unwrap(out), ++ pPS, ++ {0, SWR_PS_CONTEXT_shaded, rt, channel}); ++ } ++ } else { ++ STORE(unwrap(out), ++ pPS, ++ {0, ++ SWR_PS_CONTEXT_shaded, ++ swr_fs->info.base.output_semantic_index[attrib], ++ channel}); ++ } ++ } ++ break; ++ } ++ default: { ++ fprintf(stderr, ++ "unknown output from FS %s[%d]\n", ++ tgsi_semantic_names[swr_fs->info.base ++ .output_semantic_name[attrib]], ++ swr_fs->info.base.output_semantic_index[attrib]); ++ break; ++ } ++ } ++ } ++ ++ LLVMValueRef mask_result = 0; ++ if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) { ++ mask_result = lp_build_mask_end(&mask); ++ } ++ ++ IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); ++ ++ if (key.alphaTest.enabled) { ++ unsigned linkage = ++ locate_linkage(TGSI_SEMANTIC_COLOR, 0, &ctx->fs->info.base) + 1; ++ ++ Value *alpha = LOAD( ++ pPS, {0, SWR_PS_CONTEXT_shaded, linkage, 3 /* alpha */}, "alpha"); ++ Value *ref = VIMMED1(key.alphaTest.ref_value); ++ ++ CmpInst::Predicate cmp = CmpInst::Predicate::FCMP_FALSE; ++ switch (key.alphaTest.func) { ++ case PIPE_FUNC_NEVER: ++ cmp = CmpInst::Predicate::FCMP_FALSE; ++ break; ++ case PIPE_FUNC_LESS: ++ cmp = CmpInst::Predicate::FCMP_OLT; ++ break; ++ case PIPE_FUNC_EQUAL: ++ cmp = CmpInst::Predicate::FCMP_OEQ; ++ break; ++ case PIPE_FUNC_LEQUAL: ++ cmp = CmpInst::Predicate::FCMP_OLE; ++ break; ++ case PIPE_FUNC_GREATER: ++ cmp = CmpInst::Predicate::FCMP_OGT; ++ break; ++ case PIPE_FUNC_NOTEQUAL: ++ cmp = CmpInst::Predicate::FCMP_ONE; ++ break; ++ case PIPE_FUNC_GEQUAL: ++ cmp = CmpInst::Predicate::FCMP_OGE; ++ break; ++ case PIPE_FUNC_ALWAYS: ++ cmp = CmpInst::Predicate::FCMP_TRUE; ++ break; ++ } ++ ++ Value *alpha_result = ++ IRB()->CreateFCmp(cmp, alpha, ref, "alphaTestFunc"); ++ ++ mask_result = ++ wrap(AND(unwrap(mask_result), S_EXT(alpha_result, mSimdInt32Ty))); ++ } ++ ++ if (swr_fs->info.base.uses_kill || key.alphaTest.enabled) { ++ STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_mask}); ++ } ++ ++ RET_VOID(); ++ ++ gallivm_verify_function(gallivm, wrap(pFunction)); ++ ++ gallivm_compile_module(gallivm); ++ ++ PFN_PIXEL_KERNEL kernel = ++ (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction)); ++ debug_printf("frag shader %p\n", kernel); ++ assert(kernel && "Error: FragShader = NULL"); ++ ++#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5) ++ JM()->mIsModuleFinalized = true; ++#endif ++ ++ return kernel; ++} ++ ++PFN_PIXEL_KERNEL ++swr_compile_fs(struct swr_context *ctx, swr_jit_key &key) ++{ ++ BuilderSWR builder( ++ reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr)); ++ return builder.CompileFS(ctx, key); ++} +diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h +new file mode 100644 +index 0000000..2962646 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_shader.h +@@ -0,0 +1,61 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#pragma once ++ ++class swr_vertex_shader; ++class swr_fragment_shader; ++class swr_jit_key; ++ ++PFN_VERTEX_FUNC ++swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs); ++ ++PFN_PIXEL_KERNEL ++swr_compile_fs(struct swr_context *ctx, swr_jit_key &key); ++ ++void swr_generate_fs_key(struct swr_jit_key &key, ++ struct swr_context *ctx, ++ swr_fragment_shader *swr_fs); ++ ++struct swr_jit_key { ++ unsigned nr_cbufs; ++ unsigned light_twoside; ++ ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; ++ ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; ++ unsigned nr_samplers; ++ unsigned nr_sampler_views; ++ struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS]; ++ struct pipe_alpha_state alphaTest; ++}; ++ ++namespace std ++{ ++template <> struct hash { ++ std::size_t operator()(const swr_jit_key &k) const ++ { ++ return util_hash_crc32(&k, sizeof(k)); ++ } ++}; ++}; ++ ++bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs); +diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp +new file mode 100644 +index 0000000..fa16844 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_state.cpp +@@ -0,0 +1,1344 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#include "common/os.h" ++#include "jit_api.h" ++#include "JitManager.h" ++#include "state_llvm.h" ++ ++#include "gallivm/lp_bld_tgsi.h" ++#include "util/u_format.h" ++ ++#include "util/u_memory.h" ++#include "util/u_inlines.h" ++#include "util/u_helpers.h" ++#include "util/u_framebuffer.h" ++ ++#include "swr_state.h" ++#include "swr_context.h" ++#include "swr_context_llvm.h" ++#include "swr_screen.h" ++#include "swr_resource.h" ++#include "swr_tex_sample.h" ++#include "swr_scratch.h" ++#include "swr_shader.h" ++ ++/* These should be pulled out into separate files as necessary ++ * Just initializing everything here to get going. */ ++ ++static void * ++swr_create_blend_state(struct pipe_context *pipe, ++ const struct pipe_blend_state *blend) ++{ ++ struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state); ++ ++ memcpy(&state->pipe, blend, sizeof(*blend)); ++ ++ struct pipe_blend_state *pipe_blend = &state->pipe; ++ ++ for (int target = 0; ++ target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS); ++ target++) { ++ state->compileState[target].independentAlphaBlendEnable = ++ pipe_blend->independent_blend_enable; ++ ++ struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target]; ++ SWR_RENDER_TARGET_BLEND_STATE &targetState = ++ state->compileState[target].blendState; ++ ++ if (target != 0 && !pipe_blend->independent_blend_enable) { ++ memcpy(&targetState, &state->compileState[0].blendState, sizeof(SWR_RENDER_TARGET_BLEND_STATE)); ++ continue; ++ } ++ ++ targetState.colorBlendEnable = rt_blend->blend_enable; ++ if (targetState.colorBlendEnable) { ++ targetState.sourceAlphaBlendFactor = ++ swr_convert_blend_factor(rt_blend->alpha_src_factor); ++ targetState.destAlphaBlendFactor = ++ swr_convert_blend_factor(rt_blend->alpha_dst_factor); ++ targetState.sourceBlendFactor = ++ swr_convert_blend_factor(rt_blend->rgb_src_factor); ++ targetState.destBlendFactor = ++ swr_convert_blend_factor(rt_blend->rgb_dst_factor); ++ ++ targetState.colorBlendFunc = ++ swr_convert_blend_func(rt_blend->rgb_func); ++ targetState.alphaBlendFunc = ++ swr_convert_blend_func(rt_blend->alpha_func); ++ } ++ ++ targetState.writeDisableRed = ++ (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1; ++ targetState.writeDisableGreen = ++ (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1; ++ targetState.writeDisableBlue = ++ (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1; ++ targetState.writeDisableAlpha = ++ (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1; ++ } ++ ++ return state; ++} ++ ++static void ++swr_bind_blend_state(struct pipe_context *pipe, void *blend) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (ctx->blend == blend) ++ return; ++ ++ ctx->blend = (swr_blend_state *)blend; ++ ++ ctx->dirty |= SWR_NEW_BLEND; ++} ++ ++static void ++swr_delete_blend_state(struct pipe_context *pipe, void *blend) ++{ ++ FREE(blend); ++} ++ ++static void ++swr_set_blend_color(struct pipe_context *pipe, ++ const struct pipe_blend_color *color) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->blend_color = *color; ++ ++ ctx->dirty |= SWR_NEW_BLEND; ++} ++ ++static void ++swr_set_stencil_ref(struct pipe_context *pipe, ++ const struct pipe_stencil_ref *ref) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->stencil_ref = *ref; ++ ++ ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; ++} ++ ++static void * ++swr_create_depth_stencil_state( ++ struct pipe_context *pipe, ++ const struct pipe_depth_stencil_alpha_state *depth_stencil) ++{ ++ struct pipe_depth_stencil_alpha_state *state; ++ ++ state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil, ++ sizeof *depth_stencil); ++ ++ return state; ++} ++ ++static void ++swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil) ++ return; ++ ++ ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil; ++ ++ ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA; ++} ++ ++static void ++swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth) ++{ ++ FREE(depth); ++} ++ ++ ++static void * ++swr_create_rasterizer_state(struct pipe_context *pipe, ++ const struct pipe_rasterizer_state *rast) ++{ ++ struct pipe_rasterizer_state *state; ++ state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast); ++ ++ return state; ++} ++ ++static void ++swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ const struct pipe_rasterizer_state *rasterizer = ++ (const struct pipe_rasterizer_state *)handle; ++ ++ if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer) ++ return; ++ ++ ctx->rasterizer = (pipe_rasterizer_state *)rasterizer; ++ ++ ctx->dirty |= SWR_NEW_RASTERIZER; ++} ++ ++static void ++swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer) ++{ ++ FREE(rasterizer); ++} ++ ++ ++static void * ++swr_create_sampler_state(struct pipe_context *pipe, ++ const struct pipe_sampler_state *sampler) ++{ ++ struct pipe_sampler_state *state = ++ (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler); ++ ++ return state; ++} ++ ++static void ++swr_bind_sampler_states(struct pipe_context *pipe, ++ unsigned shader, ++ unsigned start, ++ unsigned num, ++ void **samplers) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ unsigned i; ++ ++ assert(shader < PIPE_SHADER_TYPES); ++ assert(start + num <= Elements(ctx->samplers[shader])); ++ ++ /* set the new samplers */ ++ ctx->num_samplers[shader] = num; ++ for (i = 0; i < num; i++) { ++ ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i]; ++ } ++ ++ ctx->dirty |= SWR_NEW_SAMPLER; ++} ++ ++static void ++swr_delete_sampler_state(struct pipe_context *pipe, void *sampler) ++{ ++ FREE(sampler); ++} ++ ++ ++static struct pipe_sampler_view * ++swr_create_sampler_view(struct pipe_context *pipe, ++ struct pipe_resource *texture, ++ const struct pipe_sampler_view *templ) ++{ ++ struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view); ++ ++ if (view) { ++ *view = *templ; ++ view->reference.count = 1; ++ view->texture = NULL; ++ pipe_resource_reference(&view->texture, texture); ++ view->context = pipe; ++ } ++ ++ return view; ++} ++ ++static void ++swr_set_sampler_views(struct pipe_context *pipe, ++ unsigned shader, ++ unsigned start, ++ unsigned num, ++ struct pipe_sampler_view **views) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ uint i; ++ ++ assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS); ++ ++ assert(shader < PIPE_SHADER_TYPES); ++ assert(start + num <= Elements(ctx->sampler_views[shader])); ++ ++ /* set the new sampler views */ ++ ctx->num_sampler_views[shader] = num; ++ for (i = 0; i < num; i++) { ++ /* Note: we're using pipe_sampler_view_release() here to work around ++ * a possible crash when the old view belongs to another context that ++ * was already destroyed. ++ */ ++ pipe_sampler_view_release(pipe, &ctx->sampler_views[shader][start + i]); ++ pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i], ++ views[i]); ++ } ++ ++ ctx->dirty |= SWR_NEW_SAMPLER_VIEW; ++} ++ ++static void ++swr_sampler_view_destroy(struct pipe_context *pipe, ++ struct pipe_sampler_view *view) ++{ ++ pipe_resource_reference(&view->texture, NULL); ++ FREE(view); ++} ++ ++static void * ++swr_create_vs_state(struct pipe_context *pipe, ++ const struct pipe_shader_state *vs) ++{ ++ struct swr_vertex_shader *swr_vs = ++ (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader); ++ if (!swr_vs) ++ return NULL; ++ ++ swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens); ++ swr_vs->pipe.stream_output = vs->stream_output; ++ ++ lp_build_tgsi_info(vs->tokens, &swr_vs->info); ++ ++ swr_vs->func = swr_compile_vs(pipe, swr_vs); ++ ++ swr_vs->soState = {0}; ++ ++ if (swr_vs->pipe.stream_output.num_outputs) { ++ pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output; ++ ++ swr_vs->soState.soEnable = true; ++ // soState.rasterizerDisable set on state dirty ++ // soState.streamToRasterizer not used ++ ++ for (uint32_t i = 0; i < stream_output->num_outputs; i++) { ++ swr_vs->soState.streamMasks[stream_output->output[i].stream] |= ++ 1 << (stream_output->output[i].register_index - 1); ++ } ++ for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) { ++ swr_vs->soState.streamNumEntries[i] = ++ _mm_popcnt_u32(swr_vs->soState.streamMasks[i]); ++ } ++ } ++ ++ return swr_vs; ++} ++ ++static void ++swr_bind_vs_state(struct pipe_context *pipe, void *vs) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (ctx->vs == vs) ++ return; ++ ++ ctx->vs = (swr_vertex_shader *)vs; ++ ctx->dirty |= SWR_NEW_VS; ++} ++ ++static void ++swr_delete_vs_state(struct pipe_context *pipe, void *vs) ++{ ++ struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs; ++ FREE((void *)swr_vs->pipe.tokens); ++ FREE(vs); ++} ++ ++static void * ++swr_create_fs_state(struct pipe_context *pipe, ++ const struct pipe_shader_state *fs) ++{ ++ struct swr_fragment_shader *swr_fs = new swr_fragment_shader; ++ if (!swr_fs) ++ return NULL; ++ ++ swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens); ++ ++ lp_build_tgsi_info(fs->tokens, &swr_fs->info); ++ ++ return swr_fs; ++} ++ ++ ++static void ++swr_bind_fs_state(struct pipe_context *pipe, void *fs) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (ctx->fs == fs) ++ return; ++ ++ ctx->fs = (swr_fragment_shader *)fs; ++ ctx->dirty |= SWR_NEW_FS; ++} ++ ++static void ++swr_delete_fs_state(struct pipe_context *pipe, void *fs) ++{ ++ struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs; ++ FREE((void *)swr_fs->pipe.tokens); ++ delete swr_fs; ++} ++ ++ ++static void ++swr_set_constant_buffer(struct pipe_context *pipe, ++ uint shader, ++ uint index, ++ struct pipe_constant_buffer *cb) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct pipe_resource *constants = cb ? cb->buffer : NULL; ++ ++ assert(shader < PIPE_SHADER_TYPES); ++ assert(index < Elements(ctx->constants[shader])); ++ ++ /* note: reference counting */ ++ util_copy_constant_buffer(&ctx->constants[shader][index], cb); ++ ++ if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) { ++ ctx->dirty |= SWR_NEW_VSCONSTANTS; ++ } else if (shader == PIPE_SHADER_FRAGMENT) { ++ ctx->dirty |= SWR_NEW_FSCONSTANTS; ++ } ++ ++ if (cb && cb->user_buffer) { ++ pipe_resource_reference(&constants, NULL); ++ } ++} ++ ++ ++static void * ++swr_create_vertex_elements_state(struct pipe_context *pipe, ++ unsigned num_elements, ++ const struct pipe_vertex_element *attribs) ++{ ++ struct swr_vertex_element_state *velems; ++ assert(num_elements <= PIPE_MAX_ATTRIBS); ++ velems = CALLOC_STRUCT(swr_vertex_element_state); ++ if (velems) { ++ velems->fsState.numAttribs = num_elements; ++ for (unsigned i = 0; i < num_elements; i++) { ++ // XXX: we should do this keyed on the VS usage info ++ ++ const struct util_format_description *desc = ++ util_format_description(attribs[i].src_format); ++ ++ velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset; ++ velems->fsState.layout[i].Format = ++ mesa_to_swr_format(attribs[i].src_format); ++ velems->fsState.layout[i].StreamIndex = ++ attribs[i].vertex_buffer_index; ++ velems->fsState.layout[i].InstanceEnable = ++ attribs[i].instance_divisor != 0; ++ velems->fsState.layout[i].ComponentControl0 = ++ desc->channel[0].type != UTIL_FORMAT_TYPE_VOID ++ ? ComponentControl::StoreSrc ++ : ComponentControl::Store0; ++ velems->fsState.layout[i].ComponentControl1 = ++ desc->channel[1].type != UTIL_FORMAT_TYPE_VOID ++ ? ComponentControl::StoreSrc ++ : ComponentControl::Store0; ++ velems->fsState.layout[i].ComponentControl2 = ++ desc->channel[2].type != UTIL_FORMAT_TYPE_VOID ++ ? ComponentControl::StoreSrc ++ : ComponentControl::Store0; ++ velems->fsState.layout[i].ComponentControl3 = ++ desc->channel[3].type != UTIL_FORMAT_TYPE_VOID ++ ? ComponentControl::StoreSrc ++ : ComponentControl::Store1Fp; ++ velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW; ++ velems->fsState.layout[i].InstanceDataStepRate = ++ attribs[i].instance_divisor; ++ ++ /* Calculate the pitch of each stream */ ++ const SWR_FORMAT_INFO &swr_desc = GetFormatInfo( ++ mesa_to_swr_format(attribs[i].src_format)); ++ velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp; ++ } ++ } ++ ++ return velems; ++} ++ ++static void ++swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ struct swr_vertex_element_state *swr_velems = ++ (struct swr_vertex_element_state *)velems; ++ ++ ctx->velems = swr_velems; ++ ctx->dirty |= SWR_NEW_VERTEX; ++} ++ ++static void ++swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems) ++{ ++ /* XXX Need to destroy fetch shader? */ ++ FREE(velems); ++} ++ ++ ++static void ++swr_set_vertex_buffers(struct pipe_context *pipe, ++ unsigned start_slot, ++ unsigned num_elements, ++ const struct pipe_vertex_buffer *buffers) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ assert(num_elements <= PIPE_MAX_ATTRIBS); ++ ++ util_set_vertex_buffers_count(ctx->vertex_buffer, ++ &ctx->num_vertex_buffers, ++ buffers, ++ start_slot, ++ num_elements); ++ ++ ctx->dirty |= SWR_NEW_VERTEX; ++} ++ ++ ++static void ++swr_set_index_buffer(struct pipe_context *pipe, ++ const struct pipe_index_buffer *ib) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (ib) ++ memcpy(&ctx->index_buffer, ib, sizeof(ctx->index_buffer)); ++ else ++ memset(&ctx->index_buffer, 0, sizeof(ctx->index_buffer)); ++ ++ ctx->dirty |= SWR_NEW_VERTEX; ++} ++ ++static void ++swr_set_polygon_stipple(struct pipe_context *pipe, ++ const struct pipe_poly_stipple *stipple) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->poly_stipple = *stipple; /* struct copy */ ++ ctx->dirty |= SWR_NEW_STIPPLE; ++} ++ ++static void ++swr_set_clip_state(struct pipe_context *pipe, ++ const struct pipe_clip_state *clip) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->clip = *clip; ++ /* XXX Unimplemented, but prevents crash */ ++ ++ ctx->dirty |= SWR_NEW_CLIP; ++} ++ ++ ++static void ++swr_set_scissor_states(struct pipe_context *pipe, ++ unsigned start_slot, ++ unsigned num_viewports, ++ const struct pipe_scissor_state *scissor) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->scissor = *scissor; ++ ctx->dirty |= SWR_NEW_SCISSOR; ++} ++ ++static void ++swr_set_viewport_states(struct pipe_context *pipe, ++ unsigned start_slot, ++ unsigned num_viewports, ++ const struct pipe_viewport_state *vpt) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ ctx->viewport = *vpt; ++ ctx->dirty |= SWR_NEW_VIEWPORT; ++} ++ ++ ++static void ++swr_set_framebuffer_state(struct pipe_context *pipe, ++ const struct pipe_framebuffer_state *fb) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ boolean changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb); ++ ++ assert(fb->width <= KNOB_GUARDBAND_WIDTH); ++ assert(fb->height <= KNOB_GUARDBAND_HEIGHT); ++ ++ if (changed) { ++ unsigned i; ++ for (i = 0; i < fb->nr_cbufs; ++i) ++ pipe_surface_reference(&ctx->framebuffer.cbufs[i], fb->cbufs[i]); ++ for (; i < ctx->framebuffer.nr_cbufs; ++i) ++ pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL); ++ ++ ctx->framebuffer.nr_cbufs = fb->nr_cbufs; ++ ++ ctx->framebuffer.width = fb->width; ++ ctx->framebuffer.height = fb->height; ++ ++ pipe_surface_reference(&ctx->framebuffer.zsbuf, fb->zsbuf); ++ ++ ctx->dirty |= SWR_NEW_FRAMEBUFFER; ++ } ++} ++ ++ ++static void ++swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) ++{ ++ struct swr_context *ctx = swr_context(pipe); ++ ++ if (sample_mask != ctx->sample_mask) { ++ ctx->sample_mask = sample_mask; ++ ctx->dirty |= SWR_NEW_RASTERIZER; ++ } ++} ++ ++ ++void ++swr_update_derived(struct swr_context *ctx, ++ const struct pipe_draw_info *p_draw_info) ++{ ++ /* Any state that requires dirty flags to be re-triggered sets this mask */ ++ /* For example, user_buffer vertex and index buffers. */ ++ unsigned post_update_dirty_flags = 0; ++ ++ /* Render Targets */ ++ if (ctx->dirty & SWR_NEW_FRAMEBUFFER) { ++ struct pipe_framebuffer_state *fb = &ctx->framebuffer; ++ SWR_SURFACE_STATE *new_attachment[SWR_NUM_ATTACHMENTS] = {0}; ++ boolean changed, need_idle; ++ UINT i; ++ ++ /* colorbuffer targets */ ++ if (fb->nr_cbufs) ++ for (i = 0; i < fb->nr_cbufs; ++i) ++ if (fb->cbufs[i]) { ++ struct swr_resource *colorBuffer = ++ swr_resource(fb->cbufs[i]->texture); ++ new_attachment[SWR_ATTACHMENT_COLOR0 + i] = &colorBuffer->swr; ++ } ++ ++ /* depth/stencil target */ ++ if (fb->zsbuf) { ++ struct swr_resource *depthStencilBuffer = ++ swr_resource(fb->zsbuf->texture); ++ if (depthStencilBuffer->has_depth) { ++ new_attachment[SWR_ATTACHMENT_DEPTH] = &depthStencilBuffer->swr; ++ ++ if (depthStencilBuffer->has_stencil) ++ new_attachment[SWR_ATTACHMENT_STENCIL] = ++ &depthStencilBuffer->secondary; ++ ++ } else if (depthStencilBuffer->has_stencil) ++ new_attachment[SWR_ATTACHMENT_STENCIL] = &depthStencilBuffer->swr; ++ } ++ ++ /* For each attachment that has changed, store tile contents to render ++ * target */ ++ changed = FALSE; ++ need_idle = FALSE; ++ for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) { ++ if ((uintptr_t)ctx->current.attachment[i] ++ ^ (uintptr_t)new_attachment[i]) { ++ if (ctx->current.attachment[i]) { ++ enum SWR_TILE_STATE post_state; ++ post_state = ++ (new_attachment[i] ? SWR_TILE_INVALID : SWR_TILE_RESOLVED); ++ swr_store_render_target(ctx, i, post_state); ++ need_idle |= TRUE; ++ } ++ changed |= TRUE; ++ } ++ } ++ ++ /* ++ * Attachments are live, don't update any until idle ++ * (all StoreTiles, called by swr_store_render_targets, finish) ++ */ ++ if (need_idle) ++ SwrWaitForIdle(ctx->swrContext); ++ ++ if (changed) { ++ /* Update actual SWR core attachments, or clear those no longer ++ * attached */ ++ swr_draw_context *pDC = ++ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); ++ SWR_SURFACE_STATE *renderTargets = pDC->renderTargets; ++ for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) { ++ if ((uintptr_t)ctx->current.attachment[i] ++ ^ (uintptr_t)new_attachment[i]) { ++ if (new_attachment[i]) { ++ renderTargets[i] = *new_attachment[i]; ++ ctx->current.attachment[i] = new_attachment[i]; ++ } else { ++ renderTargets[i] = {0}; ++ ctx->current.attachment[i] = nullptr; ++ } ++ } ++ } ++ ++ /* rendertarget changes also necessitate updating other state */ ++ ctx->dirty |= SWR_NEW_BLEND | SWR_NEW_SAMPLER_VIEW | SWR_NEW_VS ++ | SWR_NEW_FS | SWR_NEW_RASTERIZER | SWR_NEW_VIEWPORT ++ | SWR_NEW_DEPTH_STENCIL_ALPHA; ++ } ++ } ++ ++ /* Raster state */ ++ if (ctx->dirty & (SWR_NEW_RASTERIZER | SWR_NEW_VS)) { ++ SWR_RASTSTATE *rastState = &ctx->current.rastState; ++ rastState->cullMode = swr_convert_cull_mode(ctx->rasterizer->cull_face); ++ rastState->frontWinding = ctx->rasterizer->front_ccw ++ ? SWR_FRONTWINDING_CCW ++ : SWR_FRONTWINDING_CW; ++ rastState->scissorEnable = ctx->rasterizer->scissor; ++ rastState->pointSize = ctx->rasterizer->point_size > 0.0f ++ ? ctx->rasterizer->point_size ++ : 1.0f; ++ rastState->lineWidth = ctx->rasterizer->line_width > 0.0f ++ ? ctx->rasterizer->line_width ++ : 1.0f; ++ ++ rastState->pointParam = ctx->rasterizer->point_size_per_vertex; ++ rastState->pointSizeAttrib = ctx->vs->pointSizeAttrib; ++ ++ rastState->pointSpriteEnable = ctx->rasterizer->sprite_coord_enable; ++ rastState->pointSpriteTopOrigin = ++ ctx->rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT; ++ rastState->pointSpriteFESlot = ctx->vs->info.base.num_outputs; ++ ++ /* XXX TODO: Add multisample */ ++ rastState->sampleCount = SWR_MULTISAMPLE_1X; ++ ++ bool do_offset = false; ++ switch (ctx->rasterizer->fill_front) { ++ case PIPE_POLYGON_MODE_FILL: ++ do_offset = ctx->rasterizer->offset_tri; ++ break; ++ case PIPE_POLYGON_MODE_LINE: ++ do_offset = ctx->rasterizer->offset_line; ++ break; ++ case PIPE_POLYGON_MODE_POINT: ++ do_offset = ctx->rasterizer->offset_point; ++ break; ++ } ++ ++ if (do_offset) { ++ rastState->depthBias = ctx->rasterizer->offset_units; ++ rastState->slopeScaledDepthBias = ctx->rasterizer->offset_scale; ++ rastState->depthBiasClamp = ctx->rasterizer->offset_clamp; ++ } else { ++ rastState->depthBias = 0; ++ rastState->slopeScaledDepthBias = 0; ++ rastState->depthBiasClamp = 0; ++ } ++ struct pipe_surface *zb = ctx->framebuffer.zsbuf; ++ if (zb && swr_resource(zb->texture)->has_depth) ++ rastState->depthFormat = swr_resource(zb->texture)->swr.format; ++ ++ rastState->depthClipEnable = ctx->rasterizer->depth_clip; ++ ++ SwrSetRastState(ctx->swrContext, rastState); ++ } ++ ++ /* Scissor */ ++ if (ctx->dirty & SWR_NEW_SCISSOR) { ++ BBOX bbox(ctx->scissor.miny, ctx->scissor.maxy, ++ ctx->scissor.minx, ctx->scissor.maxx); ++ SwrSetScissorRects(ctx->swrContext, 1, &bbox); ++ } ++ ++ /* Viewport */ ++ if (ctx->dirty & SWR_NEW_VIEWPORT) { ++ pipe_viewport_state *state = &ctx->viewport; ++ SWR_VIEWPORT *vp = &ctx->current.vp; ++ SWR_VIEWPORT_MATRIX *vpm = &ctx->current.vpm; ++ ++ const float scale_x = fabs(state->scale[0]); ++ const float scale_y = fabs(state->scale[1]); ++ const float scale_z = fabs(state->scale[2]); ++ ++ vp->x = state->translate[0] - scale_x; ++ vp->width = state->translate[0] + scale_x; ++ vp->y = state->translate[1] - scale_y; ++ vp->height = state->translate[1] + scale_y; ++ if (ctx->rasterizer->clip_halfz == 0) { ++ vp->minZ = state->translate[2] - scale_z; ++ vp->maxZ = state->translate[2] + scale_z; ++ } else { ++ vp->minZ = state->translate[2]; ++ vp->maxZ = state->translate[2] + scale_z; ++ } ++ ++ /* Flip viewport for all targets except samplable textures. */ ++ /* XXX This may not be sufficient for multiple rendertargets */ ++ struct pipe_surface *cb = ctx->framebuffer.cbufs[0]; ++ if (cb && ++ !(swr_resource(cb->texture)->base.bind & PIPE_BIND_SAMPLER_VIEW)) { ++ /* Flip y and y-translate in the viewport matrix. */ ++ vpm->m00 = (vp->width - vp->x) / 2.0f; ++ vpm->m11 = (vp->y - vp->height) / 2.0f; ++ vpm->m22 = (vp->maxZ - vp->minZ) / 2.0f; ++ vpm->m30 = vp->x + vpm->m00; ++ vpm->m31 = vp->height + vpm->m11; ++ vpm->m32 = vp->minZ + vpm->m22; ++ } else { ++ vpm->m00 = (vp->width - vp->x) / 2.0f; ++ vpm->m11 = (vp->height - vp->y) / 2.0f; ++ vpm->m22 = (vp->maxZ - vp->minZ) / 2.0f; ++ vpm->m30 = vp->x + vpm->m00; ++ vpm->m31 = vp->y + vpm->m11; ++ vpm->m32 = vp->minZ + vpm->m22; ++ } ++ ++ /* Now that the matrix is calculated, clip the view coords to screen ++ * size. OpenGL allows for -ve x,y in the viewport. ++ */ ++ vp->x = std::max(vp->x, 0.0f); ++ vp->y = std::max(vp->y, 0.0f); ++ vp->width = std::min(vp->width, (float)ctx->framebuffer.width); ++ vp->height = std::min(vp->height, (float)ctx->framebuffer.height); ++ ++ SwrSetViewports(ctx->swrContext, 1, vp, vpm); ++ } ++ ++ /* Set vertex & index buffers */ ++ /* (using draw info if called by swr_draw_vbo) */ ++ if (ctx->dirty & SWR_NEW_VERTEX) { ++ uint32_t size, pitch, max_vertex, partial_inbounds; ++ const uint8_t *p_data; ++ ++ /* If being called by swr_draw_vbo, copy draw details */ ++ struct pipe_draw_info info = {0}; ++ if (p_draw_info) ++ info = *p_draw_info; ++ ++ /* vertex buffers */ ++ SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS]; ++ for (UINT i = 0; i < ctx->num_vertex_buffers; i++) { ++ pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; ++ ++ pitch = vb->stride; ++ if (!vb->user_buffer) { ++ /* VBO ++ * size is based on buffer->width0 rather than info.max_index ++ * to prevent having to validate VBO on each draw */ ++ size = vb->buffer->width0; ++ max_vertex = size / pitch; ++ partial_inbounds = size % pitch; ++ ++ p_data = (const uint8_t *)swr_resource_data(vb->buffer) ++ + vb->buffer_offset; ++ } else { ++ /* Client buffer ++ * client memory is one-time use, re-trigger SWR_NEW_VERTEX to ++ * revalidate on each draw */ ++ post_update_dirty_flags |= SWR_NEW_VERTEX; ++ ++ if (pitch) { ++ size = (info.max_index - info.min_index + 1) * pitch; ++ } else { ++ /* pitch = 0, means constant value ++ * set size to 1 vertex */ ++ size = ctx->velems->stream_pitch[i]; ++ } ++ ++ max_vertex = info.max_index + 1; ++ partial_inbounds = 0; ++ ++ /* Copy only needed vertices to scratch space */ ++ size = AlignUp(size, 4); ++ const void *ptr = (const uint8_t *) vb->user_buffer ++ + info.min_index * pitch; ++ ptr = swr_copy_to_scratch_space( ++ ctx, &ctx->scratch->vertex_buffer, ptr, size); ++ p_data = (const uint8_t *)ptr - info.min_index * pitch; ++ } ++ ++ swrVertexBuffers[i] = {0}; ++ swrVertexBuffers[i].index = i; ++ swrVertexBuffers[i].pitch = pitch; ++ swrVertexBuffers[i].pData = p_data; ++ swrVertexBuffers[i].size = size; ++ swrVertexBuffers[i].maxVertex = max_vertex; ++ swrVertexBuffers[i].partialInboundsSize = partial_inbounds; ++ } ++ ++ SwrSetVertexBuffers( ++ ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers); ++ ++ /* index buffer, if required (info passed in by swr_draw_vbo) */ ++ SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */ ++ if (info.indexed) { ++ pipe_index_buffer *ib = &ctx->index_buffer; ++ ++ pitch = ib->index_size ? ib->index_size : sizeof(uint32_t); ++ index_type = swr_convert_index_type(pitch); ++ ++ if (!ib->user_buffer) { ++ /* VBO ++ * size is based on buffer->width0 rather than info.count ++ * to prevent having to validate VBO on each draw */ ++ size = ib->buffer->width0; ++ p_data = ++ (const uint8_t *)swr_resource_data(ib->buffer) + ib->offset; ++ } else { ++ /* Client buffer ++ * client memory is one-time use, re-trigger SWR_NEW_VERTEX to ++ * revalidate on each draw */ ++ post_update_dirty_flags |= SWR_NEW_VERTEX; ++ ++ size = info.count * pitch; ++ size = AlignUp(size, 4); ++ ++ /* Copy indices to scratch space */ ++ const void *ptr = ib->user_buffer; ++ ptr = swr_copy_to_scratch_space( ++ ctx, &ctx->scratch->index_buffer, ptr, size); ++ p_data = (const uint8_t *)ptr; ++ } ++ ++ SWR_INDEX_BUFFER_STATE swrIndexBuffer; ++ swrIndexBuffer.format = swr_convert_index_type(ib->index_size); ++ swrIndexBuffer.pIndices = p_data; ++ swrIndexBuffer.size = size; ++ ++ SwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer); ++ } ++ ++ struct swr_vertex_element_state *velems = ctx->velems; ++ if (velems && velems->fsState.indexType != index_type) { ++ velems->fsFunc = NULL; ++ velems->fsState.indexType = index_type; ++ } ++ } ++ ++ /* VertexShader */ ++ if (ctx->dirty & SWR_NEW_VS) { ++ SwrSetVertexFunc(ctx->swrContext, ctx->vs->func); ++ } ++ ++ swr_jit_key key; ++ if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW ++ | SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_RASTERIZER ++ | SWR_NEW_FRAMEBUFFER)) { ++ memset(&key, 0, sizeof(key)); ++ swr_generate_fs_key(key, ctx, ctx->fs); ++ auto search = ctx->fs->map.find(key); ++ PFN_PIXEL_KERNEL func; ++ if (search != ctx->fs->map.end()) { ++ func = search->second; ++ } else { ++ func = swr_compile_fs(ctx, key); ++ ctx->fs->map.insert(std::make_pair(key, func)); ++ } ++ SWR_PS_STATE psState = {0}; ++ psState.pfnPixelShader = func; ++ psState.killsPixel = ++ ctx->fs->info.base.uses_kill || key.alphaTest.enabled; ++ psState.writesODepth = ctx->fs->info.base.writes_z; ++ psState.usesSourceDepth = ctx->fs->info.base.reads_z; ++ psState.maxRTSlotUsed = ++ (ctx->framebuffer.nr_cbufs != 0) ? ++ (ctx->framebuffer.nr_cbufs - 1) : ++ 0; ++ SwrSetPixelShaderState(ctx->swrContext, &psState); ++ } ++ ++ /* JIT sampler state */ ++ if (ctx->dirty & SWR_NEW_SAMPLER) { ++ swr_draw_context *pDC = ++ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); ++ ++ for (unsigned i = 0; i < key.nr_samplers; i++) { ++ const struct pipe_sampler_state *sampler = ++ ctx->samplers[PIPE_SHADER_FRAGMENT][i]; ++ ++ if (sampler) { ++ pDC->samplersFS[i].min_lod = sampler->min_lod; ++ pDC->samplersFS[i].max_lod = sampler->max_lod; ++ pDC->samplersFS[i].lod_bias = sampler->lod_bias; ++ COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f); ++ } ++ } ++ } ++ ++ /* JIT sampler view state */ ++ if (ctx->dirty & SWR_NEW_SAMPLER_VIEW) { ++ swr_draw_context *pDC = ++ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); ++ ++ for (unsigned i = 0; i < key.nr_sampler_views; i++) { ++ struct pipe_sampler_view *view = ++ ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]; ++ ++ if (view) { ++ struct pipe_resource *res = view->texture; ++ struct swr_resource *swr_res = swr_resource(res); ++ struct swr_jit_texture *jit_tex = &pDC->texturesFS[i]; ++ memset(jit_tex, 0, sizeof(*jit_tex)); ++ jit_tex->width = res->width0; ++ jit_tex->height = res->height0; ++ jit_tex->depth = res->depth0; ++ jit_tex->first_level = view->u.tex.first_level; ++ jit_tex->last_level = view->u.tex.last_level; ++ jit_tex->base_ptr = swr_res->swr.pBaseAddress; ++ ++ for (unsigned level = jit_tex->first_level; ++ level <= jit_tex->last_level; ++ level++) { ++ jit_tex->row_stride[level] = swr_res->row_stride[level]; ++ jit_tex->img_stride[level] = swr_res->img_stride[level]; ++ jit_tex->mip_offsets[level] = swr_res->mip_offsets[level]; ++ } ++ } ++ } ++ } ++ ++ /* VertexShader Constants */ ++ if (ctx->dirty & SWR_NEW_VSCONSTANTS) { ++ swr_draw_context *pDC = ++ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); ++ ++ for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { ++ const pipe_constant_buffer *cb = ++ &ctx->constants[PIPE_SHADER_VERTEX][i]; ++ pDC->num_constantsVS[i] = cb->buffer_size; ++ if (cb->buffer) ++ pDC->constantVS[i] = ++ (const float *)((const BYTE *)cb->buffer + cb->buffer_offset); ++ else { ++ /* Need to copy these constants to scratch space */ ++ if (cb->user_buffer && cb->buffer_size) { ++ const void *ptr = ++ ((const BYTE *)cb->user_buffer + cb->buffer_offset); ++ uint32_t size = AlignUp(cb->buffer_size, 4); ++ ptr = swr_copy_to_scratch_space( ++ ctx, &ctx->scratch->vs_constants, ptr, size); ++ pDC->constantVS[i] = (const float *)ptr; ++ } ++ } ++ } ++ } ++ ++ /* FragmentShader Constants */ ++ if (ctx->dirty & SWR_NEW_FSCONSTANTS) { ++ swr_draw_context *pDC = ++ (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext); ++ ++ for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { ++ const pipe_constant_buffer *cb = ++ &ctx->constants[PIPE_SHADER_FRAGMENT][i]; ++ pDC->num_constantsFS[i] = cb->buffer_size; ++ if (cb->buffer) ++ pDC->constantFS[i] = ++ (const float *)((const BYTE *)cb->buffer + cb->buffer_offset); ++ else { ++ /* Need to copy these constants to scratch space */ ++ if (cb->user_buffer && cb->buffer_size) { ++ const void *ptr = ++ ((const BYTE *)cb->user_buffer + cb->buffer_offset); ++ uint32_t size = AlignUp(cb->buffer_size, 4); ++ ptr = swr_copy_to_scratch_space( ++ ctx, &ctx->scratch->fs_constants, ptr, size); ++ pDC->constantFS[i] = (const float *)ptr; ++ } ++ } ++ } ++ } ++ ++ /* Depth/stencil state */ ++ if (ctx->dirty & SWR_NEW_DEPTH_STENCIL_ALPHA) { ++ struct pipe_depth_state *depth = &(ctx->depth_stencil->depth); ++ struct pipe_stencil_state *stencil = ctx->depth_stencil->stencil; ++ SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}}; ++ ++ /* XXX, incomplete. Need to flesh out stencil & alpha test state ++ struct pipe_stencil_state *front_stencil = ++ ctx->depth_stencil.stencil[0]; ++ struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1]; ++ struct pipe_alpha_state alpha; ++ */ ++ if (stencil[0].enabled) { ++ depthStencilState.stencilWriteEnable = 1; ++ depthStencilState.stencilTestEnable = 1; ++ depthStencilState.stencilTestFunc = ++ swr_convert_depth_func(stencil[0].func); ++ ++ depthStencilState.stencilPassDepthPassOp = ++ swr_convert_stencil_op(stencil[0].zpass_op); ++ depthStencilState.stencilPassDepthFailOp = ++ swr_convert_stencil_op(stencil[0].zfail_op); ++ depthStencilState.stencilFailOp = ++ swr_convert_stencil_op(stencil[0].fail_op); ++ depthStencilState.stencilWriteMask = stencil[0].writemask; ++ depthStencilState.stencilTestMask = stencil[0].valuemask; ++ depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0]; ++ } ++ if (stencil[1].enabled) { ++ depthStencilState.doubleSidedStencilTestEnable = 1; ++ ++ depthStencilState.backfaceStencilTestFunc = ++ swr_convert_depth_func(stencil[1].func); ++ ++ depthStencilState.backfaceStencilPassDepthPassOp = ++ swr_convert_stencil_op(stencil[1].zpass_op); ++ depthStencilState.backfaceStencilPassDepthFailOp = ++ swr_convert_stencil_op(stencil[1].zfail_op); ++ depthStencilState.backfaceStencilFailOp = ++ swr_convert_stencil_op(stencil[1].fail_op); ++ depthStencilState.backfaceStencilWriteMask = stencil[1].writemask; ++ depthStencilState.backfaceStencilTestMask = stencil[1].valuemask; ++ ++ depthStencilState.backfaceStencilRefValue = ++ ctx->stencil_ref.ref_value[1]; ++ } ++ ++ depthStencilState.depthTestEnable = depth->enabled; ++ depthStencilState.depthTestFunc = swr_convert_depth_func(depth->func); ++ depthStencilState.depthWriteEnable = depth->writemask; ++ SwrSetDepthStencilState(ctx->swrContext, &depthStencilState); ++ } ++ ++ /* Blend State */ ++ if (ctx->dirty & (SWR_NEW_BLEND | SWR_NEW_FRAMEBUFFER)) { ++ struct pipe_framebuffer_state *fb = &ctx->framebuffer; ++ ++ SWR_BLEND_STATE blendState; ++ memset(&blendState, 0, sizeof(blendState)); ++ blendState.independentAlphaBlendEnable = ++ ctx->blend->pipe.independent_blend_enable; ++ blendState.constantColor[0] = ctx->blend_color.color[0]; ++ blendState.constantColor[1] = ctx->blend_color.color[1]; ++ blendState.constantColor[2] = ctx->blend_color.color[2]; ++ blendState.constantColor[3] = ctx->blend_color.color[3]; ++ ++ /* If there are no color buffers bound, disable writes on RT0 ++ * and skip loop */ ++ if (fb->nr_cbufs == 0) { ++ blendState.renderTarget[0].writeDisableRed = 1; ++ blendState.renderTarget[0].writeDisableGreen = 1; ++ blendState.renderTarget[0].writeDisableBlue = 1; ++ blendState.renderTarget[0].writeDisableAlpha = 1; ++ } ++ else ++ for (int target = 0; ++ target < std::min(SWR_NUM_RENDERTARGETS, ++ PIPE_MAX_COLOR_BUFS); ++ target++) { ++ if (!fb->cbufs[target]) ++ continue; ++ ++ BLEND_COMPILE_STATE *compileState = ++ &ctx->blend->compileState[target]; ++ ++ struct swr_resource *colorBuffer = ++ swr_resource(fb->cbufs[target]->texture); ++ compileState->format = colorBuffer->swr.format; ++ ++ memcpy(&blendState.renderTarget[target], ++ &compileState->blendState, ++ sizeof(compileState->blendState)); ++ ++ PFN_BLEND_JIT_FUNC func = NULL; ++ auto search = ctx->blendJIT->find(*compileState); ++ if (search != ctx->blendJIT->end()) { ++ func = search->second; ++ } else { ++ HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr; ++ func = JitCompileBlend(hJitMgr, *compileState); ++ debug_printf("BLEND shader %p\n", func); ++ assert(func && "Error: BlendShader = NULL"); ++ ++ ctx->blendJIT->insert(std::make_pair(*compileState, func)); ++ } ++ SwrSetBlendFunc(ctx->swrContext, target, func); ++ } ++ ++ SwrSetBlendState(ctx->swrContext, &blendState); ++ } ++ ++ if (ctx->dirty & SWR_NEW_STIPPLE) { ++ /* XXX What to do with this one??? SWR doesn't stipple */ ++ } ++ ++ if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) { ++ ctx->vs->soState.rasterizerDisable = ++ ctx->rasterizer->rasterizer_discard; ++ SwrSetSoState(ctx->swrContext, &ctx->vs->soState); ++ ++ pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output; ++ ++ for (uint32_t i = 0; i < ctx->num_so_targets; i++) { ++ SWR_STREAMOUT_BUFFER buffer = {0}; ++ if (!ctx->so_targets[i]) ++ continue; ++ buffer.enable = true; ++ buffer.pBuffer = ++ (uint32_t *)swr_resource_data(ctx->so_targets[i]->buffer); ++ buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2; ++ buffer.pitch = stream_output->stride[i]; ++ buffer.streamOffset = ctx->so_targets[i]->buffer_offset >> 2; ++ ++ SwrSetSoBuffers(ctx->swrContext, &buffer, i); ++ } ++ } ++ ++ uint32_t linkage = ctx->vs->linkageMask; ++ if (ctx->rasterizer->sprite_coord_enable) ++ linkage |= (1 << ctx->vs->info.base.num_outputs); ++ ++ SwrSetLinkage(ctx->swrContext, linkage, NULL); ++ ++ // set up frontend state ++ SWR_FRONTEND_STATE feState = {0}; ++ SwrSetFrontendState(ctx->swrContext, &feState); ++ ++ // set up backend state ++ SWR_BACKEND_STATE backendState = {0}; ++ backendState.numAttributes = 1; ++ backendState.numComponents[0] = 4; ++ backendState.constantInterpolationMask = ctx->fs->constantMask; ++ SwrSetBackendState(ctx->swrContext, &backendState); ++ ++ ctx->dirty = post_update_dirty_flags; ++} ++ ++static struct pipe_stream_output_target * ++swr_create_so_target(struct pipe_context *pipe, ++ struct pipe_resource *buffer, ++ unsigned buffer_offset, ++ unsigned buffer_size) ++{ ++ struct pipe_stream_output_target *target; ++ ++ target = CALLOC_STRUCT(pipe_stream_output_target); ++ if (!target) ++ return NULL; ++ ++ target->context = pipe; ++ target->reference.count = 1; ++ pipe_resource_reference(&target->buffer, buffer); ++ target->buffer_offset = buffer_offset; ++ target->buffer_size = buffer_size; ++ return target; ++} ++ ++static void ++swr_destroy_so_target(struct pipe_context *pipe, ++ struct pipe_stream_output_target *target) ++{ ++ pipe_resource_reference(&target->buffer, NULL); ++ FREE(target); ++} ++ ++static void ++swr_set_so_targets(struct pipe_context *pipe, ++ unsigned num_targets, ++ struct pipe_stream_output_target **targets, ++ const unsigned *offsets) ++{ ++ struct swr_context *swr = swr_context(pipe); ++ uint32_t i; ++ ++ assert(num_targets < MAX_SO_STREAMS); ++ ++ for (i = 0; i < num_targets; i++) { ++ pipe_so_target_reference( ++ (struct pipe_stream_output_target **)&swr->so_targets[i], ++ targets[i]); ++ } ++ ++ for (/* fall-through */; i < swr->num_so_targets; i++) { ++ pipe_so_target_reference( ++ (struct pipe_stream_output_target **)&swr->so_targets[i], NULL); ++ } ++ ++ swr->num_so_targets = num_targets; ++ ++ swr->dirty = SWR_NEW_SO; ++} ++ ++ ++void ++swr_state_init(struct pipe_context *pipe) ++{ ++ pipe->create_blend_state = swr_create_blend_state; ++ pipe->bind_blend_state = swr_bind_blend_state; ++ pipe->delete_blend_state = swr_delete_blend_state; ++ ++ pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state; ++ pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state; ++ pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state; ++ ++ pipe->create_rasterizer_state = swr_create_rasterizer_state; ++ pipe->bind_rasterizer_state = swr_bind_rasterizer_state; ++ pipe->delete_rasterizer_state = swr_delete_rasterizer_state; ++ ++ pipe->create_sampler_state = swr_create_sampler_state; ++ pipe->bind_sampler_states = swr_bind_sampler_states; ++ pipe->delete_sampler_state = swr_delete_sampler_state; ++ ++ pipe->create_sampler_view = swr_create_sampler_view; ++ pipe->set_sampler_views = swr_set_sampler_views; ++ pipe->sampler_view_destroy = swr_sampler_view_destroy; ++ ++ pipe->create_vs_state = swr_create_vs_state; ++ pipe->bind_vs_state = swr_bind_vs_state; ++ pipe->delete_vs_state = swr_delete_vs_state; ++ ++ pipe->create_fs_state = swr_create_fs_state; ++ pipe->bind_fs_state = swr_bind_fs_state; ++ pipe->delete_fs_state = swr_delete_fs_state; ++ ++ pipe->set_constant_buffer = swr_set_constant_buffer; ++ ++ pipe->create_vertex_elements_state = swr_create_vertex_elements_state; ++ pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state; ++ pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state; ++ ++ pipe->set_vertex_buffers = swr_set_vertex_buffers; ++ pipe->set_index_buffer = swr_set_index_buffer; ++ ++ pipe->set_polygon_stipple = swr_set_polygon_stipple; ++ pipe->set_clip_state = swr_set_clip_state; ++ pipe->set_scissor_states = swr_set_scissor_states; ++ pipe->set_viewport_states = swr_set_viewport_states; ++ ++ pipe->set_framebuffer_state = swr_set_framebuffer_state; ++ ++ pipe->set_blend_color = swr_set_blend_color; ++ pipe->set_stencil_ref = swr_set_stencil_ref; ++ ++ pipe->set_sample_mask = swr_set_sample_mask; ++ ++ pipe->create_stream_output_target = swr_create_so_target; ++ pipe->stream_output_target_destroy = swr_destroy_so_target; ++ pipe->set_stream_output_targets = swr_set_so_targets; ++} +diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h +new file mode 100644 +index 0000000..fdacd42 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_state.h +@@ -0,0 +1,240 @@ ++/**************************************************************************** ++ * Copyright (C) 2015 Intel Corporation. All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ***************************************************************************/ ++ ++#ifndef SWR_STATE_H ++#define SWR_STATE_H ++ ++#include "pipe/p_defines.h" ++#include "tgsi/tgsi_scan.h" ++#include "tgsi/tgsi_parse.h" ++#include "tgsi/tgsi_dump.h" ++#include "gallivm/lp_bld_tgsi.h" ++#include "util/u_hash.h" ++#include "api.h" ++#include "swr_tex_sample.h" ++#include "swr_shader.h" ++#include ++ ++/* skeleton */ ++struct swr_vertex_shader { ++ struct pipe_shader_state pipe; ++ struct lp_tgsi_info info; ++ unsigned linkageMask; ++ unsigned pointSizeAttrib; ++ PFN_VERTEX_FUNC func; ++ SWR_STREAMOUT_STATE soState; ++ PFN_SO_FUNC soFunc[PIPE_PRIM_MAX]; ++}; ++ ++struct swr_fragment_shader { ++ struct pipe_shader_state pipe; ++ struct lp_tgsi_info info; ++ unsigned constantMask; ++ std::unordered_map map; ++}; ++ ++/* Vertex element state */ ++struct swr_vertex_element_state { ++ FETCH_COMPILE_STATE fsState; ++ PFN_FETCH_FUNC fsFunc; ++#if 1 //BMCDEBUG ++ uint32_t stream_pitch[PIPE_MAX_ATTRIBS]; ++#endif ++}; ++ ++struct swr_blend_state { ++ struct pipe_blend_state pipe; ++ BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS]; ++}; ++ ++/* Shadows of SWR API DrawState */ ++struct swr_shadow_state { ++ SWR_SURFACE_STATE *attachment[SWR_NUM_ATTACHMENTS]; ++ SWR_RASTSTATE rastState; ++ SWR_VIEWPORT vp; ++ SWR_VIEWPORT_MATRIX vpm; ++}; ++ ++void swr_update_derived(struct swr_context *, ++ const struct pipe_draw_info * = nullptr); ++ ++/* ++ * Conversion functions: Convert mesa state defines to SWR. ++ */ ++ ++static INLINE SWR_STENCILOP ++swr_convert_stencil_op(const UINT op) ++{ ++ switch (op) { ++ case PIPE_STENCIL_OP_KEEP: ++ return STENCILOP_KEEP; ++ case PIPE_STENCIL_OP_ZERO: ++ return STENCILOP_ZERO; ++ case PIPE_STENCIL_OP_REPLACE: ++ return STENCILOP_REPLACE; ++ case PIPE_STENCIL_OP_INCR: ++ return STENCILOP_INCRSAT; ++ case PIPE_STENCIL_OP_DECR: ++ return STENCILOP_DECRSAT; ++ case PIPE_STENCIL_OP_INCR_WRAP: ++ return STENCILOP_INCR; ++ case PIPE_STENCIL_OP_DECR_WRAP: ++ return STENCILOP_DECR; ++ case PIPE_STENCIL_OP_INVERT: ++ return STENCILOP_INVERT; ++ default: ++ assert(0 && "Unsupported stencil op"); ++ return STENCILOP_KEEP; ++ } ++} ++ ++static INLINE SWR_FORMAT ++swr_convert_index_type(const UINT index_size) ++{ ++ switch (index_size) { ++ case sizeof(unsigned char): ++ return R8_UINT; ++ case sizeof(unsigned short): ++ return R16_UINT; ++ case sizeof(unsigned int): ++ return R32_UINT; ++ default: ++ assert(0 && "Unsupported index type"); ++ return R32_UINT; ++ } ++} ++ ++ ++static INLINE UINT ++swr_convert_depth_func(const UINT pipe_func) ++{ ++ switch (pipe_func) { ++ case PIPE_FUNC_NEVER: ++ return ZFUNC_NEVER; ++ case PIPE_FUNC_LESS: ++ return ZFUNC_LT; ++ case PIPE_FUNC_EQUAL: ++ return ZFUNC_EQ; ++ case PIPE_FUNC_LEQUAL: ++ return ZFUNC_LE; ++ case PIPE_FUNC_GREATER: ++ return ZFUNC_GT; ++ case PIPE_FUNC_NOTEQUAL: ++ return ZFUNC_NE; ++ case PIPE_FUNC_GEQUAL: ++ return ZFUNC_GE; ++ case PIPE_FUNC_ALWAYS: ++ return ZFUNC_ALWAYS; ++ default: ++ assert(0 && "Unsupported depth func"); ++ return ZFUNC_ALWAYS; ++ } ++} ++ ++ ++static INLINE SWR_CULLMODE ++swr_convert_cull_mode(const UINT cull_face) ++{ ++ switch (cull_face) { ++ case PIPE_FACE_NONE: ++ return SWR_CULLMODE_NONE; ++ case PIPE_FACE_FRONT: ++ return SWR_CULLMODE_FRONT; ++ case PIPE_FACE_BACK: ++ return SWR_CULLMODE_BACK; ++ case PIPE_FACE_FRONT_AND_BACK: ++ return SWR_CULLMODE_BOTH; ++ default: ++ assert(0 && "Invalid cull mode"); ++ return SWR_CULLMODE_NONE; ++ } ++} ++ ++static INLINE SWR_BLEND_OP ++swr_convert_blend_func(const UINT blend_func) ++{ ++ switch (blend_func) { ++ case PIPE_BLEND_ADD: ++ return BLENDOP_ADD; ++ case PIPE_BLEND_SUBTRACT: ++ return BLENDOP_SUBTRACT; ++ case PIPE_BLEND_REVERSE_SUBTRACT: ++ return BLENDOP_REVSUBTRACT; ++ case PIPE_BLEND_MIN: ++ return BLENDOP_MIN; ++ case PIPE_BLEND_MAX: ++ return BLENDOP_MAX; ++ default: ++ assert(0 && "Invalid blend func"); ++ return BLENDOP_ADD; ++ } ++} ++ ++static INLINE SWR_BLEND_FACTOR ++swr_convert_blend_factor(const UINT blend_factor) ++{ ++ switch (blend_factor) { ++ case PIPE_BLENDFACTOR_ONE: ++ return BLENDFACTOR_ONE; ++ case PIPE_BLENDFACTOR_SRC_COLOR: ++ return BLENDFACTOR_SRC_COLOR; ++ case PIPE_BLENDFACTOR_SRC_ALPHA: ++ return BLENDFACTOR_SRC_ALPHA; ++ case PIPE_BLENDFACTOR_DST_ALPHA: ++ return BLENDFACTOR_DST_ALPHA; ++ case PIPE_BLENDFACTOR_DST_COLOR: ++ return BLENDFACTOR_DST_COLOR; ++ case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: ++ return BLENDFACTOR_SRC_ALPHA_SATURATE; ++ case PIPE_BLENDFACTOR_CONST_COLOR: ++ return BLENDFACTOR_CONST_COLOR; ++ case PIPE_BLENDFACTOR_CONST_ALPHA: ++ return BLENDFACTOR_CONST_ALPHA; ++ case PIPE_BLENDFACTOR_SRC1_COLOR: ++ return BLENDFACTOR_SRC1_COLOR; ++ case PIPE_BLENDFACTOR_SRC1_ALPHA: ++ return BLENDFACTOR_SRC1_ALPHA; ++ case PIPE_BLENDFACTOR_ZERO: ++ return BLENDFACTOR_ZERO; ++ case PIPE_BLENDFACTOR_INV_SRC_COLOR: ++ return BLENDFACTOR_INV_SRC_COLOR; ++ case PIPE_BLENDFACTOR_INV_SRC_ALPHA: ++ return BLENDFACTOR_INV_SRC_ALPHA; ++ case PIPE_BLENDFACTOR_INV_DST_ALPHA: ++ return BLENDFACTOR_INV_DST_ALPHA; ++ case PIPE_BLENDFACTOR_INV_DST_COLOR: ++ return BLENDFACTOR_INV_DST_COLOR; ++ case PIPE_BLENDFACTOR_INV_CONST_COLOR: ++ return BLENDFACTOR_INV_CONST_COLOR; ++ case PIPE_BLENDFACTOR_INV_CONST_ALPHA: ++ return BLENDFACTOR_INV_CONST_ALPHA; ++ case PIPE_BLENDFACTOR_INV_SRC1_COLOR: ++ return BLENDFACTOR_INV_SRC1_COLOR; ++ case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: ++ return BLENDFACTOR_INV_SRC1_ALPHA; ++ default: ++ assert(0 && "Invalid blend factor"); ++ return BLENDFACTOR_ONE; ++ } ++} ++#endif +diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp +new file mode 100644 +index 0000000..8e01e32 +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_tex_sample.cpp +@@ -0,0 +1,338 @@ ++/************************************************************************** ++ * ++ * Copyright 2009 VMware, Inc. ++ * All rights reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++/** ++ * Largely a copy of llvmpipe's lp_tex_sample.c ++ */ ++ ++/** ++ * Texture sampling code generation ++ * ++ * This file is nothing more than ugly glue between three largely independent ++ * entities: ++ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa) ++ * - texture sampling code generation (i.e., lp_build_sample_soa) ++ * - SWR driver ++ * ++ * All interesting code is in the functions mentioned above. There is really ++ * nothing to see here. ++ * ++ * @author Jose Fonseca ++ */ ++ ++#include "state.h" ++#include "JitManager.h" ++#include "state_llvm.h" ++ ++#include "pipe/p_defines.h" ++#include "pipe/p_shader_tokens.h" ++#include "gallivm/lp_bld_debug.h" ++#include "gallivm/lp_bld_const.h" ++#include "gallivm/lp_bld_type.h" ++#include "gallivm/lp_bld_sample.h" ++#include "gallivm/lp_bld_tgsi.h" ++#include "util/u_memory.h" ++ ++#include "swr_tex_sample.h" ++#include "swr_context_llvm.h" ++ ++ ++/** ++ * This provides the bridge between the sampler state store in ++ * lp_jit_context and lp_jit_texture and the sampler code ++ * generator. It provides the texture layout information required by ++ * the texture sampler code generator in terms of the state stored in ++ * lp_jit_context and lp_jit_texture in runtime. ++ */ ++struct swr_sampler_dynamic_state { ++ struct lp_sampler_dynamic_state base; ++ ++ const struct swr_sampler_static_state *static_state; ++}; ++ ++ ++/** ++ * This is the bridge between our sampler and the TGSI translator. ++ */ ++struct swr_sampler_soa { ++ struct lp_build_sampler_soa base; ++ ++ struct swr_sampler_dynamic_state dynamic_state; ++}; ++ ++ ++/** ++ * Fetch the specified member of the lp_jit_texture structure. ++ * \param emit_load if TRUE, emit the LLVM load instruction to actually ++ * fetch the field's value. Otherwise, just emit the ++ * GEP code to address the field. ++ * ++ * @sa http://llvm.org/docs/GetElementPtr.html ++ */ ++static LLVMValueRef ++swr_texture_member(const struct lp_sampler_dynamic_state *base, ++ struct gallivm_state *gallivm, ++ LLVMValueRef context_ptr, ++ unsigned texture_unit, ++ unsigned member_index, ++ const char *member_name, ++ boolean emit_load) ++{ ++ LLVMBuilderRef builder = gallivm->builder; ++ LLVMValueRef indices[4]; ++ LLVMValueRef ptr; ++ LLVMValueRef res; ++ ++ assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); ++ ++ /* context[0] */ ++ indices[0] = lp_build_const_int32(gallivm, 0); ++ /* context[0].textures */ ++ indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS); ++ /* context[0].textures[unit] */ ++ indices[2] = lp_build_const_int32(gallivm, texture_unit); ++ /* context[0].textures[unit].member */ ++ indices[3] = lp_build_const_int32(gallivm, member_index); ++ ++ ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), ""); ++ ++ if (emit_load) ++ res = LLVMBuildLoad(builder, ptr, ""); ++ else ++ res = ptr; ++ ++ lp_build_name(res, "context.texture%u.%s", texture_unit, member_name); ++ ++ return res; ++} ++ ++ ++/** ++ * Helper macro to instantiate the functions that generate the code to ++ * fetch the members of lp_jit_texture to fulfill the sampler code ++ * generator requests. ++ * ++ * This complexity is the price we have to pay to keep the texture ++ * sampler code generator a reusable module without dependencies to ++ * swr internals. ++ */ ++#define SWR_TEXTURE_MEMBER(_name, _emit_load) \ ++ static LLVMValueRef swr_texture_##_name( \ ++ const struct lp_sampler_dynamic_state *base, \ ++ struct gallivm_state *gallivm, \ ++ LLVMValueRef context_ptr, \ ++ unsigned texture_unit) \ ++ { \ ++ return swr_texture_member(base, \ ++ gallivm, \ ++ context_ptr, \ ++ texture_unit, \ ++ swr_jit_texture_##_name, \ ++ #_name, \ ++ _emit_load); \ ++ } ++ ++ ++SWR_TEXTURE_MEMBER(width, TRUE) ++SWR_TEXTURE_MEMBER(height, TRUE) ++SWR_TEXTURE_MEMBER(depth, TRUE) ++SWR_TEXTURE_MEMBER(first_level, TRUE) ++SWR_TEXTURE_MEMBER(last_level, TRUE) ++SWR_TEXTURE_MEMBER(base_ptr, TRUE) ++SWR_TEXTURE_MEMBER(row_stride, FALSE) ++SWR_TEXTURE_MEMBER(img_stride, FALSE) ++SWR_TEXTURE_MEMBER(mip_offsets, FALSE) ++ ++ ++/** ++ * Fetch the specified member of the lp_jit_sampler structure. ++ * \param emit_load if TRUE, emit the LLVM load instruction to actually ++ * fetch the field's value. Otherwise, just emit the ++ * GEP code to address the field. ++ * ++ * @sa http://llvm.org/docs/GetElementPtr.html ++ */ ++static LLVMValueRef ++swr_sampler_member(const struct lp_sampler_dynamic_state *base, ++ struct gallivm_state *gallivm, ++ LLVMValueRef context_ptr, ++ unsigned sampler_unit, ++ unsigned member_index, ++ const char *member_name, ++ boolean emit_load) ++{ ++ LLVMBuilderRef builder = gallivm->builder; ++ LLVMValueRef indices[4]; ++ LLVMValueRef ptr; ++ LLVMValueRef res; ++ ++ assert(sampler_unit < PIPE_MAX_SAMPLERS); ++ ++ /* context[0] */ ++ indices[0] = lp_build_const_int32(gallivm, 0); ++ /* context[0].samplers */ ++ indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS); ++ /* context[0].samplers[unit] */ ++ indices[2] = lp_build_const_int32(gallivm, sampler_unit); ++ /* context[0].samplers[unit].member */ ++ indices[3] = lp_build_const_int32(gallivm, member_index); ++ ++ ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), ""); ++ ++ if (emit_load) ++ res = LLVMBuildLoad(builder, ptr, ""); ++ else ++ res = ptr; ++ ++ lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name); ++ ++ return res; ++} ++ ++ ++#define SWR_SAMPLER_MEMBER(_name, _emit_load) \ ++ static LLVMValueRef swr_sampler_##_name( \ ++ const struct lp_sampler_dynamic_state *base, \ ++ struct gallivm_state *gallivm, \ ++ LLVMValueRef context_ptr, \ ++ unsigned sampler_unit) \ ++ { \ ++ return swr_sampler_member(base, \ ++ gallivm, \ ++ context_ptr, \ ++ sampler_unit, \ ++ swr_jit_sampler_##_name, \ ++ #_name, \ ++ _emit_load); \ ++ } ++ ++ ++SWR_SAMPLER_MEMBER(min_lod, TRUE) ++SWR_SAMPLER_MEMBER(max_lod, TRUE) ++SWR_SAMPLER_MEMBER(lod_bias, TRUE) ++SWR_SAMPLER_MEMBER(border_color, FALSE) ++ ++ ++static void ++swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) ++{ ++ FREE(sampler); ++} ++ ++ ++/** ++ * Fetch filtered values from texture. ++ * The 'texel' parameter returns four vectors corresponding to R, G, B, A. ++ */ ++static void ++swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, ++ struct gallivm_state *gallivm, ++ const struct lp_sampler_params *params) ++{ ++ struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; ++ unsigned texture_index = params->texture_index; ++ unsigned sampler_index = params->sampler_index; ++ ++ assert(sampler_index < PIPE_MAX_SAMPLERS); ++ assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS); ++ ++#if 0 ++ lp_build_sample_nop(gallivm, params->type, params->coords, params->texel); ++#else ++ lp_build_sample_soa( ++ &sampler->dynamic_state.static_state[texture_index].texture_state, ++ &sampler->dynamic_state.static_state[sampler_index].sampler_state, ++ &sampler->dynamic_state.base, ++ gallivm, ++ params); ++#endif ++} ++ ++/** ++ * Fetch the texture size. ++ */ ++static void ++swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, ++ struct gallivm_state *gallivm, ++ struct lp_type type, ++ unsigned texture_unit, ++ unsigned target, ++ LLVMValueRef context_ptr, ++ boolean is_sviewinfo, ++ enum lp_sampler_lod_property lod_property, ++ LLVMValueRef explicit_lod, /* optional */ ++ LLVMValueRef *sizes_out) ++{ ++ struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base; ++ ++ assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS); ++ ++ lp_build_size_query_soa( ++ gallivm, ++ &sampler->dynamic_state.static_state[texture_unit].texture_state, ++ &sampler->dynamic_state.base, ++ type, ++ texture_unit, ++ target, ++ context_ptr, ++ is_sviewinfo, ++ lod_property, ++ explicit_lod, ++ sizes_out); ++} ++ ++ ++struct lp_build_sampler_soa * ++swr_sampler_soa_create(const struct swr_sampler_static_state *static_state) ++{ ++ struct swr_sampler_soa *sampler; ++ ++ sampler = CALLOC_STRUCT(swr_sampler_soa); ++ if (!sampler) ++ return NULL; ++ ++ sampler->base.destroy = swr_sampler_soa_destroy; ++ sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel; ++ sampler->base.emit_size_query = swr_sampler_soa_emit_size_query; ++ sampler->dynamic_state.base.width = swr_texture_width; ++ sampler->dynamic_state.base.height = swr_texture_height; ++ sampler->dynamic_state.base.depth = swr_texture_depth; ++ sampler->dynamic_state.base.first_level = swr_texture_first_level; ++ sampler->dynamic_state.base.last_level = swr_texture_last_level; ++ sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr; ++ sampler->dynamic_state.base.row_stride = swr_texture_row_stride; ++ sampler->dynamic_state.base.img_stride = swr_texture_img_stride; ++ sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets; ++ sampler->dynamic_state.base.min_lod = swr_sampler_min_lod; ++ sampler->dynamic_state.base.max_lod = swr_sampler_max_lod; ++ sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias; ++ sampler->dynamic_state.base.border_color = swr_sampler_border_color; ++ ++ sampler->dynamic_state.static_state = static_state; ++ ++ return &sampler->base; ++} +diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h +new file mode 100644 +index 0000000..f5c368c +--- /dev/null ++++ b/src/gallium/drivers/swr/swr_tex_sample.h +@@ -0,0 +1,47 @@ ++/************************************************************************** ++ * ++ * Copyright 2007 VMware, Inc. ++ * All Rights Reserved. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sub license, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the ++ * next paragraph) shall be included in all copies or substantial portions ++ * of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. ++ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR ++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++#pragma once ++ ++#include "gallivm/lp_bld.h" ++ ++struct swr_sampler_static_state { ++ /* ++ * These attributes are effectively interleaved for more sane key handling. ++ * However, there might be lots of null space if the amount of samplers and ++ * textures isn't the same. ++ */ ++ struct lp_static_sampler_state sampler_state; ++ struct lp_static_texture_state texture_state; ++}; ++ ++/** ++ * Pure-LLVM texture sampling code generator. ++ * ++ */ ++struct lp_build_sampler_soa * ++swr_sampler_soa_create(const struct swr_sampler_static_state *key); +diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am +index d99caae..527d01b 100644 +--- a/src/gallium/targets/libgl-xlib/Makefile.am ++++ b/src/gallium/targets/libgl-xlib/Makefile.am +@@ -84,4 +84,9 @@ endif + EXTRA_lib@GL_LIB@_la_DEPENDENCIES = libgl-xlib.sym + EXTRA_DIST = SConscript libgl-xlib.sym + ++if HAVE_GALLIUM_SWR ++lib@GL_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS) ++AM_CPPFLAGS += -DGALLIUM_SWR ++endif ++ + include $(top_srcdir)/install-gallium-links.mk +diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript +index df5a220..da77ad5 100644 +--- a/src/gallium/targets/libgl-xlib/SConscript ++++ b/src/gallium/targets/libgl-xlib/SConscript +@@ -46,6 +46,10 @@ if env['llvm']: + env.Append(CPPDEFINES = ['GALLIUM_LLVMPIPE']) + env.Prepend(LIBS = [llvmpipe]) + ++if env['llvm']: ++ env.Append(CPPDEFINES = ['GALLIUM_SWR']) ++ env.Prepend(LIBS = [swr]) ++ + # Disallow undefined symbols + if env['platform'] != 'darwin': + env.Append(SHLINKFLAGS = ['-Wl,-z,defs']) +diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am +index 38e515f..5d39486 100644 +--- a/src/gallium/targets/osmesa/Makefile.am ++++ b/src/gallium/targets/osmesa/Makefile.am +@@ -74,6 +74,12 @@ lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) + lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS) + endif + ++if HAVE_GALLIUM_SWR ++AM_CPPFLAGS += -DGALLIUM_SWR ++lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS) ++lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS) ++endif ++ + EXTRA_lib@OSMESA_LIB@_la_DEPENDENCIES = osmesa.sym + EXTRA_DIST = \ + osmesa.sym \ +-- +2.6.2 + diff --git a/0002-swr-484541-Initial-public-SWR.patch b/0002-swr-484541-Initial-public-SWR.patch new file mode 100644 index 0000000..c43d9c0 --- /dev/null +++ b/0002-swr-484541-Initial-public-SWR.patch @@ -0,0 +1,46197 @@ +From 378e7aa8e96eb976aa4fe8cea6e522c3c2566031 Mon Sep 17 00:00:00 2001 +From: Tim Rowley +Date: Mon, 19 Oct 2015 13:34:59 -0500 +Subject: [PATCH 2/3] swr-484541: Initial public SWR + +--- + .../drivers/swr/rasterizer/common/containers.hpp | 208 + + .../drivers/swr/rasterizer/common/formats.cpp | 5029 ++++++++++++++++++++ + .../drivers/swr/rasterizer/common/formats.h | 222 + + src/gallium/drivers/swr/rasterizer/common/isa.hpp | 235 + + src/gallium/drivers/swr/rasterizer/common/os.h | 194 + + .../swr/rasterizer/common/rdtsc_buckets.cpp | 176 + + .../drivers/swr/rasterizer/common/rdtsc_buckets.h | 195 + + .../swr/rasterizer/common/rdtsc_buckets_shared.h | 167 + + .../drivers/swr/rasterizer/common/simdintrin.h | 792 +++ + .../drivers/swr/rasterizer/common/swr_assert.cpp | 141 + + .../drivers/swr/rasterizer/common/swr_assert.h | 84 + + src/gallium/drivers/swr/rasterizer/core/api.cpp | 1461 ++++++ + src/gallium/drivers/swr/rasterizer/core/api.h | 483 ++ + src/gallium/drivers/swr/rasterizer/core/arena.cpp | 126 + + src/gallium/drivers/swr/rasterizer/core/arena.h | 63 + + .../drivers/swr/rasterizer/core/backend.cpp | 1150 +++++ + src/gallium/drivers/swr/rasterizer/core/backend.h | 45 + + src/gallium/drivers/swr/rasterizer/core/blend.h | 318 ++ + src/gallium/drivers/swr/rasterizer/core/clip.cpp | 201 + + src/gallium/drivers/swr/rasterizer/core/clip.h | 851 ++++ + src/gallium/drivers/swr/rasterizer/core/context.h | 444 ++ + .../drivers/swr/rasterizer/core/depthstencil.h | 215 + + src/gallium/drivers/swr/rasterizer/core/fifo.hpp | 144 + + .../swr/rasterizer/core/format_conversion.h | 167 + + .../drivers/swr/rasterizer/core/format_traits.h | 2954 ++++++++++++ + .../drivers/swr/rasterizer/core/format_types.h | 1053 ++++ + .../drivers/swr/rasterizer/core/frontend.cpp | 1972 ++++++++ + src/gallium/drivers/swr/rasterizer/core/frontend.h | 326 ++ + src/gallium/drivers/swr/rasterizer/core/knobs.h | 139 + + .../drivers/swr/rasterizer/core/knobs_init.h | 98 + + .../drivers/swr/rasterizer/core/multisample.h | 562 +++ + src/gallium/drivers/swr/rasterizer/core/pa.h | 1205 +++++ + src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 1330 ++++++ + .../drivers/swr/rasterizer/core/rasterizer.cpp | 1217 +++++ + .../drivers/swr/rasterizer/core/rasterizer.h | 34 + + .../drivers/swr/rasterizer/core/rdtsc_core.cpp | 90 + + .../drivers/swr/rasterizer/core/rdtsc_core.h | 175 + + src/gallium/drivers/swr/rasterizer/core/state.h | 918 ++++ + .../drivers/swr/rasterizer/core/tessellator.h | 88 + + .../drivers/swr/rasterizer/core/threads.cpp | 884 ++++ + src/gallium/drivers/swr/rasterizer/core/threads.h | 62 + + .../drivers/swr/rasterizer/core/tilemgr.cpp | 105 + + src/gallium/drivers/swr/rasterizer/core/tilemgr.h | 392 ++ + src/gallium/drivers/swr/rasterizer/core/utils.cpp | 148 + + src/gallium/drivers/swr/rasterizer/core/utils.h | 745 +++ + .../drivers/swr/rasterizer/jitter/JitManager.cpp | 292 ++ + .../drivers/swr/rasterizer/jitter/JitManager.h | 182 + + .../drivers/swr/rasterizer/jitter/blend_jit.cpp | 473 ++ + .../drivers/swr/rasterizer/jitter/blend_jit.h | 49 + + .../drivers/swr/rasterizer/jitter/builder.cpp | 56 + + .../drivers/swr/rasterizer/jitter/builder.h | 66 + + .../drivers/swr/rasterizer/jitter/builder_gen.cpp | 1052 ++++ + .../drivers/swr/rasterizer/jitter/builder_gen.h | 205 + + .../drivers/swr/rasterizer/jitter/builder_math.h | 34 + + .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 1195 +++++ + .../drivers/swr/rasterizer/jitter/builder_misc.h | 141 + + .../drivers/swr/rasterizer/jitter/builder_x86.cpp | 242 + + .../drivers/swr/rasterizer/jitter/builder_x86.h | 65 + + .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 1450 ++++++ + .../drivers/swr/rasterizer/jitter/fetch_jit.h | 128 + + .../drivers/swr/rasterizer/jitter/jit_api.h | 105 + + .../rasterizer/jitter/scripts/gen_llvm_types.py | 334 ++ + .../swr/rasterizer/jitter/streamout_jit.cpp | 348 ++ + .../drivers/swr/rasterizer/jitter/streamout_jit.h | 91 + + .../drivers/swr/rasterizer/memory/ClearTile.cpp | 287 ++ + .../drivers/swr/rasterizer/memory/Convert.h | 698 +++ + .../drivers/swr/rasterizer/memory/LoadTile.cpp | 382 ++ + .../drivers/swr/rasterizer/memory/StoreTile.cpp | 1645 +++++++ + .../swr/rasterizer/memory/TilingFunctions.h | 518 ++ + .../drivers/swr/rasterizer/memory/tilingtraits.h | 239 + + .../drivers/swr/rasterizer/scripts/gen_knobs.py | 79 + + .../drivers/swr/rasterizer/scripts/knob_defs.py | 212 + + .../swr/rasterizer/scripts/mako/__init__.py | 8 + + .../swr/rasterizer/scripts/mako/_ast_util.py | 845 ++++ + .../drivers/swr/rasterizer/scripts/mako/ast.py | 178 + + .../drivers/swr/rasterizer/scripts/mako/cache.py | 238 + + .../drivers/swr/rasterizer/scripts/mako/cmd.py | 62 + + .../drivers/swr/rasterizer/scripts/mako/codegen.py | 1237 +++++ + .../drivers/swr/rasterizer/scripts/mako/compat.py | 174 + + .../swr/rasterizer/scripts/mako/exceptions.py | 373 ++ + .../drivers/swr/rasterizer/scripts/mako/filters.py | 201 + + .../drivers/swr/rasterizer/scripts/mako/lexer.py | 441 ++ + .../drivers/swr/rasterizer/scripts/mako/lookup.py | 359 ++ + .../swr/rasterizer/scripts/mako/parsetree.py | 594 +++ + .../drivers/swr/rasterizer/scripts/mako/pygen.py | 299 ++ + .../swr/rasterizer/scripts/mako/pyparser.py | 232 + + .../drivers/swr/rasterizer/scripts/mako/runtime.py | 878 ++++ + .../swr/rasterizer/scripts/mako/template.py | 705 +++ + .../drivers/swr/rasterizer/scripts/mako/util.py | 360 ++ + .../rasterizer/scripts/templates/knobs.template | 106 + + 90 files changed, 45466 insertions(+) + create mode 100644 src/gallium/drivers/swr/rasterizer/common/containers.hpp + create mode 100644 src/gallium/drivers/swr/rasterizer/common/formats.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/common/formats.h + create mode 100644 src/gallium/drivers/swr/rasterizer/common/isa.hpp + create mode 100644 src/gallium/drivers/swr/rasterizer/common/os.h + create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h + create mode 100644 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h + create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdintrin.h + create mode 100644 src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/common/swr_assert.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/api.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/api.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/blend.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/clip.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/clip.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/context.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/depthstencil.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/fifo.hpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_conversion.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_traits.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/format_types.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/frontend.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/frontend.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/knobs.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/knobs_init.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/multisample.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/pa.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/state.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/tessellator.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/threads.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/threads.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/tilemgr.h + create mode 100644 src/gallium/drivers/swr/rasterizer/core/utils.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/core/utils.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/JitManager.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_math.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_api.h + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h + create mode 100644 src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/memory/Convert.h + create mode 100644 src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp + create mode 100644 src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h + create mode 100644 src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/template.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/mako/util.py + create mode 100644 src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template + +diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp +new file mode 100644 +index 0000000..bc96c5f +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp +@@ -0,0 +1,208 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#ifndef SWRLIB_CONTAINERS_HPP__ ++#define SWRLIB_CONTAINERS_HPP__ ++ ++#include ++#include "common/os.h" ++ ++namespace SWRL ++{ ++ ++template ++struct UncheckedFixedVector ++{ ++ UncheckedFixedVector() : mSize(0) ++ { ++ } ++ ++ UncheckedFixedVector(std::size_t size, T const& exemplar) ++ { ++ this->mSize = 0; ++ for (std::size_t i = 0; i < size; ++i) ++ this->push_back(exemplar); ++ } ++ ++ template ++ UncheckedFixedVector(Iter fst, Iter lst) ++ { ++ this->mSize = 0; ++ for ( ; fst != lst; ++fst) ++ this->push_back(*fst); ++ } ++ ++ UncheckedFixedVector(UncheckedFixedVector const& UFV) ++ { ++ this->mSize = 0; ++ for (std::size_t i = 0, N = UFV.size(); i < N; ++i) ++ (*this)[i] = UFV[i]; ++ this->mSize = UFV.size(); ++ } ++ ++ UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) ++ { ++ for (std::size_t i = 0, N = UFV.size(); i < N; ++i) ++ (*this)[i] = UFV[i]; ++ this->mSize = UFV.size(); ++ return *this; ++ } ++ ++ T* begin() { return &this->mElements[0]; } ++ T* end() { return &this->mElements[0] + this->mSize; } ++ T const* begin() const { return &this->mElements[0]; } ++ T const* end() const { return &this->mElements[0] + this->mSize; } ++ ++ friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) ++ { ++ if (L.size() != R.size()) return false; ++ for (std::size_t i = 0, N = L.size(); i < N; ++i) ++ { ++ if (L[i] != R[i]) return false; ++ } ++ return true; ++ } ++ ++ friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) ++ { ++ if (L.size() != R.size()) return true; ++ for (std::size_t i = 0, N = L.size(); i < N; ++i) ++ { ++ if (L[i] != R[i]) return true; ++ } ++ return false; ++ } ++ ++ T& operator[](std::size_t idx) ++ { ++ return this->mElements[idx]; ++ } ++ T const& operator[](std::size_t idx) const ++ { ++ return this->mElements[idx]; ++ } ++ void push_back(T const& t) ++ { ++ this->mElements[this->mSize] = t; ++ ++this->mSize; ++ } ++ void pop_back() ++ { ++ SWR_ASSERT(this->mSize > 0); ++ --this->mSize; ++ } ++ T& back() ++ { ++ return this->mElements[this->mSize-1]; ++ } ++ T const& back() const ++ { ++ return this->mElements[this->mSize-1]; ++ } ++ bool empty() const ++ { ++ return this->mSize == 0; ++ } ++ std::size_t size() const ++ { ++ return this->mSize; ++ } ++ void resize(std::size_t sz) ++ { ++ this->mSize = sz; ++ } ++ void clear() ++ { ++ this->resize(0); ++ } ++private: ++ std::size_t mSize; ++ T mElements[NUM_ELEMENTS]; ++}; ++ ++template ++struct FixedStack : UncheckedFixedVector ++{ ++ FixedStack() {} ++ ++ void push(T const& t) ++ { ++ this->push_back(t); ++ } ++ ++ void pop() ++ { ++ this->pop_back(); ++ } ++ ++ T& top() ++ { ++ return this->back(); ++ } ++ ++ T const& top() const ++ { ++ return this->back(); ++ } ++}; ++ ++template ++struct CRCHash ++{ ++ static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B"); ++ UINT operator()(const T& k) const ++ { ++ UINT *pData = (UINT*)&k; ++ UINT crc = 0; ++ for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i) ++ { ++ crc = _mm_crc32_u32(crc, pData[i]); ++ } ++ return crc; ++ } ++}; ++ ++}// end SWRL ++ ++namespace std ++{ ++ ++template ++struct hash> ++{ ++ size_t operator() (SWRL::UncheckedFixedVector const& v) const ++ { ++ if (v.size() == 0) return 0; ++ std::hash H; ++ size_t x = H(v[0]); ++ if (v.size() == 1) return x; ++ for (size_t i = 1; i < v.size(); ++i) ++ x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); ++ return x; ++ } ++}; ++ ++ ++}// end std. ++ ++#endif//SWRLIB_CONTAINERS_HPP__ +diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp +new file mode 100644 +index 0000000..7e90ee7 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp +@@ -0,0 +1,5029 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file formats.cpp ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#include "formats.h" ++ ++// lookup table for unorm8 srgb -> float conversion ++const uint32_t srgb8Table[256] = { ++ 0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd, ++ 0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1, ++ 0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9, ++ 0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f, ++ 0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb, ++ 0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19, ++ 0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3, ++ 0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307, ++ 0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283, ++ 0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333, ++ 0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54, ++ 0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8, ++ 0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1, ++ 0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7, ++ 0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2, ++ 0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000, ++}; ++ ++// order must match SWR_FORMAT ++const SWR_FORMAT_INFO gFormatInfo[] = { ++ // R32G32B32A32_FLOAT (0x0) ++ { ++ "R32G32B32A32_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 32, 32, 32, 32 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32A32_SINT (0x1) ++ { ++ "R32G32B32A32_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 32, 32, 32, 32 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32A32_UINT (0x2) ++ { ++ "R32G32B32A32_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 32, 32, 32, 32 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x3 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R32G32B32X32_FLOAT (0x6) ++ { ++ "R32G32B32X32_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 32, 32, 32, 32 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32A32_SSCALED (0x7) ++ { ++ "R32G32B32A32_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 32, 32, 32, 32 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32A32_USCALED (0x8) ++ { ++ "R32G32B32A32_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 32, 32, 32, 32 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x9 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xc (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xd (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xf (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x10 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x11 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x14 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x18 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x19 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x20 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x21 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x22 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x23 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x24 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x25 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x26 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x27 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x28 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x29 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x2a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x2b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x2c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x2d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x2e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x2f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x30 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x31 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x32 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x33 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x34 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x35 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x36 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x37 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x38 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x39 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x3a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x3b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x3c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x3d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x3e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x3f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R32G32B32_FLOAT (0x40) ++ { ++ "R32G32B32_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 32, 32, 32, 0 }, // Bits per component ++ 96, // Bits per element ++ 12, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32_SINT (0x41) ++ { ++ "R32G32B32_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 32, 32, 32, 0 }, // Bits per component ++ 96, // Bits per element ++ 12, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32_UINT (0x42) ++ { ++ "R32G32B32_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 32, 32, 32, 0 }, // Bits per component ++ 96, // Bits per element ++ 12, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x43 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x44 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R32G32B32_SSCALED (0x45) ++ { ++ "R32G32B32_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 32, 32, 32, 0 }, // Bits per component ++ 96, // Bits per element ++ 12, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32B32_USCALED (0x46) ++ { ++ "R32G32B32_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 32, 32, 32, 0 }, // Bits per component ++ 96, // Bits per element ++ 12, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x47 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x48 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x49 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x4f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x50 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x51 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x52 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x53 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x54 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x55 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x56 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x57 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x58 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x59 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x5f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x60 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x61 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x62 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x63 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x64 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x65 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x66 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x67 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x68 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x69 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x6a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x6b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x6c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x6d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x6e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x6f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x70 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x71 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x72 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x73 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x74 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x75 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x76 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x77 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x78 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x79 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x7a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x7b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x7c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x7d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x7e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x7f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R16G16B16A16_UNORM (0x80) ++ { ++ "R16G16B16A16_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16A16_SNORM (0x81) ++ { ++ "R16G16B16A16_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16A16_SINT (0x82) ++ { ++ "R16G16B16A16_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16A16_UINT (0x83) ++ { ++ "R16G16B16A16_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16A16_FLOAT (0x84) ++ { ++ "R16G16B16A16_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32_FLOAT (0x85) ++ { ++ "R32G32_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32_SINT (0x86) ++ { ++ "R32G32_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32_UINT (0x87) ++ { ++ "R32G32_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32_FLOAT_X8X24_TYPELESS (0x88) ++ { ++ "R32_FLOAT_X8X24_TYPELESS", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x89 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x8a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x8b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x8c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x8d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R16G16B16X16_UNORM (0x8e) ++ { ++ "R16G16B16X16_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16X16_FLOAT (0x8f) ++ { ++ "R16G16B16X16_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x90 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x91 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x92 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R16G16B16A16_SSCALED (0x93) ++ { ++ "R16G16B16A16_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16A16_USCALED (0x94) ++ { ++ "R16G16B16A16_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 16, 16, 16, 16 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32_SSCALED (0x95) ++ { ++ "R32G32_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32G32_USCALED (0x96) ++ { ++ "R32G32_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x97 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R32_FLOAT_X8X24_TYPELESS_LD (0x98) ++ { ++ "R32_FLOAT_X8X24_TYPELESS_LD", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 32, 32, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x99 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x9a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x9b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x9c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x9d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x9e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x9f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa0 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa1 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa2 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa3 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa6 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa7 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa8 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xa9 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xaa (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xab (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xac (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xad (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xae (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xaf (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb0 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb1 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb2 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb3 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb6 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb7 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb8 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xb9 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xba (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xbb (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xbc (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xbd (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xbe (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xbf (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // B8G8R8A8_UNORM (0xc0) ++ { ++ "B8G8R8A8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B8G8R8A8_UNORM_SRGB (0xc1) ++ { ++ "B8G8R8A8_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R10G10B10A2_UNORM (0xc2) ++ { ++ "R10G10B10A2_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R10G10B10A2_UNORM_SRGB (0xc3) ++ { ++ "R10G10B10A2_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R10G10B10A2_UINT (0xc4) ++ { ++ "R10G10B10A2_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xc5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xc6 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R8G8B8A8_UNORM (0xc7) ++ { ++ "R8G8B8A8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8A8_UNORM_SRGB (0xc8) ++ { ++ "R8G8B8A8_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8A8_SNORM (0xc9) ++ { ++ "R8G8B8A8_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8A8_SINT (0xca) ++ { ++ "R8G8B8A8_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8A8_UINT (0xcb) ++ { ++ "R8G8B8A8_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_UNORM (0xcc) ++ { ++ "R16G16_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_SNORM (0xcd) ++ { ++ "R16G16_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_SINT (0xce) ++ { ++ "R16G16_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_UINT (0xcf) ++ { ++ "R16G16_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_FLOAT (0xd0) ++ { ++ "R16G16_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_UNORM (0xd1) ++ { ++ "B10G10R10A2_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_UNORM_SRGB (0xd2) ++ { ++ "B10G10R10A2_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R11G11B10_FLOAT (0xd3) ++ { ++ "R11G11B10_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 11, 11, 10, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xd4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xd5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R32_SINT (0xd6) ++ { ++ "R32_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 32, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32_UINT (0xd7) ++ { ++ "R32_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 32, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32_FLOAT (0xd8) ++ { ++ "R32_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 32, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R24_UNORM_X8_TYPELESS (0xd9) ++ { ++ "R24_UNORM_X8_TYPELESS", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 24, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xda (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xdb (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R24_UNORM_X8_TYPELESS_LD (0xdc) ++ { ++ "R24_UNORM_X8_TYPELESS_LD", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 24, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xdd (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xde (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xdf (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe0 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe1 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe2 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe3 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // A32_FLOAT (0xe5) ++ { ++ "A32_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 3, 0, 0, 0 }, // Swizzle ++ { 32, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xe6 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe7 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xe8 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // B8G8R8X8_UNORM (0xe9) ++ { ++ "B8G8R8X8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B8G8R8X8_UNORM_SRGB (0xea) ++ { ++ "B8G8R8X8_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8X8_UNORM (0xeb) ++ { ++ "R8G8B8X8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8X8_UNORM_SRGB (0xec) ++ { ++ "R8G8B8X8_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R9G9B9E5_SHAREDEXP (0xed) ++ { ++ "R9G9B9E5_SHAREDEXP", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 9, 9, 9, 5 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10X2_UNORM (0xee) ++ { ++ "B10G10R10X2_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xef (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xf0 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xf1 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xf2 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R10G10B10X2_USCALED (0xf3) ++ { ++ "R10G10B10X2_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8A8_SSCALED (0xf4) ++ { ++ "R8G8B8A8_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8A8_USCALED (0xf5) ++ { ++ "R8G8B8A8_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_SSCALED (0xf6) ++ { ++ "R16G16_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16_USCALED (0xf7) ++ { ++ "R16G16_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 16, 16, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32_SSCALED (0xf8) ++ { ++ "R32_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 32, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R32_USCALED (0xf9) ++ { ++ "R32_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 32, 0, 0, 0 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0xfa (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xfb (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xfc (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xfd (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xfe (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0xff (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // B5G6R5_UNORM (0x100) ++ { ++ "B5G6R5_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 0 }, // Swizzle ++ { 5, 6, 5, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B5G6R5_UNORM_SRGB (0x101) ++ { ++ "B5G6R5_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 0 }, // Swizzle ++ { 5, 6, 5, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 3, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B5G5R5A1_UNORM (0x102) ++ { ++ "B5G5R5A1_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 5, 5, 5, 1 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B5G5R5A1_UNORM_SRGB (0x103) ++ { ++ "B5G5R5A1_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 5, 5, 5, 1 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B4G4R4A4_UNORM (0x104) ++ { ++ "B4G4R4A4_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 4, 4, 4, 4 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B4G4R4A4_UNORM_SRGB (0x105) ++ { ++ "B4G4R4A4_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 4, 4, 4, 4 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8_UNORM (0x106) ++ { ++ "R8G8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 8, 8, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8_SNORM (0x107) ++ { ++ "R8G8_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 8, 8, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8_SINT (0x108) ++ { ++ "R8G8_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 8, 8, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8_UINT (0x109) ++ { ++ "R8G8_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 8, 8, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_UNORM (0x10a) ++ { ++ "R16_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_SNORM (0x10b) ++ { ++ "R16_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_SINT (0x10c) ++ { ++ "R16_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_UINT (0x10d) ++ { ++ "R16_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_FLOAT (0x10e) ++ { ++ "R16_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x10f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x110 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x111 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x112 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // A16_UNORM (0x113) ++ { ++ "A16_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 3, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x114 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x115 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x116 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // A16_FLOAT (0x117) ++ { ++ "A16_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 3, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x118 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x119 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // B5G5R5X1_UNORM (0x11a) ++ { ++ "B5G5R5X1_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 5, 5, 5, 1 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B5G5R5X1_UNORM_SRGB (0x11b) ++ { ++ "B5G5R5X1_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 5, 5, 5, 1 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 4, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8_SSCALED (0x11c) ++ { ++ "R8G8_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 8, 8, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8_USCALED (0x11d) ++ { ++ "R8G8_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 0, 0 }, // Swizzle ++ { 8, 8, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 2, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_SSCALED (0x11e) ++ { ++ "R16_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16_USCALED (0x11f) ++ { ++ "R16_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 16, 0, 0, 0 }, // Bits per component ++ 16, // Bits per element ++ 2, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x120 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x121 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x122 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x123 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x124 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x125 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x126 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x127 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x128 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x129 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x12f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x130 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x131 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x132 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x133 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x134 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x135 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x136 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x137 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x138 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x139 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x13f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R8_UNORM (0x140) ++ { ++ "R8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8_SNORM (0x141) ++ { ++ "R8_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8_SINT (0x142) ++ { ++ "R8_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8_UINT (0x143) ++ { ++ "R8_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // A8_UNORM (0x144) ++ { ++ "A8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 3, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x145 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x146 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x147 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x148 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R8_SSCALED (0x149) ++ { ++ "R8_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8_USCALED (0x14a) ++ { ++ "R8_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 8, // Bits per element ++ 1, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 0, 0, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x14b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x14c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x14d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x14e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x14f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x150 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x151 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x152 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x153 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x154 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x155 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x156 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x157 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x158 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x159 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x15f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x160 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x161 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x162 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x163 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x164 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x165 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x166 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x167 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x168 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x169 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x16f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x170 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x171 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x172 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x173 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x174 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x175 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x176 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x177 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x178 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x179 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17a (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17b (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17c (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17d (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x17f (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x180 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x181 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x182 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // YCRCB_SWAPUVY (0x183) ++ { ++ "YCRCB_SWAPUVY", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ true, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 2, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x184 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x185 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // BC1_UNORM (0x186) ++ { ++ "BC1_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC2_UNORM (0x187) ++ { ++ "BC2_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC3_UNORM (0x188) ++ { ++ "BC3_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC4_UNORM (0x189) ++ { ++ "BC4_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC5_UNORM (0x18a) ++ { ++ "BC5_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC1_UNORM_SRGB (0x18b) ++ { ++ "BC1_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 1, // Num components ++ true, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC2_UNORM_SRGB (0x18c) ++ { ++ "BC2_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ true, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC3_UNORM_SRGB (0x18d) ++ { ++ "BC3_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ true, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // 0x18e (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // YCRCB_SWAPUV (0x18f) ++ { ++ "YCRCB_SWAPUV", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 8, 8, 8, 8 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ true, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 2, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x190 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x191 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x192 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R8G8B8_UNORM (0x193) ++ { ++ "R8G8B8_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8_SNORM (0x194) ++ { ++ "R8G8B8_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8_SSCALED (0x195) ++ { ++ "R8G8B8_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8_USCALED (0x196) ++ { ++ "R8G8B8_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x197 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x198 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // BC4_SNORM (0x199) ++ { ++ "BC4_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 64, // Bits per element ++ 8, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC5_SNORM (0x19a) ++ { ++ "BC5_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // R16G16B16_FLOAT (0x19b) ++ { ++ "R16G16B16_FLOAT", ++ { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16_UNORM (0x19c) ++ { ++ "R16G16B16_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16_SNORM (0x19d) ++ { ++ "R16G16B16_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16_SSCALED (0x19e) ++ { ++ "R16G16B16_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16_USCALED (0x19f) ++ { ++ "R16G16B16_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x1a0 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1a1 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // BC7_UNORM (0x1a2) ++ { ++ "BC7_UNORM", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ false, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // BC7_UNORM_SRGB (0x1a3) ++ { ++ "BC7_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 0, 0, 0 }, // Swizzle ++ { 8, 0, 0, 0 }, // Bits per component ++ 128, // Bits per element ++ 16, // Bytes per element ++ 1, // Num components ++ true, // isSRGB ++ true, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor ++ 4, // bcWidth ++ 4, // bcHeight ++ }, ++ // 0x1a4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1a5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1a6 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1a7 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R8G8B8_UNORM_SRGB (0x1a8) ++ { ++ "R8G8B8_UNORM_SRGB", ++ { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ true, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x1a9 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1aa (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1ab (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1ac (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1ad (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1ae (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1af (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R16G16B16_UINT (0x1b0) ++ { ++ "R16G16B16_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R16G16B16_SINT (0x1b1) ++ { ++ "R16G16B16_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 16, 16, 16, 0 }, // Bits per component ++ 48, // Bits per element ++ 6, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x1b2 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R10G10B10A2_SNORM (0x1b3) ++ { ++ "R10G10B10A2_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R10G10B10A2_USCALED (0x1b4) ++ { ++ "R10G10B10A2_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R10G10B10A2_SSCALED (0x1b5) ++ { ++ "R10G10B10A2_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R10G10B10A2_SINT (0x1b6) ++ { ++ "R10G10B10A2_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_SNORM (0x1b7) ++ { ++ "B10G10R10A2_SNORM", ++ { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { true, true, true, true }, // Is normalized? ++ { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_USCALED (0x1b8) ++ { ++ "B10G10R10A2_USCALED", ++ { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_SSCALED (0x1b9) ++ { ++ "B10G10R10A2_SSCALED", ++ { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED }, ++ { 0, 0, 0, 0x3f800000 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_UINT (0x1ba) ++ { ++ "B10G10R10A2_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // B10G10R10A2_SINT (0x1bb) ++ { ++ "B10G10R10A2_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 2, 1, 0, 3 }, // Swizzle ++ { 10, 10, 10, 2 }, // Bits per component ++ 32, // Bits per element ++ 4, // Bytes per element ++ 4, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // 0x1bc (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1bd (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1be (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1bf (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c0 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c1 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c2 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c3 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c4 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c5 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c6 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // 0x1c7 (Padding) ++ { ++ "UNKNOWN", ++ { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false, ++ { false, false, false, false }, ++ { 0.0f, 0.0f, 0.0f, 0.0f }, ++ 1, 1, }, ++ // R8G8B8_UINT (0x1c8) ++ { ++ "R8G8B8_UINT", ++ { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++ // R8G8B8_SINT (0x1c9) ++ { ++ "R8G8B8_SINT", ++ { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN }, ++ { 0, 0, 0, 0x1 }, // Defaults for missing components ++ { 0, 1, 2, 0 }, // Swizzle ++ { 8, 8, 8, 0 }, // Bits per component ++ 24, // Bits per element ++ 3, // Bytes per element ++ 3, // Num components ++ false, // isSRGB ++ false, // isBC ++ false, // isSubsampled ++ { false, false, false, false }, // Is normalized? ++ { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor ++ 1, // bcWidth ++ 1, // bcHeight ++ }, ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h +new file mode 100644 +index 0000000..ff1fdb2 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/formats.h +@@ -0,0 +1,222 @@ ++ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file formats.h ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#pragma once ++ ++#include "common/os.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TYPE - Format component type ++////////////////////////////////////////////////////////////////////////// ++enum SWR_TYPE ++{ ++ SWR_TYPE_UNKNOWN, ++ SWR_TYPE_UNUSED, ++ SWR_TYPE_UNORM, ++ SWR_TYPE_SNORM, ++ SWR_TYPE_UINT, ++ SWR_TYPE_SINT, ++ SWR_TYPE_FLOAT, ++ SWR_TYPE_SSCALED, ++ SWR_TYPE_USCALED, ++}; ++////////////////////////////////////////////////////////////////////////// ++/// SWR_FORMAT ++////////////////////////////////////////////////////////////////////////// ++enum SWR_FORMAT ++{ ++ R32G32B32A32_FLOAT = 0x0, ++ R32G32B32A32_SINT = 0x1, ++ R32G32B32A32_UINT = 0x2, ++ R32G32B32X32_FLOAT = 0x6, ++ R32G32B32A32_SSCALED = 0x7, ++ R32G32B32A32_USCALED = 0x8, ++ R32G32B32_FLOAT = 0x40, ++ R32G32B32_SINT = 0x41, ++ R32G32B32_UINT = 0x42, ++ R32G32B32_SSCALED = 0x45, ++ R32G32B32_USCALED = 0x46, ++ R16G16B16A16_UNORM = 0x80, ++ R16G16B16A16_SNORM = 0x81, ++ R16G16B16A16_SINT = 0x82, ++ R16G16B16A16_UINT = 0x83, ++ R16G16B16A16_FLOAT = 0x84, ++ R32G32_FLOAT = 0x85, ++ R32G32_SINT = 0x86, ++ R32G32_UINT = 0x87, ++ R32_FLOAT_X8X24_TYPELESS = 0x88, ++ R16G16B16X16_UNORM = 0x8E, ++ R16G16B16X16_FLOAT = 0x8F, ++ R16G16B16A16_SSCALED = 0x93, ++ R16G16B16A16_USCALED = 0x94, ++ R32G32_SSCALED = 0x95, ++ R32G32_USCALED = 0x96, ++ R32_FLOAT_X8X24_TYPELESS_LD = 0x98, ++ B8G8R8A8_UNORM = 0xC0, ++ B8G8R8A8_UNORM_SRGB = 0xC1, ++ R10G10B10A2_UNORM = 0xC2, ++ R10G10B10A2_UNORM_SRGB = 0xC3, ++ R10G10B10A2_UINT = 0xC4, ++ R8G8B8A8_UNORM = 0xC7, ++ R8G8B8A8_UNORM_SRGB = 0xC8, ++ R8G8B8A8_SNORM = 0xC9, ++ R8G8B8A8_SINT = 0xCA, ++ R8G8B8A8_UINT = 0xCB, ++ R16G16_UNORM = 0xCC, ++ R16G16_SNORM = 0xCD, ++ R16G16_SINT = 0xCE, ++ R16G16_UINT = 0xCF, ++ R16G16_FLOAT = 0xD0, ++ B10G10R10A2_UNORM = 0xD1, ++ B10G10R10A2_UNORM_SRGB = 0xD2, ++ R11G11B10_FLOAT = 0xD3, ++ R32_SINT = 0xD6, ++ R32_UINT = 0xD7, ++ R32_FLOAT = 0xD8, ++ R24_UNORM_X8_TYPELESS = 0xD9, ++ R24_UNORM_X8_TYPELESS_LD = 0xDC, ++ A32_FLOAT = 0xE5, ++ B8G8R8X8_UNORM = 0xE9, ++ B8G8R8X8_UNORM_SRGB = 0xEA, ++ R8G8B8X8_UNORM = 0xEB, ++ R8G8B8X8_UNORM_SRGB = 0xEC, ++ R9G9B9E5_SHAREDEXP = 0xED, ++ B10G10R10X2_UNORM = 0xEE, ++ R10G10B10X2_USCALED = 0xF3, ++ R8G8B8A8_SSCALED = 0xF4, ++ R8G8B8A8_USCALED = 0xF5, ++ R16G16_SSCALED = 0xF6, ++ R16G16_USCALED = 0xF7, ++ R32_SSCALED = 0xF8, ++ R32_USCALED = 0xF9, ++ B5G6R5_UNORM = 0x100, ++ B5G6R5_UNORM_SRGB = 0x101, ++ B5G5R5A1_UNORM = 0x102, ++ B5G5R5A1_UNORM_SRGB = 0x103, ++ B4G4R4A4_UNORM = 0x104, ++ B4G4R4A4_UNORM_SRGB = 0x105, ++ R8G8_UNORM = 0x106, ++ R8G8_SNORM = 0x107, ++ R8G8_SINT = 0x108, ++ R8G8_UINT = 0x109, ++ R16_UNORM = 0x10A, ++ R16_SNORM = 0x10B, ++ R16_SINT = 0x10C, ++ R16_UINT = 0x10D, ++ R16_FLOAT = 0x10E, ++ A16_UNORM = 0x113, ++ A16_FLOAT = 0x117, ++ B5G5R5X1_UNORM = 0x11A, ++ B5G5R5X1_UNORM_SRGB = 0x11B, ++ R8G8_SSCALED = 0x11C, ++ R8G8_USCALED = 0x11D, ++ R16_SSCALED = 0x11E, ++ R16_USCALED = 0x11F, ++ R8_UNORM = 0x140, ++ R8_SNORM = 0x141, ++ R8_SINT = 0x142, ++ R8_UINT = 0x143, ++ A8_UNORM = 0x144, ++ R8_SSCALED = 0x149, ++ R8_USCALED = 0x14A, ++ YCRCB_SWAPUVY = 0x183, ++ BC1_UNORM = 0x186, ++ BC2_UNORM = 0x187, ++ BC3_UNORM = 0x188, ++ BC4_UNORM = 0x189, ++ BC5_UNORM = 0x18A, ++ BC1_UNORM_SRGB = 0x18B, ++ BC2_UNORM_SRGB = 0x18C, ++ BC3_UNORM_SRGB = 0x18D, ++ YCRCB_SWAPUV = 0x18F, ++ R8G8B8_UNORM = 0x193, ++ R8G8B8_SNORM = 0x194, ++ R8G8B8_SSCALED = 0x195, ++ R8G8B8_USCALED = 0x196, ++ BC4_SNORM = 0x199, ++ BC5_SNORM = 0x19A, ++ R16G16B16_FLOAT = 0x19B, ++ R16G16B16_UNORM = 0x19C, ++ R16G16B16_SNORM = 0x19D, ++ R16G16B16_SSCALED = 0x19E, ++ R16G16B16_USCALED = 0x19F, ++ BC7_UNORM = 0x1A2, ++ BC7_UNORM_SRGB = 0x1A3, ++ R8G8B8_UNORM_SRGB = 0x1A8, ++ R16G16B16_UINT = 0x1B0, ++ R16G16B16_SINT = 0x1B1, ++ R10G10B10A2_SNORM = 0x1B3, ++ R10G10B10A2_USCALED = 0x1B4, ++ R10G10B10A2_SSCALED = 0x1B5, ++ R10G10B10A2_SINT = 0x1B6, ++ B10G10R10A2_SNORM = 0x1B7, ++ B10G10R10A2_USCALED = 0x1B8, ++ B10G10R10A2_SSCALED = 0x1B9, ++ B10G10R10A2_UINT = 0x1BA, ++ B10G10R10A2_SINT = 0x1BB, ++ R8G8B8_UINT = 0x1C8, ++ R8G8B8_SINT = 0x1C9, ++ NUM_SWR_FORMATS = 0x1CA, ++}; ++////////////////////////////////////////////////////////////////////////// ++/// SWR_FORMAT_INFO - Format information ++////////////////////////////////////////////////////////////////////////// ++struct SWR_FORMAT_INFO ++{ ++ const char* name; ++ SWR_TYPE type[4]; ++ uint32_t defaults[4]; ++ uint32_t swizzle[4]; ///< swizzle per component ++ uint32_t bpc[4]; ///< bits per component ++ uint32_t bpp; ///< bits per pixel ++ uint32_t Bpp; ///< bytes per pixel ++ uint32_t numComps; ///< number of components ++ bool isSRGB; ++ bool isBC; ++ bool isSubsampled; ++ bool isNormalized[4]; ++ float toFloat[4]; ++ uint32_t bcWidth; ++ uint32_t bcHeight; ++}; ++ ++extern const SWR_FORMAT_INFO gFormatInfo[]; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Retrieves format info struct for given format. ++/// @param format - SWR format ++INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format) ++{ ++ return gFormatInfo[format]; ++} ++ ++// lookup table for unorm8 srgb -> float conversion ++extern const uint32_t srgb8Table[256]; +diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp +new file mode 100644 +index 0000000..ef38179 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp +@@ -0,0 +1,235 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#pragma once ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(_WIN32) ++#include ++#else ++#include ++#include ++#endif ++ ++class InstructionSet ++{ ++public: ++ InstructionSet() : CPU_Rep() {}; ++ ++ // getters ++ std::string Vendor(void) { return CPU_Rep.vendor_; } ++ std::string Brand(void) { return CPU_Rep.brand_; } ++ ++ bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; } ++ bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; } ++ bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; } ++ bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; } ++ bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; } ++ bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; } ++ bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; } ++ bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; } ++ bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; } ++ bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; } ++ bool AES(void) { return CPU_Rep.f_1_ECX_[25]; } ++ bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; } ++ bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; } ++ bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; } ++ ++ bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; } ++ bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; } ++ bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; } ++ bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; } ++ bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; } ++ bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; } ++ bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; } ++ bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; } ++ bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; } ++ ++ bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; } ++ bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; } ++ bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; } ++ bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; } ++ bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; } ++ bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; } ++ bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; } ++ bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; } ++ bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; } ++ bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; } ++ ++ bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; } ++ ++ bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; } ++ bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; } ++ bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; } ++ bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; } ++ bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; } ++ bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; } ++ ++ bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; } ++ bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; } ++ bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; } ++ bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; } ++ bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; } ++ ++ bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; } ++ bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; } ++ bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; } ++ bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; } ++ bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; } ++ bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; } ++ bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; } ++ ++private: ++ class InstructionSet_Internal ++ { ++ public: ++ InstructionSet_Internal() ++ : nIds_{ 0 }, ++ nExIds_{ 0 }, ++ isIntel_{ false }, ++ isAMD_{ false }, ++ f_1_ECX_{ 0 }, ++ f_1_EDX_{ 0 }, ++ f_7_EBX_{ 0 }, ++ f_7_ECX_{ 0 }, ++ f_81_ECX_{ 0 }, ++ f_81_EDX_{ 0 }, ++ data_{}, ++ extdata_{} ++ { ++ //int cpuInfo[4] = {-1}; ++ std::array cpui; ++ ++ // Calling __cpuid with 0x0 as the function_id argument ++ // gets the number of the highest valid function ID. ++#if defined(_WIN32) ++ __cpuid(cpui.data(), 0); ++ nIds_ = cpui[0]; ++#else ++ nIds_ = __get_cpuid_max(0, NULL); ++#endif ++ ++ for (int i = 0; i <= nIds_; ++i) ++ { ++#if defined(_WIN32) ++ __cpuidex(cpui.data(), i, 0); ++#else ++ int *data = cpui.data(); ++ __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); ++#endif ++ data_.push_back(cpui); ++ } ++ ++ // Capture vendor string ++ char vendor[0x20]; ++ memset(vendor, 0, sizeof(vendor)); ++ *reinterpret_cast(vendor) = data_[0][1]; ++ *reinterpret_cast(vendor + 4) = data_[0][3]; ++ *reinterpret_cast(vendor + 8) = data_[0][2]; ++ vendor_ = vendor; ++ if (vendor_ == "GenuineIntel") ++ { ++ isIntel_ = true; ++ } ++ else if (vendor_ == "AuthenticAMD") ++ { ++ isAMD_ = true; ++ } ++ ++ // load bitset with flags for function 0x00000001 ++ if (nIds_ >= 1) ++ { ++ f_1_ECX_ = data_[1][2]; ++ f_1_EDX_ = data_[1][3]; ++ } ++ ++ // load bitset with flags for function 0x00000007 ++ if (nIds_ >= 7) ++ { ++ f_7_EBX_ = data_[7][1]; ++ f_7_ECX_ = data_[7][2]; ++ } ++ ++ // Calling __cpuid with 0x80000000 as the function_id argument ++ // gets the number of the highest valid extended ID. ++#if defined(_WIN32) ++ __cpuid(cpui.data(), 0x80000000); ++ nExIds_ = cpui[0]; ++#else ++ nExIds_ = __get_cpuid_max(0x80000000, NULL); ++#endif ++ ++ char brand[0x40]; ++ memset(brand, 0, sizeof(brand)); ++ ++ for (unsigned i = 0x80000000; i <= nExIds_; ++i) ++ { ++#if defined(_WIN32) ++ __cpuidex(cpui.data(), i, 0); ++#else ++ int *data = cpui.data(); ++ __cpuid_count(i, 0, data[0], data[1], data[2], data[3]); ++#endif ++ extdata_.push_back(cpui); ++ } ++ ++ // load bitset with flags for function 0x80000001 ++ if (nExIds_ >= 0x80000001) ++ { ++ f_81_ECX_ = extdata_[1][2]; ++ f_81_EDX_ = extdata_[1][3]; ++ } ++ ++ // Interpret CPU brand string if reported ++ if (nExIds_ >= 0x80000004) ++ { ++ memcpy(brand, extdata_[2].data(), sizeof(cpui)); ++ memcpy(brand + 16, extdata_[3].data(), sizeof(cpui)); ++ memcpy(brand + 32, extdata_[4].data(), sizeof(cpui)); ++ brand_ = brand; ++ } ++ }; ++ ++ int nIds_; ++ unsigned nExIds_; ++ std::string vendor_; ++ std::string brand_; ++ bool isIntel_; ++ bool isAMD_; ++ std::bitset<32> f_1_ECX_; ++ std::bitset<32> f_1_EDX_; ++ std::bitset<32> f_7_EBX_; ++ std::bitset<32> f_7_ECX_; ++ std::bitset<32> f_81_ECX_; ++ std::bitset<32> f_81_EDX_; ++ std::vector> data_; ++ std::vector> extdata_; ++ }; ++ const InstructionSet_Internal CPU_Rep; ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h +new file mode 100644 +index 0000000..d7def2b +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/os.h +@@ -0,0 +1,194 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#ifndef __SWR_OS_H__ ++#define __SWR_OS_H__ ++ ++#include "core/knobs.h" ++ ++#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX) ++ ++#define SWR_API __cdecl ++ ++#ifndef _CRT_SECURE_NO_WARNINGS ++#define _CRT_SECURE_NO_WARNINGS ++#endif ++ ++#ifndef NOMINMAX ++#define NOMINMAX ++#endif ++#include "Windows.h" ++#include ++#include ++ ++#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD ++#define THREAD __declspec(thread) ++#define INLINE __forceinline ++#define DEBUGBREAK __debugbreak() ++ ++#define PRAGMA_WARNING_PUSH_DISABLE(...) \ ++ __pragma(warning(push));\ ++ __pragma(warning(disable:__VA_ARGS__)); ++ ++#define PRAGMA_WARNING_POP() __pragma(warning(pop)) ++ ++#if defined(_WIN32) ++#if defined(_WIN64) ++#define BitScanForwardSizeT BitScanForward64 ++#define _mm_popcount_sizeT _mm_popcnt_u64 ++#else ++#define BitScanForwardSizeT BitScanForward ++#define _mm_popcount_sizeT _mm_popcnt_u32 ++#endif ++#endif ++ ++#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) ++ ++#define SWR_API ++ ++#include ++#include ++#include ++#include ++#include ++ ++typedef void VOID; ++typedef void* LPVOID; ++typedef CARD8 BOOL; ++typedef wchar_t WCHAR; ++typedef uint16_t UINT16; ++typedef int INT; ++typedef int INT32; ++typedef unsigned int UINT; ++typedef uint32_t UINT32; ++typedef uint64_t UINT64; ++typedef int64_t INT64; ++typedef void* HANDLE; ++typedef float FLOAT; ++typedef int LONG; ++typedef CARD8 BYTE; ++typedef unsigned char UCHAR; ++typedef unsigned int DWORD; ++ ++#undef FALSE ++#define FALSE 0 ++ ++#undef TRUE ++#define TRUE 1 ++ ++#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH))) ++#define THREAD __thread ++#ifndef INLINE ++#define INLINE __inline ++#endif ++#define DEBUGBREAK asm ("int $3") ++#define __cdecl ++#define __declspec(X) ++ ++#define GCC_VERSION (__GNUC__ * 10000 \ ++ + __GNUC_MINOR__ * 100 \ ++ + __GNUC_PATCHLEVEL__) ++ ++#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500) ++inline ++uint64_t __rdtsc() ++{ ++ long low, high; ++ asm volatile("rdtsc" : "=a"(low), "=d"(high)); ++ return (low | ((uint64_t)high << 32)); ++} ++#endif ++ ++// Intrinsic not defined in gcc ++static INLINE ++void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a) ++{ ++ _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a)); ++ _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1)); ++} ++ ++inline ++unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask) ++{ ++ *Index = __builtin_ctz(Mask); ++ return (Mask != 0); ++} ++ ++inline ++unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask) ++{ ++ *Index = __builtin_clz(Mask); ++ return (Mask != 0); ++} ++ ++inline ++void *_aligned_malloc(unsigned int size, unsigned int alignment) ++{ ++ void *ret; ++ if (posix_memalign(&ret, alignment, size)) ++ { ++ return NULL; ++ } ++ return ret; ++} ++ ++inline ++unsigned char _bittest(const LONG *a, LONG b) ++{ ++ return ((*(unsigned *)(a) & (1 << b)) != 0); ++} ++ ++#if defined(_WIN32) ++static inline ++unsigned int _mm_popcnt_u32(unsigned int v) ++{ ++ return __builtin_popcount(v); ++} ++#endif ++ ++#define _aligned_free free ++#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) ++#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) ++#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) ++#define _ReadWriteBarrier() asm volatile("" ::: "memory") ++#define __stdcall ++ ++#define PRAGMA_WARNING_PUSH_DISABLE(...) ++#define PRAGMA_WARNING_POP() ++ ++#else ++ ++#error Unsupported OS/system. ++ ++#endif ++ ++#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) ++#if KNOB_SIMD_WIDTH == 8 ++#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32) ++#else ++#error Unknown SIMD width! ++#endif ++ ++#include "common/swr_assert.h" ++ ++#endif//__SWR_OS_H__ +diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +new file mode 100644 +index 0000000..469302b +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +@@ -0,0 +1,176 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file rdtsc_buckets.cpp ++* ++* @brief implementation of rdtsc buckets. ++* ++* Notes: ++* ++******************************************************************************/ ++#include "rdtsc_buckets.h" ++#include ++ ++THREAD UINT tlsThreadId = 0; ++ ++void BucketManager::RegisterThread(const std::string& name) ++{ ++ BUCKET_THREAD newThread; ++ newThread.name = name; ++ newThread.root.children.reserve(mBuckets.size()); ++ newThread.root.id = 0; ++ newThread.root.pParent = nullptr; ++ newThread.pCurrent = &newThread.root; ++ ++ mThreadMutex.lock(); ++ ++ // assign unique thread id for this thread ++ size_t id = mThreads.size(); ++ newThread.id = (UINT)id; ++ tlsThreadId = (UINT)id; ++ ++ // open threadviz file if enabled ++ if (mThreadViz) ++ { ++ char fileName[255]; ++ sprintf(fileName, "threadviz_thread.%d.dat", newThread.id); ++ newThread.vizFile = fopen(fileName, "wb"); ++ } ++ ++ // store new thread ++ mThreads.push_back(newThread); ++ ++ mThreadMutex.unlock(); ++} ++ ++UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) ++{ ++ size_t id = mBuckets.size(); ++ mBuckets.push_back(desc); ++ return (UINT)id; ++} ++ ++void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket) ++{ ++ const char *arrows[] = { ++ "", ++ "|-> ", ++ " |-> ", ++ " |-> ", ++ " |-> ", ++ " |-> ", ++ " |-> " ++ }; ++ ++ // compute percent of total cycles used by this bucket ++ float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0); ++ ++ // compute percent of parent cycles used by this bucket ++ float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0); ++ ++ // compute average cycle count per invocation ++ UINT64 CPE = bucket.elapsed / bucket.count; ++ ++ BUCKET_DESC &desc = mBuckets[bucket.id]; ++ ++ // construct hierarchy visualization ++ char hier[80]; ++ strcpy(hier, arrows[level]); ++ strcat(hier, desc.name.c_str()); ++ ++ // print out ++ fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", percentTotal, percentParent, bucket.elapsed, CPE, bucket.count, (unsigned long)0, (UINT32)(0), hier); ++ ++ // dump all children of this bucket ++ for (const BUCKET& child : bucket.children) ++ { ++ if (child.count) ++ { ++ PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child); ++ } ++ } ++} ++ ++void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) ++{ ++ // print header ++ fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str()); ++ fprintf(f, " %%Tot %%Par Cycles CPE NumEvent CPE2 NumEvent2 Bucket\n"); ++ ++ // compute thread level total cycle counts across all buckets from root ++ const BUCKET& root = thread.root; ++ UINT64 totalCycles = 0; ++ for (const BUCKET& child : root.children) ++ { ++ totalCycles += child.elapsed; ++ } ++ ++ for (const BUCKET& child : root.children) ++ { ++ if (child.count) ++ { ++ PrintBucket(f, 0, totalCycles, totalCycles, child); ++ } ++ } ++} ++ ++void BucketManager::DumpThreadViz() ++{ ++ // ensure all thread data is flushed ++ mThreadMutex.lock(); ++ for (auto& thread : mThreads) ++ { ++ fflush(thread.vizFile); ++ fclose(thread.vizFile); ++ } ++ mThreadMutex.unlock(); ++ ++ // dump bucket descriptions ++ FILE* f = fopen("threadviz_buckets.dat", "wb"); ++ for (auto& bucket : mBuckets) ++ { ++ Serialize(f, bucket); ++ } ++ fclose(f); ++} ++ ++void BucketManager::PrintReport(const std::string& filename) ++{ ++ if (mThreadViz) ++ { ++ DumpThreadViz(); ++ } ++ else ++ { ++ FILE* f = fopen(filename.c_str(), "w"); ++ ++ mThreadMutex.lock(); ++ for (const BUCKET_THREAD& thread : mThreads) ++ { ++ PrintThread(f, thread); ++ fprintf(f, "\n"); ++ } ++ mThreadMutex.unlock(); ++ ++ fclose(f); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h +new file mode 100644 +index 0000000..03530f5 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h +@@ -0,0 +1,195 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file rdtsc_buckets.h ++* ++* @brief declaration for rdtsc buckets. ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include "os.h" ++#include ++#include ++ ++#include "rdtsc_buckets_shared.h" ++ ++// unique thread id stored in thread local storage ++extern THREAD UINT tlsThreadId; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief BucketManager encapsulates a single instance of the buckets ++/// functionality. There can be one or many bucket managers active ++/// at any time. The manager owns all the threads and ++/// bucket information that have been registered to it. ++class BucketManager ++{ ++public: ++ BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz) {} ++ ++ // removes all registered thread data ++ void ClearThreads() ++ { ++ mThreadMutex.lock(); ++ mThreads.clear(); ++ mThreadMutex.unlock(); ++ } ++ ++ // removes all registered buckets ++ void ClearBuckets() ++ { ++ mBuckets.clear(); ++ } ++ ++ /// Registers a new thread with the manager. ++ /// @param name - name of thread, used for labels in reports and threadviz ++ void RegisterThread(const std::string& name); ++ ++ /// Registers a new bucket type with the manager. Returns a unique ++ /// id which should be used in subsequent calls to start/stop the bucket ++ /// @param desc - description of the bucket ++ /// @return unique id ++ UINT RegisterBucket(const BUCKET_DESC& desc); ++ ++ // dump threadviz data ++ void DumpThreadViz(); ++ ++ // print report ++ void PrintReport(const std::string& filename); ++ ++ // start capturing ++ INLINE void StartCapture() ++ { ++ mCapturing = true; ++ } ++ ++ // stop capturing ++ INLINE void StopCapture() ++ { ++ mCapturing = false; ++ ++ // wait for all threads to pop back to root bucket ++ bool stillCapturing = true; ++ while (stillCapturing) ++ { ++ stillCapturing = false; ++ for (const BUCKET_THREAD& t : mThreads) ++ { ++ if (t.pCurrent != &t.root) ++ { ++ stillCapturing = true; ++ continue; ++ } ++ } ++ } ++ } ++ ++ // start a bucket ++ // @param id generated by RegisterBucket ++ INLINE void StartBucket(UINT id) ++ { ++ if (!mCapturing) return; ++ ++ SWR_ASSERT(tlsThreadId < mThreads.size()); ++ ++ BUCKET_THREAD& bt = mThreads[tlsThreadId]; ++ ++ // if threadviz is enabled, only need to dump start info to threads viz file ++ if (mThreadViz) ++ { ++ SWR_ASSERT(bt.vizFile != nullptr); ++ if (mBuckets[id].enableThreadViz) ++ { ++ VIZ_START_DATA data{ VIZ_START, id, __rdtsc() }; ++ Serialize(bt.vizFile, data); ++ } ++ } ++ else ++ { ++ if (bt.pCurrent->children.size() < mBuckets.size()) ++ { ++ bt.pCurrent->children.resize(mBuckets.size()); ++ } ++ BUCKET &child = bt.pCurrent->children[id]; ++ child.pParent = bt.pCurrent; ++ child.id = id; ++ child.start = __rdtsc(); ++ ++ // update thread's currently executing bucket ++ bt.pCurrent = &child; ++ } ++ ++ bt.level++; ++ } ++ ++ // stop the currently executing bucket ++ INLINE void StopBucket(UINT id) ++ { ++ SWR_ASSERT(tlsThreadId < mThreads.size()); ++ BUCKET_THREAD &bt = mThreads[tlsThreadId]; ++ ++ if (bt.level == 0) return; ++ ++ if (mThreadViz) ++ { ++ SWR_ASSERT(bt.vizFile != nullptr); ++ if (mBuckets[id].enableThreadViz) ++ { ++ VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() }; ++ Serialize(bt.vizFile, data); ++ } ++ } ++ else ++ { ++ if (bt.pCurrent->start == 0) return; ++ SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected"); ++ ++ bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start); ++ bt.pCurrent->count++; ++ ++ // pop to parent ++ bt.pCurrent = bt.pCurrent->pParent; ++ } ++ ++ bt.level--; ++ } ++ ++private: ++ void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket); ++ void PrintThread(FILE* f, const BUCKET_THREAD& thread); ++ ++ // list of active threads that have registered with this manager ++ std::vector mThreads; ++ ++ // list of buckets registered with this manager ++ std::vector mBuckets; ++ ++ // is capturing currently enabled ++ volatile bool mCapturing{ false }; ++ ++ std::mutex mThreadMutex; ++ ++ // enable threadviz ++ bool mThreadViz{ false }; ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h +new file mode 100644 +index 0000000..41c6d5d +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h +@@ -0,0 +1,167 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file rdtsc_buckets.h ++* ++* @brief declaration for rdtsc buckets. ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include ++#include ++ ++struct BUCKET ++{ ++ uint32_t id{ 0 }; ++ uint64_t start{ 0 }; ++ uint64_t elapsed{ 0 }; ++ uint32_t count{ 0 }; ++ ++ BUCKET* pParent{ nullptr }; ++ std::vector children; ++}; ++ ++struct BUCKET_DESC ++{ ++ // name of bucket, used in reports ++ std::string name; ++ ++ // description of bucket, used in threadviz ++ std::string description; ++ ++ // enable for threadviz dumping ++ bool enableThreadViz; ++ ++ // threadviz color of bucket, in RGBA8_UNORM format ++ uint32_t color; ++}; ++ ++struct BUCKET_THREAD ++{ ++ // name of thread, used in reports ++ std::string name; ++ ++ // id for this thread, assigned by the thread manager ++ uint32_t id; ++ ++ // root of the bucket hierarchy for this thread ++ BUCKET root; ++ ++ // currently executing bucket somewhere in the hierarchy ++ BUCKET* pCurrent; ++ ++ // currently executing hierarchy level ++ uint32_t level{ 0 }; ++ ++ // threadviz file object ++ FILE* vizFile{ nullptr }; ++ ++ BUCKET_THREAD() {} ++ BUCKET_THREAD(const BUCKET_THREAD& that) ++ { ++ name = that.name; ++ id = that.id; ++ root = that.root; ++ pCurrent = &root; ++ vizFile = that.vizFile; ++ } ++}; ++ ++enum VIZ_TYPE ++{ ++ VIZ_START = 0, ++ VIZ_STOP = 1, ++ VIZ_DATA = 2 ++}; ++ ++struct VIZ_START_DATA ++{ ++ uint8_t type; ++ uint32_t bucketId; ++ uint64_t timestamp; ++}; ++ ++struct VIZ_STOP_DATA ++{ ++ uint8_t type; ++ uint64_t timestamp; ++}; ++ ++inline void Serialize(FILE* f, const VIZ_START_DATA& data) ++{ ++ fwrite(&data, sizeof(VIZ_START_DATA), 1, f); ++} ++ ++inline void Deserialize(FILE* f, VIZ_START_DATA& data) ++{ ++ fread(&data, sizeof(VIZ_START_DATA), 1, f); ++ assert(data.type == VIZ_START); ++} ++ ++inline void Serialize(FILE* f, const VIZ_STOP_DATA& data) ++{ ++ fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f); ++} ++ ++inline void Deserialize(FILE* f, VIZ_STOP_DATA& data) ++{ ++ fread(&data, sizeof(VIZ_STOP_DATA), 1, f); ++ assert(data.type == VIZ_STOP); ++} ++ ++inline void Serialize(FILE* f, const std::string& string) ++{ ++ assert(string.size() <= 256); ++ ++ uint8_t length = (uint8_t)string.size(); ++ fwrite(&length, sizeof(length), 1, f); ++ fwrite(string.c_str(), string.size(), 1, f); ++} ++ ++inline void Deserialize(FILE* f, std::string& string) ++{ ++ char cstr[256]; ++ uint8_t length; ++ fread(&length, sizeof(length), 1, f); ++ fread(cstr, length, 1, f); ++ cstr[length] = 0; ++ string.assign(cstr); ++} ++ ++inline void Serialize(FILE* f, const BUCKET_DESC& desc) ++{ ++ Serialize(f, desc.name); ++ Serialize(f, desc.description); ++ fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); ++ fwrite(&desc.color, sizeof(desc.color), 1, f); ++} ++ ++inline void Deserialize(FILE* f, BUCKET_DESC& desc) ++{ ++ Deserialize(f, desc.name); ++ Deserialize(f, desc.description); ++ fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f); ++ fread(&desc.color, sizeof(desc.color), 1, f); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +new file mode 100644 +index 0000000..ef7804f +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +@@ -0,0 +1,792 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#ifndef __SWR_SIMDINTRIN_H__ ++#define __SWR_SIMDINTRIN_H__ ++ ++#include "os.h" ++ ++#include ++ ++#include ++#include ++#include ++ ++#if KNOB_SIMD_WIDTH == 8 ++typedef __m256 simdscalar; ++typedef __m256i simdscalari; ++typedef uint8_t simdmask; ++#else ++#error Unsupported vector width ++#endif ++ ++// simd vector ++OSALIGNSIMD(union) simdvector ++{ ++ simdscalar v[4]; ++ struct ++ { ++ simdscalar x, y, z, w; ++ }; ++ ++ simdscalar& operator[] (const int i) { return v[i]; } ++ const simdscalar& operator[] (const int i) const { return v[i]; } ++}; ++ ++#if KNOB_SIMD_WIDTH == 8 ++#define _simd128_maskstore_ps _mm_maskstore_ps ++#define _simd_load_ps _mm256_load_ps ++#define _simd_load1_ps _mm256_broadcast_ss ++#define _simd_loadu_ps _mm256_loadu_ps ++#define _simd_setzero_ps _mm256_setzero_ps ++#define _simd_set1_ps _mm256_set1_ps ++#define _simd_blend_ps _mm256_blend_ps ++#define _simd_blendv_ps _mm256_blendv_ps ++#define _simd_store_ps _mm256_store_ps ++#define _simd_mul_ps _mm256_mul_ps ++#define _simd_add_ps _mm256_add_ps ++#define _simd_sub_ps _mm256_sub_ps ++#define _simd_rsqrt_ps _mm256_rsqrt_ps ++#define _simd_min_ps _mm256_min_ps ++#define _simd_max_ps _mm256_max_ps ++#define _simd_movemask_ps _mm256_movemask_ps ++#define _simd_cvtps_epi32 _mm256_cvtps_epi32 ++#define _simd_cvttps_epi32 _mm256_cvttps_epi32 ++#define _simd_cvtepi32_ps _mm256_cvtepi32_ps ++#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) ++#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) ++#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ) ++#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ) ++#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) ++#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) ++#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm) ++#define _simd_and_ps _mm256_and_ps ++#define _simd_or_ps _mm256_or_ps ++ ++#define _simd_rcp_ps _mm256_rcp_ps ++#define _simd_div_ps _mm256_div_ps ++#define _simd_castsi_ps _mm256_castsi256_ps ++#define _simd_andnot_ps _mm256_andnot_ps ++#define _simd_round_ps _mm256_round_ps ++#define _simd_castpd_ps _mm256_castpd_ps ++#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a)) ++ ++#define _simd_load_sd _mm256_load_sd ++#define _simd_movemask_pd _mm256_movemask_pd ++#define _simd_castsi_pd _mm256_castsi256_pd ++ ++// emulated integer simd ++#define SIMD_EMU_EPI(func, intrin) \ ++INLINE \ ++__m256i func(__m256i a, __m256i b)\ ++{\ ++ __m128i aHi = _mm256_extractf128_si256(a, 1);\ ++ __m128i bHi = _mm256_extractf128_si256(b, 1);\ ++ __m128i aLo = _mm256_castsi256_si128(a);\ ++ __m128i bLo = _mm256_castsi256_si128(b);\ ++\ ++ __m128i subLo = intrin(aLo, bLo);\ ++ __m128i subHi = intrin(aHi, bHi);\ ++\ ++ __m256i result = _mm256_castsi128_si256(subLo);\ ++ result = _mm256_insertf128_si256(result, subHi, 1);\ ++\ ++ return result;\ ++} ++ ++#if (KNOB_ARCH == KNOB_ARCH_AVX) ++#define _simd_mul_epi32 _simdemu_mul_epi32 ++#define _simd_mullo_epi32 _simdemu_mullo_epi32 ++#define _simd_sub_epi32 _simdemu_sub_epi32 ++#define _simd_sub_epi64 _simdemu_sub_epi64 ++#define _simd_min_epi32 _simdemu_min_epi32 ++#define _simd_min_epu32 _simdemu_min_epu32 ++#define _simd_max_epi32 _simdemu_max_epi32 ++#define _simd_max_epu32 _simdemu_max_epu32 ++#define _simd_add_epi32 _simdemu_add_epi32 ++#define _simd_and_si _simdemu_and_si ++#define _simd_andnot_si _simdemu_andnot_si ++#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32 ++#define _simd_cmplt_epi32 _simdemu_cmplt_epi32 ++#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32 ++#define _simd_or_si _simdemu_or_si ++#define _simd_castps_si _mm256_castps_si256 ++#define _simd_adds_epu8 _simdemu_adds_epu8 ++#define _simd_subs_epu8 _simdemu_subs_epu8 ++#define _simd_add_epi8 _simdemu_add_epi8 ++#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 ++#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 ++ ++SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) ++SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32) ++SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32) ++SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64) ++SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32) ++SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32) ++SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32) ++SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32) ++SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32) ++SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128) ++SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128) ++SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32) ++SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32) ++SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32) ++SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128) ++SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8) ++SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) ++SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) ++SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) ++SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) ++ ++#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) ++#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) ++ ++#define _simd_srli_si(a,i) _simdemu_srli_si128(a) ++#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a) ++#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a) ++#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a) ++#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128(_mm256_castps_si256(a))) ++ ++#define _simd128_fmadd_ps _mm_fmaddemu_ps ++#define _simd_fmadd_ps _mm_fmaddemu256_ps ++#define _simd_fmsub_ps _mm_fmsubemu256_ps ++#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 ++SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8) ++ ++INLINE ++__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c) ++{ ++ __m128 res = _mm_mul_ps(a, b); ++ res = _mm_add_ps(res, c); ++ return res; ++} ++ ++INLINE ++__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c) ++{ ++ __m256 res = _mm256_mul_ps(a, b); ++ res = _mm256_add_ps(res, c); ++ return res; ++} ++ ++INLINE ++__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c) ++{ ++ __m256 res = _mm256_mul_ps(a, b); ++ res = _mm256_sub_ps(res, c); ++ return res; ++} ++ ++INLINE ++__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) ++{ ++ uint32_t *pOffsets = (uint32_t*)&vOffsets; ++ simdscalar vResult; ++ float* pResult = (float*)&vResult; ++ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) ++ { ++ uint32_t offset = pOffsets[i]; ++ offset = offset * scale; ++ pResult[i] = *(float*)(((const uint8_t*)pBase + offset)); ++ } ++ ++ return vResult; ++} ++ ++INLINE ++__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) ++{ ++ uint32_t *pOffsets = (uint32_t*)&vOffsets; ++ simdscalar vResult = vSrc; ++ float* pResult = (float*)&vResult; ++ DWORD index; ++ uint32_t mask = _simd_movemask_ps(vMask); ++ while (_BitScanForward(&index, mask)) ++ { ++ mask &= ~(1 << index); ++ uint32_t offset = pOffsets[index]; ++ offset = offset * scale; ++ pResult[index] = *(float*)(((const uint8_t*)pBase + offset)); ++ } ++ ++ return vResult; ++} ++ ++INLINE ++__m256i _simd_abs_epi32(__m256i a) ++{ ++ __m128i aHi = _mm256_extractf128_si256(a, 1); ++ __m128i aLo = _mm256_castsi256_si128(a); ++ __m128i absLo = _mm_abs_epi32(aLo); ++ __m128i absHi = _mm_abs_epi32(aHi); ++ __m256i result = _mm256_castsi128_si256(absLo); ++ result = _mm256_insertf128_si256(result, absHi, 1); ++ return result; ++} ++#else ++ ++#define _simd_mul_epi32 _mm256_mul_epi32 ++#define _simd_mullo_epi32 _mm256_mullo_epi32 ++#define _simd_sub_epi32 _mm256_sub_epi32 ++#define _simd_sub_epi64 _mm256_sub_epi64 ++#define _simd_min_epi32 _mm256_min_epi32 ++#define _simd_max_epi32 _mm256_max_epi32 ++#define _simd_min_epu32 _mm256_min_epu32 ++#define _simd_max_epu32 _mm256_max_epu32 ++#define _simd_add_epi32 _mm256_add_epi32 ++#define _simd_and_si _mm256_and_si256 ++#define _simd_andnot_si _mm256_andnot_si256 ++#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32 ++#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a) ++#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b) ++#define _simd_or_si _mm256_or_si256 ++#define _simd_castps_si _mm256_castps_si256 ++ ++#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32 ++#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32 ++ ++#define _simd_srli_si(a,i) _simdemu_srli_si128(a) ++#define _simd_slli_epi32 _mm256_slli_epi32 ++#define _simd_srai_epi32 _mm256_srai_epi32 ++#define _simd_srli_epi32 _mm256_srli_epi32 ++#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128(_mm256_castps_si256(a))) ++#define _simd128_fmadd_ps _mm_fmadd_ps ++#define _simd_fmadd_ps _mm256_fmadd_ps ++#define _simd_fmsub_ps _mm256_fmsub_ps ++#define _simd_shuffle_epi8 _mm256_shuffle_epi8 ++#define _simd_adds_epu8 _mm256_adds_epu8 ++#define _simd_subs_epu8 _mm256_subs_epu8 ++#define _simd_add_epi8 _mm256_add_epi8 ++#define _simd_i32gather_ps _mm256_i32gather_ps ++#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps ++#define _simd_abs_epi32 _mm256_abs_epi32 ++ ++#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 ++#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 ++#endif ++ ++#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm)) ++#define _simd_shuffle_ps _mm256_shuffle_ps ++#define _simd_set1_epi32 _mm256_set1_epi32 ++#define _simd_set1_epi8 _mm256_set1_epi8 ++#define _simd_setzero_si _mm256_setzero_si256 ++#define _simd_cvttps_epi32 _mm256_cvttps_epi32 ++#define _simd_store_si _mm256_store_si256 ++#define _simd_broadcast_ss _mm256_broadcast_ss ++#define _simd_maskstore_ps _mm256_maskstore_ps ++#define _simd_load_si _mm256_load_si256 ++#define _simd_loadu_si _mm256_loadu_si256 ++#define _simd_sub_ps _mm256_sub_ps ++#define _simd_testz_ps _mm256_testz_ps ++#define _simd_xor_ps _mm256_xor_ps ++ ++ ++INLINE ++simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) ++{ ++ return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); ++} ++ ++// convert bitmask to vector mask ++INLINE ++simdscalar vMask(int32_t mask) ++{ ++ __m256i vec = _mm256_set1_epi32(mask); ++ const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); ++ vec = _simd_and_si(vec, bit); ++ vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); ++ return _simd_castsi_ps(vec); ++} ++ ++INLINE ++void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) ++{ ++ OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; ++ _mm256_store_ps(rArray, r); ++ _mm256_store_ps(sArray, s); ++ rArray[rlane] = sArray[slane]; ++ r = _mm256_load_ps(rArray); ++} ++ ++template ++__m256i _simdemu_srli_si128(__m256i a) ++{ ++ __m128i aHi = _mm256_extractf128_si256(a, 1); ++ __m128i aLo = _mm256_castsi256_si128(a); ++ ++ __m128i resHi = _mm_srli_si128(aHi, i); ++ __m128i resLo = _mm_alignr_epi8(aHi, aLo, i); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ ++ return result; ++} ++ ++template ++__m256i _simdemu_slli_epi32(__m256i a) ++{ ++ __m128i aHi = _mm256_extractf128_si256(a, 1); ++ __m128i aLo = _mm256_castsi256_si128(a); ++ ++ __m128i resHi = _mm_slli_epi32(aHi, i); ++ __m128i resLo = _mm_slli_epi32(aLo, i); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ ++ return result; ++} ++ ++template ++__m256i _simdemu_srai_epi32(__m256i a) ++{ ++ __m128i aHi = _mm256_extractf128_si256(a, 1); ++ __m128i aLo = _mm256_castsi256_si128(a); ++ ++ __m128i resHi = _mm_srai_epi32(aHi, i); ++ __m128i resLo = _mm_srai_epi32(aLo, i); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ ++ return result; ++} ++ ++template ++__m256i _simdemu_srli_epi32(__m256i a) ++{ ++ __m128i aHi = _mm256_extractf128_si256(a, 1); ++ __m128i aLo = _mm256_castsi256_si128(a); ++ ++ __m128i resHi = _mm_srli_epi32(aHi, i); ++ __m128i resLo = _mm_srli_epi32(aLo, i); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ ++ return result; ++} ++ ++INLINE ++void _simdvec_transpose(simdvector &v) ++{ ++ SWR_ASSERT(false, "Need to implement 8 wide version"); ++} ++ ++#else ++#error Unsupported vector width ++#endif ++ ++// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. ++INLINE ++void _simdvec_load_ps(simdvector& r, const float *p) ++{ ++ r[0] = _simd_set1_ps(p[0]); ++ r[1] = _simd_set1_ps(p[1]); ++ r[2] = _simd_set1_ps(p[2]); ++ r[3] = _simd_set1_ps(p[3]); ++} ++ ++INLINE ++void _simdvec_mov(simdvector& r, const simdscalar& s) ++{ ++ r[0] = s; ++ r[1] = s; ++ r[2] = s; ++ r[3] = s; ++} ++ ++INLINE ++void _simdvec_mov(simdvector& r, const simdvector& v) ++{ ++ r[0] = v[0]; ++ r[1] = v[1]; ++ r[2] = v[2]; ++ r[3] = v[3]; ++} ++ ++// just move a lane from the source simdvector to dest simdvector ++INLINE ++void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) ++{ ++ _simd_mov(r[0], rlane, s[0], slane); ++ _simd_mov(r[1], rlane, s[1], slane); ++ _simd_mov(r[2], rlane, s[2], slane); ++ _simd_mov(r[3], rlane, s[3], slane); ++} ++ ++INLINE ++void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) ++{ ++ simdscalar tmp; ++ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) ++ ++ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) ++ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) ++ ++ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) ++ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) ++} ++ ++INLINE ++void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) ++{ ++ simdscalar tmp; ++ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) ++ ++ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) ++ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) ++ ++ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) ++ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) ++ ++ tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) ++ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) ++} ++ ++INLINE ++simdscalar _simdvec_rcp_length_ps(const simdvector& v) ++{ ++ simdscalar length; ++ _simdvec_dp4_ps(length, v, v); ++ return _simd_rsqrt_ps(length); ++} ++ ++INLINE ++void _simdvec_normalize_ps(simdvector& r, const simdvector& v) ++{ ++ simdscalar vecLength; ++ vecLength = _simdvec_rcp_length_ps(v); ++ ++ r[0] = _simd_mul_ps(v[0], vecLength); ++ r[1] = _simd_mul_ps(v[1], vecLength); ++ r[2] = _simd_mul_ps(v[2], vecLength); ++ r[3] = _simd_mul_ps(v[3], vecLength); ++} ++ ++INLINE ++void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s) ++{ ++ r[0] = _simd_mul_ps(v[0], s); ++ r[1] = _simd_mul_ps(v[1], s); ++ r[2] = _simd_mul_ps(v[2], s); ++ r[3] = _simd_mul_ps(v[3], s); ++} ++ ++INLINE ++void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1) ++{ ++ r[0] = _simd_mul_ps(v0[0], v1[0]); ++ r[1] = _simd_mul_ps(v0[1], v1[1]); ++ r[2] = _simd_mul_ps(v0[2], v1[2]); ++ r[3] = _simd_mul_ps(v0[3], v1[3]); ++} ++ ++INLINE ++void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1) ++{ ++ r[0] = _simd_add_ps(v0[0], v1[0]); ++ r[1] = _simd_add_ps(v0[1], v1[1]); ++ r[2] = _simd_add_ps(v0[2], v1[2]); ++ r[3] = _simd_add_ps(v0[3], v1[3]); ++} ++ ++INLINE ++void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s) ++{ ++ r[0] = _simd_min_ps(v0[0], s); ++ r[1] = _simd_min_ps(v0[1], s); ++ r[2] = _simd_min_ps(v0[2], s); ++ r[3] = _simd_min_ps(v0[3], s); ++} ++ ++INLINE ++void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) ++{ ++ r[0] = _simd_max_ps(v0[0], s); ++ r[1] = _simd_max_ps(v0[1], s); ++ r[2] = _simd_max_ps(v0[2], s); ++ r[3] = _simd_max_ps(v0[3], s); ++} ++ ++// Matrix4x4 * Vector4 ++// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) ++// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) ++// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) ++// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) ++INLINE ++void _simd_mat4x4_vec4_multiply( ++ simdvector& result, ++ const float *pMatrix, ++ const simdvector& v) ++{ ++ simdscalar m; ++ simdscalar r0; ++ simdscalar r1; ++ ++ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] ++ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) ++ result[0] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] ++ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) ++ result[1] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] ++ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) ++ result[2] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] ++ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) ++ result[3] = r0; ++} ++ ++// Matrix4x4 * Vector3 - Direction Vector where w = 0. ++// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) ++// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) ++// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) ++// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) ++INLINE ++void _simd_mat3x3_vec3_w0_multiply( ++ simdvector& result, ++ const float *pMatrix, ++ const simdvector& v) ++{ ++ simdscalar m; ++ simdscalar r0; ++ simdscalar r1; ++ ++ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ result[0] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ result[1] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ result[2] = r0; ++ ++ result[3] = _simd_setzero_ps(); ++} ++ ++// Matrix4x4 * Vector3 - Position vector where w = 1. ++// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) ++// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) ++// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) ++// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) ++INLINE ++void _simd_mat4x4_vec3_w1_multiply( ++ simdvector& result, ++ const float *pMatrix, ++ const simdvector& v) ++{ ++ simdscalar m; ++ simdscalar r0; ++ simdscalar r1; ++ ++ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] ++ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++ result[0] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] ++ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++ result[1] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] ++ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++ result[2] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] ++ result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++} ++ ++INLINE ++void _simd_mat4x3_vec3_w1_multiply( ++ simdvector& result, ++ const float *pMatrix, ++ const simdvector& v) ++{ ++ simdscalar m; ++ simdscalar r0; ++ simdscalar r1; ++ ++ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] ++ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++ result[0] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] ++ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++ result[1] = r0; ++ ++ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] ++ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) ++ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] ++ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) ++ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] ++ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) ++ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) ++ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] ++ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) ++ result[2] = r0; ++ result[3] = _simd_set1_ps(1.0f); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Compute plane equation vA * vX + vB * vY + vC ++INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY) ++{ ++ simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); ++ vOut = _simd_fmadd_ps(vB, vY, vOut); ++ return vOut; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Interpolates a single component. ++/// @param vI - barycentric I ++/// @param vJ - barycentric J ++/// @param pInterpBuffer - pointer to attribute barycentric coeffs ++template ++static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer) ++{ ++ const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp]; ++ const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp]; ++ const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp]; ++ ++ simdscalar vA = _simd_broadcast_ss(pInterpA); ++ simdscalar vB = _simd_broadcast_ss(pInterpB); ++ simdscalar vC = _simd_broadcast_ss(pInterpC); ++ ++ simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); ++ vC = _simd_mul_ps(vk, vC); ++ ++ return vplaneps(vA, vB, vC, vI, vJ); ++} ++ ++ ++#endif//__SWR_SIMDINTRIN_H__ +diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp +new file mode 100644 +index 0000000..8f176e1 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp +@@ -0,0 +1,141 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#include "common/os.h" ++#include ++#include ++#include ++ ++#if defined(SWR_ENABLE_ASSERTS) ++ ++#if defined(_WIN32) ++#pragma comment(lib, "user32.lib") ++#endif // _WIN32 ++ ++bool SwrAssert( ++ bool& enabled, ++ const char* pExpression, ++ const char* pFileName, ++ uint32_t lineNum, ++ const char* pFmtString /* = nullptr */, ++ ...) ++{ ++ if (!enabled) return false; ++ ++#if defined(_WIN32) ++ static const int MAX_MESSAGE_LEN = 2048; ++ char msgBuf[MAX_MESSAGE_LEN]; ++ ++ sprintf_s(msgBuf, "%s(%d): assert: %s\n", pFileName, lineNum, pExpression); ++ msgBuf[MAX_MESSAGE_LEN - 2] = '\n'; ++ msgBuf[MAX_MESSAGE_LEN - 1] = 0; ++ OutputDebugStringA(msgBuf); ++ ++ int offset = 0; ++ ++ if (pFmtString) ++ { ++ va_list args; ++ va_start(args, pFmtString); ++ offset = _vsnprintf_s( ++ msgBuf, ++ sizeof(msgBuf), ++ sizeof(msgBuf), ++ pFmtString, ++ args); ++ va_end(args); ++ ++ if (offset < 0) { return true; } ++ ++ OutputDebugStringA("\t"); ++ OutputDebugStringA(msgBuf); ++ OutputDebugStringA("\n"); ++ } ++ ++ if (KNOB_ENABLE_ASSERT_DIALOGS) ++ { ++ int retval = sprintf_s( ++ &msgBuf[offset], ++ MAX_MESSAGE_LEN - offset, ++ "\n\n" ++ "File: %s\n" ++ "Line: %d\n" ++ "\n" ++ "Expression: %s\n\n" ++ "Cancel: Disable this assert for the remainder of the process\n" ++ "Try Again: Break into the debugger\n" ++ "Continue: Continue execution (but leave assert enabled)", ++ pFileName, ++ lineNum, ++ pExpression); ++ ++ if (retval < 0) { return true; } ++ ++ offset += retval; ++ ++ if (!IsDebuggerPresent()) ++ { ++ sprintf_s( ++ &msgBuf[offset], ++ MAX_MESSAGE_LEN - offset, ++ "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!"); ++ } ++ ++ retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION); ++ ++ switch (retval) ++ { ++ case IDCANCEL: ++ enabled = false; ++ return false; ++ ++ case IDTRYAGAIN: ++ return true; ++ ++ case IDCONTINUE: ++ return false; ++ } ++ } ++ else ++ { ++ return 0 != IsDebuggerPresent(); ++ } ++ ++#else // !_WIN32 ++ fprintf(stderr, "%s(%d): assert: %s\n", pFileName, lineNum, pExpression); ++ if (pFmtString) ++ { ++ va_list args; ++ va_start(args, pFmtString); ++ vfprintf(stderr, pFmtString, args); ++ va_end(args); ++ } ++ fflush(stderr); ++ ++ /// @todo - Implement message box on non-Windows platforms ++ ++#endif ++ return true; ++} ++ ++#endif // SWR_ENABLE_ASSERTS +diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h +new file mode 100644 +index 0000000..afc9f59 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h +@@ -0,0 +1,84 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#ifndef __SWR_ASSERT_H__ ++#define __SWR_ASSERT_H__ ++ ++#if !defined(__SWR_OS_H__) ++#error swr_assert.h should not be included directly, please include "common/os.h" instead. ++#endif ++ ++#if !defined(SWR_ENABLE_ASSERTS) ++ ++#if !defined(NDEBUG) ++#define SWR_ENABLE_ASSERTS 1 ++#else ++#define SWR_ENABLE_ASSERTS 0 ++#endif // _DEBUG ++ ++#endif // SWR_ENABLE_ASSERTS ++ ++#if SWR_ENABLE_ASSERTS ++#include "assert.h" ++ ++#if !defined(__cplusplus) ++ ++#pragma message("C++ is required for SWR Asserts, falling back to assert.h") ++ ++#define SWR_ASSERT(e, ...) assert(e) ++ ++#else ++ ++#if defined(assert) ++#undef assert ++#endif ++#define assert(exp) SWR_ASSERT(exp) ++ ++bool SwrAssert( ++ bool& enabled, ++ const char* pExpression, ++ const char* pFileName, ++ uint32_t lineNum, ++ const char* pFmtString = nullptr, ++ ...); ++ ++#define SWR_ASSERT(e, ...) {\ ++ bool expFailed = !(e);\ ++ if (expFailed) {\ ++ static bool swrAssertEnabled = true;\ ++ expFailed = SwrAssert(swrAssertEnabled, #e, __FILE__, __LINE__, ##__VA_ARGS__);\ ++ if (expFailed) { DEBUGBREAK; }\ ++ }\ ++} ++ ++#endif // C++ ++ ++#else // No asserts enabled ++ ++#define SWR_ASSERT(e, ...) {} ++ ++#endif ++ ++#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__) ++ ++#endif//__SWR_OS_H__ +diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp +new file mode 100644 +index 0000000..1081e28 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp +@@ -0,0 +1,1461 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file api.cpp ++* ++* @brief API implementation ++* ++******************************************************************************/ ++ ++#include ++#include ++#include ++ ++#if defined(__gnu_linux__) || defined(__linux__) ++#include ++#endif ++ ++#include "core/api.h" ++#include "core/backend.h" ++#include "core/context.h" ++#include "core/frontend.h" ++#include "core/rasterizer.h" ++#include "core/rdtsc_core.h" ++#include "core/threads.h" ++#include "core/tilemgr.h" ++#include "core/clip.h" ++ ++#include "common/simdintrin.h" ++#include "common/os.h" ++ ++void SetupDefaultState(SWR_CONTEXT *pContext); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Create SWR Context. ++/// @param pCreateInfo - pointer to creation info. ++HANDLE SwrCreateContext( ++ const SWR_CREATECONTEXT_INFO* pCreateInfo) ++{ ++ RDTSC_RESET(); ++ RDTSC_INIT(0); ++ ++ void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); ++ memset(pContextMem, 0, sizeof(SWR_CONTEXT)); ++ SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); ++ ++ pContext->driverType = pCreateInfo->driver; ++ pContext->privateStateSize = pCreateInfo->privateStateSize; ++ ++ pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); ++ memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); ++ ++ pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); ++ memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); ++ ++ for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) ++ { ++ pContext->dcRing[dc].arena.Init(); ++ pContext->dcRing[dc].inUse = false; ++ pContext->dcRing[dc].pTileMgr = new MacroTileMgr(); ++ pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. ++ ++ pContext->dsRing[dc].arena.Init(); ++ } ++ ++ if (!KNOB_SINGLE_THREADED) ++ { ++ memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); ++ memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); ++ new (&pContext->WaitLock) std::mutex(); ++ new (&pContext->FifosNotEmpty) std::condition_variable(); ++ ++ CreateThreadPool(pContext, &pContext->threadPool); ++ } ++ ++ // Calling createThreadPool() above can set SINGLE_THREADED ++ if (KNOB_SINGLE_THREADED) ++ { ++ pContext->NumWorkerThreads = 1; ++ } ++ ++ // Allocate scratch space for workers. ++ ///@note We could lazily allocate this but its rather small amount of memory. ++ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) ++ { ++ ///@todo Use numa API for allocations using numa information from thread data (if exists). ++ pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); ++ } ++ ++ pContext->LastRetiredId = 0; ++ pContext->nextDrawId = 1; ++ ++ // workers start at draw 1 ++ for (uint32_t i = 0; i < KNOB_MAX_NUM_THREADS; ++i) ++ { ++ pContext->WorkerFE[i] = 1; ++ pContext->WorkerBE[i] = 1; ++ } ++ ++ pContext->DrawEnqueued = 1; ++ ++ // State setup AFTER context is fully initialized ++ SetupDefaultState(pContext); ++ ++ // initialize hot tile manager ++ pContext->pHotTileMgr = new HotTileMgr(); ++ ++ // initialize function pointer tables ++ InitClearTilesTable(); ++ ++ // initialize store tiles function ++ pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; ++ pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; ++ pContext->pfnClearTile = pCreateInfo->pfnClearTile; ++ ++ return (HANDLE)pContext; ++} ++ ++void SwrDestroyContext(HANDLE hContext) ++{ ++ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; ++ DestroyThreadPool(pContext, &pContext->threadPool); ++ ++ // free the fifos ++ for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) ++ { ++ delete(pContext->dcRing[i].pTileMgr); ++ delete(pContext->dcRing[i].pDispatch); ++ } ++ ++ // Free scratch space. ++ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) ++ { ++ _aligned_free(pContext->pScratch[i]); ++ } ++ ++ _aligned_free(pContext->dcRing); ++ _aligned_free(pContext->dsRing); ++ ++ delete(pContext->pHotTileMgr); ++ ++ pContext->~SWR_CONTEXT(); ++ _aligned_free((SWR_CONTEXT*)hContext); ++} ++ ++void WakeAllThreads(SWR_CONTEXT *pContext) ++{ ++ std::unique_lock lock(pContext->WaitLock); ++ pContext->FifosNotEmpty.notify_all(); ++ lock.unlock(); ++} ++ ++bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) ++{ ++ // For single thread nothing should still be drawing. ++ if (KNOB_SINGLE_THREADED) { return false; } ++ ++ if (pDC->isCompute) ++ { ++ if (pDC->doneCompute) ++ { ++ pDC->inUse = false; ++ return false; ++ } ++ } ++ ++ // Check if backend work is done. First make sure all triangles have been binned. ++ if (pDC->doneFE == true) ++ { ++ // ensure workers have all moved passed this draw ++ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) ++ { ++ if (pContext->WorkerFE[i] <= pDC->drawId) ++ { ++ return true; ++ } ++ ++ if (pContext->WorkerBE[i] <= pDC->drawId) ++ { ++ return true; ++ } ++ } ++ ++ pDC->inUse = false; // all work is done. ++ } ++ ++ return pDC->inUse; ++} ++ ++void UpdateLastRetiredId(SWR_CONTEXT *pContext) ++{ ++ uint64_t head = pContext->LastRetiredId + 1; ++ uint64_t tail = pContext->DrawEnqueued; ++ ++ // There's no guarantee the DRAW_CONTEXT associated with (LastRetiredId+1) is still valid. ++ // This is because the update to LastRetiredId can fall behind causing the range from LastRetiredId ++ // to DrawEnqueued to exceed the size of the DRAW_CONTEXT ring. Check for this and manually increment ++ // the head to the oldest entry of the DRAW_CONTEXT ring ++ if ((tail - head) > KNOB_MAX_DRAWS_IN_FLIGHT - 1) ++ { ++ head = tail - KNOB_MAX_DRAWS_IN_FLIGHT + 1; ++ } ++ ++ DRAW_CONTEXT *pDC = &pContext->dcRing[head % KNOB_MAX_DRAWS_IN_FLIGHT]; ++ while ((head < tail) && !StillDrawing(pContext, pDC)) ++ { ++ pContext->LastRetiredId = pDC->drawId; ++ head++; ++ pDC = &pContext->dcRing[head % KNOB_MAX_DRAWS_IN_FLIGHT]; ++ } ++} ++ ++void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId) ++{ ++ if (!KNOB_SINGLE_THREADED) ++ { ++ while (drawId > pContext->LastRetiredId) ++ { ++ WakeAllThreads(pContext); ++ UpdateLastRetiredId(pContext); ++ } ++ } ++} ++ ++void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) ++{ ++ memcpy(&dst.state, &src.state, sizeof(API_STATE)); ++} ++ ++void QueueDraw(SWR_CONTEXT *pContext) ++{ ++ _ReadWriteBarrier(); ++ pContext->DrawEnqueued ++; ++ ++ if (KNOB_SINGLE_THREADED) ++ { ++ // flush denormals to 0 ++ uint32_t mxcsr = _mm_getcsr(); ++ _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); ++ ++ std::unordered_set lockedTiles; ++ WorkOnFifoFE(pContext, 0, pContext->WorkerFE[0], 0); ++ WorkOnFifoBE(pContext, 0, pContext->WorkerBE[0], lockedTiles); ++ ++ // restore csr ++ _mm_setcsr(mxcsr); ++ } ++ else ++ { ++ RDTSC_START(APIDrawWakeAllThreads); ++ WakeAllThreads(pContext); ++ RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); ++ } ++ ++ // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. ++ pContext->pPrevDrawContext = pContext->pCurDrawContext; ++ pContext->pCurDrawContext = nullptr; ++} ++ ++///@todo Combine this with QueueDraw ++void QueueDispatch(SWR_CONTEXT *pContext) ++{ ++ _ReadWriteBarrier(); ++ pContext->DrawEnqueued++; ++ ++ if (KNOB_SINGLE_THREADED) ++ { ++ // flush denormals to 0 ++ uint32_t mxcsr = _mm_getcsr(); ++ _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); ++ ++ WorkOnCompute(pContext, 0, pContext->WorkerBE[0]); ++ ++ // restore csr ++ _mm_setcsr(mxcsr); ++ } ++ else ++ { ++ RDTSC_START(APIDrawWakeAllThreads); ++ WakeAllThreads(pContext); ++ RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); ++ } ++ ++ // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. ++ pContext->pPrevDrawContext = pContext->pCurDrawContext; ++ pContext->pCurDrawContext = nullptr; ++} ++ ++DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) ++{ ++ RDTSC_START(APIGetDrawContext); ++ // If current draw context is null then need to obtain a new draw context to use from ring. ++ if (pContext->pCurDrawContext == nullptr) ++ { ++ uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; ++ ++ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; ++ pContext->pCurDrawContext = pCurDrawContext; ++ ++ // Update LastRetiredId ++ UpdateLastRetiredId(pContext); ++ ++ // Need to wait until this draw context is available to use. ++ while (StillDrawing(pContext, pCurDrawContext)) ++ { ++ // Make sure workers are working. ++ WakeAllThreads(pContext); ++ ++ _mm_pause(); ++ } ++ ++ // Assign next available entry in DS ring to this DC. ++ uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; ++ pCurDrawContext->pState = &pContext->dsRing[dsIndex]; ++ ++ Arena& stateArena = pCurDrawContext->pState->arena; ++ ++ // Copy previous state to current state. ++ if (pContext->pPrevDrawContext) ++ { ++ DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; ++ ++ // If we're splitting our draw then we can just use the same state from the previous ++ // draw. In this case, we won't increment the DS ring index so the next non-split ++ // draw can receive the state. ++ if (isSplitDraw == false) ++ { ++ CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); ++ ++ stateArena.Reset(); // Reset memory. ++ ++ // Copy private state to new context. ++ if (pPrevDrawContext->pState->pPrivateState != nullptr) ++ { ++ pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); ++ memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize); ++ } ++ ++ pContext->curStateId++; // Progress state ring index forward. ++ } ++ else ++ { ++ // If its a split draw then just copy the state pointer over ++ // since its the same draw. ++ pCurDrawContext->pState = pPrevDrawContext->pState; ++ } ++ } ++ else ++ { ++ stateArena.Reset(); // Reset memory. ++ pContext->curStateId++; // Progress state ring index forward. ++ } ++ ++ pCurDrawContext->dependency = 0; ++ pCurDrawContext->arena.Reset(); ++ pCurDrawContext->pContext = pContext; ++ pCurDrawContext->isCompute = false; // Dispatch has to set this to true. ++ pCurDrawContext->inUse = false; ++ ++ pCurDrawContext->doneCompute = false; ++ pCurDrawContext->doneFE = false; ++ pCurDrawContext->FeLock = 0; ++ ++ pCurDrawContext->pTileMgr->initialize(); ++ ++ // Assign unique drawId for this DC ++ pCurDrawContext->drawId = pContext->nextDrawId++; ++ } ++ else ++ { ++ SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); ++ } ++ ++ RDTSC_STOP(APIGetDrawContext, 0, 0); ++ return pContext->pCurDrawContext; ++} ++ ++API_STATE* GetDrawState(SWR_CONTEXT *pContext) ++{ ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ SWR_ASSERT(pDC->pState != nullptr); ++ ++ return &pDC->pState->state; ++} ++ ++void SetupDefaultState(SWR_CONTEXT *pContext) ++{ ++ API_STATE* pState = GetDrawState(pContext); ++ ++ pState->rastState.cullMode = SWR_CULLMODE_NONE; ++ pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; ++} ++ ++static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) ++{ ++ return (SWR_CONTEXT*)hContext; ++} ++ ++void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2) ++{ ++ RDTSC_START(APISync); ++ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ pDC->inUse = true; ++ ++ pDC->FeWork.type = SYNC; ++ pDC->FeWork.pfnWork = ProcessSync; ++ pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc; ++ pDC->FeWork.desc.sync.userData = userData; ++ pDC->FeWork.desc.sync.userData2 = userData2; ++ ++ // cannot execute until all previous draws have completed ++ pDC->dependency = pDC->drawId - 1; ++ ++ //enqueue ++ QueueDraw(pContext); ++ ++ RDTSC_STOP(APISync, 1, 0); ++} ++ ++void SwrWaitForIdle(HANDLE hContext) ++{ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ ++ // Wait on the previous DrawContext's drawId, as this function doesn't queue anything. ++ if (pContext->pPrevDrawContext) ++ WaitForDependencies(pContext, pContext->pPrevDrawContext->drawId); ++} ++ ++void SwrSetVertexBuffers( ++ HANDLE hContext, ++ uint32_t numBuffers, ++ const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ for (uint32_t i = 0; i < numBuffers; ++i) ++ { ++ const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i]; ++ pState->vertexBuffers[pVB->index] = *pVB; ++ } ++} ++ ++void SwrSetIndexBuffer( ++ HANDLE hContext, ++ const SWR_INDEX_BUFFER_STATE* pIndexBuffer) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ pState->indexBuffer = *pIndexBuffer; ++} ++ ++void SwrSetFetchFunc( ++ HANDLE hContext, ++ PFN_FETCH_FUNC pfnFetchFunc) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ pState->pfnFetchFunc = pfnFetchFunc; ++} ++ ++void SwrSetSoFunc( ++ HANDLE hContext, ++ PFN_SO_FUNC pfnSoFunc, ++ uint32_t streamIndex) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ SWR_ASSERT(streamIndex < MAX_SO_STREAMS); ++ ++ pState->pfnSoFunc[streamIndex] = pfnSoFunc; ++} ++ ++void SwrSetSoState( ++ HANDLE hContext, ++ SWR_STREAMOUT_STATE* pSoState) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ pState->soState = *pSoState; ++} ++ ++void SwrSetSoBuffers( ++ HANDLE hContext, ++ SWR_STREAMOUT_BUFFER* pSoBuffer, ++ uint32_t slot) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); ++ ++ pState->soBuffer[slot] = *pSoBuffer; ++} ++ ++void SwrSetVertexFunc( ++ HANDLE hContext, ++ PFN_VERTEX_FUNC pfnVertexFunc) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ pState->pfnVertexFunc = pfnVertexFunc; ++} ++ ++void SwrSetFrontendState( ++ HANDLE hContext, ++ SWR_FRONTEND_STATE *pFEState) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ pState->frontendState = *pFEState; ++} ++ ++void SwrSetGsState( ++ HANDLE hContext, ++ SWR_GS_STATE *pGSState) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ pState->gsState = *pGSState; ++} ++ ++void SwrSetGsFunc( ++ HANDLE hContext, ++ PFN_GS_FUNC pfnGsFunc) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ pState->pfnGsFunc = pfnGsFunc; ++} ++ ++void SwrSetCsFunc( ++ HANDLE hContext, ++ PFN_CS_FUNC pfnCsFunc, ++ uint32_t totalThreadsInGroup) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ pState->pfnCsFunc = pfnCsFunc; ++ pState->totalThreadsInGroup = totalThreadsInGroup; ++} ++ ++void SwrSetTsState( ++ HANDLE hContext, ++ SWR_TS_STATE *pState) ++{ ++ API_STATE* pApiState = GetDrawState(GetContext(hContext)); ++ pApiState->tsState = *pState; ++} ++ ++void SwrSetHsFunc( ++ HANDLE hContext, ++ PFN_HS_FUNC pfnFunc) ++{ ++ API_STATE* pApiState = GetDrawState(GetContext(hContext)); ++ pApiState->pfnHsFunc = pfnFunc; ++} ++ ++void SwrSetDsFunc( ++ HANDLE hContext, ++ PFN_DS_FUNC pfnFunc) ++{ ++ API_STATE* pApiState = GetDrawState(GetContext(hContext)); ++ pApiState->pfnDsFunc = pfnFunc; ++} ++ ++void SwrSetDepthStencilState( ++ HANDLE hContext, ++ SWR_DEPTH_STENCIL_STATE *pDSState) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ pState->depthStencilState = *pDSState; ++} ++ ++void SwrSetBackendState( ++ HANDLE hContext, ++ SWR_BACKEND_STATE *pBEState) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ pState->backendState = *pBEState; ++} ++ ++void SwrSetPixelShaderState( ++ HANDLE hContext, ++ SWR_PS_STATE *pPSState) ++{ ++ API_STATE *pState = GetDrawState(GetContext(hContext)); ++ pState->psState = *pPSState; ++} ++ ++void SwrSetBlendState( ++ HANDLE hContext, ++ SWR_BLEND_STATE *pBlendState) ++{ ++ API_STATE *pState = GetDrawState(GetContext(hContext)); ++ memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); ++} ++ ++void SwrSetBlendFunc( ++ HANDLE hContext, ++ uint32_t renderTarget, ++ PFN_BLEND_JIT_FUNC pfnBlendFunc) ++{ ++ SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); ++ API_STATE *pState = GetDrawState(GetContext(hContext)); ++ pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; ++} ++ ++void SwrSetLinkage( ++ HANDLE hContext, ++ uint32_t mask, ++ const uint8_t* pMap) ++{ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ ++ static const uint8_t IDENTITY_MAP[] = ++ { ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ++ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ++ }; ++ static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap), ++ "Update for new value of MAX_ATTRIBUTES"); ++ ++ pState->linkageMask = mask; ++ pState->linkageCount = _mm_popcnt_u32(mask); ++ ++ if (!pMap) ++ { ++ pMap = IDENTITY_MAP; ++ } ++ memcpy(pState->linkageMap, pMap, pState->linkageCount); ++} ++ ++// update guardband multipliers for the viewport ++void updateGuardband(API_STATE *pState) ++{ ++ // guardband center is viewport center ++ pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; ++ pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; ++ pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; ++ pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; ++} ++ ++void SwrSetRastState( ++ HANDLE hContext, ++ const SWR_RASTSTATE *pRastState) ++{ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ API_STATE* pState = GetDrawState(pContext); ++ ++ memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE)); ++} ++ ++void SwrSetViewports( ++ HANDLE hContext, ++ uint32_t numViewports, ++ const SWR_VIEWPORT* pViewports, ++ const SWR_VIEWPORT_MATRIX* pMatrices) ++{ ++ SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, ++ "Invalid number of viewports."); ++ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ API_STATE* pState = GetDrawState(pContext); ++ ++ memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); ++ ++ if (pMatrices != nullptr) ++ { ++ memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports); ++ } ++ else ++ { ++ // Compute default viewport transform. ++ for (uint32_t i = 0; i < numViewports; ++i) ++ { ++ if (pContext->driverType == DX) ++ { ++ pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f; ++ pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f; ++ pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ; ++ pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; ++ pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11; ++ pState->vpMatrix[i].m32 = pState->vp[i].minZ; ++ } ++ else ++ { ++ // Standard, with the exception that Y is inverted. ++ pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f; ++ pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f; ++ pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f; ++ pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; ++ pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11; ++ pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22; ++ ++ // Now that the matrix is calculated, clip the view coords to screen size. ++ // OpenGL allows for -ve x,y in the viewport. ++ pState->vp[i].x = std::max(pState->vp[i].x, 0.0f); ++ pState->vp[i].y = std::max(pState->vp[i].y, 0.0f); ++ } ++ } ++ } ++ ++ updateGuardband(pState); ++} ++ ++void SwrSetScissorRects( ++ HANDLE hContext, ++ uint32_t numScissors, ++ const BBOX* pScissors) ++{ ++ SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, ++ "Invalid number of scissor rects."); ++ ++ API_STATE* pState = GetDrawState(GetContext(hContext)); ++ memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX)); ++}; ++ ++void SetupMacroTileScissors(DRAW_CONTEXT *pDC) ++{ ++ API_STATE *pState = &pDC->pState->state; ++ uint32_t left, right, top, bottom; ++ ++ // Set up scissor dimensions based on scissor or viewport ++ if (pState->rastState.scissorEnable) ++ { ++ // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges ++ left = pState->scissorRects[0].left; ++ right = pState->scissorRects[0].right; ++ top = pState->scissorRects[0].top; ++ bottom = pState->scissorRects[0].bottom; ++ } ++ else ++ { ++ left = (int32_t)pState->vp[0].x; ++ right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width; ++ top = (int32_t)pState->vp[0].y; ++ bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height; ++ } ++ ++ pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE; ++ pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1; ++ pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE; ++ pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; ++} ++ ++void SetupPipeline(DRAW_CONTEXT *pDC) ++{ ++ DRAW_STATE* pState = pDC->pState; ++ ++ // setup backend ++ if (pState->state.psState.pfnPixelShader == nullptr) ++ { ++ pState->pfnBackend = &BackendNullPS; ++ } ++ else ++ { ++ bool bMultisampleEnable = (pState->state.rastState.sampleCount > SWR_MULTISAMPLE_1X) ? 1 : 0; ++ ++ // select backend function based on max slot used by PS ++ switch(pState->state.psState.shadingRate) ++ { ++ case SWR_SHADING_RATE_PIXEL: ++ if(bMultisampleEnable) ++ { ++ pState->pfnBackend = gPixelRateBackendTable[pState->state.rastState.sampleCount-1][pState->state.psState.maxRTSlotUsed]; ++ } ++ else ++ { ++ pState->pfnBackend = gSingleSampleBackendTable[pState->state.psState.maxRTSlotUsed]; ++ } ++ break; ++ case SWR_SHADING_RATE_SAMPLE: ++ ///@todo Do we need to obey sample rate ++ if (!bMultisampleEnable) ++ { ++ // If PS is set at per sample rate and multisampling is disabled, set to per pixel and single sample backend ++ pState->state.psState.shadingRate = SWR_SHADING_RATE_PIXEL; ++ pState->pfnBackend = gSingleSampleBackendTable[pState->state.psState.maxRTSlotUsed]; ++ } ++ else ++ { ++ pState->pfnBackend = gSampleRateBackendTable[pState->state.rastState.sampleCount-1][pState->state.psState.maxRTSlotUsed]; ++ } ++ break; ++ case SWR_SHADING_RATE_COARSE: ++ default: ++ assert(0 && "Invalid shading rate"); ++ break; ++ } ++ } ++ ++ PFN_PROCESS_PRIMS pfnBinner; ++ switch (pState->state.topology) ++ { ++ case TOP_POINT_LIST: ++ pState->pfnProcessPrims = CanUseSimplePoints(pDC) ? ClipPoints : ClipTriangles; ++ pfnBinner = CanUseSimplePoints(pDC) ? BinPoints : BinTriangles; ++ break; ++ case TOP_LINE_LIST: ++ case TOP_LINE_STRIP: ++ case TOP_LINE_LOOP: ++ case TOP_LINE_LIST_ADJ: ++ case TOP_LISTSTRIP_ADJ: ++ pState->pfnProcessPrims = ClipLines; ++ pfnBinner = BinLines; ++ break; ++ default: ++ pState->pfnProcessPrims = ClipTriangles; ++ pfnBinner = BinTriangles; ++ break; ++ }; ++ ++ // disable clipper if viewport transform is disabled ++ if (pState->state.frontendState.vpTransformDisable) ++ { ++ pState->pfnProcessPrims = pfnBinner; ++ } ++ ++ if ((pState->state.psState.pfnPixelShader == nullptr) && ++ (pState->state.depthStencilState.depthTestEnable == FALSE) && ++ (pState->state.depthStencilState.depthWriteEnable == FALSE) && ++ (pState->state.linkageCount == 0)) ++ { ++ pState->pfnProcessPrims = nullptr; ++ pState->state.linkageMask = 0; ++ } ++ ++ if (pState->state.soState.rasterizerDisable == true) ++ { ++ pState->pfnProcessPrims = nullptr; ++ pState->state.linkageMask = 0; ++ } ++ ++ // set up the frontend attrib mask ++ pState->state.feAttribMask = pState->state.linkageMask; ++ if (pState->state.soState.soEnable) ++ { ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ pState->state.feAttribMask |= pState->state.soState.streamMasks[i]; ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief InitDraw ++/// @param pDC - Draw context to initialize for this draw. ++void InitDraw( ++ DRAW_CONTEXT *pDC, ++ bool isSplitDraw) ++{ ++ // We don't need to re-setup the scissors/pipeline state again for split draw. ++ if (isSplitDraw == false) ++ { ++ SetupMacroTileScissors(pDC); ++ SetupPipeline(pDC); ++ } ++ ++ pDC->inUse = true; // We are using this one now. ++ ++ /// @todo: remove when we send down preset sample patterns (standard or center) ++ // If multisampling is enabled, precompute float sample offsets from fixed ++ uint32_t numSamples = pDC->pState->state.rastState.sampleCount; ++ if(numSamples > SWR_MULTISAMPLE_1X) ++ { ++ static const float fixed8Scale = 1.0f/FIXED_POINT_SCALE; ++ float* pSamplePos = pDC->pState->state.samplePos; ++ SWR_MULTISAMPLE_POS(&iSamplePos)[SWR_MAX_NUM_MULTISAMPLES] = pDC->pState->state.rastState.iSamplePos; ++ ++ for(uint32_t i = 0; i < numSamples; i++) ++ { ++ *(pSamplePos++) = ((float)(iSamplePos[i].x) * fixed8Scale); ++ *(pSamplePos++) = ((float)(iSamplePos[i].y) * fixed8Scale); ++ } ++ } ++ // just test the masked off samples once per draw and use the results in the backend. ++ SWR_RASTSTATE &rastState = pDC->pState->state.rastState; ++ uint32_t sampleMask = rastState.sampleMask; ++ for(uint32_t i = 0; i < SWR_MAX_NUM_MULTISAMPLES; i++) ++ { ++ rastState.isSampleMasked[i] = !(sampleMask & 1); ++ sampleMask>>=1; ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief We can split the draw for certain topologies for better performance. ++/// @param totalVerts - Total vertices for draw ++/// @param topology - Topology used for draw ++uint32_t MaxVertsPerDraw( ++ DRAW_CONTEXT* pDC, ++ uint32_t totalVerts, ++ PRIMITIVE_TOPOLOGY topology) ++{ ++ API_STATE& state = pDC->pState->state; ++ ++ uint32_t vertsPerDraw = totalVerts; ++ ++ if (state.soState.soEnable) ++ { ++ return totalVerts; ++ } ++ ++ switch (topology) ++ { ++ case TOP_POINT_LIST: ++ case TOP_TRIANGLE_LIST: ++ vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW; ++ break; ++ ++ case TOP_PATCHLIST_1: ++ case TOP_PATCHLIST_2: ++ case TOP_PATCHLIST_3: ++ case TOP_PATCHLIST_4: ++ case TOP_PATCHLIST_5: ++ case TOP_PATCHLIST_6: ++ case TOP_PATCHLIST_7: ++ case TOP_PATCHLIST_8: ++ case TOP_PATCHLIST_9: ++ case TOP_PATCHLIST_10: ++ case TOP_PATCHLIST_11: ++ case TOP_PATCHLIST_12: ++ case TOP_PATCHLIST_13: ++ case TOP_PATCHLIST_14: ++ case TOP_PATCHLIST_15: ++ case TOP_PATCHLIST_16: ++ case TOP_PATCHLIST_17: ++ case TOP_PATCHLIST_18: ++ case TOP_PATCHLIST_19: ++ case TOP_PATCHLIST_20: ++ case TOP_PATCHLIST_21: ++ case TOP_PATCHLIST_22: ++ case TOP_PATCHLIST_23: ++ case TOP_PATCHLIST_24: ++ case TOP_PATCHLIST_25: ++ case TOP_PATCHLIST_26: ++ case TOP_PATCHLIST_27: ++ case TOP_PATCHLIST_28: ++ case TOP_PATCHLIST_29: ++ case TOP_PATCHLIST_30: ++ case TOP_PATCHLIST_31: ++ case TOP_PATCHLIST_32: ++ if (pDC->pState->state.tsState.tsEnable) ++ { ++ uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; ++ vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; ++ } ++ break; ++ ++ default: ++ // We are not splitting up draws for other topologies. ++ break; ++ } ++ ++ return vertsPerDraw; ++} ++ ++// Recursive template used to auto-nest conditionals. Converts dynamic boolean function ++// arguments to static template arguments. ++template ++struct FEDrawChooser ++{ ++ // Last Arg Terminator ++ static PFN_FE_WORK_FUNC GetFunc(bool bArg) ++ { ++ if (bArg) ++ { ++ return ProcessDraw; ++ } ++ ++ return ProcessDraw; ++ } ++ ++ // Recursively parse args ++ template ++ static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs) ++ { ++ if (bArg) ++ { ++ return FEDrawChooser::GetFunc(remainingArgs...); ++ } ++ ++ return FEDrawChooser::GetFunc(remainingArgs...); ++ } ++}; ++ ++// Selector for correct templated Draw front-end function ++INLINE ++static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled) ++{ ++ return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled); ++} ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief DrawInstanced ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numVerts - How many vertices to read sequentially from vertex data (per instance). ++/// @param startVertex - Specifies start vertex for draw. (vertex data) ++/// @param numInstances - How many instances to render. ++/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) ++void DrawInstanced( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numVertices, ++ uint32_t startVertex, ++ uint32_t numInstances = 1, ++ uint32_t startInstance = 0) ++{ ++ RDTSC_START(APIDraw); ++ ++#if KNOB_ENABLE_TOSS_POINTS ++ if (KNOB_TOSS_DRAW) ++ { ++ return; ++ } ++#endif ++ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); ++ uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); ++ int32_t remainingVerts = numVertices; ++ ++ API_STATE *pState = &pDC->pState->state; ++ pState->topology = topology; ++ pState->forceFront = false; ++ ++ // disable culling for points/lines ++ uint32_t oldCullMode = pState->rastState.cullMode; ++ if (topology == TOP_POINT_LIST) ++ { ++ pState->rastState.cullMode = SWR_CULLMODE_NONE; ++ pState->forceFront = true; ++ } ++ ++ int draw = 0; ++ while (remainingVerts) ++ { ++ uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? ++ remainingVerts : maxVertsPerDraw; ++ ++ bool isSplitDraw = (draw > 0) ? true : false; ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); ++ InitDraw(pDC, isSplitDraw); ++ ++ pDC->FeWork.type = DRAW; ++ pDC->FeWork.pfnWork = GetFEDrawFunc( ++ false, // IsIndexed ++ pState->tsState.tsEnable, ++ pState->gsState.gsEnable, ++ pState->soState.soEnable, ++ pDC->pState->pfnProcessPrims != nullptr); ++ pDC->FeWork.desc.draw.numVerts = numVertsForDraw; ++ pDC->FeWork.desc.draw.startVertex = startVertex + draw * maxVertsPerDraw; ++ pDC->FeWork.desc.draw.numInstances = numInstances; ++ pDC->FeWork.desc.draw.startInstance = startInstance; ++ pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; ++ ++ //enqueue DC ++ QueueDraw(pContext); ++ ++ remainingVerts -= numVertsForDraw; ++ draw++; ++ } ++ ++ // restore culling state ++ pDC = GetDrawContext(pContext); ++ pDC->pState->state.rastState.cullMode = oldCullMode; ++ ++ RDTSC_STOP(APIDraw, numVertices * numInstances, 0); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDraw ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param startVertex - Specifies start vertex in vertex buffer for draw. ++/// @param primCount - Number of vertices. ++void SwrDraw( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t startVertex, ++ uint32_t numVertices) ++{ ++ DrawInstanced(hContext, topology, numVertices, startVertex); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDrawInstanced ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. ++/// @param numInstances - How many instances to render. ++/// @param startVertex - Specifies start vertex for draw. (vertex data) ++/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) ++void SwrDrawInstanced( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numVertsPerInstance, ++ uint32_t numInstances, ++ uint32_t startVertex, ++ uint32_t startInstance ++ ) ++{ ++ DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief DrawIndexedInstanced ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numIndices - Number of indices to read sequentially from index buffer. ++/// @param indexOffset - Starting index into index buffer. ++/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. ++/// @param numInstances - Number of instances to render. ++/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) ++void DrawIndexedInstance( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numIndices, ++ uint32_t indexOffset, ++ int32_t baseVertex, ++ uint32_t numInstances = 1, ++ uint32_t startInstance = 0) ++{ ++ RDTSC_START(APIDrawIndexed); ++ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ API_STATE* pState = &pDC->pState->state; ++ ++ int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); ++ uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); ++ int32_t remainingIndices = numIndices; ++ ++ uint32_t indexSize = 0; ++ switch (pState->indexBuffer.format) ++ { ++ case R32_UINT: indexSize = sizeof(uint32_t); break; ++ case R16_UINT: indexSize = sizeof(uint16_t); break; ++ case R8_UINT: indexSize = sizeof(uint8_t); break; ++ default: ++ SWR_ASSERT(0); ++ } ++ ++ int draw = 0; ++ uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; ++ pIB += (uint64_t)indexOffset * (uint64_t)indexSize; ++ ++ pState->topology = topology; ++ pState->forceFront = false; ++ ++ // disable culling for points/lines ++ uint32_t oldCullMode = pState->rastState.cullMode; ++ if (topology == TOP_POINT_LIST) ++ { ++ pState->rastState.cullMode = SWR_CULLMODE_NONE; ++ pState->forceFront = true; ++ } ++ ++ while (remainingIndices) ++ { ++ uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? ++ remainingIndices : maxIndicesPerDraw; ++ ++ // When breaking up draw, we need to obtain new draw context for each iteration. ++ bool isSplitDraw = (draw > 0) ? true : false; ++ pDC = GetDrawContext(pContext, isSplitDraw); ++ InitDraw(pDC, isSplitDraw); ++ ++ pDC->FeWork.type = DRAW; ++ pDC->FeWork.pfnWork = GetFEDrawFunc( ++ true, // IsIndexed ++ pState->tsState.tsEnable, ++ pState->gsState.gsEnable, ++ pState->soState.soEnable, ++ pDC->pState->pfnProcessPrims != nullptr); ++ pDC->FeWork.desc.draw.pDC = pDC; ++ pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; ++ pDC->FeWork.desc.draw.pIB = (int*)pIB; ++ pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; ++ ++ pDC->FeWork.desc.draw.numInstances = numInstances; ++ pDC->FeWork.desc.draw.startInstance = startInstance; ++ pDC->FeWork.desc.draw.baseVertex = baseVertex; ++ pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; ++ ++ //enqueue DC ++ QueueDraw(pContext); ++ ++ pIB += maxIndicesPerDraw * indexSize; ++ remainingIndices -= numIndicesForDraw; ++ draw++; ++ } ++ ++ // restore culling state ++ pDC = GetDrawContext(pContext); ++ pDC->pState->state.rastState.cullMode = oldCullMode; ++ ++ RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); ++} ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief DrawIndexed ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numIndices - Number of indices to read sequentially from index buffer. ++/// @param indexOffset - Starting index into index buffer. ++/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. ++void SwrDrawIndexed( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numIndices, ++ uint32_t indexOffset, ++ int32_t baseVertex ++ ) ++{ ++ DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDrawIndexedInstanced ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numIndices - Number of indices to read sequentially from index buffer. ++/// @param numInstances - Number of instances to render. ++/// @param indexOffset - Starting index into index buffer. ++/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. ++/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) ++void SwrDrawIndexedInstanced( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numIndices, ++ uint32_t numInstances, ++ uint32_t indexOffset, ++ int32_t baseVertex, ++ uint32_t startInstance) ++{ ++ DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); ++} ++ ++// Attach surfaces to pipeline ++void SwrInvalidateTiles( ++ HANDLE hContext, ++ uint32_t attachmentMask) ++{ ++ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ pDC->inUse = true; ++ ++ // Queue a load to the hottile ++ pDC->FeWork.type = INVALIDATETILES; ++ pDC->FeWork.pfnWork = ProcessInvalidateTiles; ++ pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask; ++ ++ //enqueue ++ QueueDraw(pContext); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDispatch ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param threadGroupCountX - Number of thread groups dispatched in X direction ++/// @param threadGroupCountY - Number of thread groups dispatched in Y direction ++/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction ++void SwrDispatch( ++ HANDLE hContext, ++ uint32_t threadGroupCountX, ++ uint32_t threadGroupCountY, ++ uint32_t threadGroupCountZ) ++{ ++ RDTSC_START(APIDispatch); ++ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ pDC->isCompute = true; // This is a compute context. ++ pDC->inUse = true; ++ ++ COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->arena.AllocAligned(sizeof(COMPUTE_DESC), 64); ++ ++ pTaskData->threadGroupCountX = threadGroupCountX; ++ pTaskData->threadGroupCountY = threadGroupCountY; ++ pTaskData->threadGroupCountZ = threadGroupCountZ; ++ ++ uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; ++ pDC->pDispatch->initialize(totalThreadGroups, pTaskData); ++ ++ QueueDispatch(pContext); ++ RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0); ++} ++ ++// Deswizzles, converts and stores current contents of the hot tiles to surface ++// described by pState ++void SwrStoreTiles( ++ HANDLE hContext, ++ SWR_RENDERTARGET_ATTACHMENT attachment, ++ SWR_TILE_STATE postStoreTileState) // TODO: Implement postStoreTileState ++{ ++ RDTSC_START(APIStoreTiles); ++ ++ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ pDC->inUse = true; ++ ++ SetupMacroTileScissors(pDC); ++ ++ pDC->FeWork.type = STORETILES; ++ pDC->FeWork.pfnWork = ProcessStoreTiles; ++ pDC->FeWork.desc.storeTiles.attachment = attachment; ++ pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; ++ ++ //enqueue ++ QueueDraw(pContext); ++ ++ RDTSC_STOP(APIStoreTiles, 0, 0); ++ if (attachment == SWR_ATTACHMENT_COLOR0) ++ { ++ RDTSC_ENDFRAME(); ++ } ++} ++ ++void SwrClearRenderTarget( ++ HANDLE hContext, ++ uint32_t clearMask, ++ const float clearColor[4], ++ float z, ++ BYTE stencil) ++{ ++ RDTSC_START(APIClearRenderTarget); ++ ++ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; ++ ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ SetupMacroTileScissors(pDC); ++ ++ pDC->inUse = true; ++ ++ CLEAR_FLAGS flags; ++ flags.mask = clearMask; ++ ++ pDC->FeWork.type = CLEAR; ++ pDC->FeWork.pfnWork = ProcessClear; ++ pDC->FeWork.desc.clear.flags = flags; ++ pDC->FeWork.desc.clear.clearDepth = z; ++ pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; ++ pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; ++ pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; ++ pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; ++ pDC->FeWork.desc.clear.clearStencil = stencil; ++ ++ // enqueue draw ++ QueueDraw(pContext); ++ ++ RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Returns a pointer to the private context state for the current ++/// draw operation. This is used for external componets such as the ++/// sampler. ++/// SWR is responsible for the allocation of the private context state. ++/// @param hContext - Handle passed back from SwrCreateContext ++VOID* SwrGetPrivateContextState( ++ HANDLE hContext) ++{ ++ SWR_CONTEXT* pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ DRAW_STATE* pState = pDC->pState; ++ ++ if (pState->pPrivateState == nullptr) ++ { ++ pState->pPrivateState = pState->arena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); ++ } ++ ++ return pState->pPrivateState; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Clients can use this to allocate memory for draw/dispatch ++/// operations. The memory will automatically be freed once operation ++/// has completed. Client can use this to allocate binding tables, ++/// etc. needed for shader execution. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param size - Size of allocation ++/// @param align - Alignment needed for allocation. ++VOID* SwrAllocDrawContextMemory( ++ HANDLE hContext, ++ uint32_t size, ++ uint32_t align) ++{ ++ SWR_CONTEXT* pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ return pDC->pState->arena.AllocAligned(size, align); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Returns pointer to SWR stats. ++/// @note The counters are atomically incremented by multiple threads. ++/// When calling this, you need to ensure all previous operations ++/// have completed. ++/// @todo If necessary, add a callback to avoid stalling the pipe to ++/// sample the counters. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pStats - SWR will fill this out for caller. ++void SwrGetStats( ++ HANDLE hContext, ++ SWR_STATS* pStats) ++{ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ pDC->inUse = true; ++ ++ pDC->FeWork.type = QUERYSTATS; ++ pDC->FeWork.pfnWork = ProcessQueryStats; ++ pDC->FeWork.desc.queryStats.pStats = pStats; ++ ++ // cannot execute until all previous draws have completed ++ pDC->dependency = pDC->drawId - 1; ++ ++ //enqueue ++ QueueDraw(pContext); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Enables stats counting ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param enable - If true then counts are incremented. ++void SwrEnableStats( ++ HANDLE hContext, ++ bool enable) ++{ ++ SWR_CONTEXT *pContext = GetContext(hContext); ++ DRAW_CONTEXT* pDC = GetDrawContext(pContext); ++ ++ pDC->pState->state.enableStats = enable; ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h +new file mode 100644 +index 0000000..1741ef6 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/api.h +@@ -0,0 +1,483 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file api.h ++* ++* @brief API definitions ++* ++******************************************************************************/ ++ ++#ifndef __SWR_API_H__ ++#define __SWR_API_H__ ++ ++#include "common/os.h" ++ ++#include ++#include ++ ++#include "common/simdintrin.h" ++#include "common/formats.h" ++#include "core/utils.h" ++#include "core/state.h" ++ ++///@todo place all the API functions into the 'swr' namespace. ++ ++typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Function signature for load hot tiles ++/// @param hPrivateContext - handle to private data ++/// @param dstFormat - format of the hot tile ++/// @param renderTargetIndex - render target to store, can be color, depth or stencil ++/// @param x - destination x coordinate ++/// @param y - destination y coordinate ++/// @param pDstHotTile - pointer to the hot tile surface ++typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Function signature for store hot tiles ++/// @param hPrivateContext - handle to private data ++/// @param srcFormat - format of the hot tile ++/// @param renderTargetIndex - render target to store, can be color, depth or stencil ++/// @param x - destination x coordinate ++/// @param y - destination y coordinate ++/// @param pSrcHotTile - pointer to the hot tile surface ++typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile); ++ ++/// @brief Function signature for clearing from the hot tiles clear value ++/// @param hPrivateContext - handle to private data ++/// @param renderTargetIndex - render target to store, can be color, depth or stencil ++/// @param x - destination x coordinate ++/// @param y - destination y coordinate ++/// @param pClearColor - pointer to the hot tile's clear value ++typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, ++ SWR_RENDERTARGET_ATTACHMENT rtIndex, ++ uint32_t x, uint32_t y, const float* pClearColor); ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_CREATECONTEXT_INFO ++///////////////////////////////////////////////////////////////////////// ++struct SWR_CREATECONTEXT_INFO ++{ ++ DRIVER_TYPE driver; ++ ++ // External functions (e.g. sampler) need per draw context state. ++ // Use SwrGetPrivateContextState() to access private state. ++ uint32_t privateStateSize; ++ ++ // tile manipulation functions ++ PFN_LOAD_TILE pfnLoadTile; ++ PFN_STORE_TILE pfnStoreTile; ++ PFN_CLEAR_TILE pfnClearTile; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_RECT ++///////////////////////////////////////////////////////////////////////// ++struct SWR_RECT ++{ ++ uint32_t left; ++ uint32_t right; ++ uint32_t top; ++ uint32_t bottom; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Create SWR Context. ++/// @param pCreateInfo - pointer to creation info. ++HANDLE SWR_API SwrCreateContext( ++ const SWR_CREATECONTEXT_INFO* pCreateInfo); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Destroys SWR Context. ++/// @param hContext - Handle passed back from SwrCreateContext ++void SWR_API SwrDestroyContext( ++ HANDLE hContext); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Sync cmd. Executes the callback func when all rendering up to this sync ++/// has been completed ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pfnFunc - pointer to callback function, ++/// @param userData - user data to pass back ++void SWR_API SwrSync( ++ HANDLE hContext, ++ PFN_CALLBACK_FUNC pfnFunc, ++ uint64_t userData, ++ uint64_t userData2); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Blocks until all rendering has been completed. ++/// @param hContext - Handle passed back from SwrCreateContext ++void SWR_API SwrWaitForIdle( ++ HANDLE hContext); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set vertex buffer state. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param numBuffers - Number of vertex buffer state descriptors. ++/// @param pVertexBuffers - Array of vertex buffer state descriptors. ++void SWR_API SwrSetVertexBuffers( ++ HANDLE hContext, ++ uint32_t numBuffers, ++ const SWR_VERTEX_BUFFER_STATE* pVertexBuffers); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set index buffer ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pIndexBuffer - Index buffer. ++void SWR_API SwrSetIndexBuffer( ++ HANDLE hContext, ++ const SWR_INDEX_BUFFER_STATE* pIndexBuffer); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set fetch shader pointer. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pfnFetchFunc - Pointer to shader. ++void SWR_API SwrSetFetchFunc( ++ HANDLE hContext, ++ PFN_FETCH_FUNC pfnFetchFunc); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set streamout shader pointer. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pfnSoFunc - Pointer to shader. ++/// @param streamIndex - specifies stream ++void SWR_API SwrSetSoFunc( ++ HANDLE hContext, ++ PFN_SO_FUNC pfnSoFunc, ++ uint32_t streamIndex); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set streamout state ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pSoState - Pointer to streamout state. ++void SWR_API SwrSetSoState( ++ HANDLE hContext, ++ SWR_STREAMOUT_STATE* pSoState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set streamout buffer state ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pSoBuffer - Pointer to streamout buffer. ++/// @param slot - Slot to bind SO buffer to. ++void SWR_API SwrSetSoBuffers( ++ HANDLE hContext, ++ SWR_STREAMOUT_BUFFER* pSoBuffer, ++ uint32_t slot); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set vertex shader pointer. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pfnVertexFunc - Pointer to shader. ++void SWR_API SwrSetVertexFunc( ++ HANDLE hContext, ++ PFN_VERTEX_FUNC pfnVertexFunc); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set frontend state. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state ++void SWR_API SwrSetFrontendState( ++ HANDLE hContext, ++ SWR_FRONTEND_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set geometry shader state. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state ++void SWR_API SwrSetGsState( ++ HANDLE hContext, ++ SWR_GS_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set geometry shader ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to geometry shader function ++void SWR_API SwrSetGsFunc( ++ HANDLE hContext, ++ PFN_GS_FUNC pfnGsFunc); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set compute shader ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to compute shader function ++/// @param totalThreadsInGroup - product of thread group dimensions. ++void SWR_API SwrSetCsFunc( ++ HANDLE hContext, ++ PFN_CS_FUNC pfnCsFunc, ++ uint32_t totalThreadsInGroup); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set tessellation state. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state ++void SWR_API SwrSetTsState( ++ HANDLE hContext, ++ SWR_TS_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set hull shader ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pfnFunc - Pointer to shader function ++void SWR_API SwrSetHsFunc( ++ HANDLE hContext, ++ PFN_HS_FUNC pfnFunc); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set domain shader ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pfnFunc - Pointer to shader function ++void SWR_API SwrSetDsFunc( ++ HANDLE hContext, ++ PFN_DS_FUNC pfnFunc); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set depth stencil state ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state. ++void SWR_API SwrSetDepthStencilState( ++ HANDLE hContext, ++ SWR_DEPTH_STENCIL_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set backend state ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state. ++void SWR_API SwrSetBackendState( ++ HANDLE hContext, ++ SWR_BACKEND_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set pixel shader state ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state. ++void SWR_API SwrSetPixelShaderState( ++ HANDLE hContext, ++ SWR_PS_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set blend state ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pState - Pointer to state. ++void SWR_API SwrSetBlendState( ++ HANDLE hContext, ++ SWR_BLEND_STATE *pState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set blend function ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param renderTarget - render target index ++/// @param pfnBlendFunc - function pointer ++void SWR_API SwrSetBlendFunc( ++ HANDLE hContext, ++ uint32_t renderTarget, ++ PFN_BLEND_JIT_FUNC pfnBlendFunc); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Set linkage mask ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param mask - Specifies which vertex outputs are are needed by PS. ++/// @param pMap - (Optional)Linkage map to specify where FE attributes are ++/// gathered from to supply PS attribute values. The length ++/// of the map buffer needs to match the number of set bits ++/// in "mask". ++void SWR_API SwrSetLinkage( ++ HANDLE hContext, ++ uint32_t mask, ++ const uint8_t* pMap); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDraw ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param startVertex - Specifies start vertex in vertex buffer for draw. ++/// @param primCount - Number of vertices. ++void SWR_API SwrDraw( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t startVertex, ++ uint32_t primCount); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDrawInstanced ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. ++/// @param numInstances - How many instances to render. ++/// @param startVertex - Specifies start vertex for draw. (vertex data) ++/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) ++void SWR_API SwrDrawInstanced( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numVertsPerInstance, ++ uint32_t numInstances, ++ uint32_t startVertex, ++ uint32_t startInstance); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief DrawIndexed ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numIndices - Number of indices to read sequentially from index buffer. ++/// @param indexOffset - Starting index into index buffer. ++/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. ++void SWR_API SwrDrawIndexed( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numIndices, ++ uint32_t indexOffset, ++ int32_t baseVertex); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDrawIndexedInstanced ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param topology - Specifies topology for draw. ++/// @param numIndices - Number of indices to read sequentially from index buffer. ++/// @param numInstances - Number of instances to render. ++/// @param indexOffset - Starting index into index buffer. ++/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. ++/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) ++void SWR_API SwrDrawIndexedInstanced( ++ HANDLE hContext, ++ PRIMITIVE_TOPOLOGY topology, ++ uint32_t numIndices, ++ uint32_t numInstances, ++ uint32_t indexOffset, ++ int32_t baseVertex, ++ uint32_t startInstance); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrInvalidateTiles ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. ++void SWR_API SwrInvalidateTiles( ++ HANDLE hContext, ++ uint32_t attachmentMask); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrDispatch ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param threadGroupCountX - Number of thread groups dispatched in X direction ++/// @param threadGroupCountY - Number of thread groups dispatched in Y direction ++/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction ++void SWR_API SwrDispatch( ++ HANDLE hContext, ++ uint32_t threadGroupCountX, ++ uint32_t threadGroupCountY, ++ uint32_t threadGroupCountZ); ++ ++ ++enum SWR_TILE_STATE ++{ ++ SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents before rendering ++ SWR_TILE_DIRTY = 2, // tile contains newer data than surface it represents ++ SWR_TILE_RESOLVED = 3, // is in sync with surface it represents ++}; ++ ++/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs. ++void SWR_API SwrStoreTiles( ++ HANDLE hContext, ++ SWR_RENDERTARGET_ATTACHMENT attachment, ++ SWR_TILE_STATE postStoreTileState); ++ ++void SWR_API SwrClearRenderTarget( ++ HANDLE hContext, ++ uint32_t clearMask, ++ const FLOAT clearColor[4], ++ float z, ++ BYTE stencil); ++ ++void SWR_API SwrSetRastState( ++ HANDLE hContext, ++ const SWR_RASTSTATE *pRastState); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrSetViewports ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param numViewports - number of viewports passed in ++/// @param pViewports - Specifies extents of viewport. ++/// @param pMatrices - If not specified then SWR computes a default one. ++void SWR_API SwrSetViewports( ++ HANDLE hContext, ++ uint32_t numViewports, ++ const SWR_VIEWPORT* pViewports, ++ const SWR_VIEWPORT_MATRIX* pMatrices); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SwrSetScissorRects ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param numScissors - number of scissors passed in ++/// @param pScissors - array of scissors ++void SWR_API SwrSetScissorRects( ++ HANDLE hContext, ++ uint32_t numScissors, ++ const BBOX* pScissors); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Returns a pointer to the private context state for the current ++/// draw operation. This is used for external componets such as the ++/// sampler. ++/// ++/// @note Client needs to resend private state prior to each draw call. ++/// Also, SWR is responsible for the private state memory. ++/// @param hContext - Handle passed back from SwrCreateContext ++VOID* SWR_API SwrGetPrivateContextState( ++ HANDLE hContext); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Clients can use this to allocate memory for draw/dispatch ++/// operations. The memory will automatically be freed once operation ++/// has completed. Client can use this to allocate binding tables, ++/// etc. needed for shader execution. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param size - Size of allocation ++/// @param align - Alignment needed for allocation. ++VOID* SWR_API SwrAllocDrawContextMemory( ++ HANDLE hContext, ++ uint32_t size, ++ uint32_t align); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Returns pointer to SWR stats. ++/// @note The counters are incremented by multiple threads. ++/// When calling this, you need to ensure all previous operations ++/// have completed. ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param pStats - SWR will fill this out for caller. ++void SWR_API SwrGetStats( ++ HANDLE hContext, ++ SWR_STATS* pStats); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Enables stats counting ++/// @param hContext - Handle passed back from SwrCreateContext ++/// @param enable - If true then counts are incremented. ++void SWR_API SwrEnableStats( ++ HANDLE hContext, ++ bool enable); ++ ++#endif//__SWR_API_H__ +diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp +new file mode 100644 +index 0000000..bc4cfd8 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/arena.cpp +@@ -0,0 +1,126 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file arena.cpp ++* ++* @brief Arena memory manager ++* The arena is convenient and fast for managing allocations for any of ++* our allocations that are associated with operations and can all be freed ++* once when their operation has completed. Allocations are cheap since ++* most of the time its simply an increment of an offset. Also, no need to ++* free individual allocations. All of the arena memory can be freed at once. ++* ++******************************************************************************/ ++ ++#include "context.h" ++#include "arena.h" ++ ++#include ++ ++VOID Arena::Init() ++{ ++ m_memUsed = 0; ++ m_pCurBlock = nullptr; ++ m_pUsedBlocks = nullptr; ++} ++ ++VOID* Arena::AllocAligned(uint32_t size, uint32_t align) ++{ ++ if (m_pCurBlock) ++ { ++ ArenaBlock* pCurBlock = m_pCurBlock; ++ pCurBlock->offset = AlignUp(pCurBlock->offset, align); ++ ++ if ((pCurBlock->offset + size) < pCurBlock->blockSize) ++ { ++ BYTE* pMem = (BYTE*)pCurBlock->pMem + pCurBlock->offset; ++ pCurBlock->offset += size; ++ return pMem; ++ } ++ ++ // Not enough memory in this arena so lets move to a new block. ++ pCurBlock->pNext = m_pUsedBlocks; ++ m_pUsedBlocks = pCurBlock; ++ m_pCurBlock = nullptr; ++ } ++ ++ static const uint32_t ArenaBlockSize = 1024*1024; ++ uint32_t defaultBlockSize = ArenaBlockSize; ++ if (m_pUsedBlocks == nullptr) ++ { ++ // First allocation after reset. Let's make the first block be the total ++ // memory allocated during last set of allocations prior to reset. ++ defaultBlockSize = std::max(m_memUsed, defaultBlockSize); ++ m_memUsed = 0; ++ } ++ ++ uint32_t blockSize = std::max(size, defaultBlockSize); ++ blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4); ++ ++ VOID *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned. ++ SWR_ASSERT(pMem != nullptr); ++ ++ m_pCurBlock = (ArenaBlock*)malloc(sizeof(ArenaBlock)); ++ SWR_ASSERT(m_pCurBlock != nullptr); ++ ++ if (m_pCurBlock != nullptr) ++ { ++ m_pCurBlock->pMem = pMem; ++ m_pCurBlock->blockSize = blockSize; ++ m_pCurBlock->offset = size; ++ m_memUsed += blockSize; ++ } ++ ++ return pMem; ++} ++ ++VOID* Arena::Alloc(uint32_t size) ++{ ++ return AllocAligned(size, 1); ++} ++ ++VOID Arena::Reset() ++{ ++ if (m_pCurBlock) ++ { ++ m_pCurBlock->offset = 0; ++ ++ // If we needed to allocate used blocks then reset current. ++ // The next time we allocate we'll grow the current block ++ // to match all the memory allocated this for this frame. ++ if (m_pUsedBlocks) ++ { ++ m_pCurBlock->pNext = m_pUsedBlocks; ++ m_pUsedBlocks = m_pCurBlock; ++ m_pCurBlock = nullptr; ++ } ++ } ++ ++ while(m_pUsedBlocks) ++ { ++ ArenaBlock* pBlock = m_pUsedBlocks; ++ m_pUsedBlocks = pBlock->pNext; ++ ++ _aligned_free(pBlock->pMem); ++ free(pBlock); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h +new file mode 100644 +index 0000000..e98bc83 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/arena.h +@@ -0,0 +1,63 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file arena.h ++* ++* @brief Arena memory manager ++* The arena is convenient and fast for managing allocations for any of ++* our allocations that are associated with operations and can all be freed ++* once when their operation has completed. Allocations are cheap since ++* most of the time its simply an increment of an offset. Also, no need to ++* free individual allocations. All of the arena memory can be freed at once. ++* ++******************************************************************************/ ++#pragma once ++ ++class Arena ++{ ++public: ++ Arena() : m_pCurBlock(nullptr), m_pUsedBlocks(nullptr), m_memUsed(0) { } ++ ~Arena() { } ++ ++ VOID Init(); ++ ++ VOID* AllocAligned(uint32_t size, uint32_t align); ++ VOID* Alloc(uint32_t size); ++ VOID Reset(); ++ ++private: ++ ++ struct ArenaBlock ++ { ++ ArenaBlock() : pMem(nullptr), blockSize(0), pNext(nullptr) {} ++ ++ VOID *pMem; ++ uint32_t blockSize; ++ uint32_t offset; ++ ArenaBlock *pNext; ++ }; ++ ++ ArenaBlock *m_pCurBlock; ++ ArenaBlock *m_pUsedBlocks; ++ ++ uint32_t m_memUsed; // total bytes allocated since last reset. ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp +new file mode 100644 +index 0000000..9cf2b00 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp +@@ -0,0 +1,1150 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file backend.cpp ++* ++* @brief Backend handles rasterization, pixel shading and output merger ++* operations. ++* ++******************************************************************************/ ++ ++#include ++ ++#include "rdtsc_core.h" ++#include "backend.h" ++#include "depthstencil.h" ++#include "tilemgr.h" ++#include "memory/tilingtraits.h" ++#include "core/multisample.h" ++ ++#include ++ ++const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5}; ++const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5}; ++ ++/// @todo move to common lib ++#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} ++static const __m128 gMaskToVec[] = { ++ MASKTOVEC(0,0,0,0), ++ MASKTOVEC(0,0,0,1), ++ MASKTOVEC(0,0,1,0), ++ MASKTOVEC(0,0,1,1), ++ MASKTOVEC(0,1,0,0), ++ MASKTOVEC(0,1,0,1), ++ MASKTOVEC(0,1,1,0), ++ MASKTOVEC(0,1,1,1), ++ MASKTOVEC(1,0,0,0), ++ MASKTOVEC(1,0,0,1), ++ MASKTOVEC(1,0,1,0), ++ MASKTOVEC(1,0,1,1), ++ MASKTOVEC(1,1,0,0), ++ MASKTOVEC(1,1,0,1), ++ MASKTOVEC(1,1,1,0), ++ MASKTOVEC(1,1,1,1), ++}; ++ ++typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]); ++static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Process compute work. ++/// @param pDC - pointer to draw context (dispatch). ++/// @param workerId - The unique worker ID that is assigned to this thread. ++/// @param threadGroupId - the linear index for the thread group within the dispatch. ++void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId) ++{ ++ RDTSC_START(BEDispatch); ++ ++ SWR_CONTEXT *pContext = pDC->pContext; ++ ++ const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); ++ SWR_ASSERT(pTaskData != nullptr); ++ ++ const API_STATE& state = GetApiState(pDC); ++ ++ SWR_CS_CONTEXT csContext{ 0 }; ++ csContext.tileCounter = threadGroupId; ++ csContext.dispatchDims[0] = pTaskData->threadGroupCountX; ++ csContext.dispatchDims[1] = pTaskData->threadGroupCountY; ++ csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; ++ csContext.pTGSM = pContext->pScratch[workerId]; ++ ++ state.pfnCsFunc(GetPrivateState(pDC), &csContext); ++ ++ UPDATE_STAT(CsInvocations, state.totalThreadsInGroup); ++ ++ RDTSC_STOP(BEDispatch, 1, 0); ++} ++ ++void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) ++{ ++ SYNC_DESC *pSync = (SYNC_DESC*)pUserData; ++ ++ uint32_t x, y; ++ MacroTileMgr::getTileIndices(macroTile, x, y); ++ SWR_ASSERT(x == 0 && y == 0); ++ ++ if (pSync->pfnCallbackFunc != nullptr) ++ { ++ pSync->pfnCallbackFunc(pSync->userData, pSync->userData2); ++ } ++} ++ ++void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) ++{ ++ QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData; ++ SWR_STATS* pStats = pQueryDesc->pStats; ++ SWR_CONTEXT *pContext = pDC->pContext; ++ ++ SWR_ASSERT(pStats != nullptr); ++ ++ for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) ++ { ++ pStats->DepthPassCount += pContext->stats[i].DepthPassCount; ++ ++ pStats->IaVertices += pContext->stats[i].IaVertices; ++ pStats->IaPrimitives += pContext->stats[i].IaPrimitives; ++ pStats->VsInvocations += pContext->stats[i].VsInvocations; ++ pStats->HsInvocations += pContext->stats[i].HsInvocations; ++ pStats->DsInvocations += pContext->stats[i].DsInvocations; ++ pStats->GsInvocations += pContext->stats[i].GsInvocations; ++ pStats->PsInvocations += pContext->stats[i].PsInvocations; ++ pStats->CInvocations += pContext->stats[i].CInvocations; ++ pStats->CsInvocations += pContext->stats[i].CsInvocations; ++ pStats->CPrimitives += pContext->stats[i].CPrimitives; ++ pStats->GsPrimitives += pContext->stats[i].GsPrimitives; ++ ++ for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) ++ { ++ pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream]; ++ ++ /// @note client is required to provide valid write offset before every draw, so we clear ++ /// out the contents of the write offset when storing stats ++ pContext->stats[i].SoWriteOffset[stream] = 0; ++ ++ pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream]; ++ pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream]; ++ } ++ } ++} ++ ++template ++void ClearRasterTile(BYTE *pTileBuffer, simdvector &value) ++{ ++ auto lambda = [&](int comp) ++ { ++ FormatTraits::storeSOA(comp, pTileBuffer, value.v[comp]); ++ pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits::GetBPC(comp) / 8); ++ }; ++ ++ const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM); ++ for (uint32_t i = 0; i < numIter; ++i) ++ { ++ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); ++ } ++} ++ ++template ++INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4]) ++{ ++ // convert clear color to hottile format ++ // clear color is in RGBA float/uint32 ++ simdvector vClear; ++ for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) ++ { ++ simdscalar vComp; ++ vComp = _simd_load1_ps((const float*)&clear[comp]); ++ if (FormatTraits::isNormalized(comp)) ++ { ++ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); ++ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); ++ } ++ vComp = FormatTraits::pack(comp, vComp); ++ vClear.v[FormatTraits::swizzle(comp)] = vComp; ++ } ++ ++ uint32_t tileX, tileY; ++ MacroTileMgr::getTileIndices(macroTile, tileX, tileY); ++ const API_STATE& state = GetApiState(pDC); ++ ++ int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY; ++ int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1; ++ int left = KNOB_MACROTILE_X_DIM_FIXED * tileX; ++ int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1; ++ ++ // intersect with scissor ++ top = std::max(top, state.scissorInFixedPoint.top); ++ left = std::max(left, state.scissorInFixedPoint.left); ++ bottom = std::min(bottom, state.scissorInFixedPoint.bottom); ++ right = std::min(right, state.scissorInFixedPoint.right); ++ ++ // translate to local hottile origin ++ top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY; ++ bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY; ++ left -= KNOB_MACROTILE_X_DIM_FIXED * tileX; ++ right -= KNOB_MACROTILE_X_DIM_FIXED * tileX; ++ ++ // convert to raster tiles ++ top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); ++ bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); ++ left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); ++ right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); ++ ++ const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); ++ // compute steps between raster tile samples / raster tiles / macro tile rows ++ const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8; ++ const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * numSamples; ++ const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep; ++ const uint32_t pitch = (FormatTraits::bpp * KNOB_MACROTILE_X_DIM / 8); ++ ++ HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples); ++ uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits::bpp > >(pitch, left, top)) * numSamples; ++ uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits::bpp > >(pitch, x, y)) * numSamples; ++ ++ // loop over all raster tiles in the current hot tile ++ for (int y = top; y <= bottom; ++y) ++ { ++ uint8_t* pRasterTile = pRasterTileRow; ++ for (int x = left; x <= right; ++x) ++ { ++ for( int sampleNum = 0; sampleNum < numSamples; sampleNum++) ++ { ++ ClearRasterTile(pRasterTile, vClear); ++ pRasterTile += rasterTileSampleStep; ++ } ++ } ++ pRasterTileRow += macroTileRowStep; ++ } ++ ++ pHotTile->state = HOTTILE_DIRTY; ++} ++ ++ ++void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData) ++{ ++ if (KNOB_FAST_CLEAR) ++ { ++ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; ++ SWR_CONTEXT *pContext = pDC->pContext; ++ SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; ++ uint32_t numSamples = GetNumSamples(sampleCount); ++ ++ SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason. ++ ++ RDTSC_START(BEClear); ++ ++ if (pClear->flags.mask & SWR_CLEAR_COLOR) ++ { ++ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples); ++ // All we want to do here is to mark the hot tile as being in a "needs clear" state. ++ pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]); ++ pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]); ++ pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]); ++ pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]); ++ pHotTile->state = HOTTILE_CLEAR; ++ } ++ ++ if (pClear->flags.mask & SWR_CLEAR_DEPTH) ++ { ++ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples); ++ pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth; ++ pHotTile->state = HOTTILE_CLEAR; ++ } ++ ++ if (pClear->flags.mask & SWR_CLEAR_STENCIL) ++ { ++ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples); ++ ++ pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil; ++ pHotTile->state = HOTTILE_CLEAR; ++ } ++ ++ RDTSC_STOP(BEClear, 0, 0); ++ } ++ else ++ { ++ // Legacy clear ++ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; ++ RDTSC_START(BEClear); ++ ++ if (pClear->flags.mask & SWR_CLEAR_COLOR) ++ { ++ /// @todo clear data should come in as RGBA32_FLOAT ++ DWORD clearData[4]; ++ float clearFloat[4]; ++ clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f; ++ clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f; ++ clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f; ++ clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f; ++ clearData[0] = *(DWORD*)&clearFloat[0]; ++ clearData[1] = *(DWORD*)&clearFloat[1]; ++ clearData[2] = *(DWORD*)&clearFloat[2]; ++ clearData[3] = *(DWORD*)&clearFloat[3]; ++ ++ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; ++ SWR_ASSERT(pfnClearTiles != nullptr); ++ ++ pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData); ++ } ++ ++ if (pClear->flags.mask & SWR_CLEAR_DEPTH) ++ { ++ DWORD clearData[4]; ++ clearData[0] = *(DWORD*)&pClear->clearDepth; ++ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; ++ SWR_ASSERT(pfnClearTiles != nullptr); ++ ++ pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData); ++ } ++ ++ if (pClear->flags.mask & SWR_CLEAR_STENCIL) ++ { ++ uint32_t value = pClear->clearStencil; ++ DWORD clearData[4]; ++ clearData[0] = *(DWORD*)&value; ++ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; ++ ++ pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData); ++ } ++ ++ RDTSC_STOP(BEClear, 0, 0); ++ } ++} ++ ++ ++void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) ++{ ++ RDTSC_START(BEStoreTiles); ++ STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; ++ SWR_CONTEXT *pContext = pDC->pContext; ++ ++#ifdef KNOB_ENABLE_RDTSC ++ uint32_t numTiles = 0; ++#endif ++ SWR_FORMAT srcFormat; ++ switch (pDesc->attachment) ++ { ++ case SWR_ATTACHMENT_COLOR0: ++ case SWR_ATTACHMENT_COLOR1: ++ case SWR_ATTACHMENT_COLOR2: ++ case SWR_ATTACHMENT_COLOR3: ++ case SWR_ATTACHMENT_COLOR4: ++ case SWR_ATTACHMENT_COLOR5: ++ case SWR_ATTACHMENT_COLOR6: ++ case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; ++ case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; ++ case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; ++ default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; ++ } ++ ++ uint32_t x, y; ++ MacroTileMgr::getTileIndices(macroTile, x, y); ++ ++ // Only need to store the hottile if it's been rendered to... ++ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false); ++ if (pHotTile) ++ { ++ // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. ++ if (pHotTile->state == HOTTILE_CLEAR) ++ { ++ PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat]; ++ SWR_ASSERT(pfnClearTiles != nullptr); ++ ++ pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData); ++ } ++ ++ if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) ++ { ++ int destX = KNOB_MACROTILE_X_DIM * x; ++ int destY = KNOB_MACROTILE_Y_DIM * y; ++ ++ pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, ++ pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); ++ } ++ ++ ++ if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) ++ { ++ pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; ++ } ++ } ++ RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId); ++} ++ ++ ++void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) ++{ ++ INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData; ++ SWR_CONTEXT *pContext = pDC->pContext; ++ ++ for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) ++ { ++ if (pDesc->attachmentMask & (1 << i)) ++ { ++ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false); ++ if (pHotTile) ++ { ++ pHotTile->state = HOTTILE_INVALID; ++ } ++ } ++ } ++} ++ ++#if KNOB_SIMD_WIDTH == 8 ++const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 }; ++const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 }; ++const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; ++const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; ++#define MASK 0xff ++#else ++#error Unsupported vector width ++#endif ++ ++INLINE ++bool CanEarlyZ(const SWR_PS_STATE *pPSState) ++{ ++ return (!pPSState->writesODepth && !pPSState->usesSourceDepth); ++} ++ ++simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ) ++{ ++ simdscalar vClipMask = _simd_setzero_ps(); ++ uint32_t numClipDistance = _mm_popcnt_u32(clipMask); ++ ++ for (uint32_t i = 0; i < numClipDistance; ++i) ++ { ++ // pull triangle clip distance values from clip buffer ++ simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++); ++ simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++); ++ simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++); ++ ++ // interpolate ++ simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ); ++ ++ // clip if interpolated clip distance is < 0 || NAN ++ simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ); ++ ++ vClipMask = _simd_or_ps(vClipMask, vCull); ++ } ++ ++ return _simd_movemask_ps(vClipMask); ++} ++ ++template ++void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) ++{ ++ RDTSC_START(BESetup); ++ ++ SWR_CONTEXT *pContext = pDC->pContext; ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_RASTSTATE& rastState = state.rastState; ++ const SWR_PS_STATE *pPSState = &state.psState; ++ const SWR_BLEND_STATE *pBlendState = &state.blendState; ++ ++ // broadcast scalars ++ simdscalar vIa = _simd_broadcast_ss(&work.I[0]); ++ simdscalar vIb = _simd_broadcast_ss(&work.I[1]); ++ simdscalar vIc = _simd_broadcast_ss(&work.I[2]); ++ ++ simdscalar vJa = _simd_broadcast_ss(&work.J[0]); ++ simdscalar vJb = _simd_broadcast_ss(&work.J[1]); ++ simdscalar vJc = _simd_broadcast_ss(&work.J[2]); ++ ++ simdscalar vZa = _simd_broadcast_ss(&work.Z[0]); ++ simdscalar vZb = _simd_broadcast_ss(&work.Z[1]); ++ simdscalar vZc = _simd_broadcast_ss(&work.Z[2]); ++ ++ simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet); ++ ++ simdscalar vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); ++ simdscalar vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); ++ simdscalar vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); ++ ++ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ pColorBase[rt] = renderBuffers.pColor[rt]; ++ } ++ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; ++ RDTSC_STOP(BESetup, 0, 0); ++ ++ SWR_PS_CONTEXT psContext; ++ psContext.pAttribs = work.pAttribs; ++ psContext.pPerspAttribs = work.pPerspAttribs; ++ psContext.frontFace = work.triFlags.frontFacing; ++ psContext.primID = work.triFlags.primID; ++ ++ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs ++ psContext.I = work.I; ++ psContext.J = work.J; ++ psContext.recipDet = work.recipDet; ++ psContext.pSamplePos = work.pSamplePos; ++ const uint32_t numSamples = MultisampleTraits::numSamples; ++ ++ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) ++ { ++ simdscalar vYSamplePosUL; ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ // pixel center ++ psContext.vY = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); ++ } ++ else ++ { ++ // UL pixel corner ++ vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); ++ } ++ ++ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) ++ { ++ simdscalar vXSamplePosUL; ++ if(sampleCount > SWR_MULTISAMPLE_1X) ++ { ++ // UL pixel corner ++ vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); ++ } ++ ++ // @todo: uint32_t sampleMask = state.rastState.sampleMask & MultisampleTraits::sampleMask; ++ for(uint32_t sample = 0; sample < numSamples; sample++) ++ { ++ /// @todo: sampleMask / inputcoverage ++ if (work.coverageMask[sample] & MASK) ++ { ++ RDTSC_START(BEBarycentric); ++ ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ // pixel center ++ psContext.vX = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); ++ } ++ else ++ { ++ // calculate per sample positions ++ psContext.vX = _simd_add_ps(vXSamplePosUL, MultisampleTraits::vX(sample)); ++ psContext.vY = _simd_add_ps(vYSamplePosUL, MultisampleTraits::vY(sample)); ++ } ++ ++ // evaluate I,J ++ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); ++ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); ++ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); ++ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); ++ ++ // interpolate z ++ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); ++ RDTSC_STOP(BEBarycentric, 0, 0); ++ ++ simdmask coverageMask = work.coverageMask[sample] & MASK; ++ ++ // interpolate user clip distance if available ++ if (rastState.clipDistanceMask) ++ { ++ coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, ++ psContext.vI, psContext.vJ); ++ } ++ ++ simdscalar depthPassMask = vMask(coverageMask); ++ ++ uint8_t *pDepthSample, *pStencilSample; ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ pDepthSample = pDepthBase; ++ pStencilSample = pStencilBase; ++ } ++ else ++ { ++ // offset depth/stencil buffers current sample ++ pDepthSample = pDepthBase + MultisampleTraits::RasterTileDepthOffset(sample); ++ pStencilSample = pStencilBase + MultisampleTraits::RasterTileStencilOffset(sample); ++ } ++ ++ // Early-Z? ++ if (CanEarlyZ(pPSState)) ++ { ++ RDTSC_START(BEEarlyDepthTest); ++ depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, ++ psContext.vZ, pDepthBase, depthPassMask, pStencilBase, pPSState->killsPixel); ++ RDTSC_STOP(BEEarlyDepthTest, 0, 0); ++ ++ if (!_simd_movemask_ps(depthPassMask)) ++ { ++ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); ++ continue; ++ } ++ } ++ ++ // interpolate 1/w ++ psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ); ++ psContext.sampleIndex = sample; ++ psContext.mask = _simd_castps_si(depthPassMask); ++ ++ // execute pixel shader ++ RDTSC_START(BEPixelShader); ++ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); ++ RDTSC_STOP(BEPixelShader, 0, 0); ++ ++ depthPassMask = _simd_castsi_ps(psContext.mask); ++ ++ //// late-Z ++ if (!CanEarlyZ(pPSState) || pPSState->killsPixel) ++ { ++ RDTSC_START(BELateDepthTest); ++ depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, ++ psContext.vZ, pDepthSample, depthPassMask, pStencilSample, false); ++ RDTSC_STOP(BELateDepthTest, 0, 0); ++ ++ if (!_simd_movemask_ps(depthPassMask)) ++ { ++ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); ++ continue; ++ } ++ } ++ ++ uint32_t statMask = _simd_movemask_ps(depthPassMask); ++ uint32_t statCount = _mm_popcnt_u32(statMask); ++ UPDATE_STAT(DepthPassCount, statCount); ++ ++ simdscalari mask = _simd_castps_si(depthPassMask); ++ ++ // output merger ++ RDTSC_START(BEOutputMerger); ++ ++ if(sampleCount != SWR_MULTISAMPLE_1X) ++ { ++ if(rastState.isSampleMasked[sample]) ++ { ++ continue; ++ } ++ } ++ ++ uint32_t rasterTileColorOffset = MultisampleTraits::RasterTileColorOffset(sample); ++ for (uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ uint8_t *pColorSample; ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ pColorSample = pColorBase[rt]; ++ } ++ else ++ { ++ pColorSample = pColorBase[rt] + rasterTileColorOffset; ++ } ++ ++ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; ++ ++ // Blend outputs ++ if (pRTBlend->colorBlendEnable) ++ { ++ state.pfnBlendFunc[rt](pBlendState, psContext.shaded[rt], psContext.shaded[1], pColorSample, psContext.shaded[rt]); ++ } ++ ++ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. ++ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); ++ ++ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); ++ ++ // store with color mask ++ if (!pRTBlend->writeDisableRed) ++ { ++ _simd_maskstore_ps((float*)pColorSample, mask, psContext.shaded[rt].x); ++ } ++ if (!pRTBlend->writeDisableGreen) ++ { ++ _simd_maskstore_ps((float*)(pColorSample + simd), mask, psContext.shaded[rt].y); ++ } ++ if (!pRTBlend->writeDisableBlue) ++ { ++ _simd_maskstore_ps((float*)(pColorSample + simd * 2), mask, psContext.shaded[rt].z); ++ } ++ if (!pRTBlend->writeDisableAlpha) ++ { ++ _simd_maskstore_ps((float*)(pColorSample + simd * 3), mask, psContext.shaded[rt].w); ++ } ++ } ++ ++ RDTSC_STOP(BEOutputMerger, 0, 0); ++ } ++ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); ++ } ++ RDTSC_START(BEEndTile); ++ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ ++ for (uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ } ++ RDTSC_STOP(BEEndTile, 0, 0); ++ } ++ } ++} ++ ++template ++void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) ++{ ++ RDTSC_START(BESetup); ++ ++ SWR_CONTEXT *pContext = pDC->pContext; ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_RASTSTATE& rastState = state.rastState; ++ const SWR_PS_STATE *pPSState = &state.psState; ++ const SWR_BLEND_STATE *pBlendState = &state.blendState; ++ ++ // broadcast scalars ++ simdscalar vIa = _simd_broadcast_ss(&work.I[0]); ++ simdscalar vIb = _simd_broadcast_ss(&work.I[1]); ++ simdscalar vIc = _simd_broadcast_ss(&work.I[2]); ++ ++ simdscalar vJa = _simd_broadcast_ss(&work.J[0]); ++ simdscalar vJb = _simd_broadcast_ss(&work.J[1]); ++ simdscalar vJc = _simd_broadcast_ss(&work.J[2]); ++ ++ simdscalar vZa = _simd_broadcast_ss(&work.Z[0]); ++ simdscalar vZb = _simd_broadcast_ss(&work.Z[1]); ++ simdscalar vZc = _simd_broadcast_ss(&work.Z[2]); ++ ++ simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet); ++ ++ simdscalar vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]); ++ simdscalar vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]); ++ simdscalar vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); ++ ++ uint8_t *pColorBase[SWR_NUM_RENDERTARGETS]; ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ pColorBase[rt] = renderBuffers.pColor[rt]; ++ } ++ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; ++ RDTSC_STOP(BESetup, 0, 0); ++ ++ SWR_PS_CONTEXT psContext; ++ psContext.pAttribs = work.pAttribs; ++ psContext.pPerspAttribs = work.pPerspAttribs; ++ psContext.frontFace = work.triFlags.frontFacing; ++ psContext.primID = work.triFlags.primID; ++ ++ // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs ++ psContext.I = work.I; ++ psContext.J = work.J; ++ psContext.recipDet = work.recipDet; ++ psContext.pSamplePos = work.pSamplePos; ++ psContext.sampleIndex = 0; ++ ++ const uint32_t numSamples = MultisampleTraits::numSamples; ++ for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) ++ { ++ simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); ++ simdscalar vYSamplePosCenter = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); ++ for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) ++ { ++ simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); ++ simdscalar vXSamplePosCenter = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); ++ ++ // if oDepth written to, or there is a potential to discard any samples, we need to ++ // run the PS early, then interp or broadcast Z and test ++ if(pPSState->writesODepth || pPSState->killsPixel) ++ { ++ RDTSC_START(BEBarycentric); ++ // set pixel center positions ++ psContext.vX = vXSamplePosCenter; ++ psContext.vY = vYSamplePosCenter; ++ ++ // evaluate I, J at pixel center ++ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); ++ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); ++ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); ++ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); ++ ++ // interpolate z ++ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); ++ ++ RDTSC_STOP(BEBarycentric, 0, 0); ++ ++ // interpolate 1/w ++ psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ); ++ ++ /// @todo: sampleMask / inputcoverage ++ // for now just pass in all 1s ++ psContext.mask = _simd_set1_epi32(-1); ++ ++ // execute pixel shader ++ RDTSC_START(BEPixelShader); ++ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); ++ RDTSC_STOP(BEPixelShader, 0, 0); ++ } ++ else ++ { ++ /// @todo: sampleMask / inputcoverage ++ // for now just through full pixel output ++ psContext.mask = _simd_set1_epi32(-1); ++ } ++ ++ simdscalar depthPassMask[numSamples]; ++ simdscalar anyDepthSamplePassed = _simd_setzero_ps(); ++ for(uint32_t sample = 0; sample < numSamples; sample++) ++ { ++ /// @todo: sampleMask / inputcoverage ++ depthPassMask[sample] = vMask(work.coverageMask[sample] & MASK); ++ // pull mask back out for any discards and and with coverage ++ depthPassMask[sample] = _simd_and_ps(depthPassMask[sample], _simd_castsi_ps(psContext.mask)); ++ ++ if (!_simd_movemask_ps(depthPassMask[sample])) ++ { ++ depthPassMask[sample] = _simd_setzero_ps(); ++ continue; ++ } ++ ++ // if oDepth isn't written to, we need to interpolate Z for each sample ++ // if clip distances are enabled, we need to interpolate for each sample ++ if(!pPSState->writesODepth || rastState.clipDistanceMask) ++ { ++ RDTSC_START(BEBarycentric); ++ // calculate per sample positions ++ simdscalar vSamplePosX = _simd_add_ps(vXSamplePosUL, MultisampleTraits::vX(sample)); ++ simdscalar vSamplePosY = _simd_add_ps(vYSamplePosUL, MultisampleTraits::vY(sample)); ++ ++ // evaluate I,J at sample positions ++ psContext.vI = vplaneps(vIa, vIb, vIc, vSamplePosX, vSamplePosY); ++ psContext.vJ = vplaneps(vJa, vJb, vJc, vSamplePosX, vSamplePosY); ++ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); ++ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); ++ ++ // interpolate z ++ if (!pPSState->writesODepth) ++ { ++ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); ++ } ++ ++ // interpolate clip distances ++ if (rastState.clipDistanceMask) ++ { ++ uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, ++ psContext.vI, psContext.vJ); ++ depthPassMask[sample] = _simd_and_ps(depthPassMask[sample], vMask(~clipMask)); ++ } ++ RDTSC_STOP(BEBarycentric, 0, 0); ++ } ++ // else 'broadcast' and test psContext.vZ from the PS invocation for each sample ++ ++ // offset depth/stencil buffers current sample ++ uint8_t *pDepthSample = pDepthBase + MultisampleTraits::RasterTileDepthOffset(sample); ++ uint8_t * pStencilSample = pStencilBase + MultisampleTraits::RasterTileStencilOffset(sample); ++ ++ // ZTest for this sample ++ RDTSC_START(BEEarlyDepthTest); ++ depthPassMask[sample] = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, ++ psContext.vZ, pDepthSample, depthPassMask[sample], pStencilSample, false); ++ RDTSC_STOP(BEEarlyDepthTest, 0, 0); ++ ++ anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]); ++ ++ uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]); ++ uint32_t statCount = _mm_popcnt_u32(statMask); ++ UPDATE_STAT(DepthPassCount, statCount); ++ } ++ ++ // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS ++ if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed)) ++ { ++ RDTSC_START(BEBarycentric); ++ // set pixel center positions ++ psContext.vX = vXSamplePosCenter; ++ psContext.vY = vYSamplePosCenter; ++ ++ // evaluate I,J at pixel center ++ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); ++ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); ++ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); ++ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); ++ ++ // interpolate z ++ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); ++ RDTSC_STOP(BEBarycentric, 0, 0); ++ ++ // interpolate 1/w ++ psContext.vOneOverW = vplaneps(vAOneOverW, vBOneOverW, vCOneOverW, psContext.vI, psContext.vJ); ++ ++ // execute pixel shader ++ RDTSC_START(BEPixelShader); ++ state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); ++ RDTSC_STOP(BEPixelShader, 0, 0); ++ } ++ else ++ { ++ goto Endtile; ++ } ++ ++ // loop over all samples, broadcasting the results of the PS to all passing pixels ++ for(uint32_t sample = 0; sample < numSamples; sample++) ++ { ++ if(sampleCount != SWR_MULTISAMPLE_1X) ++ { ++ if(rastState.isSampleMasked[sample]) ++ continue; ++ } ++ ++ // output merger ++ RDTSC_START(BEOutputMerger); ++ // skip if none of the pixels for this sample passed ++ if(!_simd_movemask_ps(depthPassMask[sample])) ++ { ++ depthPassMask[sample] = _simd_setzero_ps(); ++ continue; ++ } ++ simdscalari mask = _simd_castps_si(depthPassMask[sample]); ++ uint32_t rasterTileColorOffset = MultisampleTraits::RasterTileColorOffset(sample); ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; ++ ++ const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; ++ ++ // Blend outputs ++ if(pRTBlend->colorBlendEnable) ++ { ++ state.pfnBlendFunc[rt](pBlendState, psContext.shaded[rt], psContext.shaded[1], pColorSample, psContext.shaded[rt]); ++ } ++ ++ ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT. ++ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); ++ ++ const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float); ++ ++ // store with color mask ++ if(!pRTBlend->writeDisableRed) ++ { ++ _simd_maskstore_ps((float*)pColorSample, mask, psContext.shaded[rt].x); ++ } ++ if(!pRTBlend->writeDisableGreen) ++ { ++ _simd_maskstore_ps((float*)(pColorSample + simd), mask, psContext.shaded[rt].y); ++ } ++ if(!pRTBlend->writeDisableBlue) ++ { ++ _simd_maskstore_ps((float*)(pColorSample + simd * 2), mask, psContext.shaded[rt].z); ++ } ++ if(!pRTBlend->writeDisableAlpha) ++ { ++ _simd_maskstore_ps((float*)(pColorSample + simd * 3), mask, psContext.shaded[rt].w); ++ } ++ } ++ RDTSC_STOP(BEOutputMerger, 0, 0); ++ } ++ ++Endtile: ++ RDTSC_START(BEEndTile); ++ for(uint32_t sample = 0; sample < numSamples; sample++) ++ { ++ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); ++ } ++ ++ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ } ++ RDTSC_STOP(BEEndTile, 0, 0); ++ } ++ } ++} ++// optimized backend flow with NULL PS ++void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) ++{ ++ RDTSC_START(BESetup); ++ ++ SWR_CONTEXT *pContext = pDC->pContext; ++ const API_STATE& state = GetApiState(pDC); ++ // todo multisample ++ uint64_t coverageMask = work.coverageMask[0]; ++ ++ // broadcast scalars ++ simdscalar vIa = _simd_broadcast_ss(&work.I[0]); ++ simdscalar vIb = _simd_broadcast_ss(&work.I[1]); ++ simdscalar vIc = _simd_broadcast_ss(&work.I[2]); ++ ++ simdscalar vJa = _simd_broadcast_ss(&work.J[0]); ++ simdscalar vJb = _simd_broadcast_ss(&work.J[1]); ++ simdscalar vJc = _simd_broadcast_ss(&work.J[2]); ++ ++ simdscalar vZa = _simd_broadcast_ss(&work.Z[0]); ++ simdscalar vZb = _simd_broadcast_ss(&work.Z[1]); ++ simdscalar vZc = _simd_broadcast_ss(&work.Z[2]); ++ ++ simdscalar vRecipDet = _simd_broadcast_ss(&work.recipDet); ++ ++ BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; ++ ++ RDTSC_STOP(BESetup, 0, 0); ++ ++ SWR_PS_CONTEXT psContext; ++ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) ++ { ++ psContext.vY = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); ++ for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) ++ { ++ if (coverageMask & MASK) ++ { ++ RDTSC_START(BEBarycentric); ++ ++ // calculate pixel positions ++ psContext.vX = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); ++ ++ // evaluate I,J ++ psContext.vI = vplaneps(vIa, vIb, vIc, psContext.vX, psContext.vY); ++ psContext.vJ = vplaneps(vJa, vJb, vJc, psContext.vX, psContext.vY); ++ psContext.vI = _simd_mul_ps(psContext.vI, vRecipDet); ++ psContext.vJ = _simd_mul_ps(psContext.vJ, vRecipDet); ++ ++ // interpolate z ++ psContext.vZ = vplaneps(vZa, vZb, vZc, psContext.vI, psContext.vJ); ++ ++ RDTSC_STOP(BEBarycentric, 0, 0); ++ ++ simdscalar depthPassMask = vMask(coverageMask & MASK); ++ RDTSC_START(BEEarlyDepthTest); ++ depthPassMask = ZTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, ++ psContext.vZ, pDepthBase, depthPassMask, pStencilBase, false); ++ RDTSC_STOP(BEEarlyDepthTest, 0, 0); ++ ++ uint32_t statMask = _simd_movemask_ps(depthPassMask); ++ uint32_t statCount = _mm_popcnt_u32(statMask); ++ UPDATE_STAT(DepthPassCount, statCount); ++ } ++ coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); ++ pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; ++ } ++ } ++} ++ ++void InitClearTilesTable() ++{ ++ memset(sClearTilesTable, 0, sizeof(sClearTilesTable)); ++ ++ sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile; ++ sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile; ++ sClearTilesTable[R32_FLOAT] = ClearMacroTile; ++ sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile; ++ sClearTilesTable[R8_UINT] = ClearMacroTile; ++} ++ ++// initialize backend function tables ++PFN_BACKEND_FUNC gSingleSampleBackendTable[] = { ++ BackendSampleRate<0, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<1, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<2, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<3, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<4, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<5, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<6, SWR_MULTISAMPLE_1X>, ++ BackendSampleRate<7, SWR_MULTISAMPLE_1X>, ++}; ++ ++// MSAA per sample shading rate ++PFN_BACKEND_FUNC gSampleRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS] ={ ++ { ++ BackendSampleRate<0, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<1, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<2, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<3, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<4, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<5, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<6, SWR_MULTISAMPLE_2X>, ++ BackendSampleRate<7, SWR_MULTISAMPLE_2X>, ++ }, ++ { ++ BackendSampleRate<0, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<1, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<2, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<3, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<4, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<5, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<6, SWR_MULTISAMPLE_4X>, ++ BackendSampleRate<7, SWR_MULTISAMPLE_4X>, ++ }, ++ { ++ BackendSampleRate<0, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<1, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<2, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<3, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<4, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<5, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<6, SWR_MULTISAMPLE_8X>, ++ BackendSampleRate<7, SWR_MULTISAMPLE_8X>, ++ }, ++ { ++ BackendSampleRate<0, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<1, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<2, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<3, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<4, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<5, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<6, SWR_MULTISAMPLE_16X>, ++ BackendSampleRate<7, SWR_MULTISAMPLE_16X>, ++ } ++}; ++ ++// MSAA per pixel shading rate ++PFN_BACKEND_FUNC gPixelRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS] ={ ++ { ++ BackendPixelRate<0, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<1, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<2, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<3, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<4, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<5, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<6, SWR_MULTISAMPLE_2X>, ++ BackendPixelRate<7, SWR_MULTISAMPLE_2X>, ++ }, ++ { ++ BackendPixelRate<0, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<1, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<2, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<3, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<4, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<5, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<6, SWR_MULTISAMPLE_4X>, ++ BackendPixelRate<7, SWR_MULTISAMPLE_4X>, ++ }, ++ { ++ BackendPixelRate<0, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<1, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<2, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<3, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<4, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<5, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<6, SWR_MULTISAMPLE_8X>, ++ BackendPixelRate<7, SWR_MULTISAMPLE_8X>, ++ }, ++ { ++ BackendPixelRate<0, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<1, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<2, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<3, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<4, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<5, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<6, SWR_MULTISAMPLE_16X>, ++ BackendPixelRate<7, SWR_MULTISAMPLE_16X>, ++ } ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h +new file mode 100644 +index 0000000..218f5c0 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/backend.h +@@ -0,0 +1,45 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file backend.h ++* ++* @brief Backend handles rasterization, pixel shading and output merger ++* operations. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/os.h" ++#include "core/context.h" ++ ++void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); ++void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); ++void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); ++void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); ++void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); ++void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); ++void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); ++void InitClearTilesTable(); ++ ++extern PFN_BACKEND_FUNC gSingleSampleBackendTable[]; ++extern PFN_BACKEND_FUNC gSampleRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS]; ++extern PFN_BACKEND_FUNC gPixelRateBackendTable[SWR_MULTISAMPLE_TYPE_MAX-1][SWR_NUM_RENDERTARGETS]; +diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h +new file mode 100644 +index 0000000..626c237 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/blend.h +@@ -0,0 +1,318 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file blend.cpp ++* ++* @brief Implementation for blending operations. ++* ++******************************************************************************/ ++#include "state.h" ++ ++template ++INLINE ++void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out) ++{ ++ simdvector result; ++ ++ switch (func) ++ { ++ case BLENDFACTOR_ZERO: ++ result.x = _simd_setzero_ps(); ++ result.y = _simd_setzero_ps(); ++ result.z = _simd_setzero_ps(); ++ result.w = _simd_setzero_ps(); ++ break; ++ ++ case BLENDFACTOR_ONE: ++ result.x = _simd_set1_ps(1.0); ++ result.y = _simd_set1_ps(1.0); ++ result.z = _simd_set1_ps(1.0); ++ result.w = _simd_set1_ps(1.0); ++ break; ++ ++ case BLENDFACTOR_SRC_COLOR: ++ result = src; ++ break; ++ ++ case BLENDFACTOR_DST_COLOR: ++ result = dst; ++ break; ++ ++ case BLENDFACTOR_INV_SRC_COLOR: ++ result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x); ++ result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y); ++ result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z); ++ result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w); ++ break; ++ ++ case BLENDFACTOR_INV_DST_COLOR: ++ result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x); ++ result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y); ++ result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z); ++ result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); ++ break; ++ ++ case BLENDFACTOR_SRC_ALPHA: result.x = src.w; ++ result.y = src.w; ++ result.z = src.w; ++ result.w = src.w; ++ break; ++ ++ case BLENDFACTOR_INV_SRC_ALPHA: ++ { ++ simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w); ++ result.x = oneMinusSrcA; ++ result.y = oneMinusSrcA; ++ result.z = oneMinusSrcA; ++ result.w = oneMinusSrcA; ++ break; ++ } ++ ++ case BLENDFACTOR_DST_ALPHA: result.x = dst.w; ++ result.y = dst.w; ++ result.z = dst.w; ++ result.w = dst.w; ++ break; ++ ++ case BLENDFACTOR_INV_DST_ALPHA: ++ { ++ simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w); ++ result.x = oneMinusDstA; ++ result.y = oneMinusDstA; ++ result.z = oneMinusDstA; ++ result.w = oneMinusDstA; ++ break; ++ } ++ ++ case BLENDFACTOR_SRC_ALPHA_SATURATE: ++ { ++ simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w)); ++ result.x = sat; ++ result.y = sat; ++ result.z = sat; ++ result.w = _simd_set1_ps(1.0); ++ break; ++ } ++ ++ case BLENDFACTOR_CONST_COLOR: ++ result.x = constantColor[0]; ++ result.y = constantColor[1]; ++ result.z = constantColor[2]; ++ result.w = constantColor[3]; ++ break; ++ ++ case BLENDFACTOR_CONST_ALPHA: ++ result.x = result.y = result.z = result.w = constantColor[3]; ++ break; ++ ++ case BLENDFACTOR_INV_CONST_COLOR: ++ { ++ result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]); ++ result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]); ++ result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]); ++ result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); ++ break; ++ } ++ ++ case BLENDFACTOR_INV_CONST_ALPHA: ++ { ++ result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]); ++ break; ++ } ++ ++ case BLENDFACTOR_SRC1_COLOR: ++ result.x = src1.x; ++ result.y = src1.y; ++ result.z = src1.z; ++ result.w = src1.w; ++ break; ++ ++ case BLENDFACTOR_SRC1_ALPHA: ++ result.x = result.y = result.z = result.w = src1.w; ++ break; ++ ++ case BLENDFACTOR_INV_SRC1_COLOR: ++ result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x); ++ result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y); ++ result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z); ++ result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); ++ break; ++ ++ case BLENDFACTOR_INV_SRC1_ALPHA: ++ result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w); ++ break; ++ ++ default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func); ++ } ++ ++ if (Color) ++ { ++ out.x = result.x; ++ out.y = result.y; ++ out.z = result.z; ++ } ++ if (Alpha) ++ { ++ out.w = result.w; ++ } ++ ++} ++ ++template ++INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out) ++{ ++ simdvector result; ++ ++ switch (blendOp) ++ { ++ case BLENDOP_ADD: ++ result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); ++ result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); ++ result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); ++ result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); ++ break; ++ ++ case BLENDOP_SUBTRACT: ++ result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x)); ++ result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y)); ++ result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z)); ++ result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w)); ++ break; ++ ++ case BLENDOP_REVSUBTRACT: ++ result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x)); ++ result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y)); ++ result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z)); ++ result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w)); ++ break; ++ ++ case BLENDOP_MIN: ++ result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); ++ result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); ++ result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); ++ result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); ++ break; ++ ++ case BLENDOP_MAX: ++ result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x)); ++ result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y)); ++ result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z)); ++ result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w)); ++ break; ++ ++ default: ++ SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp); ++ } ++ ++ if (Color) ++ { ++ out.x = result.x; ++ out.y = result.y; ++ out.z = result.z; ++ } ++ if (Alpha) ++ { ++ out.w = result.w; ++ } ++} ++ ++template ++INLINE void Clamp(simdvector &src) ++{ ++ switch (type) ++ { ++ case SWR_TYPE_FLOAT: ++ break; ++ ++ case SWR_TYPE_UNORM: ++ src.x = _simd_max_ps(src.x, _simd_setzero_ps()); ++ src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); ++ ++ src.y = _simd_max_ps(src.y, _simd_setzero_ps()); ++ src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); ++ ++ src.z = _simd_max_ps(src.z, _simd_setzero_ps()); ++ src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); ++ ++ src.w = _simd_max_ps(src.w, _simd_setzero_ps()); ++ src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); ++ break; ++ ++ case SWR_TYPE_SNORM: ++ src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f)); ++ src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f)); ++ ++ src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f)); ++ src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f)); ++ ++ src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f)); ++ src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f)); ++ ++ src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f)); ++ src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f)); ++ break; ++ ++ default: ++ SWR_ASSERT(false, "Unimplemented clamp: %d", type); ++ break; ++ } ++} ++ ++template ++void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result) ++{ ++ // load render target ++ simdvector dst; ++ LoadSOA(pDst, dst); ++ ++ simdvector constColor; ++ constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]); ++ constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]); ++ constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]); ++ constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]); ++ ++ // clamp src/dst/constant ++ Clamp(src); ++ Clamp(src1); ++ Clamp(dst); ++ Clamp(constColor); ++ ++ simdvector srcFactor, dstFactor; ++ if (pBlendState->independentAlphaBlendEnable) ++ { ++ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); ++ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor); ++ ++ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); ++ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor); ++ ++ BlendFunc((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); ++ BlendFunc((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result); ++ } ++ else ++ { ++ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor); ++ GenerateBlendFactor((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor); ++ ++ BlendFunc((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp +new file mode 100644 +index 0000000..ce27bf7 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp +@@ -0,0 +1,201 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file clip.cpp ++* ++* @brief Implementation for clipping ++* ++******************************************************************************/ ++ ++#include ++ ++#include "common/os.h" ++#include "core/clip.h" ++ ++float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) ++{ ++ return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); ++} ++ ++template ++inline void intersect( ++ int s, // index to first edge vertex v0 in pInPts. ++ int p, // index to second edge vertex v1 in pInPts. ++ const float *pInPts, // array of all the input positions. ++ const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous. ++ int numInAttribs, // number of attributes per vertex. ++ int i, // output index. ++ float *pOutPts, // array of output positions. We'll write our new intersection point at i*4. ++ float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs. ++{ ++ float t; ++ ++ // Find the parameter of the intersection. ++ // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc. ++ const float *v1 = &pInPts[s*4]; ++ const float *v2 = &pInPts[p*4]; ++ ++ switch (ClippingPlane) ++ { ++ case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break; ++ case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break; ++ case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break; ++ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break; ++ case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break; ++ case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break; ++ default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); ++ }; ++ ++ ++ const float *a1 = &pInAttribs[s*numInAttribs]; ++ const float *a2 = &pInAttribs[p*numInAttribs]; ++ ++ float *pOutP = &pOutPts[i*4]; ++ float *pOutA = &pOutAttribs[i*numInAttribs]; ++ ++ // Interpolate new position. ++ for(int j = 0; j < 4; ++j) ++ { ++ pOutP[j] = v1[j] + (v2[j]-v1[j])*t; ++ } ++ ++ // Interpolate Attributes ++ for(int attr = 0; attr < numInAttribs; ++attr) ++ { ++ pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t; ++ } ++} ++ ++ ++// Checks whether vertex v lies inside clipping plane ++// in homogenous coords check -w < {x,y,z} < w; ++// ++template ++inline int inside(const float v[4]) ++{ ++ switch (ClippingPlane) ++ { ++ case FRUSTUM_LEFT : return (v[0]>=-v[3]); ++ case FRUSTUM_RIGHT : return (v[0]<= v[3]); ++ case FRUSTUM_TOP : return (v[1]>=-v[3]); ++ case FRUSTUM_BOTTOM : return (v[1]<= v[3]); ++ case FRUSTUM_NEAR : return (v[2]>=0.0f); ++ case FRUSTUM_FAR : return (v[2]<= v[3]); ++ default: ++ SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); ++ return 0; ++ } ++} ++ ++ ++// Clips a polygon in homogenous coordinates to a particular clipping plane. ++// Takes in vertices of the polygon (InPts) and the clipping plane ++// Puts the vertices of the clipped polygon in OutPts ++// Returns number of points in clipped polygon ++// ++template ++int ClipTriToPlane( const float *pInPts, int numInPts, ++ const float *pInAttribs, int numInAttribs, ++ float *pOutPts, float *pOutAttribs) ++{ ++ int i=0; // index number of OutPts, # of vertices in OutPts = i div 4; ++ ++ for (int j = 0; j < numInPts; ++j) ++ { ++ int s = j; ++ int p = (j + 1) % numInPts; ++ ++ int s_in = inside(&pInPts[s*4]); ++ int p_in = inside(&pInPts[p*4]); ++ ++ // test if vertex is to be added to output vertices ++ if (s_in != p_in) // edge crosses clipping plane ++ { ++ // find point of intersection ++ intersect(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs); ++ i++; ++ } ++ if (p_in) // 2nd vertex is inside clipping volume, add it to output ++ { ++ // Copy 2nd vertex position of edge over to output. ++ for(int k = 0; k < 4; ++k) ++ { ++ pOutPts[i*4 + k] = pInPts[p*4 + k]; ++ } ++ // Copy 2nd vertex attributes of edge over to output. ++ for(int attr = 0; attr < numInAttribs; ++attr) ++ { ++ pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr]; ++ } ++ i++; ++ } ++ // edge does not cross clipping plane and vertex outside clipping volume ++ // => do not add vertex ++ } ++ return i; ++} ++ ++ ++ ++void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs) ++{ ++ // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping ++ OSALIGN(float, 16) tempPts[6 * 4]; ++ OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; ++ ++ // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision ++ int NumOutPts = ClipTriToPlane(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs); ++ NumOutPts = ClipTriToPlane(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); ++ NumOutPts = ClipTriToPlane(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); ++ NumOutPts = ClipTriToPlane(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); ++ NumOutPts = ClipTriToPlane(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); ++ NumOutPts = ClipTriToPlane(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); ++ ++ SWR_ASSERT(NumOutPts <= 6); ++ ++ *numVerts = NumOutPts; ++ return; ++} ++ ++void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) ++{ ++ RDTSC_START(FEClipTriangles); ++ Clipper<3> clipper(workerId, pDC); ++ clipper.ExecuteStage(pa, prims, primMask, primId); ++ RDTSC_STOP(FEClipTriangles, 1, 0); ++} ++ ++void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) ++{ ++ RDTSC_START(FEClipLines); ++ Clipper<2> clipper(workerId, pDC); ++ clipper.ExecuteStage(pa, prims, primMask, primId); ++ RDTSC_STOP(FEClipLines, 1, 0); ++} ++void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) ++{ ++ RDTSC_START(FEClipPoints); ++ Clipper<1> clipper(workerId, pDC); ++ clipper.ExecuteStage(pa, prims, primMask, primId); ++ RDTSC_STOP(FEClipPoints, 1, 0); ++} ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h +new file mode 100644 +index 0000000..e9ba71d +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/clip.h +@@ -0,0 +1,851 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file clip.h ++* ++* @brief Definitions for clipping ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/simdintrin.h" ++#include "core/context.h" ++#include "core/pa.h" ++#include "rdtsc_core.h" ++ ++enum SWR_CLIPCODES ++{ ++ // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. ++ // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes. ++#define CLIPCODE_SHIFT 23 ++ FRUSTUM_LEFT = (0x01 << CLIPCODE_SHIFT), ++ FRUSTUM_TOP = (0x02 << CLIPCODE_SHIFT), ++ FRUSTUM_RIGHT = (0x04 << CLIPCODE_SHIFT), ++ FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT), ++ ++ FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT), ++ FRUSTUM_FAR = (0x20 << CLIPCODE_SHIFT), ++ ++ NEGW = (0x40 << CLIPCODE_SHIFT), ++ ++ GUARDBAND_LEFT = (0x80 << CLIPCODE_SHIFT | 0x1), ++ GUARDBAND_TOP = (0x80 << CLIPCODE_SHIFT | 0x2), ++ GUARDBAND_RIGHT = (0x80 << CLIPCODE_SHIFT | 0x4), ++ GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8) ++}; ++ ++#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR) ++#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) ++ ++void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, ++ int *numVerts, float *pOutAttribs); ++ ++INLINE ++void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes) ++{ ++ clipCodes = _simd_setzero_ps(); ++ ++ // -w ++ simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f)); ++ ++ // FRUSTUM_LEFT ++ simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW); ++ clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT))); ++ ++ // FRUSTUM_TOP ++ vRes = _simd_cmplt_ps(vertex.y, vNegW); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP)))); ++ ++ // FRUSTUM_RIGHT ++ vRes = _simd_cmpgt_ps(vertex.x, vertex.w); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT)))); ++ ++ // FRUSTUM_BOTTOM ++ vRes = _simd_cmpgt_ps(vertex.y, vertex.w); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM)))); ++ ++ if (state.rastState.depthClipEnable) ++ { ++ // FRUSTUM_NEAR ++ // DX clips depth [0..w], GL clips [-w..w] ++ if (type == DX) ++ { ++ vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps()); ++ } ++ else ++ { ++ vRes = _simd_cmplt_ps(vertex.z, vNegW); ++ } ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR)))); ++ ++ // FRUSTUM_FAR ++ vRes = _simd_cmpgt_ps(vertex.z, vertex.w); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR)))); ++ } ++ ++ // NEGW ++ vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps()); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW)))); ++ ++ // GUARDBAND_LEFT ++ simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left)); ++ vRes = _simd_cmplt_ps(vertex.x, gbMult); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT)))); ++ ++ // GUARDBAND_TOP ++ gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top)); ++ vRes = _simd_cmplt_ps(vertex.y, gbMult); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP)))); ++ ++ // GUARDBAND_RIGHT ++ gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right)); ++ vRes = _simd_cmpgt_ps(vertex.x, gbMult); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT)))); ++ ++ // GUARDBAND_BOTTOM ++ gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom)); ++ vRes = _simd_cmpgt_ps(vertex.y, gbMult); ++ clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM)))); ++} ++ ++template ++class Clipper ++{ ++public: ++ Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) : ++ workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC)) ++ { ++ static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); ++ } ++ ++ void ComputeClipCodes(simdvector vertex[]) ++ { ++ for (uint32_t i = 0; i < NumVertsPerPrim; ++i) ++ { ++ ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]); ++ } ++ } ++ ++ simdscalar ComputeClipCodeIntersection() ++ { ++ simdscalar result = this->clipCodes[0]; ++ for (uint32_t i = 1; i < NumVertsPerPrim; ++i) ++ { ++ result = _simd_and_ps(result, this->clipCodes[i]); ++ } ++ return result; ++ } ++ ++ simdscalar ComputeClipCodeUnion() ++ { ++ simdscalar result = this->clipCodes[0]; ++ for (uint32_t i = 1; i < NumVertsPerPrim; ++i) ++ { ++ result = _simd_or_ps(result, this->clipCodes[i]); ++ } ++ return result; ++ } ++ ++ int ComputeNegWMask() ++ { ++ simdscalar clipCodeUnion = ComputeClipCodeUnion(); ++ clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW))); ++ return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps())); ++ } ++ ++ int ComputeClipMask() ++ { ++ simdscalar clipUnion = ComputeClipCodeUnion(); ++ clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK))); ++ return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps())); ++ } ++ ++ // clipper is responsible for culling any prims with NAN coordinates ++ int ComputeNaNMask(simdvector prim[]) ++ { ++ simdscalar vNanMask = _simd_setzero_ps(); ++ for (uint32_t e = 0; e < NumVertsPerPrim; ++e) ++ { ++ simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q); ++ vNanMask = _simd_or_ps(vNanMask, vNan01); ++ simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q); ++ vNanMask = _simd_or_ps(vNanMask, vNan23); ++ } ++ ++ return _simd_movemask_ps(vNanMask); ++ } ++ ++ int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[]) ++ { ++ uint8_t cullMask = this->state.rastState.cullDistanceMask; ++ simdscalar vClipCullMask = _simd_setzero_ps(); ++ DWORD index; ++ ++ simdvector vClipCullDistLo[3]; ++ simdvector vClipCullDistHi[3]; ++ ++ pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); ++ pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); ++ while (_BitScanForward(&index, cullMask)) ++ { ++ cullMask &= ~(1 << index); ++ uint32_t slot = index >> 2; ++ uint32_t component = index & 0x3; ++ ++ simdscalar vCullMaskElem = _simd_set1_ps(-1.0f); ++ for (uint32_t e = 0; e < NumVertsPerPrim; ++e) ++ { ++ simdscalar vCullComp; ++ if (slot == 0) ++ { ++ vCullComp = vClipCullDistLo[e][component]; ++ } ++ else ++ { ++ vCullComp = vClipCullDistHi[e][component]; ++ } ++ ++ // cull if cull distance < 0 || NAN ++ simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ); ++ vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull); ++ } ++ vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem); ++ } ++ ++ // clipper should also discard any primitive with NAN clip distance ++ uint8_t clipMask = this->state.rastState.clipDistanceMask; ++ while (_BitScanForward(&index, clipMask)) ++ { ++ clipMask &= ~(1 << index); ++ uint32_t slot = index >> 2; ++ uint32_t component = index & 0x3; ++ ++ for (uint32_t e = 0; e < NumVertsPerPrim; ++e) ++ { ++ simdscalar vClipComp; ++ if (slot == 0) ++ { ++ vClipComp = vClipCullDistLo[e][component]; ++ } ++ else ++ { ++ vClipComp = vClipCullDistHi[e][component]; ++ } ++ ++ simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q); ++ vClipCullMask = _simd_or_ps(vClipCullMask, vClip); ++ } ++ } ++ ++ return _simd_movemask_ps(vClipCullMask); ++ } ++ ++ // clip a single primitive ++ int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs) ++ { ++ OSALIGN(float, 16) inVerts[3 * 4]; ++ OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; ++ ++ // transpose primitive position ++ __m128 verts[3]; ++ pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts); ++ _mm_store_ps(&inVerts[0], verts[0]); ++ _mm_store_ps(&inVerts[4], verts[1]); ++ _mm_store_ps(&inVerts[8], verts[2]); ++ ++ // transpose attribs ++ uint32_t numScalarAttribs = this->state.linkageCount * 4; ++ ++ int idx = 0; ++ DWORD slot = 0; ++ uint32_t mapIdx = 0; ++ uint32_t tmpLinkage = uint32_t(this->state.linkageMask); ++ while (_BitScanForward(&slot, tmpLinkage)) ++ { ++ tmpLinkage &= ~(1 << slot); ++ // Compute absolute attrib slot in vertex array ++ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++]; ++ __m128 attrib[3]; // triangle attribs (always 4 wide) ++ pa.AssembleSingle(inputSlot, primIndex, attrib); ++ _mm_store_ps(&inAttribs[idx], attrib[0]); ++ _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]); ++ _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]); ++ idx += 4; ++ } ++ ++ int numVerts; ++ Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs); ++ ++ return numVerts; ++ } ++ ++ // clip SIMD primitives ++ void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId) ++ { ++ // input/output vertex store for clipper ++ simdvertex vertices[7]; // maximum 7 verts generated per triangle ++ ++ // assemble pos ++ simdvector tmpVector[NumVertsPerPrim]; ++ pa.Assemble(VERTEX_POSITION_SLOT, tmpVector); ++ for (uint32_t i = 0; i < NumVertsPerPrim; ++i) ++ { ++ vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; ++ } ++ ++ // assemble attribs ++ DWORD slot = 0; ++ uint32_t mapIdx = 0; ++ uint32_t tmpLinkage = this->state.linkageMask; ++ while (_BitScanForward(&slot, tmpLinkage)) ++ { ++ tmpLinkage &= ~(1 << slot); ++ // Compute absolute attrib slot in vertex array ++ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++]; ++ ++ pa.Assemble(inputSlot, tmpVector); ++ for (uint32_t i = 0; i < NumVertsPerPrim; ++i) ++ { ++ vertices[i].attrib[inputSlot] = tmpVector[i]; ++ } ++ } ++ ++ uint32_t numAttribs; ++ if (_BitScanReverse((DWORD*)&numAttribs, this->state.linkageMask)) ++ { ++ numAttribs++; ++ } ++ else ++ { ++ numAttribs = 0; ++ } ++ ++ simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); ++ ++ // set up new PA for binning clipped primitives ++ PFN_PROCESS_PRIMS pfnBinFunc = nullptr; ++ PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; ++ if (NumVertsPerPrim == 3) ++ { ++ pfnBinFunc = BinTriangles; ++ clipTopology = TOP_TRIANGLE_FAN; ++ ++ // so that the binner knows to bloat wide points later ++ if (pa.binTopology == TOP_POINT_LIST) ++ clipTopology = TOP_POINT_LIST; ++ } ++ else if (NumVertsPerPrim == 2) ++ { ++ pfnBinFunc = BinLines; ++ clipTopology = TOP_LINE_LIST; ++ } ++ else ++ { ++ SWR_ASSERT(0 && "Unexpected points in clipper."); ++ } ++ ++ ++ uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; ++ uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; ++ ++ const simdscalari vOffsets = _mm256_set_epi32( ++ 0 * sizeof(simdvertex), // unused lane ++ 6 * sizeof(simdvertex), ++ 5 * sizeof(simdvertex), ++ 4 * sizeof(simdvertex), ++ 3 * sizeof(simdvertex), ++ 2 * sizeof(simdvertex), ++ 1 * sizeof(simdvertex), ++ 0 * sizeof(simdvertex)); ++ ++ // only need to gather 7 verts ++ // @todo dynamic mask based on actual # of verts generated per lane ++ const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); ++ ++ uint32_t numClippedPrims = 0; ++ for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) ++ { ++ uint32_t numEmittedVerts = pVertexCount[inputPrim]; ++ if (numEmittedVerts < NumVertsPerPrim) ++ { ++ continue; ++ } ++ SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); ++ ++ uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); ++ numClippedPrims += numEmittedPrims; ++ ++ // tranpose clipper output so that each lane's vertices are in SIMD order ++ // set aside space for 2 vertices, as the PA will try to read up to 16 verts ++ // for triangle fan ++ simdvertex transposedPrims[2]; ++ ++ // transpose pos ++ uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); ++ pBase += sizeof(simdscalar); ++ } ++ ++ // transpose attribs ++ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim; ++ for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); ++ pBase += sizeof(simdscalar); ++ } ++ } ++ ++ PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); ++ ++ while (clipPa.GetNextStreamOutput()) ++ { ++ do ++ { ++ simdvector attrib[NumVertsPerPrim]; ++ bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib); ++ if (assemble) ++ { ++ static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; ++ pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim])); ++ } ++ } while (clipPa.NextPrim()); ++ } ++ } ++ ++ // update global pipeline stat ++ SWR_CONTEXT* pContext = this->pDC->pContext; ++ UPDATE_STAT(CPrimitives, numClippedPrims); ++ } ++ ++ // execute the clipper stage ++ void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId) ++ { ++ // set up binner based on PA state ++ PFN_PROCESS_PRIMS pfnBinner; ++ switch (pa.binTopology) ++ { ++ case TOP_POINT_LIST: ++ pfnBinner = CanUseSimplePoints(pDC) ? BinPoints : BinTriangles; ++ break; ++ case TOP_LINE_LIST: ++ case TOP_LINE_STRIP: ++ case TOP_LINE_LOOP: ++ case TOP_LINE_LIST_ADJ: ++ case TOP_LISTSTRIP_ADJ: ++ pfnBinner = BinLines; ++ break; ++ default: ++ pfnBinner = BinTriangles; ++ break; ++ }; ++ ++ // update clipper invocations pipeline stat ++ SWR_CONTEXT* pContext = this->pDC->pContext; ++ uint32_t numInvoc = _mm_popcnt_u32(primMask); ++ UPDATE_STAT(CInvocations, numInvoc); ++ ++ ComputeClipCodes(prim); ++ ++ // cull prims with NAN coords ++ primMask &= ~ComputeNaNMask(prim); ++ ++ // user cull distance cull ++ if (this->state.rastState.cullDistanceMask) ++ { ++ primMask &= ~ComputeUserClipCullMask(pa, prim); ++ } ++ ++ // cull prims outside view frustum ++ simdscalar clipIntersection = ComputeClipCodeIntersection(); ++ int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps())); ++ ++ // skip clipping for points ++ uint32_t clipMask = 0; ++ if (NumVertsPerPrim != 1) ++ { ++ clipMask = primMask & ComputeClipMask(); ++ } ++ ++ if (clipMask) ++ { ++ RDTSC_START(FEGuardbandClip); ++ // we have to clip tris, execute the clipper, which will also ++ // call the binner ++ ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); ++ RDTSC_STOP(FEGuardbandClip, 1, 0); ++ } ++ else if (validMask) ++ { ++ // update CPrimitives pipeline state ++ SWR_CONTEXT* pContext = this->pDC->pContext; ++ UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask)); ++ ++ // forward valid prims directly to binner ++ pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); ++ } ++ } ++ ++private: ++ inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1) ++ { ++ return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1)); ++ } ++ ++ inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component) ++ { ++ const uint32_t simdVertexStride = sizeof(simdvertex); ++ const uint32_t componentStride = sizeof(simdscalar); ++ const uint32_t attribStride = sizeof(simdvector); ++ const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), ++ 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); ++ ++ // step to the simdvertex ++ simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride)); ++ ++ // step to the attribute and component ++ vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component)); ++ ++ // step to the lane ++ vOffsets = _simd_add_epi32(vOffsets, vElemOffset); ++ ++ return vOffsets; ++ } ++ ++ // gathers a single component for a given attribute for each SIMD lane ++ inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component) ++ { ++ simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); ++ simdscalar vSrc = _mm256_undefined_ps(); ++ return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1); ++ } ++ ++ inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc) ++ { ++ simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); ++ ++ uint32_t* pOffsets = (uint32_t*)&vOffsets; ++ float* pSrc = (float*)&vSrc; ++ uint32_t mask = _simd_movemask_ps(vMask); ++ DWORD lane; ++ while (_BitScanForward(&lane, mask)) ++ { ++ mask &= ~(1 << lane); ++ uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane]; ++ *(float*)pBuf = pSrc[lane]; ++ } ++ } ++ ++ template ++ inline void intersect( ++ const simdscalar& vActiveMask, // active lanes to operate on ++ const simdscalari& s, // index to first edge vertex v0 in pInPts. ++ const simdscalari& p, // index to second edge vertex v1 in pInPts. ++ const simdvector& v1, // vertex 0 position ++ const simdvector& v2, // vertex 1 position ++ simdscalari& outIndex, // output index. ++ const float *pInVerts, // array of all the input positions. ++ uint32_t numInAttribs, // number of attributes per vertex. ++ float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. ++ { ++ // compute interpolation factor ++ simdscalar t; ++ switch (ClippingPlane) ++ { ++ case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break; ++ case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break; ++ case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break; ++ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break; ++ case FRUSTUM_NEAR: ++ // DX Znear plane is 0, GL is -w ++ if (this->driverType == DX) ++ { ++ t = ComputeInterpFactor(v1[2], v2[2]); ++ } ++ else ++ { ++ t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2])); ++ } ++ break; ++ case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break; ++ default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); ++ }; ++ ++ // interpolate position and store ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]); ++ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); ++ } ++ ++ // interpolate attributes and store ++ for (uint32_t a = 0; a < numInAttribs; ++a) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); ++ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); ++ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ++ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); ++ } ++ } ++ } ++ ++ template ++ inline simdscalar inside(const simdvector& v) ++ { ++ switch (ClippingPlane) ++ { ++ case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); ++ case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]); ++ case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); ++ case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]); ++ case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f))); ++ case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]); ++ default: ++ SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); ++ return _simd_setzero_ps(); ++ } ++ } ++ ++ template ++ simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) ++ { ++ simdscalari vCurIndex = _simd_setzero_si(); ++ simdscalari vOutIndex = _simd_setzero_si(); ++ simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); ++ ++ while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty ++ { ++ simdscalari s = vCurIndex; ++ simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); ++ simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p); ++ p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask))); ++ ++ // gather position ++ simdvector vInPos0, vInPos1; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); ++ vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); ++ } ++ ++ // compute inside mask ++ simdscalar s_in = inside(vInPos0); ++ simdscalar p_in = inside(vInPos1); ++ ++ // compute intersection mask (s_in != p_in) ++ simdscalar intersectMask = _simd_xor_ps(s_in, p_in); ++ intersectMask = _simd_and_ps(intersectMask, vActiveMask); ++ ++ // store s if inside ++ s_in = _simd_and_ps(s_in, vActiveMask); ++ if (!_simd_testz_ps(s_in, s_in)) ++ { ++ // store position ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); ++ } ++ ++ // store attribs ++ for (uint32_t a = 0; a < numInAttribs; ++a) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); ++ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); ++ } ++ } ++ ++ // increment outIndex ++ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); ++ } ++ ++ // compute and store intersection ++ if (!_simd_testz_ps(intersectMask, intersectMask)) ++ { ++ intersect(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); ++ ++ // increment outIndex for active lanes ++ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); ++ } ++ ++ // increment loop index and update active mask ++ vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1)); ++ vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); ++ } ++ ++ return vOutIndex; ++ } ++ ++ template ++ simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) ++ { ++ simdscalari vCurIndex = _simd_setzero_si(); ++ simdscalari vOutIndex = _simd_setzero_si(); ++ simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts)); ++ ++ if (!_simd_testz_ps(vActiveMask, vActiveMask)) ++ { ++ simdscalari s = vCurIndex; ++ simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1)); ++ ++ // gather position ++ simdvector vInPos0, vInPos1; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); ++ vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); ++ } ++ ++ // compute inside mask ++ simdscalar s_in = inside(vInPos0); ++ simdscalar p_in = inside(vInPos1); ++ ++ // compute intersection mask (s_in != p_in) ++ simdscalar intersectMask = _simd_xor_ps(s_in, p_in); ++ intersectMask = _simd_and_ps(intersectMask, vActiveMask); ++ ++ // store s if inside ++ s_in = _simd_and_ps(s_in, vActiveMask); ++ if (!_simd_testz_ps(s_in, s_in)) ++ { ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); ++ } ++ ++ // interpolate attributes and store ++ for (uint32_t a = 0; a < numInAttribs; ++a) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); ++ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); ++ } ++ } ++ ++ // increment outIndex ++ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); ++ } ++ ++ // compute and store intersection ++ if (!_simd_testz_ps(intersectMask, intersectMask)) ++ { ++ intersect(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); ++ ++ // increment outIndex for active lanes ++ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask); ++ } ++ ++ // store p if inside ++ p_in = _simd_and_ps(p_in, vActiveMask); ++ if (!_simd_testz_ps(p_in, p_in)) ++ { ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); ++ } ++ ++ // interpolate attributes and store ++ for (uint32_t a = 0; a < numInAttribs; ++a) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); ++ ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); ++ } ++ } ++ ++ // increment outIndex ++ vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in); ++ } ++ } ++ ++ return vOutIndex; ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Vertical clipper. Clips SIMD primitives at a time ++ /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer ++ /// @param vPrimMask - mask of valid input primitives, including non-clipped prims ++ /// @param numAttribs - number of valid input attribs, including position ++ simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs) ++ { ++ // temp storage ++ simdvertex tempVertices[7]; ++ float* pTempVerts = (float*)&tempVertices[0]; ++ ++ // zero out num input verts for non-active lanes ++ simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim); ++ vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask); ++ ++ // clip prims to frustum ++ simdscalari vNumOutPts; ++ if (NumVertsPerPrim == 3) ++ { ++ vNumOutPts = ClipTriToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); ++ vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); ++ vNumOutPts = ClipTriToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); ++ vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); ++ vNumOutPts = ClipTriToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); ++ vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); ++ } ++ else ++ { ++ SWR_ASSERT(NumVertsPerPrim == 2); ++ vNumOutPts = ClipLineToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); ++ vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); ++ vNumOutPts = ClipLineToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); ++ vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); ++ vNumOutPts = ClipLineToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); ++ vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); ++ } ++ ++ // restore num verts for non-clipped, active lanes ++ simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask); ++ vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask); ++ ++ return vNumOutPts; ++ } ++ ++ const uint32_t workerId; ++ const DRIVER_TYPE driverType; ++ DRAW_CONTEXT* pDC; ++ const API_STATE& state; ++ simdscalar clipCodes[NumVertsPerPrim]; ++}; ++ ++ ++// pipeline stage functions ++void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); ++void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); ++void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); +diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h +new file mode 100644 +index 0000000..c719f27 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/context.h +@@ -0,0 +1,444 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file context.h ++* ++* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT ++* The SWR_CONTEXT is our global context and contains the DC ring, ++* thread state, etc. ++* ++* The DRAW_CONTEXT contains all state associated with a draw operation. ++* ++******************************************************************************/ ++#pragma once ++ ++#include ++#include ++ ++#include "core/api.h" ++#include "core/utils.h" ++#include "core/arena.h" ++#include "core/fifo.hpp" ++#include "core/knobs.h" ++#include "common/simdintrin.h" ++#include "core/threads.h" ++ ++// x.8 fixed point precision values ++#define FIXED_POINT_SHIFT 8 ++#define FIXED_POINT_SCALE 256 ++ ++// x.16 fixed point precision values ++#define FIXED_POINT16_SHIFT 16 ++#define FIXED_POINT16_SCALE 65536 ++ ++struct SWR_CONTEXT; ++struct DRAW_CONTEXT; ++ ++struct TRI_FLAGS ++{ ++ uint32_t frontFacing : 1; ++ uint32_t yMajor : 1; ++ uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); ++ uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); ++ uint32_t primID; ++ uint32_t renderTargetArrayIndex; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TRIANGLE_DESC ++///////////////////////////////////////////////////////////////////////// ++struct SWR_TRIANGLE_DESC ++{ ++ float I[3]; ++ float J[3]; ++ float Z[3]; ++ float OneOverW[3]; ++ float recipDet; ++ ++ float *pAttribs; ++ float *pPerspAttribs; ++ float *pSamplePos; ++ float *pUserClipBuffer; ++ ++ uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; ++ ++ TRI_FLAGS triFlags; ++}; ++ ++struct TRIANGLE_WORK_DESC ++{ ++ float *pTriBuffer; ++ float *pAttribs; ++ float *pUserClipBuffer; ++ uint32_t numAttribs; ++ TRI_FLAGS triFlags; ++}; ++ ++union CLEAR_FLAGS ++{ ++ struct ++ { ++ uint32_t mask : 3; ++ }; ++ uint32_t bits; ++}; ++ ++struct CLEAR_DESC ++{ ++ CLEAR_FLAGS flags; ++ float clearRTColor[4]; // RGBA_32F ++ float clearDepth; // [0..1] ++ BYTE clearStencil; ++}; ++ ++struct INVALIDATE_TILES_DESC ++{ ++ uint32_t attachmentMask; ++}; ++ ++struct SYNC_DESC ++{ ++ PFN_CALLBACK_FUNC pfnCallbackFunc; ++ uint64_t userData; ++ uint64_t userData2; ++}; ++ ++struct QUERY_DESC ++{ ++ SWR_STATS* pStats; ++}; ++ ++struct STORE_TILES_DESC ++{ ++ SWR_RENDERTARGET_ATTACHMENT attachment; ++ SWR_TILE_STATE postStoreTileState; ++}; ++ ++struct COMPUTE_DESC ++{ ++ uint32_t threadGroupCountX; ++ uint32_t threadGroupCountY; ++ uint32_t threadGroupCountZ; ++}; ++ ++typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); ++ ++enum WORK_TYPE ++{ ++ SYNC, ++ DRAW, ++ CLEAR, ++ INVALIDATETILES, ++ STORETILES, ++ QUERYSTATS, ++}; ++ ++struct BE_WORK ++{ ++ WORK_TYPE type; ++ PFN_WORK_FUNC pfnWork; ++ union ++ { ++ SYNC_DESC sync; ++ TRIANGLE_WORK_DESC tri; ++ CLEAR_DESC clear; ++ INVALIDATE_TILES_DESC invalidateTiles; ++ STORE_TILES_DESC storeTiles; ++ QUERY_DESC queryStats; ++ } desc; ++}; ++ ++struct DRAW_WORK ++{ ++ DRAW_CONTEXT* pDC; ++ union ++ { ++ uint32_t numIndices; // DrawIndexed: Number of indices for draw. ++ uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) ++ }; ++ union ++ { ++ const int32_t* pIB; // DrawIndexed: App supplied indices ++ uint32_t startVertex; // Draw: Starting vertex in VB to render from. ++ }; ++ int32_t baseVertex; ++ uint32_t numInstances; // Number of instances ++ uint32_t startInstance; // Instance offset ++ uint32_t startPrimID; // starting primitiveID for this draw batch ++ SWR_FORMAT type; // index buffer type ++}; ++ ++typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); ++struct FE_WORK ++{ ++ WORK_TYPE type; ++ PFN_FE_WORK_FUNC pfnWork; ++ union ++ { ++ SYNC_DESC sync; ++ DRAW_WORK draw; ++ CLEAR_DESC clear; ++ INVALIDATE_TILES_DESC invalidateTiles; ++ STORE_TILES_DESC storeTiles; ++ QUERY_DESC queryStats; ++ } desc; ++}; ++ ++struct GUARDBAND ++{ ++ float left, right, top, bottom; ++}; ++ ++struct PA_STATE; ++ ++// function signature for pipeline stages that execute after primitive assembly ++typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], ++ uint32_t primMask, simdscalari primID); ++ ++OSALIGNLINE(struct) API_STATE ++{ ++ // Vertex Buffers ++ SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; ++ ++ // Index Buffer ++ SWR_INDEX_BUFFER_STATE indexBuffer; ++ ++ // FS - Fetch Shader State ++ PFN_FETCH_FUNC pfnFetchFunc; ++ ++ // VS - Vertex Shader State ++ PFN_VERTEX_FUNC pfnVertexFunc; ++ ++ // GS - Geometry Shader State ++ PFN_GS_FUNC pfnGsFunc; ++ SWR_GS_STATE gsState; ++ ++ // CS - Compute Shader ++ PFN_CS_FUNC pfnCsFunc; ++ uint32_t totalThreadsInGroup; ++ ++ // FE - Frontend State ++ SWR_FRONTEND_STATE frontendState; ++ ++ // SOS - Streamout Shader State ++ PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; ++ ++ // Streamout state ++ SWR_STREAMOUT_STATE soState; ++ mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; ++ ++ // Tessellation State ++ PFN_HS_FUNC pfnHsFunc; ++ PFN_DS_FUNC pfnDsFunc; ++ SWR_TS_STATE tsState; ++ ++ // Specifies which VS outputs are sent to PS. ++ // Does not include position ++ uint32_t linkageMask; ++ uint32_t linkageCount; ++ uint8_t linkageMap[MAX_ATTRIBUTES]; ++ ++ // attrib mask, specifies the total set of attributes used ++ // by the frontend (vs, so, gs) ++ uint32_t feAttribMask; ++ ++ PRIMITIVE_TOPOLOGY topology; ++ bool forceFront; ++ ++ // RS - Rasterizer State ++ SWR_RASTSTATE rastState; ++ // floating point multisample offsets ++ float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; ++ ++ GUARDBAND gbState; ++ ++ SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; ++ SWR_VIEWPORT_MATRIX vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS]; ++ ++ BBOX scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; ++ BBOX scissorInFixedPoint; ++ ++ // Backend state ++ SWR_BACKEND_STATE backendState; ++ ++ // PS - Pixel shader state ++ SWR_PS_STATE psState; ++ ++ SWR_DEPTH_STENCIL_STATE depthStencilState; ++ ++ // OM - Output Merger State ++ SWR_BLEND_STATE blendState; ++ PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; ++ ++ // Stats are incremented when this is true. ++ bool enableStats; ++}; ++ ++class MacroTileMgr; ++class DispatchQueue; ++ ++struct RenderOutputBuffers ++{ ++ uint8_t* pColor[SWR_NUM_RENDERTARGETS]; ++ uint8_t* pDepth; ++ uint8_t* pStencil; ++}; ++ ++// pipeline function pointer types ++typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); ++ ++// Draw State ++struct DRAW_STATE ++{ ++ API_STATE state; ++ ++ void* pPrivateState; // Its required the driver sets this up for each draw. ++ ++ // pipeline function pointers, filled in by API thread when setting up the draw ++ PFN_BACKEND_FUNC pfnBackend; ++ PFN_PROCESS_PRIMS pfnProcessPrims; ++ ++ Arena arena; // This should only be used by API thread. ++}; ++ ++// Draw Context ++// The api thread sets up a draw context that exists for the life of the draw. ++// This draw context maintains all of the state needed for the draw operation. ++struct DRAW_CONTEXT ++{ ++ SWR_CONTEXT *pContext; ++ ++ uint64_t drawId; ++ ++ bool isCompute; // Is this DC a compute context? ++ ++ FE_WORK FeWork; ++ volatile OSALIGNLINE(uint32_t) FeLock; ++ volatile OSALIGNLINE(bool) inUse; ++ volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? ++ ++ uint64_t dependency; ++ ++ MacroTileMgr* pTileMgr; ++ ++ // The following fields are valid if isCompute is true. ++ volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) ++ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) ++ ++ DRAW_STATE* pState; ++ Arena arena; ++}; ++ ++INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) ++{ ++ SWR_ASSERT(pDC != nullptr); ++ SWR_ASSERT(pDC->pState != nullptr); ++ ++ return pDC->pState->state; ++} ++ ++INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) ++{ ++ SWR_ASSERT(pDC != nullptr); ++ SWR_ASSERT(pDC->pState != nullptr); ++ ++ return pDC->pState->pPrivateState; ++} ++ ++class HotTileMgr; ++ ++struct SWR_CONTEXT ++{ ++ // Draw Context Ring ++ // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. ++ // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number ++ // of draws that can be in flight at any given time. ++ // ++ // Description: ++ // 1. State - When an application first sets state we'll request a new draw context to use. ++ // a. If there are no available draw contexts then we'll have to wait until one becomes free. ++ // b. If one is available then set pCurDrawContext to point to it and mark it in use. ++ // c. All state calls set state on pCurDrawContext. ++ // 2. Draw - Creates submits a work item that is associated with current draw context. ++ // a. Set pPrevDrawContext = pCurDrawContext ++ // b. Set pCurDrawContext to NULL. ++ // 3. State - When an applications sets state after draw ++ // a. Same as step 1. ++ // b. State is copied from prev draw context to current. ++ DRAW_CONTEXT* dcRing; ++ ++ DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. ++ DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. ++ ++ // Draw State Ring ++ // When draw are very large (lots of primitives) then the API thread will break these up. ++ // These split draws all have identical state. So instead of storing the state directly ++ // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs ++ // to reference a single entry in the DS ring. ++ DRAW_STATE* dsRing; ++ ++ uint32_t curStateId; // Current index to the next available entry in the DS ring. ++ ++ uint32_t NumWorkerThreads; ++ ++ THREAD_POOL threadPool; // Thread pool associated with this context ++ ++ std::condition_variable FifosNotEmpty; ++ std::mutex WaitLock; ++ ++ // Draw Contexts will get a unique drawId generated from this ++ uint64_t nextDrawId; ++ ++ // Last retired drawId. Read/written only be API thread ++ uint64_t LastRetiredId; ++ ++ // most recent draw id enqueued by the API thread ++ // written by api thread, read by multiple workers ++ OSALIGNLINE(volatile uint64_t) DrawEnqueued; ++ ++ // Current FE status of each worker. ++ OSALIGNLINE(volatile uint64_t) WorkerFE[KNOB_MAX_NUM_THREADS]; ++ OSALIGNLINE(volatile uint64_t) WorkerBE[KNOB_MAX_NUM_THREADS]; ++ ++ DRIVER_TYPE driverType; ++ ++ uint32_t privateStateSize; ++ ++ HotTileMgr *pHotTileMgr; ++ ++ // tile load/store functions, passed in at create context time ++ PFN_LOAD_TILE pfnLoadTile; ++ PFN_STORE_TILE pfnStoreTile; ++ PFN_CLEAR_TILE pfnClearTile; ++ ++ // Global Stats ++ SWR_STATS stats[KNOB_MAX_NUM_THREADS]; ++ ++ // Scratch space for workers. ++ uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; ++}; ++ ++void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); ++void WakeAllThreads(SWR_CONTEXT *pContext); ++ ++#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; } ++#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; } +diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +new file mode 100644 +index 0000000..9f869ec +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +@@ -0,0 +1,215 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file depthstencil.h ++* ++* @brief Implements depth/stencil functionality ++* ++******************************************************************************/ ++#pragma once ++#include "common/os.h" ++#include "format_conversion.h" ++ ++INLINE ++void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps) ++{ ++ simdscalari stencil = _simd_castps_si(stencilps); ++ ++ switch (op) ++ { ++ case STENCILOP_KEEP: ++ break; ++ case STENCILOP_ZERO: ++ stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask); ++ break; ++ case STENCILOP_REPLACE: ++ stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask); ++ break; ++ case STENCILOP_INCRSAT: ++ { ++ simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1)); ++ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); ++ break; ++ } ++ case STENCILOP_DECRSAT: ++ { ++ simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1)); ++ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); ++ break; ++ } ++ case STENCILOP_INCR: ++ { ++ simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1)); ++ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask); ++ break; ++ } ++ case STENCILOP_DECR: ++ { ++ simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff)); ++ stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask); ++ break; ++ } ++ case STENCILOP_INVERT: ++ { ++ simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps())); ++ stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask); ++ break; ++ } ++ default: ++ break; ++ } ++} ++ ++ ++INLINE ++simdscalar ZTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, ++ bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar mask, BYTE *pStencilBase, ++ bool testOnly) ++{ ++ static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); ++ static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); ++ ++ simdscalar depthResult = _simd_set1_ps(-1.0f); ++ simdscalar zbuf; ++ ++ // clamp Z to viewport [minZ..maxZ] ++ simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ); ++ simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ); ++ interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ)); ++ ++ if (pDSState->depthTestEnable) ++ { ++ switch (pDSState->depthTestFunc) ++ { ++ case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break; ++ case ZFUNC_ALWAYS: break; ++ default: ++ zbuf = _simd_load_ps((const float*)pDepthBase); ++ } ++ ++ switch (pDSState->depthTestFunc) ++ { ++ case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break; ++ case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break; ++ case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break; ++ case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break; ++ case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break; ++ } ++ } ++ ++ simdscalar stencilMask = _simd_set1_ps(-1.0f); ++ simdscalar stencilbuf; ++ ++ uint8_t stencilRefValue; ++ uint32_t stencilTestFunc; ++ uint32_t stencilFailOp; ++ uint32_t stencilPassDepthPassOp; ++ uint32_t stencilPassDepthFailOp; ++ uint8_t stencilTestMask; ++ uint8_t stencilWriteMask; ++ if (frontFacing || !pDSState->doubleSidedStencilTestEnable) ++ { ++ stencilRefValue = pDSState->stencilRefValue; ++ stencilTestFunc = pDSState->stencilTestFunc; ++ stencilFailOp = pDSState->stencilFailOp; ++ stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp; ++ stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp; ++ stencilTestMask = pDSState->stencilTestMask; ++ stencilWriteMask = pDSState->stencilWriteMask; ++ } ++ else ++ { ++ stencilRefValue = pDSState->backfaceStencilRefValue; ++ stencilTestFunc = pDSState->backfaceStencilTestFunc; ++ stencilFailOp = pDSState->backfaceStencilFailOp; ++ stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp; ++ stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp; ++ stencilTestMask = pDSState->backfaceStencilTestMask; ++ stencilWriteMask = pDSState->backfaceStencilWriteMask; ++ } ++ ++ if (pDSState->stencilTestEnable) ++ { ++ simdvector sbuf; ++ LoadSOA(pStencilBase, sbuf); ++ stencilbuf = sbuf.v[0]; ++ ++ // apply stencil read mask ++ simdscalar stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask))); ++ ++ // do stencil compare in float to avoid simd integer emulation in AVX1 ++ stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask)); ++ ++ simdscalar stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask)); ++ ++ switch (stencilTestFunc) ++ { ++ case ZFUNC_ALWAYS: break; ++ case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break; ++ case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break; ++ case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break; ++ case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break; ++ case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break; ++ case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break; ++ case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break; ++ } ++ } ++ ++ simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask); ++ depthWriteMask = _simd_and_ps(depthWriteMask, mask); ++ ++ if (testOnly) { ++ return depthWriteMask; ++ } ++ ++ if (pDSState->depthWriteEnable) ++ { ++ _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(depthWriteMask), interpZ); ++ } ++ ++ if (pDSState->stencilWriteEnable) ++ { ++ simdscalar stencilps = stencilbuf; ++ simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue)); ++ ++ simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, mask); ++ simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthResult); ++ simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthResult, _simd_set1_ps(-1))); ++ ++ simdscalar origStencil = stencilps; ++ ++ StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps); ++ StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps); ++ StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps); ++ ++ // apply stencil write mask ++ simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask); ++ stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask)); ++ stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps); ++ ++ simdvector stencilResult; ++ stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, mask); ++ StoreSOA(stencilResult, pStencilBase); ++ } ++ ++ return depthWriteMask; ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +new file mode 100644 +index 0000000..238f5ee +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +@@ -0,0 +1,144 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file fifo.hpp ++* ++* @brief Definitions for our fifos used for thread communication. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/os.h" ++#include ++#include ++ ++template ++struct QUEUE ++{ ++ OSALIGNLINE(volatile uint32_t) mLock; ++ OSALIGNLINE(volatile uint32_t) mNumEntries; ++ std::vector mBlocks; ++ T* mCurBlock; ++ uint32_t mHead; ++ uint32_t mTail; uint32_t mCurBlockIdx; ++ ++ // power of 2 ++ static const uint32_t mBlockSizeShift = 6; ++ static const uint32_t mBlockSize = 1 << mBlockSizeShift; ++ ++ void initialize() ++ { ++ mLock = 0; ++ mHead = 0; ++ mTail = 0; ++ mNumEntries = 0; ++ mCurBlock = (T*)malloc(mBlockSize*sizeof(T)); ++ mBlocks.push_back(mCurBlock); ++ mCurBlockIdx = 0; ++ } ++ ++ void clear() ++ { ++ mHead = 0; ++ mTail = 0; ++ mCurBlock = mBlocks[0]; ++ mCurBlockIdx = 0; ++ ++ mNumEntries = 0; ++ _ReadWriteBarrier(); ++ mLock = 0; ++ } ++ ++ uint32_t getNumQueued() ++ { ++ return mNumEntries; ++ } ++ ++ bool tryLock() ++ { ++ if (mLock) ++ { ++ return false; ++ } ++ ++ // try to lock the FIFO ++ LONG initial = InterlockedCompareExchange(&mLock, 1, 0); ++ return (initial == 0); ++ } ++ ++ void unlock() ++ { ++ mLock = 0; ++ } ++ ++ T* peek() ++ { ++ if (mNumEntries == 0) ++ { ++ return nullptr; ++ } ++ uint32_t block = mHead >> mBlockSizeShift; ++ return &mBlocks[block][mHead & (mBlockSize-1)]; ++ } ++ ++ void dequeue_noinc() ++ { ++ mHead ++; ++ mNumEntries --; ++ } ++ ++ bool enqueue_try_nosync(const T* entry) ++ { ++ memcpy(&mCurBlock[mTail], entry, sizeof(T)); ++ ++ mTail ++; ++ if (mTail == mBlockSize) ++ { ++ if (++mCurBlockIdx < mBlocks.size()) ++ { ++ mCurBlock = mBlocks[mCurBlockIdx]; ++ } ++ else ++ { ++ T* newBlock = (T*)malloc(sizeof(T)*mBlockSize); ++ SWR_ASSERT(newBlock); ++ ++ mBlocks.push_back(newBlock); ++ mCurBlock = newBlock; ++ } ++ ++ mTail = 0; ++ } ++ ++ mNumEntries ++; ++ return true; ++ } ++ ++ void destroy() ++ { ++ for (uint32_t i = 0; i < mBlocks.size(); ++i) ++ { ++ free(mBlocks[i]); ++ } ++ } ++ ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +new file mode 100644 +index 0000000..af57697 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +@@ -0,0 +1,167 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file format_conversion.h ++* ++* @brief API implementation ++* ++******************************************************************************/ ++#include "format_types.h" ++#include "format_traits.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Load SIMD packed pixels in SOA format and converts to ++/// SOA RGBA32_FLOAT format. ++/// @param pSrc - source data in SOA form ++/// @param dst - output data in SOA form ++template ++INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst) ++{ ++ // fast path for float32 ++ if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits::GetBPC(0) == 32)) ++ { ++ auto lambda = [&](int comp) ++ { ++ simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar))); ++ ++ dst.v[FormatTraits::swizzle(comp)] = vComp; ++ }; ++ ++ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); ++ return; ++ } ++ ++ auto lambda = [&](int comp) ++ { ++ // load SIMD components ++ simdscalar vComp = FormatTraits::loadSOA(comp, pSrc); ++ ++ // unpack ++ vComp = FormatTraits::unpack(comp, vComp); ++ ++ // convert ++ if (FormatTraits::isNormalized(comp)) ++ { ++ vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp)); ++ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::toFloat(comp))); ++ } ++ ++ dst.v[FormatTraits::swizzle(comp)] = vComp; ++ ++ pSrc += (FormatTraits::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; ++ }; ++ ++ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Convert and store simdvector of pixels in SOA ++/// RGBA32_FLOAT to SOA format ++/// @param src - source data in SOA form ++/// @param dst - output data in SOA form ++template ++INLINE void StoreSOA(const simdvector &src, BYTE *pDst) ++{ ++ // fast path for float32 ++ if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits::GetBPC(0) == 32)) ++ { ++ for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) ++ { ++ simdscalar vComp = src.v[FormatTraits::swizzle(comp)]; ++ ++ // Gamma-correct ++ if (FormatTraits::isSRGB) ++ { ++ if (comp < 3) // Input format is always RGBA32_FLOAT. ++ { ++ vComp = FormatTraits::convertSrgb(comp, vComp); ++ } ++ } ++ ++ _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp); ++ } ++ return; ++ } ++ ++ auto lambda = [&](int comp) ++ { ++ simdscalar vComp = src.v[FormatTraits::swizzle(comp)]; ++ ++ // Gamma-correct ++ if (FormatTraits::isSRGB) ++ { ++ if (comp < 3) // Input format is always RGBA32_FLOAT. ++ { ++ vComp = FormatTraits::convertSrgb(comp, vComp); ++ } ++ } ++ ++ // convert ++ if (FormatTraits::isNormalized(comp)) ++ { ++ if (FormatTraits::GetType(comp) == SWR_TYPE_UNORM) ++ { ++ vComp = _simd_max_ps(vComp, _simd_setzero_ps()); ++ } ++ ++ if (FormatTraits::GetType(comp) == SWR_TYPE_SNORM) ++ { ++ vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f)); ++ } ++ vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f)); ++ ++ vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(comp))); ++ vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); ++ } ++ else if (FormatTraits::GetBPC(comp) < 32) ++ { ++ if (FormatTraits::GetType(comp) == SWR_TYPE_UINT) ++ { ++ int iMax = (1 << FormatTraits::GetBPC(comp)) - 1; ++ int iMin = 0; ++ simdscalari vCompi = _simd_castps_si(vComp); ++ vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); ++ vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); ++ vComp = _simd_castsi_ps(vCompi); ++ } ++ else if (FormatTraits::GetType(comp) == SWR_TYPE_SINT) ++ { ++ int iMax = (1 << (FormatTraits::GetBPC(comp) - 1)) - 1; ++ int iMin = -1 - iMax; ++ simdscalari vCompi = _simd_castps_si(vComp); ++ vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); ++ vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); ++ vComp = _simd_castsi_ps(vCompi); ++ } ++ } ++ ++ // pack ++ vComp = FormatTraits::pack(comp, vComp); ++ ++ // store ++ FormatTraits::storeSOA(comp, pDst, vComp); ++ ++ pDst += (FormatTraits::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; ++ }; ++ ++ UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h +new file mode 100644 +index 0000000..d39f523 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h +@@ -0,0 +1,2954 @@ ++ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file format_traits.h ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#pragma once ++ ++#include "format_types.h" ++#include "utils.h" ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatSwizzle - Component swizzle selects ++////////////////////////////////////////////////////////////////////////// ++template ++struct FormatSwizzle ++{ ++ // Return swizzle select for component. ++ INLINE static uint32_t swizzle(UINT c) ++ { ++ static const uint32_t s[4] = { comp0, comp1, comp2, comp3 }; ++ return s[c]; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits ++////////////////////////////////////////////////////////////////////////// ++template ++struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0> ++{ ++ static const uint32_t bpp{ 0 }; ++ static const uint32_t numComps{ 0 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{1}; ++ static const uint32_t bcHeight{1}; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32A32_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32_32 TransposeT; ++ typedef Format4<32, 32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32A32_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32_32 TransposeT; ++ typedef Format4<32, 32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32A32_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32_32 TransposeT; ++ typedef Format4<32, 32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32X32_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32_32 TransposeT; ++ typedef Format4<32, 32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32A32_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32_32 TransposeT; ++ typedef Format4<32, 32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32A32_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32_32 TransposeT; ++ typedef Format4<32, 32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 96 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32 TransposeT; ++ typedef Format3<32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 96 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32 TransposeT; ++ typedef Format3<32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 96 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32 TransposeT; ++ typedef Format3<32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 96 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32 TransposeT; ++ typedef Format3<32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32B32_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 96 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32_32 TransposeT; ++ typedef Format3<32, 32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_FLOAT_X8X24_TYPELESS ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16X16_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16X16_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16A16_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16_16 TransposeT; ++ typedef Format4<16, 16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32G32_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_FLOAT_X8X24_TYPELESS_LD ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose32_32 TransposeT; ++ typedef Format2<32, 32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B8G8R8A8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B8G8R8A8_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R11G11B10_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose11_11_10 TransposeT; ++ typedef Format3<11, 11, 10> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R24_UNORM_X8_TYPELESS ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<24> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R24_UNORM_X8_TYPELESS_LD ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<24> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for A32_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B8G8R8X8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B8G8R8X8_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8X8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8X8_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R9G9B9E5_SHAREDEXP ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose9_9_9_5 TransposeT; ++ typedef Format4<9, 9, 9, 5> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10X2_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10X2_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8A8_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16 TransposeT; ++ typedef Format2<16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R32_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<32> TransposeT; ++ typedef Format1<32> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B5G6R5_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose5_6_5 TransposeT; ++ typedef Format3<5, 6, 5> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B5G6R5_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose5_6_5 TransposeT; ++ typedef Format3<5, 6, 5> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B5G5R5A1_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose5_5_5_1 TransposeT; ++ typedef Format4<5, 5, 5, 1> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B5G5R5A1_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose5_5_5_1 TransposeT; ++ typedef Format4<5, 5, 5, 1> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B4G4R4A4_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose4_4_4_4 TransposeT; ++ typedef Format4<4, 4, 4, 4> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B4G4R4A4_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose4_4_4_4 TransposeT; ++ typedef Format4<4, 4, 4, 4> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8 TransposeT; ++ typedef Format2<8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8 TransposeT; ++ typedef Format2<8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8 TransposeT; ++ typedef Format2<8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8 TransposeT; ++ typedef Format2<8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for A16_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for A16_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B5G5R5X1_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose5_5_5_1 TransposeT; ++ typedef Format4<5, 5, 5, 1> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B5G5R5X1_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose5_5_5_1 TransposeT; ++ typedef Format4<5, 5, 5, 1> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8 TransposeT; ++ typedef Format2<8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 2 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8 TransposeT; ++ typedef Format2<8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 16 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<16> TransposeT; ++ typedef Format1<16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for A8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 8 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef TransposeSingleComponent<8> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for YCRCB_SWAPUVY ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ true }; ++ static const uint32_t bcWidth{ 2 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC1_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<64> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC2_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC3_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC4_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<64> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC5_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC1_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<64> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC2_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC3_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for YCRCB_SWAPUV ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ true }; ++ static const uint32_t bcWidth{ 2 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8_8 TransposeT; ++ typedef Format4<8, 8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC4_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 64 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<64> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC5_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_FLOAT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC7_UNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for BC7_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 128 }; ++ static const uint32_t numComps{ 1 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ true }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 4 }; ++ static const uint32_t bcHeight{ 4 }; ++ ++ typedef TransposeSingleComponent<128> TransposeT; ++ typedef Format1<8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_UNORM_SRGB ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ true }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R16G16B16_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 48 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose16_16_16 TransposeT; ++ typedef Format3<16, 16, 16> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R10G10B10A2_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_SNORM ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_USCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_SSCALED ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x3f800000> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for B10G10R10A2_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<2, 1, 0, 3>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 32 }; ++ static const uint32_t numComps{ 4 }; ++ static const bool hasAlpha{ true }; ++ static const uint32_t alphaComp{ 3 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose10_10_10_2 TransposeT; ++ typedef Format4<10, 10, 10, 2> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_UINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// FormatTraits - Format traits specialization for R8G8B8_SINT ++////////////////////////////////////////////////////////////////////////// ++template<> struct FormatTraits : ++ ComponentTraits, ++ FormatSwizzle<0, 1, 2>, ++ Defaults<0, 0, 0, 0x1> ++{ ++ static const uint32_t bpp{ 24 }; ++ static const uint32_t numComps{ 3 }; ++ static const bool hasAlpha{ false }; ++ static const uint32_t alphaComp{ 0 }; ++ static const bool isSRGB{ false }; ++ static const bool isBC{ false }; ++ static const bool isSubsampled{ false }; ++ static const uint32_t bcWidth{ 1 }; ++ static const uint32_t bcHeight{ 1 }; ++ ++ typedef Transpose8_8_8 TransposeT; ++ typedef Format3<8, 8, 8> FormatT; ++}; ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h +new file mode 100644 +index 0000000..92125df +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h +@@ -0,0 +1,1053 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file formats.h ++* ++* @brief Definitions for SWR_FORMAT functions. ++* ++******************************************************************************/ ++#pragma once ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking same pixel sizes ++////////////////////////////////////////////////////////////////////////// ++template ++struct PackTraits ++{ ++ static const uint32_t MyNumBits = NumBits; ++ static simdscalar loadSOA(const BYTE *pSrc) = delete; ++ static void storeSOA(BYTE *pDst, simdscalar src) = delete; ++ static simdscalar unpack(simdscalar &in) = delete; ++ static simdscalar pack(simdscalar &in) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking unused channels ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct PackTraits<0, false> ++{ ++ static const uint32_t MyNumBits = 0; ++ ++ static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); } ++ static void storeSOA(BYTE *pDst, simdscalar src) { return; } ++ static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } ++ static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } ++}; ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct PackTraits<8, false> ++{ ++ static const uint32_t MyNumBits = 8; ++ ++ static simdscalar loadSOA(const BYTE *pSrc) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ __m256 result = _mm256_setzero_ps(); ++ __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); ++ return _mm256_insertf128_ps(result, vLo, 0); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static void storeSOA(BYTE *pDst, simdscalar src) ++ { ++ // store simd bytes ++#if KNOB_SIMD_WIDTH == 8 ++ _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar unpack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++#if KNOB_ARCH==KNOB_ARCH_AVX ++ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); ++ __m128i resLo = _mm_cvtepu8_epi32(src); ++ __m128i resHi = _mm_shuffle_epi8(src, ++ _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ return _mm256_castsi256_ps(result); ++#elif KNOB_ARCH==KNOB_ARCH_AVX2 ++ return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); ++#endif ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar pack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalari src = _simd_castps_si(in); ++ __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); ++ __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); ++ return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking 8 bit signed channels ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct PackTraits<8, true> ++{ ++ static const uint32_t MyNumBits = 8; ++ ++ static simdscalar loadSOA(const BYTE *pSrc) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ __m256 result = _mm256_setzero_ps(); ++ __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); ++ return _mm256_insertf128_ps(result, vLo, 0); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static void storeSOA(BYTE *pDst, simdscalar src) ++ { ++ // store simd bytes ++#if KNOB_SIMD_WIDTH == 8 ++ _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar unpack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++#if KNOB_ARCH==KNOB_ARCH_AVX ++ SWR_ASSERT(0); // I think this may be incorrect. ++ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); ++ __m128i resLo = _mm_cvtepi8_epi32(src); ++ __m128i resHi = _mm_shuffle_epi8(src, ++ _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ return _mm256_castsi256_ps(result); ++#elif KNOB_ARCH==KNOB_ARCH_AVX2 ++ return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); ++#endif ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar pack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalari src = _simd_castps_si(in); ++ __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); ++ __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); ++ return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct PackTraits<16, false> ++{ ++ static const uint32_t MyNumBits = 16; ++ ++ static simdscalar loadSOA(const BYTE *pSrc) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ __m256 result = _mm256_setzero_ps(); ++ __m128 vLo = _mm_load_ps((const float*)pSrc); ++ return _mm256_insertf128_ps(result, vLo, 0); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static void storeSOA(BYTE *pDst, simdscalar src) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ // store 16B (2B * 8) ++ _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar unpack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++#if KNOB_ARCH==KNOB_ARCH_AVX ++ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); ++ __m128i resLo = _mm_cvtepu16_epi32(src); ++ __m128i resHi = _mm_shuffle_epi8(src, ++ _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ return _mm256_castsi256_ps(result); ++#elif KNOB_ARCH==KNOB_ARCH_AVX2 ++ return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); ++#endif ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar pack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalari src = _simd_castps_si(in); ++ __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); ++ return _mm256_castsi256_ps(res); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking 16 bit signed channels ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct PackTraits<16, true> ++{ ++ static const uint32_t MyNumBits = 16; ++ ++ static simdscalar loadSOA(const BYTE *pSrc) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ __m256 result = _mm256_setzero_ps(); ++ __m128 vLo = _mm_load_ps((const float*)pSrc); ++ return _mm256_insertf128_ps(result, vLo, 0); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static void storeSOA(BYTE *pDst, simdscalar src) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ // store 16B (2B * 8) ++ _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar unpack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++#if KNOB_ARCH==KNOB_ARCH_AVX ++ SWR_ASSERT(0); // I think this is incorrectly implemented ++ __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); ++ __m128i resLo = _mm_cvtepi16_epi32(src); ++ __m128i resHi = _mm_shuffle_epi8(src, ++ _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); ++ ++ __m256i result = _mm256_castsi128_si256(resLo); ++ result = _mm256_insertf128_si256(result, resHi, 1); ++ return _mm256_castsi256_ps(result); ++#elif KNOB_ARCH==KNOB_ARCH_AVX2 ++ return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); ++#endif ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar pack(simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalari src = _simd_castps_si(in); ++ __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); ++ return _mm256_castsi256_ps(res); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// PackTraits - Helpers for packing / unpacking 32 bit channels ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct PackTraits<32, false> ++{ ++ static const uint32_t MyNumBits = 32; ++ ++ static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); } ++ static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } ++ static simdscalar unpack(simdscalar &in) { return in; } ++ static simdscalar pack(simdscalar &in) { return in; } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits. ++////////////////////////////////////////////////////////////////////////// ++template ++struct TypeTraits : PackTraits ++{ ++ static const SWR_TYPE MyType = type; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UINT8 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<8> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UINT; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UINT8 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<8, true> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_SINT; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UINT16 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<16> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UINT; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for SINT16 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<16, true> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_SINT; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UINT32 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<32> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UINT; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UINT32 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<32> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_SINT; ++ static float toFloat() { return 0.0; } ++ static float fromFloat() { SWR_ASSERT(0); return 0.0; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UNORM8 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<8> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UNORM; ++ static float toFloat() { return 1.0f / 255.0f; } ++ static float fromFloat() { return 255.0f; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UNORM8 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<8, true> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_SNORM; ++ static float toFloat() { return 1.0f / 127.0f; } ++ static float fromFloat() { return 127.0f; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UNORM16 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<16> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UNORM; ++ static float toFloat() { return 1.0f / 65535.0f; } ++ static float fromFloat() { return 65535.0f; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for SNORM16 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<16, true> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UNORM; ++ static float toFloat() { return 1.0f / 32767.0f; } ++ static float fromFloat() { return 32767.0f; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for UNORM24 ++////////////////////////////////////////////////////////////////////////// ++template<> ++struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_UNORM; ++ static float toFloat() { return 1.0f / 16777215.0f; } ++ static float fromFloat() { return 16777215.0f; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++// FLOAT Specializations from here on... ++////////////////////////////////////////////////////////////////////////// ++#define TO_M128i(a) _mm_castps_si128(a) ++#define TO_M128(a) _mm_castsi128_ps(a) ++ ++#include "math.h" ++ ++template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden > ++inline static __m128 fastpow(__m128 arg) { ++ __m128 ret = arg; ++ ++ static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) ++ * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); ++ ++ // Apply a constant pre-correction factor. ++ ret = _mm_mul_ps(ret, factor); ++ ++ // Reinterpret arg as integer to obtain logarithm. ++ //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); ++ ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); ++ ++ // Multiply logarithm by power. ++ ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); ++ ++ // Convert back to "integer" to exponentiate. ++ //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); ++ ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); ++ ++ return ret; ++} ++ ++inline static __m128 pow512_4(__m128 arg) { ++ // 5/12 is too small, so compute the 4th root of 20/12 instead. ++ // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. ++ // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 ++ __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg); ++ __m128 xover = _mm_mul_ps(arg, xf); ++ ++ __m128 xfm1 = _mm_rsqrt_ps(xf); ++ __m128 x2 = _mm_mul_ps(arg, arg); ++ __m128 xunder = _mm_mul_ps(x2, xfm1); ++ ++ // sqrt2 * over + 2 * sqrt2 * under ++ __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), ++ _mm_add_ps(xover, xunder)); ++ ++ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); ++ xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); ++ return xavg; ++} ++ ++inline static __m128 powf_wrapper(__m128 Base, float Exp) ++{ ++ float *f = (float *)(&Base); ++ ++ return _mm_set_ps(powf(f[0], Exp), ++ powf(f[1], Exp), ++ powf(f[2], Exp), ++ powf(f[3], Exp)); ++} ++ ++static inline __m128 ConvertFloatToSRGB2(__m128& Src) ++{ ++ // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value ++ __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); ++ ++ // squeeze the mask down to 16 bits (4 bits per DWORD) ++ int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask); ++ ++ __m128 Result; ++ ++ // ++ if (CompareResult == 0xFFFF) ++ { ++ // all DWORDs are <= the threshold ++ Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); ++ } ++ else if (CompareResult == 0x0) ++ { ++ // all DWORDs are > the threshold ++ __m128 fSrc_0RGB = Src; ++ ++ // --> 1.055f * c(1.0f/2.4f) - 0.055f ++#if KNOB_USE_FAST_SRGB == TRUE ++ // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. ++ __m128 f = pow512_4(fSrc_0RGB); ++#else ++ __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); ++#endif ++ f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); ++ Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); ++ } ++ else ++ { ++ // some DWORDs are <= the threshold and some are > threshold ++ __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); ++ ++ __m128 fSrc_0RGB = Src; ++ ++ // --> 1.055f * c(1.0f/2.4f) - 0.055f ++#if KNOB_USE_FAST_SRGB == TRUE ++ // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. ++ __m128 f = pow512_4(fSrc_0RGB); ++#else ++ __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); ++#endif ++ f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); ++ f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); ++ ++ // Clear the alpha (is garbage after the sub) ++ __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); ++ ++ __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); ++ __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); ++ __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); ++ ++ Result = TO_M128(CombinedParts); ++ } ++ ++ return Result; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for FLOAT16 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<16> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_FLOAT; ++ static float toFloat() { return 1.0f; } ++ static float fromFloat() { return 1.0f; } ++ static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); } ++ ++ static simdscalar pack(const simdscalar &in) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++#if (KNOB_ARCH == KNOB_ARCH_AVX) ++ // input is 8 packed float32, output is 8 packed float16 ++ simdscalari src = _simd_castps_si(in); ++ ++ static const uint32_t FLOAT_EXP_BITS = 8; ++ static const uint32_t FLOAT_MANTISSA_BITS = 23; ++ static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; ++ static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; ++ ++ static const uint32_t HALF_EXP_BITS = 5; ++ static const uint32_t HALF_MANTISSA_BITS = 10; ++ static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1; ++ static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; ++ ++ // minimum exponent required, exponents below this are flushed to 0. ++ static const int32_t HALF_EXP_MIN = -14; ++ static const int32_t FLOAT_EXP_BIAS = 127; ++ static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; ++ static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand ++ ++ // maximum exponent required, exponents above this are set to infinity ++ static const int32_t HALF_EXP_MAX = 15; ++ static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; ++ ++ const simdscalari vSignMask = _simd_set1_epi32(0x80000000); ++ const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); ++ const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); ++ const simdscalari vExpMin = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); ++ const simdscalari vExpMinFtz = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); ++ const simdscalari vExpMax = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); ++ ++ simdscalari vSign = _simd_and_si(src, vSignMask); ++ simdscalari vExp = _simd_and_si(src, vExpMask); ++ simdscalari vMan = _simd_and_si(src, vManMask); ++ ++ simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); ++ simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); ++ simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); ++ simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); ++ ++ simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); ++ ++ // pack output 16-bits into the lower 16-bits of each 32-bit channel ++ simdscalari vDst = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); ++ vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); ++ ++ // Flush To Zero ++ vDst = _simd_andnot_si(vFTZMask, vDst); ++ // Apply Infinites / NaN ++ vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); ++ ++ // Apply clamps ++ vDst = _simd_andnot_si(vClampMask, vDst); ++ vDst = _simd_or_si(vDst, ++ _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); ++ ++ // Compute Denormals (subnormals) ++ if (!_mm256_testz_si256(vDenormMask, vDenormMask)) ++ { ++ uint32_t *pDenormMask = (uint32_t*)&vDenormMask; ++ uint32_t *pExp = (uint32_t*)&vExp; ++ uint32_t *pMan = (uint32_t*)&vMan; ++ uint32_t *pDst = (uint32_t*)&vDst; ++ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) ++ { ++ if (pDenormMask[i]) ++ { ++ // Need to compute subnormal value ++ uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; ++ uint32_t mantissa = pMan[i] | ++ (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. Make it explicit ++ ++ pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); ++ } ++ } ++ } ++ ++ // Add in sign bits ++ vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); ++ ++ // Pack to lower 128-bits ++ vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); ++ ++#if 0 ++#if !defined(NDEBUG) ++ simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)); ++ ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]); ++ } ++#endif ++#endif ++ ++ return _simd_castsi_ps(vDst); ++ ++#else ++ return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC))); ++#endif ++#else ++#error Unsupported vector width ++#endif ++ } ++ ++ static simdscalar unpack(const simdscalar &in) ++ { ++ // input is 8 packed float16, output is 8 packed float32 ++ SWR_ASSERT(0); // @todo ++ return _simd_setzero_ps(); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// TypeTraits - Format type traits specialization for FLOAT32 ++////////////////////////////////////////////////////////////////////////// ++template<> struct TypeTraits : PackTraits<32> ++{ ++ static const SWR_TYPE MyType = SWR_TYPE_FLOAT; ++ static float toFloat() { return 1.0f; } ++ static float fromFloat() { return 1.0f; } ++ static inline simdscalar convertSrgb(simdscalar &in) ++ { ++#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2) ++ __m128 srcLo = _mm256_extractf128_ps(in, 0); ++ __m128 srcHi = _mm256_extractf128_ps(in, 1); ++ ++ srcLo = ConvertFloatToSRGB2(srcLo); ++ srcHi = ConvertFloatToSRGB2(srcHi); ++ ++ in = _mm256_insertf128_ps(in, srcLo, 0); ++ in = _mm256_insertf128_ps(in, srcHi, 1); ++ ++#endif ++ return in; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format1 - Bitfield for single component formats. ++////////////////////////////////////////////////////////////////////////// ++template ++struct Format1 ++{ ++ union ++ { ++ uint32_t r : x; ++ ++ ///@ The following are here to provide full template needed in Formats. ++ uint32_t g : x; ++ uint32_t b : x; ++ uint32_t a : x; ++ }; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format1 - Bitfield for single component formats - 8 bit specialization ++////////////////////////////////////////////////////////////////////////// ++template<> ++struct Format1<8> ++{ ++ union ++ { ++ uint8_t r; ++ ++ ///@ The following are here to provide full template needed in Formats. ++ uint8_t g; ++ uint8_t b; ++ uint8_t a; ++ }; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format1 - Bitfield for single component formats - 16 bit specialization ++////////////////////////////////////////////////////////////////////////// ++template<> ++struct Format1<16> ++{ ++ union ++ { ++ uint16_t r; ++ ++ ///@ The following are here to provide full template needed in Formats. ++ uint16_t g; ++ uint16_t b; ++ uint16_t a; ++ }; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format2 - Bitfield for 2 component formats. ++////////////////////////////////////////////////////////////////////////// ++template ++union Format2 ++{ ++ struct ++ { ++ uint32_t r : x; ++ uint32_t g : y; ++ }; ++ struct ++ { ++ ///@ The following are here to provide full template needed in Formats. ++ uint32_t b : x; ++ uint32_t a : y; ++ }; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format2 - Bitfield for 2 component formats - 16 bit specialization ++////////////////////////////////////////////////////////////////////////// ++template<> ++union Format2<8,8> ++{ ++ struct ++ { ++ uint16_t r : 8; ++ uint16_t g : 8; ++ }; ++ struct ++ { ++ ///@ The following are here to provide full template needed in Formats. ++ uint16_t b : 8; ++ uint16_t a : 8; ++ }; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format3 - Bitfield for 3 component formats. ++////////////////////////////////////////////////////////////////////////// ++template ++union Format3 ++{ ++ struct ++ { ++ uint32_t r : x; ++ uint32_t g : y; ++ uint32_t b : z; ++ }; ++ uint32_t a; ///@note This is here to provide full template needed in Formats. ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format3 - Bitfield for 3 component formats - 16 bit specialization ++////////////////////////////////////////////////////////////////////////// ++template<> ++union Format3<5,6,5> ++{ ++ struct ++ { ++ uint16_t r : 5; ++ uint16_t g : 6; ++ uint16_t b : 5; ++ }; ++ uint16_t a; ///@note This is here to provide full template needed in Formats. ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format4 - Bitfield for 4 component formats. ++////////////////////////////////////////////////////////////////////////// ++template ++struct Format4 ++{ ++ uint32_t r : x; ++ uint32_t g : y; ++ uint32_t b : z; ++ uint32_t a : w; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format4 - Bitfield for 4 component formats - 16 bit specialization ++////////////////////////////////////////////////////////////////////////// ++template<> ++struct Format4<5,5,5,1> ++{ ++ uint16_t r : 5; ++ uint16_t g : 5; ++ uint16_t b : 5; ++ uint16_t a : 1; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Format4 - Bitfield for 4 component formats - 16 bit specialization ++////////////////////////////////////////////////////////////////////////// ++template<> ++struct Format4<4,4,4,4> ++{ ++ uint16_t r : 4; ++ uint16_t g : 4; ++ uint16_t b : 4; ++ uint16_t a : 4; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// ComponentTraits - Default components ++////////////////////////////////////////////////////////////////////////// ++template ++struct Defaults ++{ ++ INLINE static uint32_t GetDefault(uint32_t comp) ++ { ++ static const uint32_t defaults[4]{ x, y, z, w }; ++ return defaults[comp]; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// ComponentTraits - Component type traits. ++////////////////////////////////////////////////////////////////////////// ++template ++struct ComponentTraits ++{ ++ INLINE static SWR_TYPE GetType(uint32_t comp) ++ { ++ static const SWR_TYPE CompType[4]{ X, Y, Z, W }; ++ return CompType[comp]; ++ } ++ ++ INLINE static uint32_t GetBPC(uint32_t comp) ++ { ++ static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW }; ++ return MyBpc[comp]; ++ } ++ ++ INLINE static bool isNormalized(uint32_t comp) ++ { ++ switch (comp) ++ { ++ case 0: ++ return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false; ++ case 1: ++ return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false; ++ case 2: ++ return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false; ++ case 3: ++ return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false; ++ } ++ SWR_ASSERT(0); ++ return false; ++ } ++ ++ INLINE static float toFloat(uint32_t comp) ++ { ++ switch (comp) ++ { ++ case 0: ++ return TypeTraits::toFloat(); ++ case 1: ++ return TypeTraits::toFloat(); ++ case 2: ++ return TypeTraits::toFloat(); ++ case 3: ++ return TypeTraits::toFloat(); ++ } ++ SWR_ASSERT(0); ++ return TypeTraits::toFloat(); ++ ++ } ++ ++ INLINE static float fromFloat(uint32_t comp) ++ { ++ switch (comp) ++ { ++ case 0: ++ return TypeTraits::fromFloat(); ++ case 1: ++ return TypeTraits::fromFloat(); ++ case 2: ++ return TypeTraits::fromFloat(); ++ case 3: ++ return TypeTraits::fromFloat(); ++ } ++ SWR_ASSERT(0); ++ return TypeTraits::fromFloat(); ++ } ++ ++ INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc) ++ { ++ switch (comp) ++ { ++ case 0: ++ return TypeTraits::loadSOA(pSrc); ++ case 1: ++ return TypeTraits::loadSOA(pSrc); ++ case 2: ++ return TypeTraits::loadSOA(pSrc); ++ case 3: ++ return TypeTraits::loadSOA(pSrc); ++ } ++ SWR_ASSERT(0); ++ return TypeTraits::loadSOA(pSrc); ++ } ++ ++ INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src) ++ { ++ switch (comp) ++ { ++ case 0: ++ TypeTraits::storeSOA(pDst, src); ++ return; ++ case 1: ++ TypeTraits::storeSOA(pDst, src); ++ return; ++ case 2: ++ TypeTraits::storeSOA(pDst, src); ++ return; ++ case 3: ++ TypeTraits::storeSOA(pDst, src); ++ return; ++ } ++ SWR_ASSERT(0); ++ TypeTraits::storeSOA(pDst, src); ++ } ++ ++ INLINE static simdscalar unpack(uint32_t comp, simdscalar &in) ++ { ++ switch (comp) ++ { ++ case 0: ++ return TypeTraits::unpack(in); ++ case 1: ++ return TypeTraits::unpack(in); ++ case 2: ++ return TypeTraits::unpack(in); ++ case 3: ++ return TypeTraits::unpack(in); ++ } ++ SWR_ASSERT(0); ++ return TypeTraits::unpack(in); ++ } ++ ++ INLINE static simdscalar pack(uint32_t comp, simdscalar &in) ++ { ++ switch (comp) ++ { ++ case 0: ++ return TypeTraits::pack(in); ++ case 1: ++ return TypeTraits::pack(in); ++ case 2: ++ return TypeTraits::pack(in); ++ case 3: ++ return TypeTraits::pack(in); ++ } ++ SWR_ASSERT(0); ++ return TypeTraits::pack(in); ++ } ++ ++ INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in) ++ { ++ switch (comp) ++ { ++ case 0: ++ return TypeTraits::convertSrgb(in);; ++ case 1: ++ return TypeTraits::convertSrgb(in);; ++ case 2: ++ return TypeTraits::convertSrgb(in);; ++ case 3: ++ return TypeTraits::convertSrgb(in);; ++ } ++ SWR_ASSERT(0); ++ return TypeTraits::convertSrgb(in); ++ } ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +new file mode 100644 +index 0000000..986e49f +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +@@ -0,0 +1,1972 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file frontend.cpp ++* ++* @brief Implementation for Frontend which handles vertex processing, ++* primitive assembly, clipping, binning, etc. ++* ++******************************************************************************/ ++ ++#include "api.h" ++#include "frontend.h" ++#include "backend.h" ++#include "context.h" ++#include "rdtsc_core.h" ++#include "rasterizer.h" ++#include "utils.h" ++#include "threads.h" ++#include "pa.h" ++#include "clip.h" ++#include "tilemgr.h" ++#include "tessellator.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Helper macro to generate a bitmask ++static INLINE uint32_t GenMask(uint32_t numBits) ++{ ++ SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); ++ return ((1U << numBits) - 1); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief FE handler for SwrSync. ++/// @param pContext - pointer to SWR context. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pUserData - Pointer to user data passed back to sync callback. ++/// @todo This should go away when we switch this to use compute threading. ++void ProcessSync( ++ SWR_CONTEXT *pContext, ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ void *pUserData) ++{ ++ SYNC_DESC *pSync = (SYNC_DESC*)pUserData; ++ BE_WORK work; ++ work.type = SYNC; ++ work.pfnWork = ProcessSyncBE; ++ work.desc.sync = *pSync; ++ ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ pTileMgr->enqueue(0, 0, &work); ++ ++ _ReadWriteBarrier(); ++ pDC->doneFE = true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief FE handler for SwrGetStats. ++/// @param pContext - pointer to SWR context. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pUserData - Pointer to user data passed back to stats callback. ++/// @todo This should go away when we switch this to use compute threading. ++void ProcessQueryStats( ++ SWR_CONTEXT *pContext, ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ void *pUserData) ++{ ++ QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData; ++ BE_WORK work; ++ work.type = QUERYSTATS; ++ work.pfnWork = ProcessQueryStatsBE; ++ work.desc.queryStats = *pQueryStats; ++ ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ pTileMgr->enqueue(0, 0, &work); ++ ++ _ReadWriteBarrier(); ++ pDC->doneFE = true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief FE handler for SwrClearRenderTarget. ++/// @param pContext - pointer to SWR context. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pUserData - Pointer to user data passed back to clear callback. ++/// @todo This should go away when we switch this to use compute threading. ++void ProcessClear( ++ SWR_CONTEXT *pContext, ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ void *pUserData) ++{ ++ CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData; ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ ++ const API_STATE& state = GetApiState(pDC); ++ ++ // queue a clear to each macro tile ++ // compute macro tile bounds for the current scissor/viewport ++ uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED; ++ uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED; ++ uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED; ++ uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED; ++ ++ BE_WORK work; ++ work.type = CLEAR; ++ work.pfnWork = ProcessClearBE; ++ work.desc.clear = *pClear; ++ ++ for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y) ++ { ++ for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x) ++ { ++ pTileMgr->enqueue(x, y, &work); ++ } ++ } ++ ++ _ReadWriteBarrier(); ++ pDC->doneFE = true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief FE handler for SwrStoreTiles. ++/// @param pContext - pointer to SWR context. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pUserData - Pointer to user data passed back to callback. ++/// @todo This should go away when we switch this to use compute threading. ++void ProcessStoreTiles( ++ SWR_CONTEXT *pContext, ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ void *pUserData) ++{ ++ RDTSC_START(FEProcessStoreTiles); ++ STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData; ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ ++ const API_STATE& state = GetApiState(pDC); ++ ++ // queue a store to each macro tile ++ // compute macro tile bounds for the current render target ++ const uint32_t macroWidth = KNOB_MACROTILE_X_DIM; ++ const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; ++ ++ uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; ++ uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; ++ ++ // store tiles ++ BE_WORK work; ++ work.type = STORETILES; ++ work.pfnWork = ProcessStoreTileBE; ++ work.desc.storeTiles = *pStore; ++ ++ for (uint32_t x = 0; x < numMacroTilesX; ++x) ++ { ++ for (uint32_t y = 0; y < numMacroTilesY; ++y) ++ { ++ pTileMgr->enqueue(x, y, &work); ++ } ++ } ++ ++ _ReadWriteBarrier(); ++ pDC->doneFE = true; ++ ++ RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief FE handler for SwrInvalidateTiles. ++/// @param pContext - pointer to SWR context. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pUserData - Pointer to user data passed back to callback. ++/// @todo This should go away when we switch this to use compute threading. ++void ProcessInvalidateTiles( ++ SWR_CONTEXT *pContext, ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ void *pUserData) ++{ ++ RDTSC_START(FEProcessInvalidateTiles); ++ INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData; ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ ++ const API_STATE& state = GetApiState(pDC); ++ ++ // queue a store to each macro tile ++ // compute macro tile bounds for the current render target ++ uint32_t macroWidth = KNOB_MACROTILE_X_DIM; ++ uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; ++ ++ uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; ++ uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; ++ ++ // load tiles ++ BE_WORK work; ++ work.type = INVALIDATETILES; ++ work.pfnWork = ProcessInvalidateTilesBE; ++ work.desc.invalidateTiles = *pInv; ++ ++ for (uint32_t x = 0; x < numMacroTilesX; ++x) ++ { ++ for (uint32_t y = 0; y < numMacroTilesY; ++y) ++ { ++ pTileMgr->enqueue(x, y, &work); ++ } ++ } ++ ++ _ReadWriteBarrier(); ++ pDC->doneFE = true; ++ ++ RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes the number of primitives given the number of verts. ++/// @param mode - primitive topology for draw operation. ++/// @param numElements - number of vertices or indices for draw. ++/// @todo Frontend needs to be refactored. This will go in appropriate place then. ++uint32_t GetNumPrims( ++ PRIMITIVE_TOPOLOGY mode, ++ uint32_t numElements) ++{ ++ switch (mode) ++ { ++ case TOP_POINT_LIST: return numElements; ++ case TOP_TRIANGLE_LIST: return numElements / 3; ++ case TOP_TRIANGLE_STRIP: return numElements < 3 ? 0 : numElements - 2; ++ case TOP_TRIANGLE_FAN: return numElements < 3 ? 0 : numElements - 2; ++ case TOP_TRIANGLE_DISC: return numElements < 2 ? 0 : numElements - 1; ++ case TOP_QUAD_LIST: return numElements / 4; ++ case TOP_QUAD_STRIP: return numElements < 4 ? 0 : (numElements - 2) / 2; ++ case TOP_LINE_STRIP: return numElements < 2 ? 0 : numElements - 1; ++ case TOP_LINE_LIST: return numElements / 2; ++ case TOP_LINE_LOOP: return numElements; ++ case TOP_RECT_LIST: return numElements / 3; ++ ++ case TOP_PATCHLIST_1: ++ case TOP_PATCHLIST_2: ++ case TOP_PATCHLIST_3: ++ case TOP_PATCHLIST_4: ++ case TOP_PATCHLIST_5: ++ case TOP_PATCHLIST_6: ++ case TOP_PATCHLIST_7: ++ case TOP_PATCHLIST_8: ++ case TOP_PATCHLIST_9: ++ case TOP_PATCHLIST_10: ++ case TOP_PATCHLIST_11: ++ case TOP_PATCHLIST_12: ++ case TOP_PATCHLIST_13: ++ case TOP_PATCHLIST_14: ++ case TOP_PATCHLIST_15: ++ case TOP_PATCHLIST_16: ++ case TOP_PATCHLIST_17: ++ case TOP_PATCHLIST_18: ++ case TOP_PATCHLIST_19: ++ case TOP_PATCHLIST_20: ++ case TOP_PATCHLIST_21: ++ case TOP_PATCHLIST_22: ++ case TOP_PATCHLIST_23: ++ case TOP_PATCHLIST_24: ++ case TOP_PATCHLIST_25: ++ case TOP_PATCHLIST_26: ++ case TOP_PATCHLIST_27: ++ case TOP_PATCHLIST_28: ++ case TOP_PATCHLIST_29: ++ case TOP_PATCHLIST_30: ++ case TOP_PATCHLIST_31: ++ case TOP_PATCHLIST_32: ++ return numElements / (mode - TOP_PATCHLIST_BASE); ++ ++ case TOP_LINE_LIST_ADJ: ++ case TOP_LISTSTRIP_ADJ: ++ case TOP_TRI_LIST_ADJ: ++ case TOP_TRI_STRIP_ADJ: ++ case TOP_TRI_STRIP_REVERSE: ++ case TOP_POLYGON: ++ case TOP_POINT_LIST_BF: ++ case TOP_LINE_STRIP_CONT: ++ case TOP_LINE_STRIP_BF: ++ case TOP_LINE_STRIP_CONT_BF: ++ case TOP_TRIANGLE_FAN_NOSTIPPLE: ++ case TOP_PATCHLIST_BASE: ++ case TOP_UNKNOWN: ++ SWR_ASSERT(false, "Unsupported topology: %d", mode); ++ return 0; ++ } ++ ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Return number of verts per primitive. ++/// @param topology - topology ++/// @param includeAdjVerts - include adjacent verts in primitive vertices ++INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) ++{ ++ uint32_t numVerts = 0; ++ switch (topology) ++ { ++ case TOP_POINT_LIST: ++ case TOP_POINT_LIST_BF: ++ numVerts = 1; ++ break; ++ case TOP_LINE_LIST: ++ case TOP_LINE_STRIP: ++ case TOP_LINE_LIST_ADJ: ++ case TOP_LINE_LOOP: ++ case TOP_LINE_STRIP_CONT: ++ case TOP_LINE_STRIP_BF: ++ case TOP_LISTSTRIP_ADJ: ++ numVerts = 2; ++ break; ++ case TOP_TRIANGLE_LIST: ++ case TOP_TRIANGLE_STRIP: ++ case TOP_TRIANGLE_FAN: ++ case TOP_TRI_LIST_ADJ: ++ case TOP_TRI_STRIP_ADJ: ++ case TOP_TRI_STRIP_REVERSE: ++ case TOP_RECT_LIST: ++ numVerts = 3; ++ break; ++ case TOP_QUAD_LIST: ++ case TOP_QUAD_STRIP: ++ numVerts = 4; ++ break; ++ case TOP_PATCHLIST_1: ++ case TOP_PATCHLIST_2: ++ case TOP_PATCHLIST_3: ++ case TOP_PATCHLIST_4: ++ case TOP_PATCHLIST_5: ++ case TOP_PATCHLIST_6: ++ case TOP_PATCHLIST_7: ++ case TOP_PATCHLIST_8: ++ case TOP_PATCHLIST_9: ++ case TOP_PATCHLIST_10: ++ case TOP_PATCHLIST_11: ++ case TOP_PATCHLIST_12: ++ case TOP_PATCHLIST_13: ++ case TOP_PATCHLIST_14: ++ case TOP_PATCHLIST_15: ++ case TOP_PATCHLIST_16: ++ case TOP_PATCHLIST_17: ++ case TOP_PATCHLIST_18: ++ case TOP_PATCHLIST_19: ++ case TOP_PATCHLIST_20: ++ case TOP_PATCHLIST_21: ++ case TOP_PATCHLIST_22: ++ case TOP_PATCHLIST_23: ++ case TOP_PATCHLIST_24: ++ case TOP_PATCHLIST_25: ++ case TOP_PATCHLIST_26: ++ case TOP_PATCHLIST_27: ++ case TOP_PATCHLIST_28: ++ case TOP_PATCHLIST_29: ++ case TOP_PATCHLIST_30: ++ case TOP_PATCHLIST_31: ++ case TOP_PATCHLIST_32: ++ numVerts = topology - TOP_PATCHLIST_BASE; ++ break; ++ default: ++ SWR_ASSERT(false, "Unsupported topology: %d", topology); ++ break; ++ } ++ ++ if (includeAdjVerts) ++ { ++ switch (topology) ++ { ++ case TOP_LISTSTRIP_ADJ: ++ case TOP_LINE_LIST_ADJ: numVerts = 4; break; ++ case TOP_TRI_STRIP_ADJ: ++ case TOP_TRI_LIST_ADJ: numVerts = 6; break; ++ default: break; ++ } ++ } ++ ++ return numVerts; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief StreamOut - Streams vertex data out to SO buffers. ++/// Generally, we are only streaming out a SIMDs worth of triangles. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) ++static void StreamOut( ++ DRAW_CONTEXT* pDC, ++ PA_STATE& pa, ++ uint32_t workerId, ++ uint32_t* pPrimData) ++{ ++ RDTSC_START(FEStreamout); ++ ++ SWR_CONTEXT* pContext = pDC->pContext; ++ ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_STREAMOUT_STATE &soState = state.soState; ++ ++ uint32_t streamIndex = 0; ///@todo Stream index will come from PA_STATE. ++ uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); ++ ++ // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. ++ uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t); ++ ++ SWR_STREAMOUT_CONTEXT soContext = { 0 }; ++ ++ // Setup buffer state pointers. ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ soContext.pBuffer[i] = &state.soBuffer[i]; ++ } ++ ++ uint32_t numPrims = pa.NumPrims(); ++ for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) ++ { ++ DWORD slot = 0; ++ uint32_t soMask = soState.streamMasks[streamIndex]; ++ ++ // Write all entries into primitive data buffer for SOS. ++ while (_BitScanForward(&slot, soMask)) ++ { ++ __m128 attrib[MAX_ATTRIBUTES]; // prim attribs (always 4 wide) ++ uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT; ++ pa.AssembleSingle(paSlot, primIndex, attrib); ++ ++ // Attribute offset is relative offset from start of vertex. ++ // Note that attributes start at slot 1 in the PA buffer. We need to write this ++ // to prim data starting at slot 0. Which is why we do (slot - 1). ++ // Also note: GL works slightly differently, and needs slot 0 ++ uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); ++ ++ // Store each vertex's attrib at appropriate locations in pPrimData buffer. ++ for (uint32_t v = 0; v < soVertsPerPrim; ++v) ++ { ++ uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); ++ ++ _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); ++ } ++ soMask &= ~(1 << slot); ++ } ++ ++ // Update pPrimData pointer ++ soContext.pPrimData = pPrimData; ++ ++ // Call SOS ++ state.pfnSoFunc[streamIndex](soContext); ++ } ++ ++ // Update SO write offset. The driver provides memory for the update. ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ if (state.soBuffer[i].pWriteOffset) ++ { ++ *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); ++ ++ // The SOS increments the existing write offset. So we don't want to increment ++ // the SoWriteOffset stat using an absolute offset instead of relative. ++ SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset); ++ } ++ } ++ ++ UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); ++ UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); ++ ++ RDTSC_STOP(FEStreamout, 1, 0); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes number of invocations. The current index represents ++/// the start of the SIMD. The max index represents how much work ++/// items are remaining. If there is less then a SIMD's left of work ++/// then return the remaining amount of work. ++/// @param curIndex - The start index for the SIMD. ++/// @param maxIndex - The last index for all work items. ++static INLINE uint32_t GetNumInvocations( ++ uint32_t curIndex, ++ uint32_t maxIndex) ++{ ++ uint32_t remainder = (maxIndex - curIndex); ++ return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Implements GS stage. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pa - The primitive assembly object. ++/// @param pGsOut - output stream for GS ++template < ++ bool HasStreamOutT, ++ bool HasRastT> ++static void GeometryShaderStage( ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ PA_STATE& pa, ++ void* pGsOut, ++ void* pCutBuffer, ++ uint32_t* pSoPrimData, ++ simdscalari primID) ++{ ++ RDTSC_START(FEGeometryShader); ++ ++ SWR_GS_CONTEXT gsContext; ++ SWR_CONTEXT* pContext = pDC->pContext; ++ ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_GS_STATE* pState = &state.gsState; ++ ++ SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); ++ SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized"); ++ ++ gsContext.pStream[0] = (uint8_t*)pGsOut; ++ gsContext.pCutBuffer = (uint8_t*)pCutBuffer; ++ gsContext.PrimitiveID = primID; ++ ++ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); ++ simdvector attrib[MAX_ATTRIBUTES]; ++ ++ // assemble all attributes for the input primitive ++ for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; ++ pa.Assemble(attribSlot, attrib); ++ ++ for (uint32_t i = 0; i < numVertsPerPrim; ++i) ++ { ++ gsContext.vert[i].attrib[attribSlot] = attrib[i]; ++ } ++ } ++ ++ // assemble position ++ pa.Assemble(VERTEX_POSITION_SLOT, attrib); ++ for (uint32_t i = 0; i < numVertsPerPrim; ++i) ++ { ++ gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; ++ } ++ ++ const uint32_t vertexStride = sizeof(simdvertex); ++ const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; ++ const uint32_t inputPrimStride = numSimdBatches * vertexStride; ++ const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH; ++ const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; ++ const uint32_t cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH; ++ for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) ++ { ++ gsContext.InstanceID = instance; ++ ++ // execute the geometry shader ++ state.pfnGsFunc(GetPrivateState(pDC), &gsContext); ++ ++ gsContext.pStream[0] += instanceStride; ++ gsContext.pCutBuffer += cutInstanceStride; ++ } ++ ++ // record valid prims from the frontend to avoid over binning the newly generated ++ // prims from the GS ++ uint32_t numInputPrims = pa.NumPrims(); ++ ++ // set up new binner and state for the GS output topology ++ PFN_PROCESS_PRIMS pfnClipFunc = nullptr; ++ if (HasRastT) ++ { ++ switch (pState->outputTopology) ++ { ++ case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; ++ case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; ++ case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; ++ default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology); ++ } ++ } ++ ++ // foreach input prim: ++ // - setup a new PA based on the emitted verts for that prim ++ // - loop over the new verts, calling PA to assemble each prim ++ uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount; ++ uint32_t* pPrimitiveId = (uint32_t*)&primID; ++ ++ uint32_t totalPrimsGenerated = 0; ++ for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) ++ { ++ uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride; ++ uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride; ++ for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) ++ { ++ uint32_t numEmittedVerts = pVertexCount[inputPrim]; ++ if (numEmittedVerts == 0) ++ { ++ continue; ++ } ++ ++ uint8_t* pBase = pInstanceBase + instance * instanceStride; ++ uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride; ++ ++ DWORD numAttribs; ++ _BitScanReverse(&numAttribs, state.feAttribMask); ++ numAttribs++; ++ ++ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBase, numEmittedVerts, numAttribs, pState->outputTopology, true); ++ ++ while (gsPa.GetNextStreamOutput()) ++ { ++ do ++ { ++ bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); ++ ++ if (assemble) ++ { ++ totalPrimsGenerated += gsPa.NumPrims(); ++ ++ if (HasStreamOutT) ++ { ++ StreamOut(pDC, gsPa, workerId, pSoPrimData); ++ } ++ ++ if (HasRastT) ++ { ++ simdscalari vPrimId; ++ // pull primitiveID from the GS output if available ++ if (state.gsState.emitsPrimitiveID) ++ { ++ simdvector primIdAttrib[3]; ++ gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib); ++ vPrimId = _simd_castps_si(primIdAttrib[0].x); ++ } ++ else ++ { ++ vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); ++ } ++ ++ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); ++ } ++ } ++ } while (gsPa.NextPrim()); ++ } ++ } ++ } ++ ++ // update GS pipeline stats ++ UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount); ++ UPDATE_STAT(GsPrimitives, totalPrimsGenerated); ++ ++ RDTSC_STOP(FEGeometryShader, 1, 0); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Allocate GS buffers ++/// @param pDC - pointer to draw context. ++/// @param state - API state ++/// @param ppGsOut - pointer to GS output buffer allocation ++/// @param ppCutBuffer - pointer to GS output cut buffer allocation ++static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer) ++{ ++ SWR_ASSERT(state.gsState.gsEnable); ++ // allocate arena space to hold GS output verts ++ // @todo pack attribs ++ // @todo support multiple streams ++ const uint32_t vertexStride = sizeof(simdvertex); ++ const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; ++ uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH; ++ *ppGsOut = pDC->arena.AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float)); ++ ++ // allocate arena space to hold cut buffer, which is essentially a bitfield sized to the ++ // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance ++ const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; ++ const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH; ++ *ppCutBuffer = pDC->arena.AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float)); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate mask from remaining work. ++/// @param numWorkItems - Number of items being worked on by a SIMD. ++static INLINE simdscalari GenerateMask(uint32_t numWorkItems) ++{ ++ uint32_t numActive = (numWorkItems >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numWorkItems; ++ uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; ++ return _simd_castps_si(vMask(mask)); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Contains all data generated by the HS and passed to the ++/// tessellator and DS. ++struct TessellationThreadLocalData ++{ ++ ScalarPatch patchData[KNOB_SIMD_WIDTH]; ++ void* pTxCtx; ++ size_t tsCtxSize; ++ ++ simdscalar* pDSOutput; ++ size_t numDSOutputVectors; ++}; ++ ++THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Allocate tessellation data for this worker thread. ++INLINE ++static void AllocateTessellationData(SWR_CONTEXT* pContext) ++{ ++ /// @TODO - Don't use thread local storage. Use Worker local storage instead. ++ if (gt_pTessellationThreadData == nullptr) ++ { ++ gt_pTessellationThreadData = (TessellationThreadLocalData*) ++ _aligned_malloc(sizeof(TessellationThreadLocalData), 64); ++ memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Implements Tessellation Stages. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param pa - The primitive assembly object. ++/// @param pGsOut - output stream for GS ++template < ++ bool HasGeometryShaderT, ++ bool HasStreamOutT, ++ bool HasRastT> ++static void TessellationStages( ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ PA_STATE& pa, ++ void* pGsOut, ++ void* pCutBuffer, ++ uint32_t* pSoPrimData, ++ simdscalari primID) ++{ ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_TS_STATE& tsState = state.tsState; ++ SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro ++ ++ SWR_ASSERT(gt_pTessellationThreadData); ++ ++ HANDLE tsCtx = TSInitCtx( ++ tsState.domain, ++ tsState.partitioning, ++ tsState.tsOutputTopology, ++ gt_pTessellationThreadData->pTxCtx, ++ gt_pTessellationThreadData->tsCtxSize); ++ if (tsCtx == nullptr) ++ { ++ gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64); ++ tsCtx = TSInitCtx( ++ tsState.domain, ++ tsState.partitioning, ++ tsState.tsOutputTopology, ++ gt_pTessellationThreadData->pTxCtx, ++ gt_pTessellationThreadData->tsCtxSize); ++ } ++ SWR_ASSERT(tsCtx); ++ ++ PFN_PROCESS_PRIMS pfnClipFunc = nullptr; ++ if (HasRastT) ++ { ++ switch (tsState.postDSTopology) ++ { ++ case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; ++ case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; ++ case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; ++ default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology); ++ } ++ } ++ ++ SWR_HS_CONTEXT hsContext; ++ hsContext.pCPout = gt_pTessellationThreadData->patchData; ++ hsContext.PrimitiveID = primID; ++ ++ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); ++ // Max storage for one attribute for an entire simdprimitive ++ simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; ++ ++ // assemble all attributes for the input primitives ++ for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) ++ { ++ uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; ++ pa.Assemble(attribSlot, simdattrib); ++ ++ for (uint32_t i = 0; i < numVertsPerPrim; ++i) ++ { ++ hsContext.vert[i].attrib[attribSlot] = simdattrib[i]; ++ } ++ } ++ ++#if defined(_DEBUG) ++ memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); ++#endif ++ ++ // Run the HS ++ RDTSC_START(FEHullShader); ++ state.pfnHsFunc(GetPrivateState(pDC), &hsContext); ++ RDTSC_STOP(FEHullShader, 0, 0); ++ ++ uint32_t numPrims = pa.NumPrims(); ++ UPDATE_STAT(HsInvocations, numPrims); ++ ++ const uint32_t* pPrimId = (const uint32_t*)&primID; ++ ++ for (uint32_t p = 0; p < numPrims; ++p) ++ { ++ // Run Tessellator ++ SWR_TS_TESSELLATED_DATA tsData = { 0 }; ++ RDTSC_START(FETessellation); ++ TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); ++ RDTSC_STOP(FETessellation, 0, 0); ++ ++ if (tsData.NumPrimitives == 0) ++ { ++ continue; ++ } ++ SWR_ASSERT(tsData.NumDomainPoints); ++ ++ // Allocate DS Output memory ++ uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; ++ size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; ++ if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors) ++ { ++ _aligned_free(gt_pTessellationThreadData->pDSOutput); ++ gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(sizeof(simdvector) * requiredDSOutputVectors, 64); ++ gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors; ++ } ++ SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); ++ SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors); ++ ++ // Run Domain Shader ++ SWR_DS_CONTEXT dsContext; ++ dsContext.PrimitiveID = pPrimId[p]; ++ dsContext.pCpIn = &hsContext.pCPout[p]; ++ dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; ++ dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; ++ dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; ++ dsContext.vectorStride = requiredDSVectorInvocations; ++ ++ for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) ++ { ++ RDTSC_START(FEDomainShader); ++ state.pfnDsFunc(GetPrivateState(pDC), &dsContext); ++ RDTSC_STOP(FEDomainShader, 0, 0); ++ } ++ UPDATE_STAT(DsInvocations, tsData.NumDomainPoints); ++ ++ PA_TESS tessPa( ++ pDC, ++ dsContext.pOutputData, ++ dsContext.vectorStride, ++ tsState.numDsOutputAttribs, ++ tsData.ppIndices, ++ tsData.NumPrimitives, ++ tsState.postDSTopology); ++ ++ while (tessPa.HasWork()) ++ { ++ simdvector prim[3]; // Only deal with triangles, lines, or points ++ // PaAssemble returns false if there is not enough verts to assemble. ++ RDTSC_START(FEPAAssemble); ++ bool assemble = tessPa.Assemble(VERTEX_POSITION_SLOT, prim); ++ RDTSC_STOP(FEPAAssemble, 1, 0); ++ ++ if (assemble) ++ { ++ if (HasGeometryShaderT) ++ { ++ GeometryShaderStage( ++ pDC, workerId, tessPa, pGsOut, pCutBuffer, pSoPrimData, ++ _simd_set1_epi32(dsContext.PrimitiveID)); ++ } ++ else ++ { ++ if (HasStreamOutT) ++ { ++ StreamOut(pDC, tessPa, workerId, pSoPrimData); ++ } ++ ++ if (HasRastT) ++ { ++ SWR_ASSERT(pfnClipFunc); ++ pfnClipFunc(pDC, tessPa, workerId, prim, ++ GenMask(tessPa.NumPrims()), primID); ++ } ++ } ++ } // if (assemble) ++ ++ tessPa.NextPrim(); ++ ++ } // while (tessPa.HasWork()) ++ } // for (uint32_t p = 0; p < numPrims; ++p) ++ ++ TSDestroyCtx(tsCtx); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief FE handler for SwrDraw. ++/// @tparam IsIndexedT - Is indexed drawing enabled ++/// @tparam HasTessellationT - Is tessellation enabled ++/// @tparam HasGeometryShaderT - Is the geometry shader stage enabled ++/// @tparam HasStreamOutT - Is stream-out enabled ++/// @tparam HasRastT - Is rasterization enabled ++/// @param pContext - pointer to SWR context. ++/// @param pDC - pointer to draw context. ++/// @param workerId - thread's worker id. ++/// @param pUserData - Pointer to DRAW_WORK ++template < ++ bool IsIndexedT, ++ bool HasTessellationT, ++ bool HasGeometryShaderT, ++ bool HasStreamOutT, ++ bool HasRastT> ++void ProcessDraw( ++ SWR_CONTEXT *pContext, ++ DRAW_CONTEXT *pDC, ++ uint32_t workerId, ++ void *pUserData) ++{ ++ ++#if KNOB_ENABLE_TOSS_POINTS ++ if (KNOB_TOSS_QUEUE_FE) ++ { ++ pDC->doneFE = 1; ++ return; ++ } ++#endif ++ ++ RDTSC_START(FEProcessDraw); ++ ++ DRAW_WORK& work = *(DRAW_WORK*)pUserData; ++ const API_STATE& state = GetApiState(pDC); ++ __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++ SWR_VS_CONTEXT vsContext; ++ simdvertex vin; ++ ++ int indexSize = 0; ++ int32_t endVertex = work.numVerts; ++ const int32_t* pLastRequestedIndex = nullptr; ++ if (IsIndexedT) ++ { ++ switch (work.type) ++ { ++ case R32_UINT: ++ indexSize = sizeof(uint32_t); ++ pLastRequestedIndex = &(work.pIB[endVertex]); ++ break; ++ case R16_UINT: ++ indexSize = sizeof(uint16_t); ++ // nasty address offset to last index ++ pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex])); ++ break; ++ case R8_UINT: ++ indexSize = sizeof(uint8_t); ++ // nasty address offset to last index ++ pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex])); ++ break; ++ default: ++ SWR_ASSERT(0); ++ } ++ } ++ ++ SWR_FETCH_CONTEXT fetchInfo = { 0 }; ++ fetchInfo.pStreams = &state.vertexBuffers[0]; ++ fetchInfo.StartInstance = work.startInstance; ++ fetchInfo.StartVertex = 0; ++ ++ vsContext.pVin = &vin; ++ ++ if (IsIndexedT) ++ { ++ fetchInfo.BaseVertex = work.baseVertex; ++ ++ // if the entire index buffer isn't being consumed, set the last index ++ // so that fetches < a SIMD wide will be masked off ++ fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size); ++ if (pLastRequestedIndex < fetchInfo.pLastIndex) ++ { ++ fetchInfo.pLastIndex = pLastRequestedIndex; ++ } ++ } ++ else ++ { ++ fetchInfo.StartVertex = work.startVertex; ++ } ++ ++#ifdef KNOB_ENABLE_RDTSC ++ uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); ++#endif ++ ++ void* pGsOut = nullptr; ++ void* pCutBuffer = nullptr; ++ if (HasGeometryShaderT) ++ { ++ AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer); ++ } ++ ++ if (HasTessellationT) ++ { ++ SWR_ASSERT(state.tsState.tsEnable == true); ++ SWR_ASSERT(state.pfnHsFunc != nullptr); ++ SWR_ASSERT(state.pfnDsFunc != nullptr); ++ ++ AllocateTessellationData(pContext); ++ } ++ else ++ { ++ SWR_ASSERT(state.tsState.tsEnable == false); ++ SWR_ASSERT(state.pfnHsFunc == nullptr); ++ SWR_ASSERT(state.pfnDsFunc == nullptr); ++ } ++ ++ // allocate space for streamout input prim data ++ uint32_t* pSoPrimData = nullptr; ++ if (HasStreamOutT) ++ { ++ pSoPrimData = (uint32_t*)pDC->arena.AllocAligned(4096, 16); ++ } ++ ++ // choose primitive assembler ++ PA_FACTORY paFactory(pDC, IsIndexedT, state.topology, work.numVerts); ++ PA_STATE& pa = paFactory.GetPA(); ++ ++ /// @todo: temporarily move instance loop in the FE to ensure SO ordering ++ for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) ++ { ++ simdscalari vIndex; ++ int32_t i = 0; ++ ++ if (IsIndexedT) ++ { ++ fetchInfo.pIndices = work.pIB; ++ } ++ else ++ { ++ vIndex = _simd_add_epi32(_simd_set1_epi32(i), vScale); ++ fetchInfo.pIndices = (const int32_t*)&vIndex; ++ } ++ ++ fetchInfo.CurInstance = instanceNum; ++ vsContext.InstanceID = instanceNum; ++ ++ while (pa.HasWork()) ++ { ++ // PaGetNextVsOutput currently has the side effect of updating some PA state machine state. ++ // So we need to keep this outside of (i < endVertex) check. ++ simdmask* pvCutIndices = nullptr; ++ if (IsIndexedT) ++ { ++ pvCutIndices = &pa.GetNextVsIndices(); ++ } ++ ++ simdvertex& vout = pa.GetNextVsOutput(); ++ vsContext.pVout = &vout; ++ ++ if (i < endVertex) ++ { ++ ++ // 1. Execute FS/VS for a single SIMD. ++ RDTSC_START(FEFetchShader); ++ state.pfnFetchFunc(fetchInfo, vin); ++ RDTSC_STOP(FEFetchShader, 0, 0); ++ ++ // forward fetch generated vertex IDs to the vertex shader ++ vsContext.VertexID = fetchInfo.VertexID; ++ ++ // Setup active mask for vertex shader. ++ vsContext.mask = GenerateMask(endVertex - i); ++ ++ // forward cut mask to the PA ++ if (IsIndexedT) ++ { ++ *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); ++ } ++ ++ UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex)); ++ ++#if KNOB_ENABLE_TOSS_POINTS ++ if (!KNOB_TOSS_FETCH) ++#endif ++ { ++ RDTSC_START(FEVertexShader); ++ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); ++ RDTSC_STOP(FEVertexShader, 0, 0); ++ ++ UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex)); ++ } ++ } ++ ++ // 2. Assemble primitives given the last two SIMD. ++ do ++ { ++ simdvector prim[MAX_NUM_VERTS_PER_PRIM]; ++ // PaAssemble returns false if there is not enough verts to assemble. ++ RDTSC_START(FEPAAssemble); ++ bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); ++ RDTSC_STOP(FEPAAssemble, 1, 0); ++ ++#if KNOB_ENABLE_TOSS_POINTS ++ if (!KNOB_TOSS_FETCH) ++#endif ++ { ++#if KNOB_ENABLE_TOSS_POINTS ++ if (!KNOB_TOSS_VS) ++#endif ++ { ++ if (assemble) ++ { ++ UPDATE_STAT(IaPrimitives, pa.NumPrims()); ++ ++ if (HasTessellationT) ++ { ++ TessellationStages( ++ pDC, workerId, pa, pGsOut, pCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); ++ } ++ else if (HasGeometryShaderT) ++ { ++ GeometryShaderStage( ++ pDC, workerId, pa, pGsOut, pCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); ++ } ++ else ++ { ++ // If streamout is enabled then stream vertices out to memory. ++ if (HasStreamOutT) ++ { ++ StreamOut(pDC, pa, workerId, pSoPrimData); ++ } ++ ++ if (HasRastT) ++ { ++ SWR_ASSERT(pDC->pState->pfnProcessPrims); ++ pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, ++ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID)); ++ } ++ } ++ } ++ } ++ } ++ } while (pa.NextPrim()); ++ ++ i += KNOB_SIMD_WIDTH; ++ if (IsIndexedT) ++ { ++ fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); ++ } ++ else ++ { ++ vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); ++ } ++ } ++ pa.Reset(); ++ } ++ ++ _ReadWriteBarrier(); ++ pDC->doneFE = true; ++ RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId); ++} ++// Explicit Instantiation of all combinations ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++template void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Expland points to give them area. ++/// @param tri - SOA vertices for triangles. ++static INLINE void ExpandPoint(simdvector tri[3], simdscalar size) ++{ ++ const float bloat = 0.5f; ++ ++ const __m256 vAdjust0X = _mm256_set_ps(-bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat); ++ const __m256 vAdjust0Y = _mm256_set_ps(-bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat, -bloat); ++ const __m256 vAdjust1X = _mm256_set_ps(bloat, -bloat, bloat, -bloat, bloat, -bloat, bloat, -bloat); ++ const __m256 vAdjust1Y = _mm256_set_ps(bloat, bloat, bloat, bloat, bloat, bloat, bloat, bloat); ++ const __m256 vAdjust2X = _mm256_set_ps(bloat, bloat, bloat, bloat, bloat, bloat, bloat, bloat); ++ const __m256 vAdjust2Y = _mm256_set_ps(-bloat, bloat, -bloat, bloat, -bloat, bloat, -bloat, bloat); ++ ++ tri[0].x = _simd_fmadd_ps(size, vAdjust0X, tri[0].x); ++ tri[0].y = _simd_fmadd_ps(size, vAdjust0Y, tri[0].y); ++ tri[1].x = _simd_fmadd_ps(size, vAdjust1X, tri[1].x); ++ tri[1].y = _simd_fmadd_ps(size, vAdjust1Y, tri[1].y); ++ tri[2].x = _simd_fmadd_ps(size, vAdjust2X, tri[2].x); ++ tri[2].y = _simd_fmadd_ps(size, vAdjust2Y, tri[2].y); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Processes attributes for the backend based on linkage mask and ++/// linkage map. Essentially just doing an SOA->AOS conversion and pack. ++/// @param pDC - Draw context ++/// @param pa - Primitive Assembly state ++/// @param linkageMask - Specifies which VS outputs are routed to PS. ++/// @param pLinkageMap - maps VS attribute slot to PS slot ++/// @param triIndex - Triangle to process attributes for ++/// @param pBuffer - Output result ++template ++INLINE void ProcessAttributes( ++ DRAW_CONTEXT *pDC, ++ PA_STATE&pa, ++ uint32_t linkageMask, ++ const uint8_t* pLinkageMap, ++ uint32_t triIndex, ++ float *pBuffer) ++{ ++ DWORD slot = 0; ++ uint32_t mapIdx = 0; ++ while (_BitScanForward(&slot, linkageMask)) ++ { ++ linkageMask &= ~(1 << slot); // done with this bit. ++ ++ // compute absolute slot in vertex attrib array ++ uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx++]; ++ ++ __m128 attrib[3]; // triangle attribs (always 4 wide) ++ pa.AssembleSingle(inputSlot, triIndex, attrib); ++ ++ for (uint32_t i = 0; i < NumVerts; ++i) ++ { ++ _mm_store_ps(pBuffer, attrib[i]); ++ pBuffer += 4; ++ } ++ ++ // pad out the attrib buffer to 3 verts to ensure the triangle ++ // interpolation code in the pixel shader works correctly for the ++ // 3 topologies - point, line, tri. This effectively zeros out the ++ // effect of the missing vertices in the triangle interpolation. ++ for (uint32_t i = NumVerts; i < 3; ++i) ++ { ++ _mm_store_ps(pBuffer, attrib[NumVerts - 1]); ++ pBuffer += 4; ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Processes enabled user clip distances. Loads the active clip ++/// distances from the PA, sets up barycentric equations, and ++/// stores the results to the output buffer ++/// @param pa - Primitive Assembly state ++/// @param primIndex - primitive index to process ++/// @param clipDistMask - mask of enabled clip distances ++/// @param pUserClipBuffer - buffer to store results ++template ++void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer) ++{ ++ DWORD clipDist; ++ while (_BitScanForward(&clipDist, clipDistMask)) ++ { ++ clipDistMask &= ~(1 << clipDist); ++ uint32_t clipSlot = clipDist >> 2; ++ uint32_t clipComp = clipDist & 0x3; ++ uint32_t clipAttribSlot = clipSlot == 0 ? ++ VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; ++ ++ __m128 primClipDist[3]; ++ pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); ++ ++ float vertClipDist[NumVerts]; ++ for (uint32_t e = 0; e < NumVerts; ++e) ++ { ++ OSALIGNSIMD(float) aVertClipDist[4]; ++ _mm_store_ps(aVertClipDist, primClipDist[e]); ++ vertClipDist[e] = aVertClipDist[clipComp]; ++ }; ++ ++ // setup plane equations for barycentric interpolation in the backend ++ float baryCoeff[NumVerts]; ++ for (uint32_t e = 0; e < NumVerts - 1; ++e) ++ { ++ baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1]; ++ } ++ baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1]; ++ ++ for (uint32_t e = 0; e < NumVerts; ++e) ++ { ++ *(pUserClipBuffer++) = baryCoeff[e]; ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping ++/// culling, viewport transform, etc. ++/// @param pDC - pointer to draw context. ++/// @param pa - The primitive assembly object. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param tri - Contains triangle position data for SIMDs worth of triangles. ++/// @param primID - Primitive ID for each triangle. ++void BinTriangles( ++ DRAW_CONTEXT *pDC, ++ PA_STATE& pa, ++ uint32_t workerId, ++ simdvector tri[3], ++ uint32_t triMask, ++ simdscalari primID) ++{ ++ RDTSC_START(FEBinTriangles); ++ ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_RASTSTATE& rastState = state.rastState; ++ const SWR_FRONTEND_STATE& feState = state.frontendState; ++ const SWR_GS_STATE& gsState = state.gsState; ++ ++ // Simple wireframe mode for debugging purposes only ++ ++ simdscalar vRecipW0 = _simd_set1_ps(1.0f); ++ simdscalar vRecipW1 = _simd_set1_ps(1.0f); ++ simdscalar vRecipW2 = _simd_set1_ps(1.0f); ++ ++ if (!feState.vpTransformDisable) ++ { ++ // perspective divide ++ vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); ++ vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); ++ vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); ++ ++ tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); ++ tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); ++ tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); ++ ++ tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); ++ tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); ++ tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); ++ ++ tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); ++ tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); ++ tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); ++ ++ // viewport transform to screen coords ++ viewportTransform<3>(tri, state.vpMatrix[0]); ++ } ++ ++ // bloat points to tri ++ if (pa.binTopology == TOP_POINT_LIST) ++ { ++ if (rastState.pointParam) ++ { ++ simdvector size[3]; ++ pa.Assemble(rastState.pointSizeAttrib, size); ++ ExpandPoint(tri, size[0].x); ++ } ++ else ++ { ++ ExpandPoint(tri, _simd_set1_ps(rastState.pointSize)); ++ } ++ } ++ ++ // convert to fixed point ++ simdscalari vXi[3], vYi[3]; ++ vXi[0] = fpToFixedPointVertical(tri[0].x); ++ vYi[0] = fpToFixedPointVertical(tri[0].y); ++ vXi[1] = fpToFixedPointVertical(tri[1].x); ++ vYi[1] = fpToFixedPointVertical(tri[1].y); ++ vXi[2] = fpToFixedPointVertical(tri[2].x); ++ vYi[2] = fpToFixedPointVertical(tri[2].y); ++ ++ // triangle setup ++ simdscalari vAi[3], vBi[3]; ++ triangleSetupABIntVertical(vXi, vYi, vAi, vBi); ++ ++ // determinant ++ simdscalari vDet[2]; ++ calcDeterminantIntVertical(vAi, vBi, vDet); ++ ++ // cull zero area ++ int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); ++ int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); ++ ++ int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2)); ++ ++ uint32_t origTriMask = triMask; ++ triMask &= ~cullZeroAreaMask; ++ ++ // determine front winding tris ++ // CW +det ++ // CCW -det ++ maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); ++ maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); ++ int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) ); ++ ++ uint32_t frontWindingTris; ++ if (rastState.frontWinding == SWR_FRONTWINDING_CW) ++ { ++ frontWindingTris = cwTriMask; ++ } ++ else ++ { ++ frontWindingTris = ~cwTriMask; ++ } ++ ++ // cull ++ uint32_t cullTris; ++ switch ((SWR_CULLMODE)rastState.cullMode) ++ { ++ case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; ++ case SWR_CULLMODE_NONE: cullTris = 0x0; break; ++ case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; ++ case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; ++ default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; ++ } ++ ++ triMask &= ~cullTris; ++ ++ if (origTriMask ^ triMask) ++ { ++ RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); ++ } ++ ++ // compute per tri backface ++ uint32_t frontFaceMask = frontWindingTris; ++ ++ uint32_t *pPrimID = (uint32_t *)&primID; ++ DWORD triIndex = 0; ++ ++ if (!triMask) ++ { ++ goto endBinTriangles; ++ } ++ ++ // Calc bounding box of triangles ++ simdBBox bbox; ++ calcBoundingBoxIntVertical(vXi, vYi, bbox); ++ ++ // determine if triangle falls between pixel centers and discard ++ // only discard for non-MSAA case ++ // (left + 127) & ~255 ++ // (right + 128) & ~255 ++ ++ if(rastState.sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ origTriMask = triMask; ++ ++ int cullCenterMask; ++ { ++ simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127)); ++ left = _simd_and_si(left, _simd_set1_epi32(~255)); ++ simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128)); ++ right = _simd_and_si(right, _simd_set1_epi32(~255)); ++ ++ simdscalari vMaskH = _simd_cmpeq_epi32(left, right); ++ ++ simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127)); ++ top = _simd_and_si(top, _simd_set1_epi32(~255)); ++ simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128)); ++ bottom = _simd_and_si(bottom, _simd_set1_epi32(~255)); ++ ++ simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom); ++ vMaskV = _simd_or_si(vMaskH, vMaskV); ++ cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); ++ } ++ ++ triMask &= ~cullCenterMask; ++ ++ if(origTriMask ^ triMask) ++ { ++ RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); ++ } ++ } ++ ++ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. ++ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); ++ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); ++ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); ++ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); ++ ++ // Cull tris completely outside scissor ++ { ++ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); ++ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); ++ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); ++ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); ++ triMask = triMask & ~maskOutsideScissor; ++ } ++ ++ if (!triMask) ++ { ++ goto endBinTriangles; ++ } ++ ++ // Convert triangle bbox to macrotile units. ++ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); ++ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); ++ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); ++ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); ++ ++ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; ++ _simd_store_si((simdscalari*)aMTLeft, bbox.left); ++ _simd_store_si((simdscalari*)aMTRight, bbox.right); ++ _simd_store_si((simdscalari*)aMTTop, bbox.top); ++ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); ++ ++ // transpose verts needed for backend ++ /// @todo modify BE to take non-transformed verts ++ __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; ++ vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); ++ vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); ++ vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); ++ vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); ++ ++ // store render target array index ++ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; ++ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) ++ { ++ simdvector vRtai[3]; ++ pa.Assemble(VERTEX_RTAI_SLOT, vRtai); ++ simdscalari vRtaii; ++ vRtaii = _simd_castps_si(vRtai[0].x); ++ _simd_store_si((simdscalari*)aRTAI, vRtaii); ++ } ++ else ++ { ++ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); ++ } ++ ++ // scan remaining valid triangles and bin each separately ++ while (_BitScanForward(&triIndex, triMask)) ++ { ++ uint32_t linkageCount = state.linkageCount; ++ uint32_t linkageMask = state.linkageMask; ++ uint32_t numScalarAttribs = linkageCount * 4; ++ ++ BE_WORK work; ++ work.type = DRAW; ++ ++ TRIANGLE_WORK_DESC &desc = work.desc.tri; ++ ++ desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); ++ desc.triFlags.primID = pPrimID[triIndex]; ++ desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; ++ ++ work.pfnWork = gRasterizerTable[rastState.sampleCount]; ++ ++ // store active attribs ++ float *pAttribs = (float*)pDC->arena.AllocAligned(numScalarAttribs*3*sizeof(float), 16); ++ desc.pAttribs = pAttribs; ++ desc.numAttribs = linkageCount; ++ ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs); ++ ++ // store triangle vertex data ++ desc.pTriBuffer = (float*)pDC->arena.AllocAligned(4*4*sizeof(float), 16); ++ ++ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); ++ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); ++ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); ++ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); ++ ++ // store user clip distances ++ if (rastState.clipDistanceMask) ++ { ++ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); ++ desc.pUserClipBuffer = (float*)pDC->arena.Alloc(numClipDist * 3 * sizeof(float)); ++ ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); ++ } ++ ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) ++ { ++ for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) ++ { ++#if KNOB_ENABLE_TOSS_POINTS ++ if (!KNOB_TOSS_SETUP_TRIS) ++#endif ++ { ++ pTileMgr->enqueue(x, y, &work); ++ } ++ } ++ } ++ ++ triMask &= ~(1 << triIndex); ++ } ++ ++endBinTriangles: ++ RDTSC_STOP(FEBinTriangles, 1, 0); ++} ++ ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Bin SIMD points to the backend. Only supports point size of 1 ++/// @param pDC - pointer to draw context. ++/// @param pa - The primitive assembly object. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param tri - Contains point position data for SIMDs worth of points. ++/// @param primID - Primitive ID for each point. ++void BinPoints( ++ DRAW_CONTEXT *pDC, ++ PA_STATE& pa, ++ uint32_t workerId, ++ simdvector prim[3], ++ uint32_t primMask, ++ simdscalari primID) ++{ ++ RDTSC_START(FEBinPoints); ++ ++ simdvector& primVerts = prim[0]; ++ ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_GS_STATE& gsState = state.gsState; ++ ++ // perspective divide ++ simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); ++ primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); ++ primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); ++ primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); ++ ++ // viewport transform to screen coords ++ viewportTransform<1>(&primVerts, state.vpMatrix[0]); ++ ++ // convert to fixed point ++ simdscalari vXi, vYi; ++ vXi = fpToFixedPointVertical(primVerts.x); ++ vYi = fpToFixedPointVertical(primVerts.y); ++ ++ // adjust for triangle rasterization rules - ie top-left rule ++ vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); ++ vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); ++ ++ // cull points off the top-left edge of the viewport ++ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); ++ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); ++ ++ // compute macro tile coordinates ++ simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); ++ simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); ++ ++ OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; ++ _simd_store_si((simdscalari*)aMacroX, macroX); ++ _simd_store_si((simdscalari*)aMacroY, macroY); ++ ++ // compute raster tile coordinates ++ simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); ++ simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); ++ ++ // compute raster tile relative x,y for coverage mask ++ simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); ++ simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); ++ ++ simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); ++ simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); ++ ++ OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; ++ OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; ++ _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); ++ _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); ++ ++ OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; ++ OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; ++ _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); ++ _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); ++ ++ OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; ++ _simd_store_ps((float*)aZ, primVerts.z); ++ ++ // store render target array index ++ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; ++ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) ++ { ++ simdvector vRtai; ++ pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); ++ simdscalari vRtaii = _simd_castps_si(vRtai.x); ++ _simd_store_si((simdscalari*)aRTAI, vRtaii); ++ } ++ else ++ { ++ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); ++ } ++ ++ uint32_t *pPrimID = (uint32_t *)&primID; ++ DWORD primIndex = 0; ++ // scan remaining valid triangles and bin each separately ++ while (_BitScanForward(&primIndex, primMask)) ++ { ++ uint32_t linkageCount = state.linkageCount; ++ uint32_t linkageMask = state.linkageMask; ++ ++ uint32_t numScalarAttribs = linkageCount * 4; ++ ++ BE_WORK work; ++ work.type = DRAW; ++ ++ TRIANGLE_WORK_DESC &desc = work.desc.tri; ++ ++ // points are always front facing ++ desc.triFlags.frontFacing = 1; ++ desc.triFlags.primID = pPrimID[primIndex]; ++ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; ++ ++ work.pfnWork = rastPoint; ++ ++ // store attributes ++ float *pAttribs = (float*)pDC->arena.AllocAligned(3 * numScalarAttribs * sizeof(float), 16); ++ desc.pAttribs = pAttribs; ++ desc.numAttribs = linkageCount; ++ ++ ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs); ++ ++ // store raster tile aligned x, y, perspective correct z ++ float *pTriBuffer = (float*)pDC->arena.AllocAligned(4 * sizeof(float), 16); ++ desc.pTriBuffer = pTriBuffer; ++ *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; ++ *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; ++ *pTriBuffer = aZ[primIndex]; ++ ++ uint32_t tX = aTileRelativeX[primIndex]; ++ uint32_t tY = aTileRelativeY[primIndex]; ++ ++ // pack the relative x,y into the coverageMask, the rasterizer will ++ // generate the true coverage mask from it ++ work.desc.tri.triFlags.coverageMask = tX | (tY << 4); ++ ++ // bin it ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++#if KNOB_ENABLE_TOSS_POINTS ++ if (!KNOB_TOSS_SETUP_TRIS) ++#endif ++ { ++ pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); ++ } ++ primMask &= ~(1 << primIndex); ++ } ++ ++ RDTSC_STOP(FEBinPoints, 1, 0); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Bin SIMD lines to the backend. ++/// @param pDC - pointer to draw context. ++/// @param pa - The primitive assembly object. ++/// @param workerId - thread's worker id. Even thread has a unique id. ++/// @param tri - Contains line position data for SIMDs worth of points. ++/// @param primID - Primitive ID for each line. ++void BinLines( ++ DRAW_CONTEXT *pDC, ++ PA_STATE& pa, ++ uint32_t workerId, ++ simdvector prim[], ++ uint32_t primMask, ++ simdscalari primID) ++{ ++ RDTSC_START(FEBinLines); ++ ++ const API_STATE& state = GetApiState(pDC); ++ const SWR_RASTSTATE& rastState = state.rastState; ++ const SWR_FRONTEND_STATE& feState = state.frontendState; ++ const SWR_GS_STATE& gsState = state.gsState; ++ ++ simdscalar vRecipW0 = _simd_set1_ps(1.0f); ++ simdscalar vRecipW1 = _simd_set1_ps(1.0f); ++ ++ if (!feState.vpTransformDisable) ++ { ++ // perspective divide ++ vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); ++ vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); ++ ++ prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0); ++ prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1); ++ ++ prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0); ++ prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1); ++ ++ prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0); ++ prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); ++ ++ // viewport transform to screen coords ++ viewportTransform<2>(prim, state.vpMatrix[0]); ++ } ++ ++ // convert to fixed point ++ simdscalari vXi[2], vYi[2]; ++ vXi[0] = fpToFixedPointVertical(prim[0].x); ++ vYi[0] = fpToFixedPointVertical(prim[0].y); ++ vXi[1] = fpToFixedPointVertical(prim[1].x); ++ vYi[1] = fpToFixedPointVertical(prim[1].y); ++ ++ // compute x-major vs y-major mask ++ simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); ++ simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); ++ simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); ++ uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); ++ ++ // cull zero-length lines ++ simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); ++ vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); ++ ++ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); ++ ++ uint32_t *pPrimID = (uint32_t *)&primID; ++ ++ simdscalar vUnused = _simd_setzero_ps(); ++ ++ // Calc bounding box of lines ++ simdBBox bbox; ++ bbox.left = _simd_min_epi32(vXi[0], vXi[1]); ++ bbox.right = _simd_max_epi32(vXi[0], vXi[1]); ++ bbox.top = _simd_min_epi32(vYi[0], vYi[1]); ++ bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]); ++ ++ // bloat bbox by line width along minor axis ++ simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); ++ simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); ++ simdBBox bloatBox; ++ bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi); ++ bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi); ++ bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi); ++ bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi); ++ ++ bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask); ++ bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask); ++ bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask); ++ bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask); ++ ++ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive. ++ bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left)); ++ bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top)); ++ bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right)); ++ bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom)); ++ ++ // Cull prims completely outside scissor ++ { ++ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right); ++ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom); ++ simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); ++ uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); ++ primMask = primMask & ~maskOutsideScissor; ++ } ++ ++ if (!primMask) ++ { ++ goto endBinLines; ++ } ++ ++ // Convert triangle bbox to macrotile units. ++ bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); ++ bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); ++ bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); ++ bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); ++ ++ OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; ++ _simd_store_si((simdscalari*)aMTLeft, bbox.left); ++ _simd_store_si((simdscalari*)aMTRight, bbox.right); ++ _simd_store_si((simdscalari*)aMTTop, bbox.top); ++ _simd_store_si((simdscalari*)aMTBottom, bbox.bottom); ++ ++ // transpose verts needed for backend ++ /// @todo modify BE to take non-transformed verts ++ __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; ++ vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); ++ vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); ++ vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); ++ vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); ++ ++ // store render target array index ++ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; ++ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) ++ { ++ simdvector vRtai[2]; ++ pa.Assemble(VERTEX_RTAI_SLOT, vRtai); ++ simdscalari vRtaii = _simd_castps_si(vRtai[0].x); ++ _simd_store_si((simdscalari*)aRTAI, vRtaii); ++ } ++ else ++ { ++ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); ++ } ++ ++ // scan remaining valid prims and bin each separately ++ DWORD primIndex; ++ while (_BitScanForward(&primIndex, primMask)) ++ { ++ uint32_t linkageCount = state.linkageCount; ++ uint32_t linkageMask = state.linkageMask; ++ uint32_t numScalarAttribs = linkageCount * 4; ++ ++ BE_WORK work; ++ work.type = DRAW; ++ ++ TRIANGLE_WORK_DESC &desc = work.desc.tri; ++ ++ desc.triFlags.frontFacing = 1; ++ desc.triFlags.primID = pPrimID[primIndex]; ++ desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; ++ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; ++ ++ work.pfnWork = RasterizeLine; ++ ++ // store active attribs ++ desc.pAttribs = (float*)pDC->arena.AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); ++ desc.numAttribs = linkageCount; ++ ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs); ++ ++ // store line vertex data ++ desc.pTriBuffer = (float*)pDC->arena.AllocAligned(4 * 4 * sizeof(float), 16); ++ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); ++ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); ++ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); ++ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); ++ ++ // store user clip distances ++ if (rastState.clipDistanceMask) ++ { ++ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); ++ desc.pUserClipBuffer = (float*)pDC->arena.Alloc(numClipDist * 2 * sizeof(float)); ++ ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); ++ } ++ ++ MacroTileMgr *pTileMgr = pDC->pTileMgr; ++ for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) ++ { ++ for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) ++ { ++#if KNOB_ENABLE_TOSS_POINTS ++ if (!KNOB_TOSS_SETUP_TRIS) ++#endif ++ { ++ pTileMgr->enqueue(x, y, &work); ++ } ++ } ++ } ++ ++ primMask &= ~(1 << primIndex); ++ } ++ ++endBinLines: ++ ++ RDTSC_STOP(FEBinLines, 1, 0); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h +new file mode 100644 +index 0000000..e8452c3 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h +@@ -0,0 +1,326 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file frontend.h ++* ++* @brief Definitions for Frontend which handles vertex processing, ++* primitive assembly, clipping, binning, etc. ++* ++******************************************************************************/ ++#pragma once ++#include "context.h" ++ ++INLINE ++__m128i fpToFixedPoint(const __m128 vIn) ++{ ++ __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE)); ++ return _mm_cvtps_epi32(vFixed); ++} ++ ++INLINE ++simdscalari fpToFixedPointVertical(const simdscalar vIn) ++{ ++ simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE)); ++ return _simd_cvtps_epi32(vFixed); ++} ++ ++ ++// Calculates the A and B coefficients for the 3 edges of the triangle ++// ++// maths for edge equations: ++// standard form of a line in 2d ++// Ax + By + C = 0 ++// A = y0 - y1 ++// B = x1 - x0 ++// C = x0y1 - x1y0 ++INLINE ++void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) ++{ ++ // vYsub = y1 y2 y0 dc ++ __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); ++ // vY = y0 y1 y2 dc ++ vA = _mm_sub_ps(vY, vYsub); ++ ++ // Result: ++ // A[0] = y0 - y1 ++ // A[1] = y1 - y2 ++ // A[2] = y2 - y0 ++ ++ // vXsub = x1 x2 x0 dc ++ __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); ++ // vX = x0 x1 x2 dc ++ vB = _mm_sub_ps(vXsub, vX); ++ ++ // Result: ++ // B[0] = x1 - x0 ++ // B[1] = x2 - x1 ++ // B[2] = x0 - x2 ++} ++ ++INLINE ++void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3]) ++{ ++ // generate edge equations ++ // A = y0 - y1 ++ // B = x1 - x0 ++ vA[0] = _simd_sub_ps(vY[0], vY[1]); ++ vA[1] = _simd_sub_ps(vY[1], vY[2]); ++ vA[2] = _simd_sub_ps(vY[2], vY[0]); ++ ++ vB[0] = _simd_sub_ps(vX[1], vX[0]); ++ vB[1] = _simd_sub_ps(vX[2], vX[1]); ++ vB[2] = _simd_sub_ps(vX[0], vX[2]); ++} ++ ++INLINE ++void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) ++{ ++ // generate edge equations ++ // A = y0 - y1 ++ // B = x1 - x0 ++ // C = x0y1 - x1y0 ++ __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); ++ vA = _mm_sub_epi32(vY, vYsub); ++ ++ __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); ++ vB = _mm_sub_epi32(vXsub, vX); ++} ++ ++INLINE ++void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) ++{ ++ // A = y0 - y1 ++ // B = x1 - x0 ++ vA[0] = _simd_sub_epi32(vY[0], vY[1]); ++ vA[1] = _simd_sub_epi32(vY[1], vY[2]); ++ vA[2] = _simd_sub_epi32(vY[2], vY[0]); ++ ++ vB[0] = _simd_sub_epi32(vX[1], vX[0]); ++ vB[1] = _simd_sub_epi32(vX[2], vX[1]); ++ vB[2] = _simd_sub_epi32(vX[0], vX[2]); ++} ++// Calculate the determinant of the triangle ++// 2 vectors between the 3 points: P, Q ++// Px = x0-x2, Py = y0-y2 ++// Qx = x1-x2, Qy = y1-y2 ++// |Px Qx| ++// det = | | = PxQy - PyQx ++// |Py Qy| ++// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) ++// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx ++// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) ++// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) ++// : B[2]*A[1] - A[2]*B[1] ++INLINE ++float calcDeterminantInt(const __m128i vA, const __m128i vB) ++{ ++ // vAShuf = [A1, A0, A2, A0] ++ __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); ++ // vBShuf = [B2, B0, B1, B0] ++ __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); ++ // vMul = [A1*B2, B1*A2] ++ __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); ++ ++ // shuffle upper to lower ++ // vMul2 = [B1*A2, B1*A2] ++ __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); ++ //vMul = [A1*B2 - B1*A2] ++ vMul = _mm_sub_epi64(vMul, vMul2); ++ ++ // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned ++ OSALIGN(int64_t, 16) result; ++ _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul)); ++ ++ double fResult = (double)result; ++ fResult = fResult * (1.0 / FIXED_POINT16_SCALE); ++ ++ return (float)fResult; ++} ++ ++INLINE ++void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) ++{ ++ // refer to calcDeterminantInt comment for calculation explanation ++ // A1*B2 ++ simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 ++ simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 ++ ++ simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); ++ simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); ++ ++ simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 ++ simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 ++ ++ // B1*A2 ++ simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); ++ simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); ++ ++ simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); ++ simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); ++ ++ simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); ++ simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); ++ ++ // A1*B2 - A2*B1 ++ simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); ++ simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); ++ ++ // shuffle 0 1 4 5 -> 0 1 2 3 ++ simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20); ++ simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31); ++ ++ pvDet[0] = vResultLo; ++ pvDet[1] = vResultHi; ++} ++ ++INLINE ++void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) ++{ ++ // C = -Ax - By ++ vC = _mm_mul_ps(vA, vX); ++ __m128 vCy = _mm_mul_ps(vB, vY); ++ vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); ++ vC = _mm_sub_ps(vC, vCy); ++} ++ ++INLINE ++void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix) ++{ ++ vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00)); ++ vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30)); ++ ++ vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11)); ++ vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31)); ++ ++ vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22)); ++ vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32)); ++} ++ ++template ++INLINE ++void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix) ++{ ++ simdscalar m00 = _simd_load1_ps(&vpMatrix.m00); ++ simdscalar m30 = _simd_load1_ps(&vpMatrix.m30); ++ simdscalar m11 = _simd_load1_ps(&vpMatrix.m11); ++ simdscalar m31 = _simd_load1_ps(&vpMatrix.m31); ++ simdscalar m22 = _simd_load1_ps(&vpMatrix.m22); ++ simdscalar m32 = _simd_load1_ps(&vpMatrix.m32); ++ ++ for (uint32_t i = 0; i < NumVerts; ++i) ++ { ++ v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); ++ v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); ++ v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); ++ } ++} ++ ++INLINE ++void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox) ++{ ++ // Need horizontal fp min here ++ __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); ++ __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); ++ ++ __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); ++ __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); ++ ++ ++ __m128i vMinX = _mm_min_epi32(vX, vX1); ++ vMinX = _mm_min_epi32(vMinX, vX2); ++ ++ __m128i vMaxX = _mm_max_epi32(vX, vX1); ++ vMaxX = _mm_max_epi32(vMaxX, vX2); ++ ++ __m128i vMinY = _mm_min_epi32(vY, vY1); ++ vMinY = _mm_min_epi32(vMinY, vY2); ++ ++ __m128i vMaxY = _mm_max_epi32(vY, vY1); ++ vMaxY = _mm_max_epi32(vMaxY, vY2); ++ ++ bbox.left = _mm_extract_epi32(vMinX, 0); ++ bbox.right = _mm_extract_epi32(vMaxX, 0); ++ bbox.top = _mm_extract_epi32(vMinY, 0); ++ bbox.bottom = _mm_extract_epi32(vMaxY, 0); ++ ++#if 0 ++ Jacob: A = _mm_shuffle_ps(X, Y, 0 0 0 0) ++B = _mm_shuffle_ps(Z, W, 0 0 0 0) ++A = _mm_shuffle_epi32(A, 3 0 3 0) ++A = _mm_shuffle_ps(A, B, 1 0 1 0) ++#endif ++ ++} ++ ++INLINE ++void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox) ++{ ++ simdscalari vMinX = vX[0]; ++ vMinX = _simd_min_epi32(vMinX, vX[1]); ++ vMinX = _simd_min_epi32(vMinX, vX[2]); ++ ++ simdscalari vMaxX = vX[0]; ++ vMaxX = _simd_max_epi32(vMaxX, vX[1]); ++ vMaxX = _simd_max_epi32(vMaxX, vX[2]); ++ ++ simdscalari vMinY = vY[0]; ++ vMinY = _simd_min_epi32(vMinY, vY[1]); ++ vMinY = _simd_min_epi32(vMinY, vY[2]); ++ ++ simdscalari vMaxY = vY[0]; ++ vMaxY = _simd_max_epi32(vMaxY, vY[1]); ++ vMaxY = _simd_max_epi32(vMaxY, vY[2]); ++ ++ bbox.left = vMinX; ++ bbox.right = vMaxX; ++ bbox.top = vMinY; ++ bbox.bottom = vMaxY; ++} ++ ++INLINE ++bool CanUseSimplePoints(DRAW_CONTEXT *pDC) ++{ ++ const API_STATE& state = GetApiState(pDC); ++ ++ return (state.rastState.pointSize == 1.0f && ++ !state.rastState.pointParam && ++ !state.rastState.pointSpriteEnable); ++} ++ ++uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); ++uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); ++ ++// Templated Draw front-end function. All combinations of template parameter values are available ++template ++void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++ ++void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); ++ ++struct PA_STATE_BASE; // forward decl ++void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID); ++void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); ++void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h +new file mode 100644 +index 0000000..6140790 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h +@@ -0,0 +1,139 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file knobs.h ++* ++* @brief Static (Compile-Time) Knobs for Core. ++* ++******************************************************************************/ ++#pragma once ++ ++#include ++#include ++ ++#define KNOB_ARCH_AVX 0 ++#define KNOB_ARCH_AVX2 1 ++#define KNOB_ARCH_AVX512 2 ++ ++/////////////////////////////////////////////////////////////////////////////// ++// Architecture validation ++/////////////////////////////////////////////////////////////////////////////// ++#if !defined(KNOB_ARCH) ++#define KNOB_ARCH KNOB_ARCH_AVX ++#endif ++ ++#if (KNOB_ARCH == KNOB_ARCH_AVX) ++#define KNOB_ARCH_ISA AVX ++#define KNOB_ARCH_STR "AVX" ++#define KNOB_SIMD_WIDTH 8 ++#elif (KNOB_ARCH == KNOB_ARCH_AVX2) ++#define KNOB_ARCH_ISA AVX2 ++#define KNOB_ARCH_STR "AVX2" ++#define KNOB_SIMD_WIDTH 8 ++#elif (KNOB_ARCH == KNOB_ARCH_AVX512) ++#define KNOB_ARCH_ISA AVX512F ++#define KNOB_ARCH_STR "AVX512" ++#define KNOB_SIMD_WIDTH 16 ++#error "AVX512 not yet supported" ++#else ++#error "Unknown architecture" ++#endif ++ ++#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING") ++ ++/////////////////////////////////////////////////////////////////////////////// ++// Configuration knobs ++/////////////////////////////////////////////////////////////////////////////// ++#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon. ++ ++// Maximum supported number of active vertex buffer streams ++#define KNOB_NUM_STREAMS 32 ++ ++// Maximum supported number of attributes per vertex ++#define KNOB_NUM_ATTRIBUTES 37 ++ ++// Maximum supported active viewports and scissors ++#define KNOB_NUM_VIEWPORTS_SCISSORS 16 ++ ++// Guardband range used by the clipper ++#define KNOB_GUARDBAND_WIDTH 4096.0f ++#define KNOB_GUARDBAND_HEIGHT 2048.0f ++ ++/////////////////////////////// ++// Macro tile configuration ++/////////////////////////////// ++ ++// raster tile dimensions ++#define KNOB_TILE_X_DIM 8 ++#define KNOB_TILE_X_DIM_SHIFT 3 ++#define KNOB_TILE_Y_DIM 8 ++#define KNOB_TILE_Y_DIM_SHIFT 3 ++ ++// fixed macrotile pixel dimension for now, eventually will be ++// dynamically set based on tile format and pixel size ++#define KNOB_MACROTILE_X_DIM 64 ++#define KNOB_MACROTILE_Y_DIM 64 ++#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8) ++#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8) ++#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 14 ++#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 14 ++#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT) ++#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT) ++ ++// total # of hot tiles available. This should be enough to ++// fully render a 16kx16k 128bpp render target ++#define KNOB_NUM_HOT_TILES_X 256 ++#define KNOB_NUM_HOT_TILES_Y 256 ++#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT ++#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT ++#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT ++ ++#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4 ++#error "incompatible width/tile dimensions" ++#endif ++ ++#if KNOB_SIMD_WIDTH == 8 ++#define SIMD_TILE_X_DIM 4 ++#define SIMD_TILE_Y_DIM 2 ++#else ++#error "Invalid simd width" ++#endif ++ ++/////////////////////////////////////////////////////////////////////////////// ++// Optimization knobs ++/////////////////////////////////////////////////////////////////////////////// ++#define KNOB_USE_FAST_SRGB TRUE ++ ++// enables cut-aware primitive assembler ++#define KNOB_ENABLE_CUT_AWARE_PA TRUE ++ ++/////////////////////////////////////////////////////////////////////////////// ++// Debug knobs ++/////////////////////////////////////////////////////////////////////////////// ++//#define KNOB_ENABLE_RDTSC ++//#define KNOB_SWRC_TRACING ++ ++// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs. ++#if !defined(KNOB_ENABLE_TOSS_POINTS) ++#define KNOB_ENABLE_TOSS_POINTS 0 ++#endif ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +new file mode 100644 +index 0000000..3f19555 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +@@ -0,0 +1,98 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file knobs_init.h ++* ++* @brief Dynamic Knobs Initialization for Core. ++* ++******************************************************************************/ ++#pragma once ++ ++#include ++#include ++#include ++#include ++#include ++ ++// Assume the type is compatible with a 32-bit integer ++template ++static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue) ++{ ++ uint32_t value = 0; ++ if (sscanf(pOverride, "%u", &value)) ++ { ++ knobValue = static_cast(value); ++ } ++} ++ ++static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue) ++{ ++ size_t len = strlen(pOverride); ++ if (len == 1) ++ { ++ auto c = tolower(pOverride[0]); ++ if (c == 'y' || c == 't' || c == '1') ++ { ++ knobValue = true; ++ return; ++ } ++ if (c == 'n' || c == 'f' || c == '0') ++ { ++ knobValue = false; ++ return; ++ } ++ } ++ ++ // Try converting to a number and casting to bool ++ uint32_t value = 0; ++ if (sscanf(pOverride, "%u", &value)) ++ { ++ knobValue = value != 0; ++ return; ++ } ++} ++ ++static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue) ++{ ++ float value = knobValue; ++ if (sscanf(pOverride, "%f", &value)) ++ { ++ knobValue = value; ++ } ++} ++ ++template ++static inline void InitKnob(T& knob) ++{ ++ ++ // TODO, read registry first ++ ++ // Second, read environment variables ++ const char* pOverride = getenv(knob.Name()); ++ ++ if (pOverride) ++ { ++ auto knobValue = knob.Value(); ++ ConvertEnvToKnob(pOverride, knobValue); ++ knob.Value(knobValue); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h +new file mode 100644 +index 0000000..f7d5263 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h +@@ -0,0 +1,562 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file multisample.h ++* ++******************************************************************************/ ++ ++#pragma once ++ ++#include "context.h" ++#include "format_traits.h" ++ ++INLINE ++uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount) ++{ ++ static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16}; ++ assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX); ++ return sampleCountLUT[sampleCount]; ++} ++ ++INLINE ++SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) ++{ ++ switch(numSamples) ++ { ++ case 1: return SWR_MULTISAMPLE_1X; ++ case 2: return SWR_MULTISAMPLE_2X; ++ case 4: return SWR_MULTISAMPLE_4X; ++ case 8: return SWR_MULTISAMPLE_8X; ++ case 16: return SWR_MULTISAMPLE_16X; ++ default: assert(0); return SWR_MULTISAMPLE_1X; ++ } ++} ++ ++// hardcoded offsets based on Direct3d standard multisample positions ++// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner ++// coords are 0.8 fixed point offsets from (0, 0) ++template ++struct MultisampleTraits ++{ ++ INLINE static __m128i vXi(uint32_t sampleNum) = delete; ++ INLINE static __m128i vYi(uint32_t sampleNum) = delete; ++ INLINE static simdscalar vX(uint32_t sampleNum) = delete; ++ INLINE static simdscalar vY(uint32_t sampleNum) = delete; ++ INLINE static __m128i TileSampleOffsetsX() = delete; ++ INLINE static __m128i TileSampleOffsetsY() = delete; ++ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete; ++ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete; ++ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete; ++ ++ static const uint32_t numSamples = 0; ++ static const uint32_t sampleMask = 0; ++}; ++ ++template<> ++struct MultisampleTraits ++{ ++ INLINE static __m128i vXi(uint32_t sampleNum) ++ { ++ static const __m128i X = _mm_set1_epi32(0x80); ++ return X; ++ } ++ ++ INLINE static __m128i vYi(uint32_t sampleNum) ++ { ++ static const __m128i Y = _mm_set1_epi32(0x80); ++ return Y; ++ } ++ ++ INLINE static simdscalar vX(uint32_t sampleNum) ++ { ++ static const simdscalar X = _simd_set1_ps(0.5f); ++ return X; ++ } ++ ++ INLINE static simdscalar vY(uint32_t sampleNum) ++ { ++ static const simdscalar Y = _simd_set1_ps(0.5f); ++ return Y; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsX() ++ { ++ static const uint32_t bboxLeftEdge = 0x80; ++ static const uint32_t bboxRightEdge = 0x80; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); ++ return tileSampleOffsetX; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsY() ++ { ++ static const uint32_t bboxTopEdge = 0x80; ++ static const uint32_t bboxBottomEdge = 0x80; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); ++ return tileSampleOffsetY; ++ } ++ ++ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) ++ { ++ return 0; ++ } ++ ++ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) ++ { ++ return 0; ++ } ++ ++ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) ++ { ++ return 0; ++ } ++ ++ static const uint32_t numSamples = 1; ++ static const uint32_t sampleMask = 1; ++}; ++ ++template<> ++struct MultisampleTraits ++{ ++ INLINE static __m128i vXi(uint32_t sampleNum) ++ { ++ static const __m128i X[numSamples] {_mm_set1_epi32(0xC0), _mm_set1_epi32(0x40)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static __m128i vYi(uint32_t sampleNum) ++ { ++ static const __m128i Y[numSamples] {_mm_set1_epi32(0xC0), _mm_set1_epi32(0x40)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static simdscalar vX(uint32_t sampleNum) ++ { ++ static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; ++ assert(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static simdscalar vY(uint32_t sampleNum) ++ { ++ static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; ++ assert(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsX() ++ { ++ static const uint32_t bboxLeftEdge = 0x40; ++ static const uint32_t bboxRightEdge = 0xC0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); ++ return tileSampleOffsetX; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsY() ++ { ++ static const uint32_t bboxTopEdge = 0x40; ++ static const uint32_t bboxBottomEdge = 0xC0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); ++ return tileSampleOffsetY; ++ } ++ ++ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileColorOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileColorOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileDepthOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileDepthOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileStencilOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileStencilOffsets[sampleNum]; ++ } ++ ++ static const uint32_t numSamples = 2; ++ static const uint32_t sampleMask = 0x3; ++}; ++ ++template<> ++struct MultisampleTraits ++{ ++ INLINE static __m128i vXi(uint32_t sampleNum) ++ { ++ static const __m128i X[numSamples] ++ {_mm_set1_epi32(0x60), _mm_set1_epi32(0xE0), _mm_set1_epi32(0x20), _mm_set1_epi32(0xA0)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static __m128i vYi(uint32_t sampleNum) ++ { ++ static const __m128i Y[numSamples] ++ {_mm_set1_epi32(0x20), _mm_set1_epi32(0x60), _mm_set1_epi32(0xA0), _mm_set1_epi32(0xE0)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static simdscalar vX(uint32_t sampleNum) ++ { ++ static const simdscalar X[numSamples] ++ {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)}; ++ assert(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static simdscalar vY(uint32_t sampleNum) ++ { ++ static const simdscalar Y[numSamples] ++ {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)}; ++ assert(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsX() ++ { ++ static const uint32_t bboxLeftEdge = 0x20; ++ static const uint32_t bboxRightEdge = 0xE0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); ++ return tileSampleOffsetX; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsY() ++ { ++ static const uint32_t bboxTopEdge = 0x20; ++ static const uint32_t bboxBottomEdge = 0xE0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); ++ return tileSampleOffsetY; ++ } ++ ++ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileColorOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileColorOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileDepthOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileDepthOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileStencilOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileStencilOffsets[sampleNum]; ++ } ++ ++ static const uint32_t numSamples = 4; ++ static const uint32_t sampleMask = 0xF; ++}; ++ ++template<> ++struct MultisampleTraits ++{ ++ INLINE static __m128i vXi(uint32_t sampleNum) ++ { ++ static const __m128i X[numSamples] ++ {_mm_set1_epi32(0x90), _mm_set1_epi32(0x70), _mm_set1_epi32(0xD0), _mm_set1_epi32(0x50), ++ _mm_set1_epi32(0x30), _mm_set1_epi32(0x10), _mm_set1_epi32(0xB0), _mm_set1_epi32(0xF0)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static __m128i vYi(uint32_t sampleNum) ++ { ++ static const __m128i Y[numSamples] ++ {_mm_set1_epi32(0x50), _mm_set1_epi32(0xB0), _mm_set1_epi32(0x90), _mm_set1_epi32(0x30), ++ _mm_set1_epi32(0xD0), _mm_set1_epi32(0x70), _mm_set1_epi32(0xF0), _mm_set1_epi32(0x10)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static simdscalar vX(uint32_t sampleNum) ++ { ++ static const simdscalar X[numSamples] ++ {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125), ++ _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)}; ++ assert(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static simdscalar vY(uint32_t sampleNum) ++ { ++ static const simdscalar Y[numSamples] ++ {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875), ++ _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)}; ++ assert(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsX() ++ { ++ static const uint32_t bboxLeftEdge = 0x10; ++ static const uint32_t bboxRightEdge = 0xF0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); ++ return tileSampleOffsetX; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsY() ++ { ++ static const uint32_t bboxTopEdge = 0x10; ++ static const uint32_t bboxBottomEdge = 0xF0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); ++ return tileSampleOffsetY; ++ } ++ ++ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileColorOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileColorOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileDepthOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileDepthOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileStencilOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileStencilOffsets[sampleNum]; ++ } ++ ++ static const uint32_t numSamples = 8; ++ static const uint32_t sampleMask = 0xFF; ++}; ++ ++template<> ++struct MultisampleTraits ++{ ++ INLINE static __m128i vXi(uint32_t sampleNum) ++ { ++ static const __m128i X[numSamples] ++ {_mm_set1_epi32(0x90), _mm_set1_epi32(0x70), _mm_set1_epi32(0x50), _mm_set1_epi32(0xC0), ++ _mm_set1_epi32(0x30), _mm_set1_epi32(0xA0), _mm_set1_epi32(0xD0), _mm_set1_epi32(0xB0), ++ _mm_set1_epi32(0x60), _mm_set1_epi32(0x80), _mm_set1_epi32(0x40), _mm_set1_epi32(0x20), ++ _mm_set1_epi32(0x00), _mm_set1_epi32(0xF0), _mm_set1_epi32(0xE0), _mm_set1_epi32(0x10)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static __m128i vYi(uint32_t sampleNum) ++ { ++ static const __m128i Y[numSamples] ++ {_mm_set1_epi32(0x90), _mm_set1_epi32(0x50), _mm_set1_epi32(0xA0), _mm_set1_epi32(0x70), ++ _mm_set1_epi32(0x60), _mm_set1_epi32(0xD0), _mm_set1_epi32(0xB0), _mm_set1_epi32(0x30), ++ _mm_set1_epi32(0xE0), _mm_set1_epi32(0x10), _mm_set1_epi32(0x20), _mm_set1_epi32(0xC0), ++ _mm_set1_epi32(0x80), _mm_set1_epi32(0x40), _mm_set1_epi32(0xF0), _mm_set1_epi32(0x00)}; ++ SWR_ASSERT(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static simdscalar vX(uint32_t sampleNum) ++ { ++ static const simdscalar X[numSamples] ++ {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500), ++ _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), ++ _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250), ++ _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)}; ++ assert(sampleNum < numSamples); ++ return X[sampleNum]; ++ } ++ ++ INLINE static simdscalar vY(uint32_t sampleNum) ++ { ++ static const simdscalar Y[numSamples] ++ {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375), ++ _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875), ++ _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500), ++ _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)}; ++ assert(sampleNum < numSamples); ++ return Y[sampleNum]; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsX() ++ { ++ static const uint32_t bboxLeftEdge = 0x00; ++ static const uint32_t bboxRightEdge = 0xF0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); ++ return tileSampleOffsetX; ++ } ++ ++ INLINE static __m128i TileSampleOffsetsY() ++ { ++ static const uint32_t bboxTopEdge = 0x00; ++ static const uint32_t bboxBottomEdge = 0xF0; ++ // BR, BL, UR, UL ++ static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); ++ return tileSampleOffsetY; ++ } ++ ++ INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileColorOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileColorOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileDepthOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileDepthOffsets[sampleNum]; ++ } ++ ++ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) ++ { ++ static const uint32_t RasterTileStencilOffsets[numSamples] ++ { 0, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8), ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 2, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 3, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 4, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 5, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 6, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 7, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 8, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 9, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 10, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 11, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 12, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 13, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 14, ++ (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits::bpp / 8) * 15, ++ }; ++ assert(sampleNum < numSamples); ++ return RasterTileStencilOffsets[sampleNum]; ++ } ++ ++ static const uint32_t numSamples = 16; ++ static const uint32_t sampleMask = 0xFFFF; ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h +new file mode 100644 +index 0000000..52ea820 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/pa.h +@@ -0,0 +1,1205 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file pa.h ++* ++* @brief Definitions for primitive assembly. ++* N primitives are assembled at a time, where N is the SIMD width. ++* A state machine, that is specific for a given topology, drives the ++* assembly of vertices into triangles. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "frontend.h" ++ ++struct PA_STATE ++{ ++ DRAW_CONTEXT *pDC; // draw context ++ uint8_t* pStreamBase; // vertex stream ++ uint32_t streamSizeInVerts; // total size of the input stream in verts ++ ++ // The topology the binner will use. In some cases the FE changes the topology from the api state. ++ PRIMITIVE_TOPOLOGY binTopology; ++ ++ PA_STATE() {} ++ PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : ++ pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {} ++ ++ virtual bool HasWork() = 0; ++ virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; ++ virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; ++ virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0; ++ virtual bool NextPrim() = 0; ++ virtual simdvertex& GetNextVsOutput() = 0; ++ virtual bool GetNextStreamOutput() = 0; ++ virtual simdmask& GetNextVsIndices() = 0; ++ virtual uint32_t NumPrims() = 0; ++ virtual void Reset() = 0; ++ virtual simdscalari GetPrimID(uint32_t startID) = 0; ++}; ++ ++// The Optimized PA is a state machine that assembles triangles from vertex shader simd ++// output. Here is the sequence ++// 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). ++// 2. Execute PA function to assemble and bin triangles. ++// a. The PA function is a set of functions that collectively make up the ++// state machine for a given topology. ++// 1. We use a state index to track which PA function to call. ++// b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. ++// 1. We call this the current and previous simd vertex. ++// 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In ++// order to assemble the second triangle, for a triangle list, we'll need the ++// last vertex from the previous simd and the first 2 vertices from the current simd. ++// 3. At times the PA can assemble multiple triangles from the 2 simd vertices. ++// ++// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without ++// cuts ++struct PA_STATE_OPT : public PA_STATE ++{ ++ simdvertex leadingVertex; // For tri-fan ++ uint32_t numPrims; // Total number of primitives for draw. ++ uint32_t numPrimsComplete; // Total number of complete primitives. ++ ++ uint32_t numSimdPrims; // Number of prims in current simd. ++ ++ uint32_t cur; // index to current VS output. ++ uint32_t prev; // index to prev VS output. Not really needed in the state. ++ uint32_t first; // index to first VS output. Used for trifan. ++ ++ uint32_t counter; // state counter ++ bool reset; // reset state ++ ++ uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2}) ++ simdscalari primID; ++ ++ typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); ++ typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++ PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles. ++ PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle. ++ ++ // state used to advance the PA when Next is called ++ PFN_PA_FUNC pfnPaNextFunc; ++ uint32_t nextNumSimdPrims; ++ uint32_t nextNumPrimsIncrement; ++ bool nextReset; ++ bool isStreaming; ++ ++ simdmask tmpIndices; // temporary index store for unused virtual function ++ ++ PA_STATE_OPT() {} ++ PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, ++ bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); ++ ++ bool HasWork() ++ { ++ return (this->numPrimsComplete < this->numPrims) ? true : false; ++ } ++ ++ simdvector& GetSimdVector(uint32_t index, uint32_t slot) ++ { ++ simdvertex* pVertex = (simdvertex*)pStreamBase; ++ return pVertex[index].attrib[slot]; ++ } ++ ++ // Assembles 4 triangles. Each simdvector is a single vertex from 4 ++ // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. ++ bool Assemble(uint32_t slot, simdvector verts[]) ++ { ++ return this->pfnPaFunc(*this, slot, verts); ++ } ++ ++ // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). ++ void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) ++ { ++ return this->pfnPaSingleFunc(*this, slot, primIndex, verts); ++ } ++ ++ bool NextPrim() ++ { ++ this->pfnPaFunc = this->pfnPaNextFunc; ++ this->numSimdPrims = this->nextNumSimdPrims; ++ this->numPrimsComplete += this->nextNumPrimsIncrement; ++ this->reset = this->nextReset; ++ ++ if (this->isStreaming) ++ { ++ this->reset = false; ++ } ++ ++ bool morePrims = false; ++ ++ if (this->numSimdPrims > 0) ++ { ++ morePrims = true; ++ this->numSimdPrims--; ++ } ++ else ++ { ++ this->counter = (this->reset) ? 0 : (this->counter + 1); ++ this->reset = false; ++ } ++ ++ this->pfnPaFunc = this->pfnPaNextFunc; ++ ++ if (!HasWork()) ++ { ++ morePrims = false; // no more to do ++ } ++ ++ return morePrims; ++ } ++ ++ simdvertex& GetNextVsOutput() ++ { ++ // increment cur and prev indices ++ const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH; ++ this->prev = this->cur; // prev is undefined for first state. ++ this->cur = this->counter % numSimdVerts; ++ ++ simdvertex* pVertex = (simdvertex*)pStreamBase; ++ return pVertex[this->cur]; ++ } ++ ++ simdmask& GetNextVsIndices() ++ { ++ // unused in optimized PA, pass tmp buffer back ++ return tmpIndices; ++ } ++ ++ bool GetNextStreamOutput() ++ { ++ this->prev = this->cur; ++ this->cur = this->counter; ++ ++ return HasWork(); ++ } ++ ++ uint32_t NumPrims() ++ { ++ return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? ++ (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH; ++ } ++ ++ void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, ++ PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, ++ uint32_t numSimdPrims = 0, ++ uint32_t numPrimsIncrement = 0, ++ bool reset = false) ++ { ++ this->pfnPaNextFunc = pfnPaNextFunc; ++ this->nextNumSimdPrims = numSimdPrims; ++ this->nextNumPrimsIncrement = numPrimsIncrement; ++ this->nextReset = reset; ++ ++ this->pfnPaSingleFunc = pfnPaNextSingleFunc; ++ } ++ ++ void Reset() ++ { ++ this->numPrimsComplete = 0; ++ this->numSimdPrims = 0; ++ this->cur = 0; ++ this->prev = 0; ++ this->first = 0; ++ this->counter = 0; ++ this->reset = false; ++ } ++ ++ simdscalari GetPrimID(uint32_t startID) ++ { ++ return _simd_add_epi32(this->primID, ++ _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH))); ++ } ++}; ++ ++// helper C wrappers to avoid having to rewrite all the PA topology state functions ++INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, ++ PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, ++ uint32_t numSimdPrims = 0, ++ uint32_t numPrimsIncrement = 0, ++ bool reset = false) ++{ ++ return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); ++} ++INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) ++{ ++ return pa.GetSimdVector(index, slot); ++} ++ ++INLINE __m128 swizzleLane0(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); ++} ++ ++INLINE __m128 swizzleLane1(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); ++} ++ ++INLINE __m128 swizzleLane2(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); ++} ++ ++INLINE __m128 swizzleLane3(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); ++} ++ ++INLINE __m128 swizzleLane4(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); ++ ++} ++ ++INLINE __m128 swizzleLane5(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); ++} ++ ++INLINE __m128 swizzleLane6(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); ++} ++ ++INLINE __m128 swizzleLane7(const simdvector &a) ++{ ++ simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); ++ simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); ++ return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); ++} ++ ++INLINE __m128 swizzleLaneN(const simdvector &a, int lane) ++{ ++ switch (lane) { ++ case 0: ++ return swizzleLane0(a); ++ case 1: ++ return swizzleLane1(a); ++ case 2: ++ return swizzleLane2(a); ++ case 3: ++ return swizzleLane3(a); ++ case 4: ++ return swizzleLane4(a); ++ case 5: ++ return swizzleLane5(a); ++ case 6: ++ return swizzleLane6(a); ++ case 7: ++ return swizzleLane7(a); ++ default: ++ return _mm_setzero_ps(); ++ } ++} ++ ++// Cut-aware primitive assembler. ++struct PA_STATE_CUT : public PA_STATE ++{ ++ simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex ++ uint32_t numVerts; // number of vertices available in buffer store ++ uint32_t numAttribs; // number of attributes ++ int32_t numRemainingVerts; // number of verts remaining to be assembled ++ uint32_t numVertsToAssemble; // total number of verts to assemble for the draw ++ OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather ++ simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd ++ uint32_t numPrimsAssembled; // number of primitives that are fully assembled ++ uint32_t headVertex; // current unused vertex slot in vertex buffer store ++ uint32_t tailVertex; // beginning vertex currently assembling ++ uint32_t curVertex; // current unprocessed vertex ++ uint32_t startPrimId; // starting prim id ++ simdscalari vPrimId; // vector of prim ID ++ bool needOffsets; // need to compute gather offsets for current SIMD ++ uint32_t vertsPerPrim; ++ simdvertex tmpVertex; // temporary simdvertex for unimplemented API ++ bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they ++ // are ignored. Fetch shader sends invalid verts on cuts that should be ignored ++ // while the GS sends valid verts for every index ++ // Topology state tracking ++ uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; ++ uint32_t curIndex; ++ bool reverseWinding; // indicates reverse winding for strips ++ int32_t adjExtraVert; // extra vert uses for tristrip w/ adj ++ ++ typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); ++ PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert ++ ++ PA_STATE_CUT() {} ++ PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, ++ uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts) ++ : PA_STATE(pDC, in_pStream, in_streamSizeInVerts) ++ { ++ numVerts = in_streamSizeInVerts; ++ numAttribs = in_numAttribs; ++ binTopology = topo; ++ needOffsets = false; ++ processCutVerts = in_processCutVerts; ++ ++ numVertsToAssemble = numRemainingVerts = in_numVerts; ++ numPrimsAssembled = 0; ++ headVertex = tailVertex = curVertex = 0; ++ ++ curIndex = 0; ++ pCutIndices = in_pIndices; ++ memset(indices, 0, sizeof(indices)); ++ vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++ reverseWinding = false; ++ adjExtraVert = -1; ++ ++ bool gsEnabled = pDC->pState->state.gsState.gsEnable; ++ vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); ++ ++ switch (topo) ++ { ++ case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; ++ case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; ++ case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; ++ case TOP_TRI_STRIP_ADJ: if (gsEnabled) ++ { ++ pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ; ++ } ++ else ++ { ++ pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ; ++ } ++ break; ++ ++ case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; ++ case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; ++ case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; ++ case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; ++ case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; ++ default: assert(0 && "Unimplemented topology"); ++ } ++ } ++ ++ simdvertex& GetNextVsOutput() ++ { ++ uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; ++ this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts; ++ this->needOffsets = true; ++ return ((simdvertex*)pStreamBase)[vertexIndex]; ++ } ++ ++ simdmask& GetNextVsIndices() ++ { ++ uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; ++ simdmask* pCurCutIndex = this->pCutIndices + vertexIndex; ++ return *pCurCutIndex; ++ } ++ ++ simdvector& GetSimdVector(uint32_t index, uint32_t slot) ++ { ++ // unused ++ SWR_ASSERT(0 && "Not implemented"); ++ return this->tmpVertex.attrib[0]; ++ } ++ ++ bool GetNextStreamOutput() ++ { ++ this->headVertex += KNOB_SIMD_WIDTH; ++ this->needOffsets = true; ++ return HasWork(); ++ } ++ ++ simdscalari GetPrimID(uint32_t startID) ++ { ++ return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); ++ } ++ ++ void Reset() ++ { ++ this->numRemainingVerts = this->numVertsToAssemble; ++ this->numPrimsAssembled = 0; ++ this->curIndex = 0; ++ this->curVertex = 0; ++ this->tailVertex = 0; ++ this->headVertex = 0; ++ this->reverseWinding = false; ++ this->adjExtraVert = -1; ++ this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++ } ++ ++ bool HasWork() ++ { ++ return this->numRemainingVerts > 0 || this->adjExtraVert != -1; ++ } ++ ++ bool IsVertexStoreFull() ++ { ++ return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex; ++ } ++ ++ void RestartTopology() ++ { ++ this->curIndex = 0; ++ this->reverseWinding = false; ++ this->adjExtraVert = -1; ++ } ++ ++ bool IsCutIndex(uint32_t vertex) ++ { ++ uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH; ++ uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1); ++ return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1; ++ } ++ ++ // iterates across the unprocessed verts until we hit the end or we ++ // have assembled SIMD prims ++ void ProcessVerts() ++ { ++ while (this->numPrimsAssembled != KNOB_SIMD_WIDTH && ++ this->numRemainingVerts > 0 && ++ this->curVertex != this->headVertex) ++ { ++ // if cut index, restart topology ++ if (IsCutIndex(this->curVertex)) ++ { ++ if (this->processCutVerts) ++ { ++ (this->*pfnPa)(this->curVertex, false); ++ } ++ // finish off tri strip w/ adj before restarting topo ++ if (this->adjExtraVert != -1) ++ { ++ (this->*pfnPa)(this->curVertex, true); ++ } ++ RestartTopology(); ++ } ++ else ++ { ++ (this->*pfnPa)(this->curVertex, false); ++ } ++ ++ this->curVertex = (this->curVertex + 1) % this->numVerts; ++ this->numRemainingVerts--; ++ } ++ ++ // special case last primitive for tri strip w/ adj ++ if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) ++ { ++ (this->*pfnPa)(this->curVertex, true); ++ } ++ } ++ ++ void Advance() ++ { ++ // done with current batch ++ // advance tail to the current unsubmitted vertex ++ this->tailVertex = this->curVertex; ++ this->numPrimsAssembled = 0; ++ this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH)); ++ } ++ ++ bool NextPrim() ++ { ++ // if we've assembled enough prims, we can advance to the next set of verts ++ if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0) ++ { ++ Advance(); ++ } ++ return false; ++ } ++ ++ void ComputeOffsets() ++ { ++ for (uint32_t v = 0; v < this->vertsPerPrim; ++v) ++ { ++ simdscalari vIndices = *(simdscalari*)&this->indices[v][0]; ++ ++ // step to simdvertex batch ++ const uint32_t simdShift = 3; // @todo make knob ++ simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift); ++ this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex))); ++ ++ // step to index ++ const uint32_t simdMask = 0x7; // @todo make knob ++ simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); ++ this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); ++ } ++ } ++ ++ bool Assemble(uint32_t slot, simdvector result[]) ++ { ++ // process any outstanding verts ++ ProcessVerts(); ++ ++ // return false if we don't have enough prims assembled ++ if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0) ++ { ++ return false; ++ } ++ ++ // cache off gather offsets given the current SIMD set of indices the first time we get an assemble ++ if (this->needOffsets) ++ { ++ ComputeOffsets(); ++ this->needOffsets = false; ++ } ++ ++ for (uint32_t v = 0; v < this->vertsPerPrim; ++v) ++ { ++ simdscalari offsets = this->vOffsets[v]; ++ ++ // step to attribute ++ offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); ++ ++ float* pBase = (float*)this->pStreamBase; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); ++ ++ // move base to next component ++ pBase += KNOB_SIMD_WIDTH; ++ } ++ } ++ ++ return true; ++ } ++ ++ void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3]) ++ { ++ // move to slot ++ for (uint32_t v = 0; v < this->vertsPerPrim; ++v) ++ { ++ uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; ++ uint32_t offset = pOffset[triIndex]; ++ offset += sizeof(simdvector) * slot; ++ float* pVert = (float*)&tri[v]; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ float* pComponent = (float*)(this->pStreamBase + offset); ++ pVert[c] = *pComponent; ++ offset += KNOB_SIMD_WIDTH * sizeof(float); ++ } ++ } ++ } ++ ++ uint32_t NumPrims() ++ { ++ return this->numPrimsAssembled; ++ } ++ ++ // Per-topology functions ++ void ProcessVertTriStrip(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 3) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ if (reverseWinding) ++ { ++ this->indices[1][this->numPrimsAssembled] = this->vert[2]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[1]; ++ } ++ else ++ { ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ } ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->vert[0] = this->vert[1]; ++ this->vert[1] = this->vert[2]; ++ this->curIndex = 2; ++ this->reverseWinding ^= 1; ++ } ++ } ++ ++ template ++ void AssembleTriStripAdj() ++ { ++ if (!gsEnabled) ++ { ++ this->vert[1] = this->vert[2]; ++ this->vert[2] = this->vert[4]; ++ ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ ++ this->vert[4] = this->vert[2]; ++ this->vert[2] = this->vert[1]; ++ } ++ else ++ { ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ this->indices[3][this->numPrimsAssembled] = this->vert[3]; ++ this->indices[4][this->numPrimsAssembled] = this->vert[4]; ++ this->indices[5][this->numPrimsAssembled] = this->vert[5]; ++ } ++ this->numPrimsAssembled++; ++ } ++ ++ ++ template ++ void ProcessVertTriStripAdj(uint32_t index, bool finish) ++ { ++ // handle last primitive of tristrip ++ if (finish && this->adjExtraVert != -1) ++ { ++ this->vert[3] = this->adjExtraVert; ++ AssembleTriStripAdj(); ++ this->adjExtraVert = -1; ++ return; ++ } ++ ++ switch (this->curIndex) ++ { ++ case 0: ++ case 1: ++ case 2: ++ case 4: ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ break; ++ case 3: ++ this->vert[5] = index; ++ this->curIndex++; ++ break; ++ case 5: ++ if (this->adjExtraVert == -1) ++ { ++ this->adjExtraVert = index; ++ } ++ else ++ { ++ this->vert[3] = index; ++ if (!gsEnabled) ++ { ++ AssembleTriStripAdj(); ++ ++ uint32_t nextTri[6]; ++ if (this->reverseWinding) ++ { ++ nextTri[0] = this->vert[4]; ++ nextTri[1] = this->vert[0]; ++ nextTri[2] = this->vert[2]; ++ nextTri[4] = this->vert[3]; ++ nextTri[5] = this->adjExtraVert; ++ } ++ else ++ { ++ nextTri[0] = this->vert[2]; ++ nextTri[1] = this->adjExtraVert; ++ nextTri[2] = this->vert[3]; ++ nextTri[4] = this->vert[4]; ++ nextTri[5] = this->vert[0]; ++ } ++ for (uint32_t i = 0; i < 6; ++i) ++ { ++ this->vert[i] = nextTri[i]; ++ } ++ ++ this->adjExtraVert = -1; ++ this->reverseWinding ^= 1; ++ } ++ else ++ { ++ this->curIndex++; ++ } ++ } ++ break; ++ case 6: ++ SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); ++ AssembleTriStripAdj(); ++ ++ uint32_t nextTri[6]; ++ if (this->reverseWinding) ++ { ++ nextTri[0] = this->vert[4]; ++ nextTri[1] = this->vert[0]; ++ nextTri[2] = this->vert[2]; ++ nextTri[4] = this->vert[3]; ++ nextTri[5] = this->adjExtraVert; ++ } ++ else ++ { ++ nextTri[0] = this->vert[2]; ++ nextTri[1] = this->adjExtraVert; ++ nextTri[2] = this->vert[3]; ++ nextTri[4] = this->vert[4]; ++ nextTri[5] = this->vert[0]; ++ } ++ for (uint32_t i = 0; i < 6; ++i) ++ { ++ this->vert[i] = nextTri[i]; ++ } ++ this->reverseWinding ^= 1; ++ this->adjExtraVert = index; ++ this->curIndex--; ++ break; ++ } ++ } ++ ++ void ProcessVertTriList(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 3) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->curIndex = 0; ++ } ++ } ++ ++ void ProcessVertTriListAdj(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 6) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ this->indices[3][this->numPrimsAssembled] = this->vert[3]; ++ this->indices[4][this->numPrimsAssembled] = this->vert[4]; ++ this->indices[5][this->numPrimsAssembled] = this->vert[5]; ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->curIndex = 0; ++ } ++ } ++ ++ void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 6) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[2]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[4]; ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->curIndex = 0; ++ } ++ } ++ ++ ++ void ProcessVertLineList(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 2) ++ { ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ ++ this->numPrimsAssembled++; ++ this->curIndex = 0; ++ } ++ } ++ ++ void ProcessVertLineStrip(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 2) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->vert[0] = this->vert[1]; ++ this->curIndex = 1; ++ } ++ } ++ ++ void ProcessVertLineStripAdj(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 4) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ this->indices[3][this->numPrimsAssembled] = this->vert[3]; ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->vert[0] = this->vert[1]; ++ this->vert[1] = this->vert[2]; ++ this->vert[2] = this->vert[3]; ++ this->curIndex = 3; ++ } ++ } ++ ++ void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 4) ++ { ++ // assembled enough verts for prim, add to gather indices ++ this->indices[0][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[2]; ++ ++ // increment numPrimsAssembled ++ this->numPrimsAssembled++; ++ ++ // set up next prim state ++ this->vert[0] = this->vert[1]; ++ this->vert[1] = this->vert[2]; ++ this->vert[2] = this->vert[3]; ++ this->curIndex = 3; ++ } ++ } ++ ++ void ProcessVertLineListAdj(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 4) ++ { ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[2][this->numPrimsAssembled] = this->vert[2]; ++ this->indices[3][this->numPrimsAssembled] = this->vert[3]; ++ ++ this->numPrimsAssembled++; ++ this->curIndex = 0; ++ } ++ } ++ ++ void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 4) ++ { ++ this->indices[0][this->numPrimsAssembled] = this->vert[1]; ++ this->indices[1][this->numPrimsAssembled] = this->vert[2]; ++ ++ this->numPrimsAssembled++; ++ this->curIndex = 0; ++ } ++ } ++ ++ void ProcessVertPointList(uint32_t index, bool finish) ++ { ++ this->vert[this->curIndex] = index; ++ this->curIndex++; ++ if (this->curIndex == 1) ++ { ++ this->indices[0][this->numPrimsAssembled] = this->vert[0]; ++ this->numPrimsAssembled++; ++ this->curIndex = 0; ++ } ++ } ++}; ++ ++// Primitive Assembly for data output from the DomainShader. ++struct PA_TESS : PA_STATE ++{ ++ PA_TESS( ++ DRAW_CONTEXT *in_pDC, ++ const simdscalar* in_pVertData, ++ uint32_t in_attributeStrideInVectors, ++ uint32_t in_numAttributes, ++ uint32_t* (&in_ppIndices)[3], ++ uint32_t in_numPrims, ++ PRIMITIVE_TOPOLOGY in_binTopology) : ++ ++ PA_STATE(in_pDC, nullptr, 0), ++ m_pVertexData(in_pVertData), ++ m_attributeStrideInVectors(in_attributeStrideInVectors), ++ m_numAttributes(in_numAttributes), ++ m_numPrims(in_numPrims) ++ { ++ m_vPrimId = _simd_setzero_si(); ++ binTopology = in_binTopology; ++ m_ppIndices[0] = in_ppIndices[0]; ++ m_ppIndices[1] = in_ppIndices[1]; ++ m_ppIndices[2] = in_ppIndices[2]; ++ ++ switch (binTopology) ++ { ++ case TOP_POINT_LIST: ++ m_numVertsPerPrim = 1; ++ break; ++ ++ case TOP_LINE_LIST: ++ m_numVertsPerPrim = 2; ++ break; ++ ++ case TOP_TRIANGLE_LIST: ++ m_numVertsPerPrim = 3; ++ break; ++ ++ default: ++ SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); ++ break; ++ } ++ } ++ ++ bool HasWork() ++ { ++ return m_numPrims != 0; ++ } ++ ++ simdvector& GetSimdVector(uint32_t index, uint32_t slot) ++ { ++ SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__); ++ static simdvector junk = { 0 }; ++ return junk; ++ } ++ ++ static simdscalari GenPrimMask(uint32_t numPrims) ++ { ++ SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); ++#if KNOB_SIMD_WIDTH == 8 ++ static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] = ++ { ++ -1, -1, -1, -1, -1, -1, -1, -1, ++ 0, 0, 0, 0, 0, 0, 0, 0 ++ }; ++#elif KNOB_SIMD_WIDTH == 16 ++ static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] = ++ { ++ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ++ }; ++#else ++#error "Help, help, I can't get up!" ++#endif ++ ++ return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]); ++ } ++ ++ bool Assemble(uint32_t slot, simdvector verts[]) ++ { ++ static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented"); ++ SWR_ASSERT(slot < m_numAttributes); ++ ++ uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); ++ if (0 == numPrimsToAssemble) ++ { ++ return false; ++ } ++ ++ simdscalari mask = GenPrimMask(numPrimsToAssemble); ++ ++ const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; ++ for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) ++ { ++ simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]); ++ ++ const float* pBase = pBaseAttrib; ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ verts[i].v[c] = _simd_mask_i32gather_ps( ++ _simd_setzero_ps(), ++ pBase, ++ indices, ++ _simd_castsi_ps(mask), ++ 4 /* gcc doesn't like sizeof(float) */); ++ pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; ++ } ++ } ++ ++ return true; ++ } ++ ++ void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) ++ { ++ SWR_ASSERT(slot < m_numAttributes); ++ SWR_ASSERT(primIndex < PA_TESS::NumPrims()); ++ ++ const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; ++ for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) ++ { ++ uint32_t index = m_ppIndices[i][primIndex]; ++ const float* pVertData = pVertDataBase; ++ float* pVert = (float*)&verts[i]; ++ ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ pVert[c] = pVertData[index]; ++ pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; ++ } ++ } ++ } ++ ++ bool NextPrim() ++ { ++ uint32_t numPrims = PA_TESS::NumPrims(); ++ m_numPrims -= numPrims; ++ m_ppIndices[0] += numPrims; ++ m_ppIndices[1] += numPrims; ++ m_ppIndices[2] += numPrims; ++ ++ return HasWork(); ++ } ++ ++ simdvertex& GetNextVsOutput() ++ { ++ SWR_ASSERT(0, "%s", __FUNCTION__); ++ static simdvertex junk; ++ return junk; ++ } ++ ++ bool GetNextStreamOutput() ++ { ++ SWR_ASSERT(0, "%s", __FUNCTION__); ++ return false; ++ } ++ ++ simdmask& GetNextVsIndices() ++ { ++ SWR_ASSERT(0, "%s", __FUNCTION__); ++ static simdmask junk; ++ return junk; ++ } ++ ++ uint32_t NumPrims() ++ { ++ return std::min(m_numPrims, KNOB_SIMD_WIDTH); ++ } ++ ++ void Reset() { SWR_ASSERT(0); }; ++ ++ simdscalari GetPrimID(uint32_t startID) ++ { ++ return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); ++ } ++ ++private: ++ const simdscalar* m_pVertexData = nullptr; ++ uint32_t m_attributeStrideInVectors = 0; ++ uint32_t m_numAttributes = 0; ++ uint32_t m_numPrims = 0; ++ uint32_t* m_ppIndices[3]; ++ ++ uint32_t m_numVertsPerPrim = 0; ++ ++ simdscalari m_vPrimId; ++}; ++ ++// Primitive Assembler factory class, responsible for creating and initializing the correct assembler ++// based on state. ++struct PA_FACTORY ++{ ++ PA_FACTORY(DRAW_CONTEXT* pDC, bool isIndexed, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo) ++ { ++#if KNOB_ENABLE_CUT_AWARE_PA == TRUE ++ const API_STATE& state = GetApiState(pDC); ++ if ((isIndexed && ( ++ topo == TOP_TRIANGLE_STRIP || ++ (topo == TOP_POINT_LIST && CanUseSimplePoints(pDC)) || ++ topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || ++ topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ || ++ topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || ++ topo == TOP_TRI_STRIP_ADJ)) || ++ ++ // non-indexed draws with adjacency topologies must use cut-aware PA until we add support ++ // for them in the optimized PA ++ (!isIndexed && ( ++ topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))) ++ { ++ DWORD numAttribs; ++ _BitScanReverse(&numAttribs, state.feAttribMask); ++ numAttribs++; ++ this->paCut = PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, ++ &this->indexStore[0], numVerts, numAttribs, state.topology, false); ++ cutPA = true; ++ } ++ else ++#endif ++ { ++ uint32_t numPrims = GetNumPrims(in_topo, numVerts); ++ this->paOpt = PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false); ++ cutPA = false; ++ } ++ ++ } ++ ++ PA_STATE& GetPA() ++ { ++#if KNOB_ENABLE_CUT_AWARE_PA == TRUE ++ if (cutPA) ++ { ++ return this->paCut; ++ } ++ else ++#endif ++ { ++ return this->paOpt; ++ } ++ } ++ ++ PA_STATE_OPT paOpt; ++ PA_STATE_CUT paCut; ++ bool cutPA; ++ ++ PRIMITIVE_TOPOLOGY topo; ++ ++ simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; ++ simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +new file mode 100644 +index 0000000..6dce0bb +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +@@ -0,0 +1,1330 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file pa_avx.cpp ++* ++* @brief AVX implementation for primitive assembly. ++* N primitives are assembled at a time, where N is the SIMD width. ++* A state machine, that is specific for a given topology, drives the ++* assembly of vertices into triangles. ++* ++******************************************************************************/ ++#include "context.h" ++#include "pa.h" ++#include "frontend.h" ++ ++#if (KNOB_SIMD_WIDTH == 8) ++ ++bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++ ++bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]); ++ ++bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]); ++ ++bool PaTriPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaTriPoints1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaTriPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++void PaTriPointsSingle1(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); ++void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); ++ ++template ++void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output ++ // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. ++ // Each attribute has 4 components. ++ ++ /// @todo Optimize this ++ ++ float* pOutVec = (float*)verts; ++ ++ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) ++ { ++ uint32_t input_cp = primIndex * TotalControlPoints + cp; ++ uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; ++ uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; ++ ++ // Loop over all components of the attribute ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); ++ pOutVec[cp * 4 + i] = pInputVec[input_lane]; ++ } ++ } ++} ++ ++template ++static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState( ++ pa, ++ PaPatchList, ++ PaPatchListSingle); ++ ++ return false; ++} ++ ++template ++static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output ++ // KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute. ++ // Each attribute has 4 components. ++ ++ /// @todo Optimize this ++ ++ // Loop over all components of the attribute ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) ++ { ++ float vec[KNOB_SIMD_WIDTH]; ++ for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane) ++ { ++ uint32_t input_cp = lane * TotalControlPoints + cp; ++ uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; ++ uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; ++ ++ const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); ++ vec[lane] = pInputVec[input_lane]; ++ } ++ verts[cp][i] = _simd_loadu_ps(vec); ++ } ++ } ++ ++ SetNextPaState( ++ pa, ++ PaPatchList, ++ PaPatchListSingle, ++ 0, ++ KNOB_SIMD_WIDTH, ++ true); ++ ++ return true; ++} ++ ++#define PA_PATCH_LIST_TERMINATOR(N) \ ++ template<> bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\ ++ { return PaPatchListTerm(pa, slot, verts); } ++PA_PATCH_LIST_TERMINATOR(1) ++PA_PATCH_LIST_TERMINATOR(2) ++PA_PATCH_LIST_TERMINATOR(3) ++PA_PATCH_LIST_TERMINATOR(4) ++PA_PATCH_LIST_TERMINATOR(5) ++PA_PATCH_LIST_TERMINATOR(6) ++PA_PATCH_LIST_TERMINATOR(7) ++PA_PATCH_LIST_TERMINATOR(8) ++PA_PATCH_LIST_TERMINATOR(9) ++PA_PATCH_LIST_TERMINATOR(10) ++PA_PATCH_LIST_TERMINATOR(11) ++PA_PATCH_LIST_TERMINATOR(12) ++PA_PATCH_LIST_TERMINATOR(13) ++PA_PATCH_LIST_TERMINATOR(14) ++PA_PATCH_LIST_TERMINATOR(15) ++PA_PATCH_LIST_TERMINATOR(16) ++PA_PATCH_LIST_TERMINATOR(17) ++PA_PATCH_LIST_TERMINATOR(18) ++PA_PATCH_LIST_TERMINATOR(19) ++PA_PATCH_LIST_TERMINATOR(20) ++PA_PATCH_LIST_TERMINATOR(21) ++PA_PATCH_LIST_TERMINATOR(22) ++PA_PATCH_LIST_TERMINATOR(23) ++PA_PATCH_LIST_TERMINATOR(24) ++PA_PATCH_LIST_TERMINATOR(25) ++PA_PATCH_LIST_TERMINATOR(26) ++PA_PATCH_LIST_TERMINATOR(27) ++PA_PATCH_LIST_TERMINATOR(28) ++PA_PATCH_LIST_TERMINATOR(29) ++PA_PATCH_LIST_TERMINATOR(30) ++PA_PATCH_LIST_TERMINATOR(31) ++PA_PATCH_LIST_TERMINATOR(32) ++#undef PA_PATCH_LIST_TERMINATOR ++ ++bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaTriList1, PaTriListSingle0); ++ return false; // Not enough vertices to assemble 4 or 8 triangles. ++} ++ ++bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaTriList2, PaTriListSingle0); ++ return false; // Not enough vertices to assemble 8 triangles. ++} ++ ++bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, 0, slot); ++ simdvector& b = PaGetSimdVector(pa, 1, slot); ++ simdvector& c = PaGetSimdVector(pa, 2, slot); ++ simdscalar s; ++ ++ // Tri Pattern - provoking vertex is always v0 ++ // v0 -> 0 3 6 9 12 15 18 21 ++ // v1 -> 1 4 7 10 13 16 19 22 ++ // v2 -> 2 5 8 11 14 17 20 23 ++ ++ for(int i = 0; i < 4; ++i) ++ { ++ simdvector& v0 = verts[0]; ++ v0[i] = _simd_blend_ps(a[i], b[i], 0x92); ++ v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); ++ v0[i] = _mm256_permute_ps(v0[i], 0x6C); ++ s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21); ++ v0[i] = _simd_blend_ps(v0[i], s, 0x44); ++ ++ simdvector& v1 = verts[1]; ++ v1[i] = _simd_blend_ps(a[i], b[i], 0x24); ++ v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); ++ v1[i] = _mm256_permute_ps(v1[i], 0xB1); ++ s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21); ++ v1[i] = _simd_blend_ps(v1[i], s, 0x66); ++ ++ simdvector& v2 = verts[2]; ++ v2[i] = _simd_blend_ps(a[i], b[i], 0x49); ++ v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); ++ v2[i] = _mm256_permute_ps(v2[i], 0xC6); ++ s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21); ++ v2[i] = _simd_blend_ps(v2[i], s, 0x22); ++ } ++ ++ SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true); ++ return true; ++} ++ ++void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ // We have 12 simdscalars contained within 3 simdvectors which ++ // hold at least 8 triangles worth of data. We want to assemble a single ++ // triangle with data in horizontal form. ++ simdvector& a = PaGetSimdVector(pa, 0, slot); ++ simdvector& b = PaGetSimdVector(pa, 1, slot); ++ simdvector& c = PaGetSimdVector(pa, 2, slot); ++ ++ // Convert from vertical to horizontal. ++ // Tri Pattern - provoking vertex is always v0 ++ // v0 -> 0 3 6 9 12 15 18 21 ++ // v1 -> 1 4 7 10 13 16 19 22 ++ // v2 -> 2 5 8 11 14 17 20 23 ++ switch(primIndex) ++ { ++ case 0: ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane1(a); ++ verts[2] = swizzleLane2(a); ++ break; ++ case 1: ++ verts[0] = swizzleLane3(a); ++ verts[1] = swizzleLane4(a); ++ verts[2] = swizzleLane5(a); ++ break; ++ case 2: ++ verts[0] = swizzleLane6(a); ++ verts[1] = swizzleLane7(a); ++ verts[2] = swizzleLane0(b); ++ break; ++ case 3: ++ verts[0] = swizzleLane1(b); ++ verts[1] = swizzleLane2(b); ++ verts[2] = swizzleLane3(b); ++ break; ++ case 4: ++ verts[0] = swizzleLane4(b); ++ verts[1] = swizzleLane5(b); ++ verts[2] = swizzleLane6(b); ++ break; ++ case 5: ++ verts[0] = swizzleLane7(b); ++ verts[1] = swizzleLane0(c); ++ verts[2] = swizzleLane1(c); ++ break; ++ case 6: ++ verts[0] = swizzleLane2(c); ++ verts[1] = swizzleLane3(c); ++ verts[2] = swizzleLane4(c); ++ break; ++ case 7: ++ verts[0] = swizzleLane5(c); ++ verts[1] = swizzleLane6(c); ++ verts[2] = swizzleLane7(c); ++ break; ++ }; ++} ++ ++bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0); ++ return false; // Not enough vertices to assemble 8 triangles. ++} ++ ++bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); ++ simdscalar s; ++ ++ for(int i = 0; i < 4; ++i) ++ { ++ simdscalar a0 = a[i]; ++ simdscalar b0 = b[i]; ++ ++ // Tri Pattern - provoking vertex is always v0 ++ // v0 -> 01234567 ++ // v1 -> 13355779 ++ // v2 -> 22446688 ++ simdvector& v0 = verts[0]; ++ v0[i] = a0; ++ ++ // s -> 4567891011 ++ s = _mm256_permute2f128_ps(a0, b0, 0x21); ++ // s -> 23456789 ++ s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); ++ ++ simdvector& v1 = verts[1]; ++ // v1 -> 13355779 ++ v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1)); ++ ++ simdvector& v2 = verts[2]; ++ // v2 -> 22446688 ++ v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2)); ++ } ++ ++ SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH); ++ return true; ++} ++ ++void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); ++ ++ // Convert from vertical to horizontal. ++ // Tri Pattern - provoking vertex is always v0 ++ // v0 -> 01234567 ++ // v1 -> 13355779 ++ // v2 -> 22446688 ++ switch(primIndex) ++ { ++ case 0: ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane1(a); ++ verts[2] = swizzleLane2(a); ++ break; ++ case 1: ++ verts[0] = swizzleLane1(a); ++ verts[1] = swizzleLane3(a); ++ verts[2] = swizzleLane2(a); ++ break; ++ case 2: ++ verts[0] = swizzleLane2(a); ++ verts[1] = swizzleLane3(a); ++ verts[2] = swizzleLane4(a); ++ break; ++ case 3: ++ verts[0] = swizzleLane3(a); ++ verts[1] = swizzleLane5(a); ++ verts[2] = swizzleLane4(a); ++ break; ++ case 4: ++ verts[0] = swizzleLane4(a); ++ verts[1] = swizzleLane5(a); ++ verts[2] = swizzleLane6(a); ++ break; ++ case 5: ++ verts[0] = swizzleLane5(a); ++ verts[1] = swizzleLane7(a); ++ verts[2] = swizzleLane6(a); ++ break; ++ case 6: ++ verts[0] = swizzleLane6(a); ++ verts[1] = swizzleLane7(a); ++ verts[2] = swizzleLane0(b); ++ break; ++ case 7: ++ verts[0] = swizzleLane7(a); ++ verts[1] = swizzleLane1(b); ++ verts[2] = swizzleLane0(b); ++ break; ++ }; ++} ++ ++bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); ++ ++ // Extract vertex 0 to every lane of first vector ++ for(int i = 0; i < 4; ++i) ++ { ++ __m256 a0 = a[i]; ++ simdvector& v0 = verts[0]; ++ v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0)); ++ v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00); ++ } ++ ++ // store off leading vertex for attributes ++ simdvertex* pVertex = (simdvertex*)pa.pStreamBase; ++ pa.leadingVertex = pVertex[pa.cur]; ++ ++ SetNextPaState(pa, PaTriFan1, PaTriFanSingle0); ++ return false; // Not enough vertices to assemble 8 triangles. ++} ++ ++bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& leadVert = pa.leadingVertex.attrib[slot]; ++ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); ++ simdscalar s; ++ ++ // need to fill vectors 1/2 with new verts, and v0 with anchor vert. ++ for(int i = 0; i < 4; ++i) ++ { ++ simdscalar a0 = a[i]; ++ simdscalar b0 = b[i]; ++ ++ __m256 comp = leadVert[i]; ++ simdvector& v0 = verts[0]; ++ v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0)); ++ v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00); ++ ++ simdvector& v2 = verts[2]; ++ s = _mm256_permute2f128_ps(a0, b0, 0x21); ++ v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2)); ++ ++ simdvector& v1 = verts[1]; ++ v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1)); ++ } ++ ++ SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH); ++ return true; ++} ++ ++void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ // vert 0 from leading vertex ++ simdvector& lead = pa.leadingVertex.attrib[slot]; ++ verts[0] = swizzleLane0(lead); ++ ++ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); ++ ++ // vert 1 ++ if (primIndex < 7) ++ { ++ verts[1] = swizzleLaneN(a, primIndex + 1); ++ } ++ else ++ { ++ verts[1] = swizzleLane0(b); ++ } ++ ++ // vert 2 ++ if (primIndex < 6) ++ { ++ verts[2] = swizzleLaneN(a, primIndex + 2); ++ } ++ else ++ { ++ verts[2] = swizzleLaneN(b, primIndex - 6); ++ } ++} ++ ++bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaQuadList1, PaQuadListSingle0); ++ return false; // Not enough vertices to assemble 8 triangles. ++} ++ ++bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, 0, slot); ++ simdvector& b = PaGetSimdVector(pa, 1, slot); ++ simdscalar s1, s2; ++ ++ for(int i = 0; i < 4; ++i) ++ { ++ simdscalar a0 = a[i]; ++ simdscalar b0 = b[i]; ++ ++ s1 = _mm256_permute2f128_ps(a0, b0, 0x20); ++ s2 = _mm256_permute2f128_ps(a0, b0, 0x31); ++ ++ simdvector& v0 = verts[0]; ++ v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0)); ++ ++ simdvector& v1 = verts[1]; ++ v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1)); ++ ++ simdvector& v2 = verts[2]; ++ v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2)); ++ } ++ ++ SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true); ++ return true; ++} ++ ++void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, 0, slot); ++ simdvector& b = PaGetSimdVector(pa, 1, slot); ++ ++ switch (primIndex) ++ { ++ case 0: ++ // triangle 0 - 0 1 2 ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane1(a); ++ verts[2] = swizzleLane2(a); ++ break; ++ ++ case 1: ++ // triangle 1 - 0 2 3 ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane2(a); ++ verts[2] = swizzleLane3(a); ++ break; ++ ++ case 2: ++ // triangle 2 - 4 5 6 ++ verts[0] = swizzleLane4(a); ++ verts[1] = swizzleLane5(a); ++ verts[2] = swizzleLane6(a); ++ break; ++ ++ case 3: ++ // triangle 3 - 4 6 7 ++ verts[0] = swizzleLane4(a); ++ verts[1] = swizzleLane6(a); ++ verts[2] = swizzleLane7(a); ++ break; ++ ++ case 4: ++ // triangle 4 - 8 9 10 (0 1 2) ++ verts[0] = swizzleLane0(b); ++ verts[1] = swizzleLane1(b); ++ verts[2] = swizzleLane2(b); ++ break; ++ ++ case 5: ++ // triangle 1 - 0 2 3 ++ verts[0] = swizzleLane0(b); ++ verts[1] = swizzleLane2(b); ++ verts[2] = swizzleLane3(b); ++ break; ++ ++ case 6: ++ // triangle 2 - 4 5 6 ++ verts[0] = swizzleLane4(b); ++ verts[1] = swizzleLane5(b); ++ verts[2] = swizzleLane6(b); ++ break; ++ ++ case 7: ++ // triangle 3 - 4 6 7 ++ verts[0] = swizzleLane4(b); ++ verts[1] = swizzleLane6(b); ++ verts[2] = swizzleLane7(b); ++ break; ++ } ++} ++ ++void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[]) ++{ ++ PaLineStripSingle0(pa, slot, lineIndex, verts); ++ ++ if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) { ++ simdvector &start = PaGetSimdVector(pa, pa.first, slot); ++ verts[1] = swizzleLane0(start); ++ } ++} ++ ++bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0); ++ return false; ++} ++ ++bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ PaLineStrip1(pa, slot, verts); ++ ++ if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) { ++ // loop reconnect now ++ int lane = pa.numPrims - pa.numPrimsComplete - 1; ++ simdvector &start = PaGetSimdVector(pa, pa.first, slot); ++ for (int i = 0; i < 4; i++) { ++ float *startVtx = (float *)&(start[i]); ++ float *targetVtx = (float *)&(verts[1][i]); ++ targetVtx[lane] = startVtx[0]; ++ } ++ } ++ ++ SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH); ++ return true; ++} ++ ++ ++bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaLineList1, PaLineListSingle0); ++ return false; // Not enough vertices to assemble 8 lines ++} ++ ++bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, 0, slot); ++ simdvector& b = PaGetSimdVector(pa, 1, slot); ++ /// @todo: verify provoking vertex is correct ++ // Line list 0 1 2 3 4 5 6 7 ++ // 8 9 10 11 12 13 14 15 ++ ++ // shuffle: ++ // 0 2 4 6 8 10 12 14 ++ // 1 3 5 7 9 11 13 15 ++ ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ // 0 1 2 3 8 9 10 11 ++ __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20); ++ // 4 5 6 7 12 13 14 15 ++ __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31); ++ ++ // 0 2 4 6 8 10 12 14 ++ verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0)); ++ // 1 3 5 7 9 11 13 15 ++ verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1)); ++ } ++ ++ SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true); ++ return true; ++} ++ ++void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ simdvector &a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector &b = PaGetSimdVector(pa, pa.cur, slot); ++ ++ switch (primIndex) ++ { ++ case 0: ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane1(a); ++ break; ++ case 1: ++ verts[0] = swizzleLane2(a); ++ verts[1] = swizzleLane3(a); ++ break; ++ case 2: ++ verts[0] = swizzleLane4(a); ++ verts[1] = swizzleLane5(a); ++ break; ++ case 3: ++ verts[0] = swizzleLane6(a); ++ verts[1] = swizzleLane7(a); ++ break; ++ case 4: ++ verts[0] = swizzleLane0(b); ++ verts[1] = swizzleLane1(b); ++ break; ++ case 5: ++ verts[0] = swizzleLane2(b); ++ verts[1] = swizzleLane3(b); ++ break; ++ case 6: ++ verts[0] = swizzleLane4(b); ++ verts[1] = swizzleLane5(b); ++ break; ++ case 7: ++ verts[0] = swizzleLane6(b); ++ verts[1] = swizzleLane7(b); ++ break; ++ } ++} ++ ++bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0); ++ return false; // Not enough vertices to assemble 8 lines ++} ++ ++bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); ++ ++ /// @todo: verify provoking vertex is correct ++ // Line list 0 1 2 3 4 5 6 7 ++ // 8 9 10 11 12 13 14 15 ++ ++ // shuffle: ++ // 0 1 2 3 4 5 6 7 ++ // 1 2 3 4 5 6 7 8 ++ ++ verts[0] = a; ++ ++ for(uint32_t i = 0; i < 4; ++i) ++ { ++ // 1 2 3 x 5 6 7 x ++ __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1) ++ // 4 5 6 7 8 9 10 11 ++ __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21); ++ ++ // x x x 4 x x x 8 ++ __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low (0 0 0 0) ++ ++ verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88); ++ } ++ ++ SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH); ++ return true; ++} ++ ++void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.prev, slot); ++ simdvector& b = PaGetSimdVector(pa, pa.cur, slot); ++ ++ switch (lineIndex) ++ { ++ case 0: ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane1(a); ++ break; ++ case 1: ++ verts[0] = swizzleLane1(a); ++ verts[1] = swizzleLane2(a); ++ break; ++ case 2: ++ verts[0] = swizzleLane2(a); ++ verts[1] = swizzleLane3(a); ++ break; ++ case 3: ++ verts[0] = swizzleLane3(a); ++ verts[1] = swizzleLane4(a); ++ break; ++ case 4: ++ verts[0] = swizzleLane4(a); ++ verts[1] = swizzleLane5(a); ++ break; ++ case 5: ++ verts[0] = swizzleLane5(a); ++ verts[1] = swizzleLane6(a); ++ break; ++ case 6: ++ verts[0] = swizzleLane6(a); ++ verts[1] = swizzleLane7(a); ++ break; ++ case 7: ++ verts[0] = swizzleLane7(a); ++ verts[1] = swizzleLane0(b); ++ break; ++ } ++} ++ ++bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); ++ ++ verts[0] = a; // points only have 1 vertex. ++ ++ SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true); ++ return true; ++} ++ ++void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ simdvector &a = PaGetSimdVector(pa, pa.cur, slot); ++ switch(primIndex) ++ { ++ case 0: ++ verts[0] = swizzleLane0(a); ++ break; ++ case 1: ++ verts[0] = swizzleLane1(a); ++ break; ++ case 2: ++ verts[0] = swizzleLane2(a); ++ break; ++ case 3: ++ verts[0] = swizzleLane3(a); ++ break; ++ case 4: ++ verts[0] = swizzleLane4(a); ++ break; ++ case 5: ++ verts[0] = swizzleLane5(a); ++ break; ++ case 6: ++ verts[0] = swizzleLane6(a); ++ break; ++ case 7: ++ verts[0] = swizzleLane7(a); ++ break; ++ } ++} ++ ++// each point generates two tris ++// primitive assembly broadcasts each point to the 3 vertices of the 2 tris ++// binner will bloat each point ++// ++// input simd : p0 p1 p2 p3 p4 p5 p6 p7 == 8 points, 16 tris ++// output phase 0: ++// verts[0] : p0 p0 p1 p1 p2 p2 p3 p3 ++// verts[1] : p0 p0 p1 p1 p2 p2 p3 p3 ++// verts[2] : p0 p0 p1 p1 p2 p2 p3 p3 ++// ++// output phase 1: ++// verts[0] : p4 p4 p5 p5 p6 p6 p7 p7 ++// verts[1] : p4 p4 p5 p5 p6 p6 p7 p7 ++// verts[2] : p4 p4 p5 p5 p6 p6 p7 p7 ++ ++ ++// 0 1 2 3 4 5 6 7 ++ ++bool PaTriPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); ++ ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]); // 0 0 1 1 4 4 5 5 ++ __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]); // 2 2 3 3 6 6 7 7 ++ __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x20); // 0 0 1 1 2 2 3 3 ++ ++ verts[0].v[i] = verts[1].v[i] = verts[2].v[i] = vCombined; ++ } ++ ++ SetNextPaState(pa, PaTriPoints1, PaTriPointsSingle0, 1, KNOB_SIMD_WIDTH); ++ return true; ++} ++ ++bool PaTriPoints1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); ++ ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ __m256 vLow128 = _mm256_unpacklo_ps(a.v[i], a.v[i]); // 0 0 1 1 4 4 5 5 ++ __m256 vHigh128 = _mm256_unpackhi_ps(a.v[i], a.v[i]); // 2 2 3 3 6 6 7 7 ++ __m256 vCombined = _mm256_permute2f128_ps(vLow128, vHigh128, 0x31); // 4 4 5 5 6 6 7 7 ++ ++ verts[0].v[i] = verts[1].v[i] = verts[2].v[i] = vCombined; ++ } ++ ++ SetNextPaState(pa, PaTriPoints0, PaTriPointsSingle1, 0, KNOB_SIMD_WIDTH); ++ return true; ++ ++} ++ ++static void PaTriPointsSprite(PA_STATE_OPT& pa, uint32_t primIndex, __m128 verts[]) ++{ ++ const API_STATE& state = GetApiState(pa.pDC); ++ ++ if (!state.rastState.pointSpriteTopOrigin) { ++ if (primIndex & 1) { ++ verts[0] = _mm_set_ps(1, 0, 1, 0); ++ verts[1] = _mm_set_ps(1, 0, 0, 1); ++ verts[2] = _mm_set_ps(1, 0, 1, 1); ++ } else { ++ verts[0] = _mm_set_ps(1, 0, 1, 0); ++ verts[1] = _mm_set_ps(1, 0, 0, 0); ++ verts[2] = _mm_set_ps(1, 0, 0, 1); ++ } ++ } else { ++ if (primIndex & 1) { ++ verts[0] = _mm_set_ps(1, 0, 0, 0); ++ verts[1] = _mm_set_ps(1, 0, 1, 1); ++ verts[2] = _mm_set_ps(1, 0, 0, 1); ++ } else { ++ verts[0] = _mm_set_ps(1, 0, 0, 0); ++ verts[1] = _mm_set_ps(1, 0, 1, 0); ++ verts[2] = _mm_set_ps(1, 0, 1, 1); ++ } ++ } ++} ++ ++void PaTriPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ const API_STATE& state = GetApiState(pa.pDC); ++ ++ if (state.rastState.pointSpriteEnable && state.rastState.pointSpriteFESlot == slot) { ++ return PaTriPointsSprite(pa, primIndex, verts); ++ } ++ ++ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); ++ ++ switch(primIndex) ++ { ++ case 0: ++ case 1: ++ verts[0] = verts[1] = verts[2] = swizzleLane0(a); break; ++ case 2: ++ case 3: ++ verts[0] = verts[1] = verts[2] = swizzleLane1(a); break; ++ case 4: ++ case 5: ++ verts[0] = verts[1] = verts[2] = swizzleLane2(a); break; ++ case 6: ++ case 7: ++ verts[0] = verts[1] = verts[2] = swizzleLane3(a); break; ++ } ++} ++ ++void PaTriPointsSingle1(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) ++{ ++ const API_STATE& state = GetApiState(pa.pDC); ++ ++ if (state.rastState.pointSpriteEnable && state.rastState.pointSpriteFESlot == slot) { ++ return PaTriPointsSprite(pa, primIndex, verts); ++ } ++ ++ simdvector& a = PaGetSimdVector(pa, pa.cur, slot); ++ ++ switch(primIndex) ++ { ++ case 0: ++ case 1: ++ verts[0] = verts[1] = verts[2] = swizzleLane4(a); break; ++ case 2: ++ case 3: ++ verts[0] = verts[1] = verts[2] = swizzleLane5(a); break; ++ case 4: ++ case 5: ++ verts[0] = verts[1] = verts[2] = swizzleLane6(a); break; ++ case 6: ++ case 7: ++ verts[0] = verts[1] = verts[2] = swizzleLane7(a); break; ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief State 1 for RECT_LIST topology. ++/// There is not enough to assemble 8 triangles. ++bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) ++{ ++ SetNextPaState(pa, PaRectList1, PaRectListSingle0); ++ return false; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief State 1 for RECT_LIST topology. ++/// Rect lists has the following format. ++/// w x y z ++/// v2 o---o v5 o---o v8 o---o v11 o---o ++/// | \ | | \ | | \ | | \ | ++/// v1 o---o v4 o---o v7 o---o v10 o---o ++/// v0 v3 v6 v9 ++/// ++/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. ++/// ++/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 ++/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 ++/// etc. ++/// ++/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 ++/// where v0 contains all the first vertices for 8 triangles. ++/// ++/// Result: ++/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } ++/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } ++/// verts[2] = { v2, w, v5, x, v8, y, v11, z } ++/// ++/// @param pa - State for PA state machine. ++/// @param slot - Index into VS output which is either a position (slot 0) or attribute. ++/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. ++bool PaRectList1( ++ PA_STATE_OPT& pa, ++ uint32_t slot, ++ simdvector verts[]) ++{ ++ // SIMD vectors a and b are the last two vertical outputs from the vertex shader. ++ simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } ++ simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } ++ ++ __m256 tmp0, tmp1, tmp2; ++ ++ // Loop over each component in the simdvector. ++ for(int i = 0; i < 4; ++i) ++ { ++ simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } ++ tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } ++ v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. ++ tmp1 = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } ++ v0[i] = _mm256_permute_ps(v0[i], 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } ++ v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } ++ ++ /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. ++ /// AVX2 should make this much cheaper. ++ simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } ++ v1[i] = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } ++ tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } ++ tmp2 = _mm256_blend_ps(v1[i], tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } ++ tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } ++ v1[i] = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } ++ v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } ++ v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } ++ ++ // verts[2] = { v2, w, v5, x, v8, y, v11, z } ++ simdvector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } ++ v2[i] = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } ++ tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } ++ v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0); ++ ++ // Need to compute 4th implied vertex for the rectangle. ++ tmp2 = _mm256_sub_ps(v0[i], v1[i]); ++ tmp2 = _mm256_add_ps(tmp2, v2[i]); // tmp2 = { w, *, x, *, y, *, z, * } ++ tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } ++ v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } ++ } ++ ++ SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); ++ return true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief State 2 for RECT_LIST topology. ++/// Not implemented unless there is a use case for more then 8 rects. ++/// @param pa - State for PA state machine. ++/// @param slot - Index into VS output which is either a position (slot 0) or attribute. ++/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. ++bool PaRectList2( ++ PA_STATE_OPT& pa, ++ uint32_t slot, ++ simdvector verts[]) ++{ ++ SWR_ASSERT(0); // Is rect list used for anything other then clears? ++ SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true); ++ return true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief This procedure is called by the Binner to assemble the attributes. ++/// Unlike position, which is stored vertically, the attributes are ++/// stored horizontally. The outputs from the VS, labeled as 'a' and ++/// 'b' are vertical. This function needs to transpose the lanes ++/// containing the vertical attribute data into horizontal form. ++/// @param pa - State for PA state machine. ++/// @param slot - Index into VS output for a given attribute. ++/// @param primIndex - Binner processes each triangle individually. ++/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. ++void PaRectListSingle0( ++ PA_STATE_OPT& pa, ++ uint32_t slot, ++ uint32_t primIndex, ++ __m128 verts[]) ++{ ++ // We have 12 simdscalars contained within 3 simdvectors which ++ // hold at least 8 triangles worth of data. We want to assemble a single ++ // triangle with data in horizontal form. ++ simdvector& a = PaGetSimdVector(pa, 0, slot); ++ ++ // Convert from vertical to horizontal. ++ switch(primIndex) ++ { ++ case 0: ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane1(a); ++ verts[2] = swizzleLane2(a); ++ break; ++ case 1: ++ verts[0] = swizzleLane0(a); ++ verts[1] = swizzleLane2(a); ++ verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2); ++ break; ++ case 2: ++ case 3: ++ case 4: ++ case 5: ++ case 6: ++ case 7: ++ SWR_ASSERT(0); ++ break; ++ }; ++} ++ ++PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, ++ bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), ++ cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) ++{ ++ const API_STATE& state = GetApiState(pDC); ++ ++ this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo; ++ ++ switch (this->binTopology) ++ { ++ case TOP_TRIANGLE_LIST: ++ this->pfnPaFunc = PaTriList0; ++ break; ++ case TOP_TRIANGLE_STRIP: ++ this->pfnPaFunc = PaTriStrip0; ++ break; ++ case TOP_TRIANGLE_FAN: ++ this->pfnPaFunc = PaTriFan0; ++ break; ++ case TOP_QUAD_LIST: ++ this->pfnPaFunc = PaQuadList0; ++ this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles ++ break; ++ case TOP_QUAD_STRIP: ++ // quad strip pattern when decomposed into triangles is the same as verts strips ++ this->pfnPaFunc = PaTriStrip0; ++ this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles ++ break; ++ case TOP_LINE_LIST: ++ this->pfnPaFunc = PaLineList0; ++ this->numPrims = in_numPrims; ++ break; ++ case TOP_LINE_STRIP: ++ this->pfnPaFunc = PaLineStrip0; ++ this->numPrims = in_numPrims; ++ break; ++ case TOP_LINE_LOOP: ++ this->pfnPaFunc = PaLineLoop0; ++ this->numPrims = in_numPrims; ++ break; ++ case TOP_POINT_LIST: ++ // use point binner and rasterizer if supported ++ if (CanUseSimplePoints(pDC)) ++ { ++ this->pfnPaFunc = PaPoints0; ++ this->numPrims = in_numPrims; ++ } ++ else ++ { ++ this->pfnPaFunc = PaTriPoints0; ++ this->numPrims = in_numPrims * 2; // 1 point generates 2 tris ++ } ++ break; ++ case TOP_RECT_LIST: ++ this->pfnPaFunc = PaRectList0; ++ this->numPrims = in_numPrims * 2; ++ break; ++ ++ case TOP_PATCHLIST_1: ++ this->pfnPaFunc = PaPatchList<1>; ++ break; ++ case TOP_PATCHLIST_2: ++ this->pfnPaFunc = PaPatchList<2>; ++ break; ++ case TOP_PATCHLIST_3: ++ this->pfnPaFunc = PaPatchList<3>; ++ break; ++ case TOP_PATCHLIST_4: ++ this->pfnPaFunc = PaPatchList<4>; ++ break; ++ case TOP_PATCHLIST_5: ++ this->pfnPaFunc = PaPatchList<5>; ++ break; ++ case TOP_PATCHLIST_6: ++ this->pfnPaFunc = PaPatchList<6>; ++ break; ++ case TOP_PATCHLIST_7: ++ this->pfnPaFunc = PaPatchList<7>; ++ break; ++ case TOP_PATCHLIST_8: ++ this->pfnPaFunc = PaPatchList<8>; ++ break; ++ case TOP_PATCHLIST_9: ++ this->pfnPaFunc = PaPatchList<9>; ++ break; ++ case TOP_PATCHLIST_10: ++ this->pfnPaFunc = PaPatchList<10>; ++ break; ++ case TOP_PATCHLIST_11: ++ this->pfnPaFunc = PaPatchList<11>; ++ break; ++ case TOP_PATCHLIST_12: ++ this->pfnPaFunc = PaPatchList<12>; ++ break; ++ case TOP_PATCHLIST_13: ++ this->pfnPaFunc = PaPatchList<13>; ++ break; ++ case TOP_PATCHLIST_14: ++ this->pfnPaFunc = PaPatchList<14>; ++ break; ++ case TOP_PATCHLIST_15: ++ this->pfnPaFunc = PaPatchList<15>; ++ break; ++ case TOP_PATCHLIST_16: ++ this->pfnPaFunc = PaPatchList<16>; ++ break; ++ case TOP_PATCHLIST_17: ++ this->pfnPaFunc = PaPatchList<17>; ++ break; ++ case TOP_PATCHLIST_18: ++ this->pfnPaFunc = PaPatchList<18>; ++ break; ++ case TOP_PATCHLIST_19: ++ this->pfnPaFunc = PaPatchList<19>; ++ break; ++ case TOP_PATCHLIST_20: ++ this->pfnPaFunc = PaPatchList<20>; ++ break; ++ case TOP_PATCHLIST_21: ++ this->pfnPaFunc = PaPatchList<21>; ++ break; ++ case TOP_PATCHLIST_22: ++ this->pfnPaFunc = PaPatchList<22>; ++ break; ++ case TOP_PATCHLIST_23: ++ this->pfnPaFunc = PaPatchList<23>; ++ break; ++ case TOP_PATCHLIST_24: ++ this->pfnPaFunc = PaPatchList<24>; ++ break; ++ case TOP_PATCHLIST_25: ++ this->pfnPaFunc = PaPatchList<25>; ++ break; ++ case TOP_PATCHLIST_26: ++ this->pfnPaFunc = PaPatchList<26>; ++ break; ++ case TOP_PATCHLIST_27: ++ this->pfnPaFunc = PaPatchList<27>; ++ break; ++ case TOP_PATCHLIST_28: ++ this->pfnPaFunc = PaPatchList<28>; ++ break; ++ case TOP_PATCHLIST_29: ++ this->pfnPaFunc = PaPatchList<29>; ++ break; ++ case TOP_PATCHLIST_30: ++ this->pfnPaFunc = PaPatchList<30>; ++ break; ++ case TOP_PATCHLIST_31: ++ this->pfnPaFunc = PaPatchList<31>; ++ break; ++ case TOP_PATCHLIST_32: ++ this->pfnPaFunc = PaPatchList<32>; ++ break; ++ ++ default: ++ SWR_ASSERT(0); ++ break; ++ }; ++ ++ // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); ++ // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3); ++ simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); ++ simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0); ++ ++ switch(this->binTopology) ++ { ++ case TOP_TRIANGLE_LIST: ++ case TOP_TRIANGLE_STRIP: ++ case TOP_TRIANGLE_FAN: ++ case TOP_LINE_STRIP: ++ case TOP_LINE_LIST: ++ case TOP_LINE_LOOP: ++ this->primIDIncr = 8; ++ this->primID = id8; ++ break; ++ case TOP_QUAD_LIST: ++ case TOP_QUAD_STRIP: ++ case TOP_RECT_LIST: ++ this->primIDIncr = 4; ++ this->primID = id4; ++ break; ++ case TOP_POINT_LIST: ++ if (CanUseSimplePoints(pDC)) ++ { ++ this->primIDIncr = 8; ++ this->primID = id8; ++ } ++ else ++ { ++ this->primIDIncr = 4; ++ this->primID = id4; ++ } ++ break; ++ case TOP_PATCHLIST_1: ++ case TOP_PATCHLIST_2: ++ case TOP_PATCHLIST_3: ++ case TOP_PATCHLIST_4: ++ case TOP_PATCHLIST_5: ++ case TOP_PATCHLIST_6: ++ case TOP_PATCHLIST_7: ++ case TOP_PATCHLIST_8: ++ case TOP_PATCHLIST_9: ++ case TOP_PATCHLIST_10: ++ case TOP_PATCHLIST_11: ++ case TOP_PATCHLIST_12: ++ case TOP_PATCHLIST_13: ++ case TOP_PATCHLIST_14: ++ case TOP_PATCHLIST_15: ++ case TOP_PATCHLIST_16: ++ case TOP_PATCHLIST_17: ++ case TOP_PATCHLIST_18: ++ case TOP_PATCHLIST_19: ++ case TOP_PATCHLIST_20: ++ case TOP_PATCHLIST_21: ++ case TOP_PATCHLIST_22: ++ case TOP_PATCHLIST_23: ++ case TOP_PATCHLIST_24: ++ case TOP_PATCHLIST_25: ++ case TOP_PATCHLIST_26: ++ case TOP_PATCHLIST_27: ++ case TOP_PATCHLIST_28: ++ case TOP_PATCHLIST_29: ++ case TOP_PATCHLIST_30: ++ case TOP_PATCHLIST_31: ++ case TOP_PATCHLIST_32: ++ // Always run KNOB_SIMD_WIDTH number of patches at a time. ++ this->primIDIncr = 8; ++ this->primID = id8; ++ break; ++ ++ default: ++ SWR_ASSERT(0); ++ break; ++ }; ++ ++} ++#endif +diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +new file mode 100644 +index 0000000..71de298 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +@@ -0,0 +1,1217 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file rasterizer.cpp ++* ++* @brief Implementation for the rasterizer. ++* ++******************************************************************************/ ++ ++#include ++#include ++ ++#include "rasterizer.h" ++#include "multisample.h" ++#include "rdtsc_core.h" ++#include "backend.h" ++#include "utils.h" ++#include "frontend.h" ++#include "tilemgr.h" ++#include "memory/tilingtraits.h" ++ ++void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, ++ uint32_t numSamples, uint32_t renderTargetArrayIndex); ++void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep); ++void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, ++ uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep); ++ ++#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} ++const __m128 gMaskToVec[] = { ++ MASKTOVEC(0,0,0,0), ++ MASKTOVEC(0,0,0,1), ++ MASKTOVEC(0,0,1,0), ++ MASKTOVEC(0,0,1,1), ++ MASKTOVEC(0,1,0,0), ++ MASKTOVEC(0,1,0,1), ++ MASKTOVEC(0,1,1,0), ++ MASKTOVEC(0,1,1,1), ++ MASKTOVEC(1,0,0,0), ++ MASKTOVEC(1,0,0,1), ++ MASKTOVEC(1,0,1,0), ++ MASKTOVEC(1,0,1,1), ++ MASKTOVEC(1,1,0,0), ++ MASKTOVEC(1,1,0,1), ++ MASKTOVEC(1,1,1,0), ++ MASKTOVEC(1,1,1,1), ++}; ++ ++const __m256d gMaskToVecpd[] = ++{ ++ MASKTOVEC(0, 0, 0, 0), ++ MASKTOVEC(0, 0, 0, 1), ++ MASKTOVEC(0, 0, 1, 0), ++ MASKTOVEC(0, 0, 1, 1), ++ MASKTOVEC(0, 1, 0, 0), ++ MASKTOVEC(0, 1, 0, 1), ++ MASKTOVEC(0, 1, 1, 0), ++ MASKTOVEC(0, 1, 1, 1), ++ MASKTOVEC(1, 0, 0, 0), ++ MASKTOVEC(1, 0, 0, 1), ++ MASKTOVEC(1, 0, 1, 0), ++ MASKTOVEC(1, 0, 1, 1), ++ MASKTOVEC(1, 1, 0, 0), ++ MASKTOVEC(1, 1, 0, 1), ++ MASKTOVEC(1, 1, 1, 0), ++ MASKTOVEC(1, 1, 1, 1), ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief rasterize a raster tile partially covered by the triangle ++/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile ++/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C) ++/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad. ++/// Used to step between quads when sweeping over the raster tile. ++INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, __m256d vEdge0, __m256d vEdge1, __m256d vEdge2, ++ __m128i &vA, __m128i &vB, __m256d &vStepQuad0, __m256d &vStepQuad1, __m256d &vStepQuad2) ++{ ++ uint64_t coverageMask = 0; ++ ++ // Step to the pixel sample locations of the 1st quad ++ double edge0; ++ double edge1; ++ double edge2; ++ _mm_store_sd(&edge0, _mm256_castpd256_pd128(vEdge0)); ++ _mm_store_sd(&edge1, _mm256_castpd256_pd128(vEdge1)); ++ _mm_store_sd(&edge2, _mm256_castpd256_pd128(vEdge2)); ++ ++ vEdge0 = _mm256_broadcast_sd(&edge0); ++ vEdge1 = _mm256_broadcast_sd(&edge1); ++ vEdge2 = _mm256_broadcast_sd(&edge2); ++ ++ vEdge0 = _mm256_add_pd(vEdge0, vStepQuad0); ++ vEdge1 = _mm256_add_pd(vEdge1, vStepQuad1); ++ vEdge2 = _mm256_add_pd(vEdge2, vStepQuad2); ++ ++ // compute step to next quad (mul by 2 in x and y direction) ++ __m256d vAEdge0 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 0, 0, 0))); ++ __m256d vAEdge1 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(1, 1, 1, 1))); ++ __m256d vAEdge2 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vA, _MM_SHUFFLE(2, 2, 2, 2))); ++ __m256d vBEdge0 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 0, 0, 0))); ++ __m256d vBEdge1 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(1, 1, 1, 1))); ++ __m256d vBEdge2 = _mm256_cvtepi32_pd(_mm_shuffle_epi32(vB, _MM_SHUFFLE(2, 2, 2, 2))); ++ ++ __m256d vStep0X = _mm256_mul_pd(vAEdge0, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); ++ __m256d vStep0Y = _mm256_mul_pd(vBEdge0, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); ++ ++ __m256d vStep1X = _mm256_mul_pd(vAEdge1, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); ++ __m256d vStep1Y = _mm256_mul_pd(vBEdge1, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); ++ ++ __m256d vStep2X = _mm256_mul_pd(vAEdge2, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); ++ __m256d vStep2Y = _mm256_mul_pd(vBEdge2, _mm256_set1_pd(2 * FIXED_POINT_SCALE)); ++ ++ // fast unrolled version for 8x8 tile ++#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8 ++ int mask0, mask1, mask2; ++ uint64_t mask; ++ ++ // evaluate which pixels in the quad are covered ++#define EVAL \ ++ mask0 = _mm256_movemask_pd(vEdge0);\ ++ mask1 = _mm256_movemask_pd(vEdge1);\ ++ mask2 = _mm256_movemask_pd(vEdge2); ++ ++ // update coverage mask ++#define UPDATE_MASK(bit) \ ++ mask = mask0 & mask1 & mask2;\ ++ coverageMask |= (mask << bit); ++ ++ // step in the +x direction to the next quad ++#define INCX \ ++ vEdge0 = _mm256_add_pd(vEdge0, vStep0X);\ ++ vEdge1 = _mm256_add_pd(vEdge1, vStep1X);\ ++ vEdge2 = _mm256_add_pd(vEdge2, vStep2X); ++ // step in the +y direction to the next quad ++#define INCY \ ++ vEdge0 = _mm256_add_pd(vEdge0, vStep0Y);\ ++ vEdge1 = _mm256_add_pd(vEdge1, vStep1Y);\ ++ vEdge2 = _mm256_add_pd(vEdge2, vStep2Y); ++ // step in the -x direction to the next quad ++#define DECX \ ++ vEdge0 = _mm256_sub_pd(vEdge0, vStep0X);\ ++ vEdge1 = _mm256_sub_pd(vEdge1, vStep1X);\ ++ vEdge2 = _mm256_sub_pd(vEdge2, vStep2X); ++ ++ // sweep 2x2 quad back and forth through the raster tile, ++ // computing coverage masks for the entire tile ++ ++ // raster tile ++ // 0 1 2 3 4 5 6 7 ++ // x x ++ // x x ------------------> ++ // x x | ++ // <-----------------x x V ++ // .. ++ ++ // row 0 ++ EVAL; ++ UPDATE_MASK(0); ++ INCX; ++ EVAL; ++ UPDATE_MASK(4); ++ INCX; ++ EVAL; ++ UPDATE_MASK(8); ++ INCX; ++ EVAL; ++ UPDATE_MASK(12); ++ INCY; ++ ++ //row 1 ++ EVAL; ++ UPDATE_MASK(28); ++ DECX; ++ EVAL; ++ UPDATE_MASK(24); ++ DECX; ++ EVAL; ++ UPDATE_MASK(20); ++ DECX; ++ EVAL; ++ UPDATE_MASK(16); ++ INCY; ++ ++ // row 2 ++ EVAL; ++ UPDATE_MASK(32); ++ INCX; ++ EVAL; ++ UPDATE_MASK(36); ++ INCX; ++ EVAL; ++ UPDATE_MASK(40); ++ INCX; ++ EVAL; ++ UPDATE_MASK(44); ++ INCY; ++ ++ // row 3 ++ EVAL; ++ UPDATE_MASK(60); ++ DECX; ++ EVAL; ++ UPDATE_MASK(56); ++ DECX; ++ EVAL; ++ UPDATE_MASK(52); ++ DECX; ++ EVAL; ++ UPDATE_MASK(48); ++#else ++ uint32_t bit = 0; ++ for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y) ++ { ++ __m256d vStartOfRowEdge0 = vEdge0; ++ __m256d vStartOfRowEdge1 = vEdge1; ++ __m256d vStartOfRowEdge2 = vEdge2; ++ ++ for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x) ++ { ++ int mask0 = _mm256_movemask_pd(vEdge0); ++ int mask1 = _mm256_movemask_pd(vEdge1); ++ int mask2 = _mm256_movemask_pd(vEdge2); ++ ++ uint64_t mask = mask0 & mask1 & mask2; ++ coverageMask |= (mask << bit); ++ ++ // step to the next pixel in the x ++ vEdge0 = _mm256_add_pd(vEdge0, vStep0X); ++ vEdge1 = _mm256_add_pd(vEdge1, vStep1X); ++ vEdge2 = _mm256_add_pd(vEdge2, vStep2X); ++ bit+=4; ++ } ++ ++ // step to the next row ++ vEdge0 = _mm256_add_pd(vStartOfRowEdge0, vStep0Y); ++ vEdge1 = _mm256_add_pd(vStartOfRowEdge1, vStep1Y); ++ vEdge2 = _mm256_add_pd(vStartOfRowEdge2, vStep2Y); ++ } ++#endif ++ return coverageMask; ++ ++} ++// Top left rule: ++// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge ++// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge ++// Top left: a sample is in if it is a top or left edge. ++// Out: !(horizontal && above) = !horizontal && below ++// Out: !horizontal && left = !(!horizontal && left) = horizontal and right ++INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge) ++{ ++ // if vA < 0, vC-- ++ // if vA == 0 && vB < 0, vC-- ++ ++ __m256d vEdgeOut = vEdge; ++ __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0)); ++ ++ // if vA < 0 (line is not horizontal and below) ++ int msk = _mm_movemask_ps(_mm_castsi128_ps(vA)); ++ ++ // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri) ++ __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128()); ++ int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp)); ++ msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB)); ++ ++ // if either of these are true and we're on the line (edge == 0), bump it outside the line ++ vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]); ++ return vEdgeOut; ++} ++ ++// max(abs(dz/dx), abs(dz,dy) ++INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) ++{ ++ /* ++ // evaluate i,j at (0,0) ++ float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; ++ float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; ++ ++ // evaluate i,j at (1,0) ++ float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2]; ++ float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2]; ++ ++ // compute dz/dx ++ float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2]; ++ float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2]; ++ float dzdx = abs(d10 - d00); ++ ++ // evaluate i,j at (0,1) ++ float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2]; ++ float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2]; ++ ++ float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2]; ++ float dzdy = abs(d01 - d00); ++ */ ++ ++ // optimized version of above ++ float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0])); ++ float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1])); ++ ++ return std::max(dzdx, dzdy); ++} ++ ++INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z) ++{ ++ if (pState->depthFormat == R24_UNORM_X8_TYPELESS) ++ { ++ return (1.0f / (1 << 24)); ++ } ++ else if (pState->depthFormat == R16_UNORM) ++ { ++ return (1.0f / (1 << 16)); ++ } ++ else ++ { ++ SWR_ASSERT(pState->depthFormat == R32_FLOAT); ++ ++ // for f32 depth, factor = 2^(exponent(max(abs(z) - 23) ++ float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2]))); ++ uint32_t zMaxInt = *(uint32_t*)&zMax; ++ zMaxInt &= 0x7f800000; ++ zMax = *(float*)&zMaxInt; ++ ++ return zMax * (1.0f / (1 << 23)); ++ } ++} ++ ++INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z) ++{ ++ if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0) ++ { ++ return 0.0f; ++ } ++ ++ float scale = pState->slopeScaledDepthBias; ++ if (scale != 0.0f) ++ { ++ scale *= ComputeMaxDepthSlope(pTri); ++ } ++ ++ float bias = pState->depthBias * ComputeBiasFactor(pState, pTri, z) + scale; ++ if (pState->depthBiasClamp > 0.0f) ++ { ++ bias = std::min(bias, pState->depthBiasClamp); ++ } ++ else if (pState->depthBiasClamp < 0.0f) ++ { ++ bias = std::max(bias, pState->depthBiasClamp); ++ } ++ ++ return bias; ++} ++ ++// Prevent DCE by writing coverage mask from rasterizer to volatile ++#if KNOB_ENABLE_TOSS_POINTS ++__declspec(thread) volatile uint64_t gToss; ++#endif ++ ++static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; ++// try to avoid _chkstk insertions; make this thread local ++static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; ++ ++template ++void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) ++{ ++ ++ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc); ++#if KNOB_ENABLE_TOSS_POINTS ++ if (KNOB_TOSS_BIN_TRIS) ++ { ++ return; ++ } ++#endif ++ RDTSC_START(BERasterizeTriangle); ++ ++ RDTSC_START(BETriangleSetup); ++ const API_STATE &state = GetApiState(pDC); ++ const SWR_RASTSTATE &rastState = state.rastState; ++ ++ OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; ++ triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; ++ ++ __m128 vX, vY, vZ, vRecipW; ++ ++ // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care ++ // eg: vX = [x0 x1 x2 dc] ++ vX = _mm_load_ps(workDesc.pTriBuffer); ++ vY = _mm_load_ps(workDesc.pTriBuffer + 4); ++ vZ = _mm_load_ps(workDesc.pTriBuffer + 8); ++ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); ++ ++ // convert to fixed point ++ __m128i vXi = fpToFixedPoint(vX); ++ __m128i vYi = fpToFixedPoint(vY); ++ ++ // quantize floating point position to fixed point precision ++ // to prevent attribute creep around the triangle vertices ++ vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); ++ vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE)); ++ ++ // triangle setup - A and B edge equation coefs ++ __m128 vA, vB; ++ triangleSetupAB(vX, vY, vA, vB); ++ ++ __m128i vAi, vBi; ++ triangleSetupABInt(vXi, vYi, vAi, vBi); ++ ++ // determinant ++ float det = calcDeterminantInt(vAi, vBi); ++ ++ /// @todo: This test is flipped...we have a stray '-' sign somewhere ++ // Convert CW triangles to CCW ++ if (det > 0.0) ++ { ++ vA = _mm_mul_ps(vA, _mm_set1_ps(-1)); ++ vB = _mm_mul_ps(vB, _mm_set1_ps(-1)); ++ vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1)); ++ vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1)); ++ det = -det; ++ } ++ ++ __m128 vC; ++ // Finish triangle setup - C edge coef ++ triangleSetupC(vX, vY, vA, vB, vC); ++ ++ // compute barycentric i and j ++ // i = (A1x + B1y + C1)/det ++ // j = (A2x + B2y + C2)/det ++ __m128 vDet = _mm_set1_ps(det); ++ __m128 vRecipDet = _mm_div_ps(_mm_set1_ps(1.0f), vDet);//_mm_rcp_ps(vDet); ++ _mm_store_ss(&triDesc.recipDet, vRecipDet); ++ ++ // only extract coefs for 2 of the barycentrics; the 3rd can be ++ // determined from the barycentric equation: ++ // i + j + k = 1 <=> k = 1 - j - i ++ _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1); ++ _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1); ++ _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1); ++ _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2); ++ _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); ++ _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); ++ ++ OSALIGN(float, 16) oneOverW[4]; ++ _mm_store_ps(oneOverW, vRecipW); ++ triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; ++ triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; ++ triDesc.OneOverW[2] = oneOverW[2]; ++ ++ // calculate perspective correct coefs per vertex attrib ++ float* pPerspAttribs = perspAttribsTLS; ++ float* pAttribs = workDesc.pAttribs; ++ triDesc.pPerspAttribs = pPerspAttribs; ++ triDesc.pAttribs = pAttribs; ++ float *pRecipW = workDesc.pTriBuffer + 12; ++ __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW); ++ __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1); ++ __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1); ++ for(uint32_t i = 0; i < workDesc.numAttribs; i++) ++ { ++ __m128 attribA = _mm_load_ps(pAttribs); ++ __m128 attribB = _mm_load_ps(pAttribs+=4); ++ __m128 attribC = _mm_load_ps(pAttribs+=4); ++ pAttribs+=4; ++ ++ attribA = _mm_mul_ps(attribA, vOneOverWV0); ++ attribB = _mm_mul_ps(attribB, vOneOverWV1); ++ attribC = _mm_mul_ps(attribC, vOneOverWV2); ++ ++ _mm_store_ps(pPerspAttribs, attribA); ++ _mm_store_ps(pPerspAttribs+=4, attribB); ++ _mm_store_ps(pPerspAttribs+=4, attribC); ++ pPerspAttribs+=4; ++ } ++ ++ // compute bary Z ++ // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) ++ OSALIGN(float, 16) a[4]; ++ _mm_store_ps(a, vZ); ++ triDesc.Z[0] = a[0] - a[2]; ++ triDesc.Z[1] = a[1] - a[2]; ++ triDesc.Z[2] = a[2]; ++ ++ // add depth bias ++ triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8); ++ ++ // broadcast A and B coefs for each edge to all slots ++ __m128i vAEdge0h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(0,0,0,0)); ++ __m128i vAEdge1h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(1,1,1,1)); ++ __m128i vAEdge2h = _mm_shuffle_epi32(vAi, _MM_SHUFFLE(2,2,2,2)); ++ __m128i vBEdge0h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(0,0,0,0)); ++ __m128i vBEdge1h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(1,1,1,1)); ++ __m128i vBEdge2h = _mm_shuffle_epi32(vBi, _MM_SHUFFLE(2,2,2,2)); ++ ++ __m256d vAEdge0Fix8 = _mm256_cvtepi32_pd(vAEdge0h); ++ __m256d vAEdge1Fix8 = _mm256_cvtepi32_pd(vAEdge1h); ++ __m256d vAEdge2Fix8 = _mm256_cvtepi32_pd(vAEdge2h); ++ __m256d vBEdge0Fix8 = _mm256_cvtepi32_pd(vBEdge0h); ++ __m256d vBEdge1Fix8 = _mm256_cvtepi32_pd(vBEdge1h); ++ __m256d vBEdge2Fix8 = _mm256_cvtepi32_pd(vBEdge2h); ++ ++ // Precompute pixel quad step offsets ++ // 0,0 ------ 1,0 ++ // | | ++ // | | ++ // 1,0 ------ 1,1 ++ const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0); ++ const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0); ++ ++ // Evaluate edge equations at 4 upper left corners of a 2x2 pixel quad ++ // used to step between quads while sweeping over a raster tile ++ __m256d vQuadStepX0Fix16 = _mm256_mul_pd(vAEdge0Fix8, vQuadOffsetsXIntFix8); ++ __m256d vQuadStepX1Fix16 = _mm256_mul_pd(vAEdge1Fix8, vQuadOffsetsXIntFix8); ++ __m256d vQuadStepX2Fix16 = _mm256_mul_pd(vAEdge2Fix8, vQuadOffsetsXIntFix8); ++ ++ __m256d vQuadStepY0Fix16 = _mm256_mul_pd(vBEdge0Fix8, vQuadOffsetsYIntFix8); ++ __m256d vQuadStepY1Fix16 = _mm256_mul_pd(vBEdge1Fix8, vQuadOffsetsYIntFix8); ++ __m256d vQuadStepY2Fix16 = _mm256_mul_pd(vBEdge2Fix8, vQuadOffsetsYIntFix8); ++ ++ // vStepQuad = A*vQuadOffsetsXInt + B*vQuadOffsetsYInt ++ __m256d vStepQuad0Fix16 = _mm256_add_pd(vQuadStepX0Fix16, vQuadStepY0Fix16); ++ __m256d vStepQuad1Fix16 = _mm256_add_pd(vQuadStepX1Fix16, vQuadStepY1Fix16); ++ __m256d vStepQuad2Fix16 = _mm256_add_pd(vQuadStepX2Fix16, vQuadStepY2Fix16); ++ ++ // Precompute tile step offsets ++ // 0,0 ------ KNOB_TILE_X_DIM-1,0 ++ // | | ++ // | | ++ // KNOB_TILE_Y_DIM-1,0 ------ KNOB_TILE_X_DIM-1,KNOB_TILE_Y_DIM-1 ++ const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM-1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM-1)*FIXED_POINT_SCALE, 0); ++ const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM-1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM-1)*FIXED_POINT_SCALE, 0, 0); ++ ++ // Calc bounding box of triangle ++ OSALIGN(BBOX, 16) bbox; ++ calcBoundingBoxInt(vXi, vYi, bbox); ++ ++ // Intersect with scissor/viewport ++ bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left); ++ bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right); ++ bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top); ++ bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom); ++ ++ triDesc.triFlags = workDesc.triFlags; ++ ++ // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox ++ uint32_t macroX, macroY; ++ MacroTileMgr::getTileIndices(macroTile, macroX, macroY); ++ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; ++ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; ++ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; ++ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; ++ ++ OSALIGN(BBOX, 16) intersect; ++ intersect.left = std::max(bbox.left, macroBoxLeft); ++ intersect.top = std::max(bbox.top, macroBoxTop); ++ intersect.right = std::min(bbox.right, macroBoxRight); ++ intersect.bottom = std::min(bbox.bottom, macroBoxBottom); ++ ++ SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0); ++ ++ RDTSC_STOP(BETriangleSetup, 0, pDC->drawId); ++ ++ // update triangle desc ++ uint32_t tileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); ++ uint32_t tileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); ++ uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); ++ uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); ++ uint32_t numTilesX = maxTileX - tileX + 1; ++ uint32_t numTilesY = maxTileY - tileY + 1; ++ ++ if (numTilesX == 0 || numTilesY == 0) ++ { ++ RDTSC_EVENT(BEEmptyTriangle, 1, 0); ++ RDTSC_STOP(BERasterizeTriangle, 1, 0); ++ return; ++ } ++ ++ RDTSC_START(BEStepSetup); ++ ++ // Step to pixel center of top-left pixel of the triangle bbox ++ // Align intersect bbox (top/left) to raster tile's (top/left). ++ int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM)); ++ int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM)); ++ ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ // Add 0.5, in fixed point, to offset to pixel center ++ x += (FIXED_POINT_SCALE / 2); ++ y += (FIXED_POINT_SCALE / 2); ++ } ++ ++ __m128i vTopLeftX = _mm_set1_epi32(x); ++ __m128i vTopLeftY = _mm_set1_epi32(y); ++ ++ // evaluate edge equations at top-left pixel using 64bit math ++ // all other evaluations will be 32bit steps from it ++ // small triangles could skip this and do all 32bit math ++ // edge 0 ++ // ++ // line = Ax + By + C ++ // solving for C: ++ // C = -Ax - By ++ // we know x0 and y0 are on the line; plug them in: ++ // C = -Ax0 - By0 ++ // plug C back into line equation: ++ // line = Ax - Bx - Ax0 - Bx1 ++ // line = A(x - x0) + B(y - y0) ++ // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY ++ ++ // edge 0 and 1 ++ // edge0 = A0(x - x0) + B0(y - y0) ++ // edge1 = A1(x - x1) + B1(y - y1) ++ __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi); ++ __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi); ++ ++ __m256d vEdgeFix16[3]; ++ ++ // evaluate A(dx) and B(dY) for all points ++ __m256d vAipd = _mm256_cvtepi32_pd(vAi); ++ __m256d vBipd = _mm256_cvtepi32_pd(vBi); ++ __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX); ++ __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY); ++ ++ __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd); ++ __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); ++ __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); ++ ++ // adjust for top-left rule ++ vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); ++ ++ // broadcast respective edge results to all lanes ++ double* pEdge = (double*)&vEdge; ++ vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]); ++ vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]); ++ vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]); ++ ++ // compute step to the next tile ++ __m256d vNextXTileFix8 = _mm256_set1_pd(KNOB_TILE_X_DIM * FIXED_POINT_SCALE); ++ __m256d vNextYTileFix8 = _mm256_set1_pd(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE); ++ __m256d vTileStepX0Fix16 = _mm256_mul_pd(vAEdge0Fix8, vNextXTileFix8); ++ __m256d vTileStepY0Fix16 = _mm256_mul_pd(vBEdge0Fix8, vNextYTileFix8); ++ __m256d vTileStepX1Fix16 = _mm256_mul_pd(vAEdge1Fix8, vNextXTileFix8); ++ __m256d vTileStepY1Fix16 = _mm256_mul_pd(vBEdge1Fix8, vNextYTileFix8); ++ __m256d vTileStepX2Fix16 = _mm256_mul_pd(vAEdge2Fix8, vNextXTileFix8); ++ __m256d vTileStepY2Fix16 = _mm256_mul_pd(vBEdge2Fix8, vNextYTileFix8); ++ ++ // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile ++ // used to for testing if entire raster tile is inside a triangle ++ __m256d vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vTileOffsetsXIntFix8); ++ __m256d vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vTileOffsetsYIntFix8); ++ vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], _mm256_add_pd(vResultAxFix16, vResultByFix16)); ++ ++ vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vTileOffsetsXIntFix8); ++ vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vTileOffsetsYIntFix8); ++ vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], _mm256_add_pd(vResultAxFix16, vResultByFix16)); ++ ++ vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vTileOffsetsXIntFix8); ++ vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vTileOffsetsYIntFix8); ++ vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], _mm256_add_pd(vResultAxFix16, vResultByFix16)); ++ ++ // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox ++ // step sample positions to the raster tile bbox of multisample points ++ // min(xSamples),min(ySamples) ------ max(xSamples),min(ySamples) ++ // | | ++ // | | ++ // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) ++ __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox; ++ if(sampleCount > SWR_MULTISAMPLE_1X) ++ { ++ __m128i vTileSampleBBoxXh = MultisampleTraits::TileSampleOffsetsX(); ++ __m128i vTileSampleBBoxYh = MultisampleTraits::TileSampleOffsetsY(); ++ ++ __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); ++ __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); ++ ++ // step edge equation tests from Tile ++ // used to for testing if entire raster tile is inside a triangle ++ vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vTileSampleBBoxXFix8); ++ vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vTileSampleBBoxYFix8); ++ vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); ++ ++ vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vTileSampleBBoxXFix8); ++ vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vTileSampleBBoxYFix8); ++ vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); ++ ++ vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vTileSampleBBoxXFix8); ++ vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vTileSampleBBoxYFix8); ++ vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); ++ } ++ ++ RDTSC_STOP(BEStepSetup, 0, pDC->drawId); ++ ++ uint32_t tY = tileY; ++ uint32_t tX = tileX; ++ uint32_t maxY = maxTileY; ++ uint32_t maxX = maxTileX; ++ ++ triDesc.pSamplePos = pDC->pState->state.samplePos; ++ ++ // compute steps between raster tiles for render output buffers ++ static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; ++ static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep}; ++ static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; ++ static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep}; ++ static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8)) * MultisampleTraits::numSamples}; ++ static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep}; ++ RenderOutputBuffers renderBuffers, currentRenderBufferRow; ++ ++ GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits::numSamples, ++ triDesc.triFlags.renderTargetArrayIndex); ++ currentRenderBufferRow = renderBuffers; ++ ++ // rasterize and generate coverage masks per sample ++ uint32_t maxSamples = MultisampleTraits::numSamples; ++ for (uint32_t tileY = tY; tileY <= maxY; ++tileY) ++ { ++ __m256d vStartOfRowEdge0 = vEdgeFix16[0]; ++ __m256d vStartOfRowEdge1 = vEdgeFix16[1]; ++ __m256d vStartOfRowEdge2 = vEdgeFix16[2]; ++ ++ for (uint32_t tileX = tX; tileX <= maxX; ++tileX) ++ { ++ uint64_t anyCoveredSamples = 0; ++ ++ // is the corner of the edge outside of the raster tile? (vEdge < 0) ++ int mask0, mask1, mask2; ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ // is the corner of the edge outside of the raster tile? (vEdge < 0) ++ mask0 = _mm256_movemask_pd(vEdgeFix16[0]); ++ mask1 = _mm256_movemask_pd(vEdgeFix16[1]); ++ mask2 = _mm256_movemask_pd(vEdgeFix16[2]); ++ } ++ else ++ { ++ __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; ++ // evaluate edge equations at the tile multisample bounding box ++ vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]); ++ vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]); ++ vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]); ++ mask0 = _mm256_movemask_pd(vSampleBboxTest0); ++ mask1 = _mm256_movemask_pd(vSampleBboxTest1); ++ mask2 = _mm256_movemask_pd(vSampleBboxTest2); ++ } ++ ++ for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++) ++ { ++ // trivial reject, at least one edge has all 4 corners of raster tile outside ++ bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false; ++ ++ if (!trivialReject) ++ { ++ // trivial accept mask ++ triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; ++ if ((mask0 & mask1 & mask2) == 0xf) ++ { ++ anyCoveredSamples = triDesc.coverageMask[sampleNum]; ++ // trivial accept, all 4 corners of all 3 edges are negative ++ // i.e. raster tile completely inside triangle ++ RDTSC_EVENT(BETrivialAccept, 1, 0); ++ } ++ else ++ { ++ __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; ++ if(sampleCount == SWR_MULTISAMPLE_1X) ++ { ++ // should get optimized out for single sample case (global value numbering or copy propagation) ++ vEdge0AtSample = vEdgeFix16[0]; ++ vEdge1AtSample = vEdgeFix16[1]; ++ vEdge2AtSample = vEdgeFix16[2]; ++ } ++ else ++ { ++ __m128i vSampleOffsetXh = MultisampleTraits::vXi(sampleNum); ++ __m128i vSampleOffsetYh = MultisampleTraits::vYi(sampleNum); ++ __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); ++ __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); ++ ++ // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0] ++ // for each edge and broadcasts it before offsetting to individual pixel quads ++ ++ // step edge equation tests from UL tile corner to pixel sample position ++ vResultAxFix16 = _mm256_mul_pd(vAEdge0Fix8, vSampleOffsetX); ++ vResultByFix16 = _mm256_mul_pd(vBEdge0Fix8, vSampleOffsetY); ++ vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); ++ vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample); ++ ++ vResultAxFix16 = _mm256_mul_pd(vAEdge1Fix8, vSampleOffsetX); ++ vResultByFix16 = _mm256_mul_pd(vBEdge1Fix8, vSampleOffsetY); ++ vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); ++ vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample); ++ ++ vResultAxFix16 = _mm256_mul_pd(vAEdge2Fix8, vSampleOffsetX); ++ vResultByFix16 = _mm256_mul_pd(vBEdge2Fix8, vSampleOffsetY); ++ vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); ++ vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample); ++ } ++ ++ // not trivial accept or reject, must rasterize full tile ++ RDTSC_START(BERasterizePartial); ++ triDesc.coverageMask[sampleNum] = rasterizePartialTile(pDC, vEdge0AtSample, vEdge1AtSample, vEdge2AtSample, ++ vAi, vBi, vStepQuad0Fix16, vStepQuad1Fix16, vStepQuad2Fix16); ++ RDTSC_STOP(BERasterizePartial, 0, 0); ++ ++ anyCoveredSamples |= triDesc.coverageMask[sampleNum]; ++ } ++ } ++ else ++ { ++ if(sampleCount > SWR_MULTISAMPLE_1X) ++ { ++ triDesc.coverageMask[sampleNum] = 0; ++ } ++ RDTSC_EVENT(BETrivialReject, 1, 0); ++ } ++ } ++ ++#if KNOB_ENABLE_TOSS_POINTS ++ if(KNOB_TOSS_RS) ++ { ++ gToss = triDesc.coverageMask[0]; ++ } ++ else ++#endif ++ if(anyCoveredSamples) ++ { ++ RDTSC_START(BEPixelBackend); ++ pDC->pState->pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); ++ RDTSC_STOP(BEPixelBackend, 0, 0); ++ } ++ ++ // step to the next tile in X ++ vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], vTileStepX0Fix16); ++ vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], vTileStepX1Fix16); ++ vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], vTileStepX2Fix16); ++ ++ StepRasterTileX(state.psState.maxRTSlotUsed, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep); ++ } ++ ++ // step to the next tile in Y ++ vEdgeFix16[0] = _mm256_add_pd(vStartOfRowEdge0, vTileStepY0Fix16); ++ vEdgeFix16[1] = _mm256_add_pd(vStartOfRowEdge1, vTileStepY1Fix16); ++ vEdgeFix16[2] = _mm256_add_pd(vStartOfRowEdge2, vTileStepY2Fix16); ++ ++ StepRasterTileY(state.psState.maxRTSlotUsed, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep); ++ } ++ ++ RDTSC_STOP(BERasterizeTriangle, 1, 0); ++} ++ ++void RasterizePoint(DRAW_CONTEXT *pDC, uint32_t workerId, const TRIANGLE_WORK_DESC &workDesc, uint32_t macroTile) ++{ ++#if KNOB_ENABLE_TOSS_POINTS ++ if (KNOB_TOSS_BIN_TRIS) ++ { ++ return; ++ } ++#endif ++ ++ // map x,y relative offsets from start of raster tile to bit position in ++ // coverage mask for the point ++ static const uint32_t coverageMap[8][8] = { ++ { 0, 1, 4, 5, 8, 9, 12, 13 }, ++ { 2, 3, 6, 7, 10, 11, 14, 15 }, ++ { 16, 17, 20, 21, 24, 25, 28, 29 }, ++ { 18, 19, 22, 23, 26, 27, 30, 31 }, ++ { 32, 33, 36, 37, 40, 41, 44, 45 }, ++ { 34, 35, 38, 39, 42, 43, 46, 47 }, ++ { 48, 49, 52, 53, 56, 57, 60, 61 }, ++ { 50, 51, 54, 55, 58, 59, 62, 63 } ++ }; ++ ++ OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; ++ ++ // pull point information from triangle buffer ++ // @todo use structs for readability ++ uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer; ++ uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1); ++ float z = *(workDesc.pTriBuffer + 2); ++ ++ // construct triangle descriptor for point ++ // no interpolation, set up i,j for constant interpolation of z and attribs ++ // @todo implement an optimized backend that doesn't require triangle information ++ ++ // compute coverage mask from x,y packed into the coverageMask flag ++ // mask indices by the maximum valid index for x/y of coveragemap. ++ uint32_t tX = workDesc.triFlags.coverageMask & 0x7; ++ uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7; ++ // todo: multisample points? ++ triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX]; ++ ++ // no persp divide needed for points ++ triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs; ++ triDesc.triFlags = workDesc.triFlags; ++ triDesc.recipDet = 1.0f; ++ triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f; ++ triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f; ++ triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f; ++ triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z; ++ ++ RenderOutputBuffers renderBuffers; ++ GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, ++ renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex); ++ ++ RDTSC_START(BEPixelBackend); ++ pDC->pState->pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers); ++ RDTSC_STOP(BEPixelBackend, 0, 0); ++} ++ ++void rastPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) ++{ ++ TRIANGLE_WORK_DESC *pDesc = (TRIANGLE_WORK_DESC*)pData; ++ RasterizePoint(pDC, workerId, *pDesc, macroTile); ++ ++} ++// Get pointers to hot tile memory for color RT, depth, stencil ++void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, ++ uint32_t numSamples, uint32_t renderTargetArrayIndex) ++{ ++ const API_STATE& state = GetApiState(pDC); ++ SWR_CONTEXT *pContext = pDC->pContext; ++ const SWR_DEPTH_STENCIL_STATE *pDSState = &state.depthStencilState; ++ const uint32_t MaxRT = state.psState.maxRTSlotUsed; ++ ++ uint32_t mx, my; ++ MacroTileMgr::getTileIndices(macroID, mx, my); ++ tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx; ++ tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my; ++ ++ if(state.psState.pfnPixelShader != NULL) ++ { ++ // compute tile offset for active hottile buffers ++ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; ++ uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); ++ offset*=numSamples; ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, ++ numSamples, renderTargetArrayIndex); ++ pColor->state = HOTTILE_DIRTY; ++ renderBuffers.pColor[rt] = pColor->pBuffer + offset; ++ } ++ } ++ if(pDSState->depthTestEnable || pDSState->depthWriteEnable) ++ { ++ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; ++ uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); ++ offset*=numSamples; ++ HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, ++ numSamples, renderTargetArrayIndex); ++ pDepth->state = HOTTILE_DIRTY; ++ SWR_ASSERT(pDepth->pBuffer != nullptr); ++ renderBuffers.pDepth = pDepth->pBuffer + offset; ++ } ++ if(pDSState->stencilTestEnable) ++ { ++ const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits::bpp / 8; ++ uint32_t offset = ComputeTileOffset2D::bpp> >(pitch, tileX, tileY); ++ offset*=numSamples; ++ HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, ++ numSamples, renderTargetArrayIndex); ++ pStencil->state = HOTTILE_DIRTY; ++ SWR_ASSERT(pStencil->pBuffer != nullptr); ++ renderBuffers.pStencil = pStencil->pBuffer + offset; ++ } ++} ++ ++INLINE ++void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep) ++{ ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ buffers.pColor[rt] += colorTileStep; ++ } ++ ++ buffers.pDepth += depthTileStep; ++ buffers.pStencil += stencilTileStep; ++} ++ ++INLINE ++void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep) ++{ ++ for(uint32_t rt = 0; rt <= MaxRT; ++rt) ++ { ++ startBufferRow.pColor[rt] += colorRowStep; ++ buffers.pColor[rt] = startBufferRow.pColor[rt]; ++ } ++ startBufferRow.pDepth += depthRowStep; ++ buffers.pDepth = startBufferRow.pDepth; ++ ++ startBufferRow.pStencil += stencilRowStep; ++ buffers.pStencil = startBufferRow.pStencil; ++} ++ ++// initialize rasterizer function table ++PFN_WORK_FUNC gRasterizerTable[SWR_MULTISAMPLE_TYPE_MAX] = ++{ ++ RasterizeTriangle, ++ RasterizeTriangle, ++ RasterizeTriangle, ++ RasterizeTriangle, ++ RasterizeTriangle ++}; ++ ++void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) ++{ ++ const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData); ++#if KNOB_ENABLE_TOSS_POINTS ++ if (KNOB_TOSS_BIN_TRIS) ++ { ++ return; ++ } ++#endif ++ ++ // bloat line to two tris and call the triangle rasterizer twice ++ RDTSC_START(BERasterizeLine); ++ ++ const API_STATE &state = GetApiState(pDC); ++ ++ // macrotile dimensioning ++ uint32_t macroX, macroY; ++ MacroTileMgr::getTileIndices(macroTile, macroX, macroY); ++ int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED; ++ int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1; ++ int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; ++ int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; ++ ++ // create a copy of the triangle buffer to write our adjusted vertices to ++ OSALIGNSIMD(float) newTriBuffer[4 * 4]; ++ TRIANGLE_WORK_DESC newWorkDesc = workDesc; ++ newWorkDesc.pTriBuffer = &newTriBuffer[0]; ++ ++ // create a copy of the attrib buffer to write our adjusted attribs to ++ OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES]; ++ newWorkDesc.pAttribs = &newAttribBuffer[0]; ++ ++ const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f); ++ const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f); ++ ++ __m128 vX, vY, vZ, vRecipW; ++ ++ vX = _mm_load_ps(workDesc.pTriBuffer); ++ vY = _mm_load_ps(workDesc.pTriBuffer + 4); ++ vZ = _mm_load_ps(workDesc.pTriBuffer + 8); ++ vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12); ++ ++ // triangle 0 ++ // v0,v1 -> v0,v0,v1 ++ __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0)); ++ __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0)); ++ __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0)); ++ __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0)); ++ ++ __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth); ++ __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0); ++ if (workDesc.triFlags.yMajor) ++ { ++ vXa = _mm_add_ps(vAdjust, vXa); ++ } ++ else ++ { ++ vYa = _mm_add_ps(vAdjust, vYa); ++ } ++ ++ // Store triangle description for rasterizer ++ _mm_store_ps((float*)&newTriBuffer[0], vXa); ++ _mm_store_ps((float*)&newTriBuffer[4], vYa); ++ _mm_store_ps((float*)&newTriBuffer[8], vZa); ++ _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); ++ ++ // binner bins 3 edges for lines as v0, v1, v1 ++ // tri0 needs v0, v0, v1 ++ for (uint32_t a = 0; a < workDesc.numAttribs; ++a) ++ { ++ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]); ++ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]); ++ ++ _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0); ++ _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0); ++ _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1); ++ } ++ ++ // Store user clip distances for triangle 0 ++ float newClipBuffer[3 * 8]; ++ uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask); ++ if (numClipDist) ++ { ++ newWorkDesc.pUserClipBuffer = newClipBuffer; ++ ++ float* pOldBuffer = workDesc.pUserClipBuffer; ++ float* pNewBuffer = newClipBuffer; ++ for (uint32_t i = 0; i < numClipDist; ++i) ++ { ++ // read barycentric coeffs from binner ++ float a = *(pOldBuffer++); ++ float b = *(pOldBuffer++); ++ ++ // reconstruct original clip distance at vertices ++ float c0 = a + b; ++ float c1 = b; ++ ++ // construct triangle barycentrics ++ *(pNewBuffer++) = c0 - c1; ++ *(pNewBuffer++) = c0 - c1; ++ *(pNewBuffer++) = c1; ++ } ++ } ++ ++ // make sure this macrotile intersects the triangle ++ __m128i vXai = fpToFixedPoint(vXa); ++ __m128i vYai = fpToFixedPoint(vYa); ++ OSALIGN(BBOX, 16) bboxA; ++ calcBoundingBoxInt(vXai, vYai, bboxA); ++ ++ if (!(bboxA.left > macroBoxRight || ++ bboxA.left > state.scissorInFixedPoint.right || ++ bboxA.right - 1 < macroBoxLeft || ++ bboxA.right - 1 < state.scissorInFixedPoint.left || ++ bboxA.top > macroBoxBottom || ++ bboxA.top > state.scissorInFixedPoint.bottom || ++ bboxA.bottom - 1 < macroBoxTop || ++ bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { ++ // rasterize triangle ++ RasterizeTriangle(pDC, workerId, macroTile, (void*)&newWorkDesc); ++ } ++ ++ // triangle 1 ++ // v0,v1 -> v1,v1,v0 ++ vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1)); ++ vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1)); ++ vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1)); ++ vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1)); ++ ++ vAdjust = _mm_mul_ps(vLineWidth, vBloat1); ++ if (workDesc.triFlags.yMajor) ++ { ++ vXa = _mm_add_ps(vAdjust, vXa); ++ } ++ else ++ { ++ vYa = _mm_add_ps(vAdjust, vYa); ++ } ++ ++ // Store triangle description for rasterizer ++ _mm_store_ps((float*)&newTriBuffer[0], vXa); ++ _mm_store_ps((float*)&newTriBuffer[4], vYa); ++ _mm_store_ps((float*)&newTriBuffer[8], vZa); ++ _mm_store_ps((float*)&newTriBuffer[12], vRecipWa); ++ ++ // binner bins 3 edges for lines as v0, v1, v1 ++ // tri1 needs v1, v1, v0 ++ for (uint32_t a = 0; a < workDesc.numAttribs; ++a) ++ { ++ __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]); ++ __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]); ++ ++ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1); ++ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1); ++ _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0); ++ } ++ ++ // store user clip distance for triangle 1 ++ if (numClipDist) ++ { ++ float* pOldBuffer = workDesc.pUserClipBuffer; ++ float* pNewBuffer = newClipBuffer; ++ for (uint32_t i = 0; i < numClipDist; ++i) ++ { ++ // read barycentric coeffs from binner ++ float a = *(pOldBuffer++); ++ float b = *(pOldBuffer++); ++ ++ // reconstruct original clip distance at vertices ++ float c0 = a + b; ++ float c1 = b; ++ ++ // construct triangle barycentrics ++ *(pNewBuffer++) = c1 - c0; ++ *(pNewBuffer++) = c1 - c0; ++ *(pNewBuffer++) = c0; ++ } ++ } ++ ++ vXai = fpToFixedPoint(vXa); ++ vYai = fpToFixedPoint(vYa); ++ calcBoundingBoxInt(vXai, vYai, bboxA); ++ ++ if (!(bboxA.left > macroBoxRight || ++ bboxA.left > state.scissorInFixedPoint.right || ++ bboxA.right - 1 < macroBoxLeft || ++ bboxA.right - 1 < state.scissorInFixedPoint.left || ++ bboxA.top > macroBoxBottom || ++ bboxA.top > state.scissorInFixedPoint.bottom || ++ bboxA.bottom - 1 < macroBoxTop || ++ bboxA.bottom - 1 < state.scissorInFixedPoint.top)) { ++ // rasterize triangle ++ RasterizeTriangle(pDC, workerId, macroTile, (void*)&newWorkDesc); ++ } ++ ++ RDTSC_STOP(BERasterizeLine, 1, 0); ++} ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +new file mode 100644 +index 0000000..e07d7ea +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +@@ -0,0 +1,34 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file rasterizer.h ++* ++* @brief Definitions for the rasterizer. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "context.h" ++ ++void rastPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); ++extern PFN_WORK_FUNC gRasterizerTable[SWR_MULTISAMPLE_TYPE_MAX]; ++void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp +new file mode 100644 +index 0000000..df96f72 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp +@@ -0,0 +1,90 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#include "rdtsc_core.h" ++#include "common/rdtsc_buckets.h" ++ ++// must match CORE_BUCKETS enum order ++BUCKET_DESC gCoreBuckets[] = { ++ { "APIClearRenderTarget", "", true, 0xff0b8bea }, ++ { "APIDraw", "", true, 0xff000066 }, ++ { "APIDrawWakeAllThreads", "", false, 0xffffffff }, ++ { "APIDrawIndexed", "", true, 0xff000066 }, ++ { "APIDispatch", "", true, 0xff660000 }, ++ { "APIStoreTiles", "", true, 0xff00ffff }, ++ { "APIGetDrawContext", "", false, 0xffffffff }, ++ { "APISync", "", true, 0xff6666ff }, ++ { "FEProcessDraw", "", true, 0xff009900 }, ++ { "FEProcessDrawIndexed", "", true, 0xff009900 }, ++ { "FEFetchShader", "", false, 0xffffffff }, ++ { "FEVertexShader", "", false, 0xffffffff }, ++ { "FEHullShader", "", false, 0xffffffff }, ++ { "FETessellation", "", false, 0xffffffff }, ++ { "FEDomainShader", "", false, 0xffffffff }, ++ { "FEGeometryShader", "", false, 0xffffffff }, ++ { "FEStreamout", "", false, 0xffffffff }, ++ { "FEPAAssemble", "", false, 0xffffffff }, ++ { "FEBinPoints", "", false, 0xff29b854 }, ++ { "FEBinLines", "", false, 0xff29b854 }, ++ { "FEBinTriangles", "", false, 0xff29b854 }, ++ { "FETriangleSetup", "", false, 0xffffffff }, ++ { "FEViewportCull", "", false, 0xffffffff }, ++ { "FEGuardbandClip", "", false, 0xffffffff }, ++ { "FEClipPoints", "", false, 0xffffffff }, ++ { "FEClipLines", "", false, 0xffffffff }, ++ { "FEClipTriangles", "", false, 0xffffffff }, ++ { "FECullZeroAreaAndBackface", "", false, 0xffffffff }, ++ { "FECullBetweenCenters", "", false, 0xffffffff }, ++ { "FEProcessStoreTiles", "", true, 0xff39c864 }, ++ { "FEProcessInvalidateTiles", "", true, 0xffffffff }, ++ { "WorkerWorkOnFifoBE", "", false, 0xff40261c }, ++ { "WorkerFoundWork", "", false, 0xff573326 }, ++ { "BELoadTiles", "", true, 0xffb0e2ff }, ++ { "BEDispatch", "", true, 0xff00a2ff }, ++ { "BEClear", "", true, 0xff00ccbb }, ++ { "BERasterizeLine", "", true, 0xffb26a4e }, ++ { "BERasterizeTriangle", "", true, 0xffb26a4e }, ++ { "BETriangleSetup", "", false, 0xffffffff }, ++ { "BEStepSetup", "", false, 0xffffffff }, ++ { "BECullZeroArea", "", false, 0xffffffff }, ++ { "BEEmptyTriangle", "", false, 0xffffffff }, ++ { "BETrivialAccept", "", false, 0xffffffff }, ++ { "BETrivialReject", "", false, 0xffffffff }, ++ { "BERasterizePartial", "", false, 0xffffffff }, ++ { "BEPixelBackend", "", false, 0xffffffff }, ++ { "BESetup", "", false, 0xffffffff }, ++ { "BEBarycentric", "", false, 0xffffffff }, ++ { "BEEarlyDepthTest", "", false, 0xffffffff }, ++ { "BEPixelShader", "", false, 0xffffffff }, ++ { "BELateDepthTest", "", false, 0xffffffff }, ++ { "BEOutputMerger", "", false, 0xffffffff }, ++ { "BEStoreTiles", "", true, 0xff00cccc }, ++ { "BEEndTile", "", false, 0xffffffff }, ++ { "WorkerWaitForThreadEvent", "", false, 0xffffffff }, ++}; ++ ++/// @todo bucketmanager and mapping should probably be a part of the SWR context ++std::vector gBucketMap; ++BucketManager gBucketMgr(false); ++ ++uint32_t gCurrentFrame = 0; +diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h +new file mode 100644 +index 0000000..1e3700d +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h +@@ -0,0 +1,175 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#pragma once ++#include "knobs.h" ++ ++#include "common/os.h" ++#include "common/rdtsc_buckets.h" ++ ++#include ++ ++enum CORE_BUCKETS ++{ ++ APIClearRenderTarget, ++ APIDraw, ++ APIDrawWakeAllThreads, ++ APIDrawIndexed, ++ APIDispatch, ++ APIStoreTiles, ++ APIGetDrawContext, ++ APISync, ++ FEProcessDraw, ++ FEProcessDrawIndexed, ++ FEFetchShader, ++ FEVertexShader, ++ FEHullShader, ++ FETessellation, ++ FEDomainShader, ++ FEGeometryShader, ++ FEStreamout, ++ FEPAAssemble, ++ FEBinPoints, ++ FEBinLines, ++ FEBinTriangles, ++ FETriangleSetup, ++ FEViewportCull, ++ FEGuardbandClip, ++ FEClipPoints, ++ FEClipLines, ++ FEClipTriangles, ++ FECullZeroAreaAndBackface, ++ FECullBetweenCenters, ++ FEProcessStoreTiles, ++ FEProcessInvalidateTiles, ++ WorkerWorkOnFifoBE, ++ WorkerFoundWork, ++ BELoadTiles, ++ BEDispatch, ++ BEClear, ++ BERasterizeLine, ++ BERasterizeTriangle, ++ BETriangleSetup, ++ BEStepSetup, ++ BECullZeroArea, ++ BEEmptyTriangle, ++ BETrivialAccept, ++ BETrivialReject, ++ BERasterizePartial, ++ BEPixelBackend, ++ BESetup, ++ BEBarycentric, ++ BEEarlyDepthTest, ++ BEPixelShader, ++ BELateDepthTest, ++ BEOutputMerger, ++ BEStoreTiles, ++ BEEndTile, ++ WorkerWaitForThreadEvent, ++ ++ NumBuckets ++}; ++ ++void rdtscReset(); ++void rdtscInit(int threadId); ++void rdtscStart(uint32_t bucketId); ++void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId); ++void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2); ++void rdtscEndFrame(); ++ ++#ifdef KNOB_ENABLE_RDTSC ++#define RDTSC_RESET() rdtscReset() ++#define RDTSC_INIT(threadId) rdtscInit(threadId) ++#define RDTSC_START(bucket) rdtscStart(bucket) ++#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw) ++#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2) ++#define RDTSC_ENDFRAME() rdtscEndFrame() ++#else ++#define RDTSC_RESET() ++#define RDTSC_INIT(threadId) ++#define RDTSC_START(bucket) ++#define RDTSC_STOP(bucket, count, draw) ++#define RDTSC_EVENT(bucket, count1, count2) ++#define RDTSC_ENDFRAME() ++#endif ++ ++extern std::vector gBucketMap; ++extern BucketManager gBucketMgr; ++extern BUCKET_DESC gCoreBuckets[]; ++extern uint32_t gCurrentFrame; ++ ++INLINE void rdtscReset() ++{ ++ gCurrentFrame = 0; ++ gBucketMgr.ClearThreads(); ++ gBucketMgr.ClearBuckets(); ++} ++ ++INLINE void rdtscInit(int threadId) ++{ ++ // register all the buckets once ++ if (threadId == 0) ++ { ++ gBucketMap.resize(NumBuckets); ++ for (uint32_t i = 0; i < NumBuckets; ++i) ++ { ++ gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]); ++ } ++ } ++ ++ std::string name = threadId == 0 ? "API" : "WORKER"; ++ gBucketMgr.RegisterThread(name); ++} ++ ++INLINE void rdtscStart(uint32_t bucketId) ++{ ++ uint32_t id = gBucketMap[bucketId]; ++ gBucketMgr.StartBucket(id); ++} ++ ++INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId) ++{ ++ uint32_t id = gBucketMap[bucketId]; ++ gBucketMgr.StopBucket(id); ++} ++ ++INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2) ++{ ++ ++} ++ ++INLINE void rdtscEndFrame() ++{ ++ gCurrentFrame++; ++ ++ if (gCurrentFrame == KNOB_BUCKETS_START_FRAME) ++ { ++ gBucketMgr.StartCapture(); ++ } ++ ++ if (gCurrentFrame == KNOB_BUCKETS_END_FRAME) ++ { ++ gBucketMgr.StopCapture(); ++ gBucketMgr.PrintReport("rdtsc.txt"); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h +new file mode 100644 +index 0000000..ad8b91fc +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/state.h +@@ -0,0 +1,918 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file state.h ++* ++* @brief Definitions for API state. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/formats.h" ++#include "common/simdintrin.h" ++ ++// clear flags ++#define SWR_CLEAR_NONE 0 ++#define SWR_CLEAR_COLOR (1 << 0) ++#define SWR_CLEAR_DEPTH (1 << 1) ++#define SWR_CLEAR_STENCIL (1 << 2) ++ ++enum DRIVER_TYPE ++{ ++ DX, ++ GL ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// PRIMITIVE_TOPOLOGY. ++////////////////////////////////////////////////////////////////////////// ++enum PRIMITIVE_TOPOLOGY ++{ ++ TOP_UNKNOWN = 0x0, ++ TOP_POINT_LIST = 0x1, ++ TOP_LINE_LIST = 0x2, ++ TOP_LINE_STRIP = 0x3, ++ TOP_TRIANGLE_LIST = 0x4, ++ TOP_TRIANGLE_STRIP = 0x5, ++ TOP_TRIANGLE_FAN = 0x6, ++ TOP_QUAD_LIST = 0x7, ++ TOP_QUAD_STRIP = 0x8, ++ TOP_LINE_LIST_ADJ = 0x9, ++ TOP_LISTSTRIP_ADJ = 0xA, ++ TOP_TRI_LIST_ADJ = 0xB, ++ TOP_TRI_STRIP_ADJ = 0xC, ++ TOP_TRI_STRIP_REVERSE = 0xD, ++ TOP_POLYGON = 0xE, ++ TOP_RECT_LIST = 0xF, ++ TOP_LINE_LOOP = 0x10, ++ TOP_POINT_LIST_BF = 0x11, ++ TOP_LINE_STRIP_CONT = 0x12, ++ TOP_LINE_STRIP_BF = 0x13, ++ TOP_LINE_STRIP_CONT_BF = 0x14, ++ TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16, ++ TOP_TRIANGLE_DISC = 0x17, /// @todo What is this?? ++ ++ TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist. ++ TOP_PATCHLIST_1 = 0x20, // List of 1-vertex patches ++ TOP_PATCHLIST_2 = 0x21, ++ TOP_PATCHLIST_3 = 0x22, ++ TOP_PATCHLIST_4 = 0x23, ++ TOP_PATCHLIST_5 = 0x24, ++ TOP_PATCHLIST_6 = 0x25, ++ TOP_PATCHLIST_7 = 0x26, ++ TOP_PATCHLIST_8 = 0x27, ++ TOP_PATCHLIST_9 = 0x28, ++ TOP_PATCHLIST_10 = 0x29, ++ TOP_PATCHLIST_11 = 0x2A, ++ TOP_PATCHLIST_12 = 0x2B, ++ TOP_PATCHLIST_13 = 0x2C, ++ TOP_PATCHLIST_14 = 0x2D, ++ TOP_PATCHLIST_15 = 0x2E, ++ TOP_PATCHLIST_16 = 0x2F, ++ TOP_PATCHLIST_17 = 0x30, ++ TOP_PATCHLIST_18 = 0x31, ++ TOP_PATCHLIST_19 = 0x32, ++ TOP_PATCHLIST_20 = 0x33, ++ TOP_PATCHLIST_21 = 0x34, ++ TOP_PATCHLIST_22 = 0x35, ++ TOP_PATCHLIST_23 = 0x36, ++ TOP_PATCHLIST_24 = 0x37, ++ TOP_PATCHLIST_25 = 0x38, ++ TOP_PATCHLIST_26 = 0x39, ++ TOP_PATCHLIST_27 = 0x3A, ++ TOP_PATCHLIST_28 = 0x3B, ++ TOP_PATCHLIST_29 = 0x3C, ++ TOP_PATCHLIST_30 = 0x3D, ++ TOP_PATCHLIST_31 = 0x3E, ++ TOP_PATCHLIST_32 = 0x3F, // List of 32-vertex patches ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_SHADER_TYPE ++////////////////////////////////////////////////////////////////////////// ++enum SWR_SHADER_TYPE ++{ ++ SHADER_VERTEX, ++ SHADER_GEOMETRY, ++ SHADER_DOMAIN, ++ SHADER_HULL, ++ SHADER_PIXEL, ++ SHADER_COMPUTE, ++ ++ NUM_SHADER_TYPES, ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_RENDERTARGET_ATTACHMENT ++/// @todo Its not clear what an "attachment" means. Its not common term. ++////////////////////////////////////////////////////////////////////////// ++enum SWR_RENDERTARGET_ATTACHMENT ++{ ++ SWR_ATTACHMENT_COLOR0, ++ SWR_ATTACHMENT_COLOR1, ++ SWR_ATTACHMENT_COLOR2, ++ SWR_ATTACHMENT_COLOR3, ++ SWR_ATTACHMENT_COLOR4, ++ SWR_ATTACHMENT_COLOR5, ++ SWR_ATTACHMENT_COLOR6, ++ SWR_ATTACHMENT_COLOR7, ++ SWR_ATTACHMENT_DEPTH, ++ SWR_ATTACHMENT_STENCIL, ++ ++ SWR_NUM_ATTACHMENTS ++}; ++ ++#define SWR_NUM_RENDERTARGETS 8 ++ ++#define SWR_ATTACHMENT_COLOR0_BIT 0x001 ++#define SWR_ATTACHMENT_COLOR1_BIT 0x002 ++#define SWR_ATTACHMENT_COLOR2_BIT 0x004 ++#define SWR_ATTACHMENT_COLOR3_BIT 0x008 ++#define SWR_ATTACHMENT_COLOR4_BIT 0x010 ++#define SWR_ATTACHMENT_COLOR5_BIT 0x020 ++#define SWR_ATTACHMENT_COLOR6_BIT 0x040 ++#define SWR_ATTACHMENT_COLOR7_BIT 0x080 ++#define SWR_ATTACHMENT_DEPTH_BIT 0x100 ++#define SWR_ATTACHMENT_STENCIL_BIT 0x200 ++#define SWR_ATTACHMENT_MASK_ALL 0x3ff ++#define SWR_ATTACHMENT_MASK_COLOR 0x0ff ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SWR Inner Tessellation factor ID ++/// See above GetTessFactorOutputPosition code for documentation ++enum SWR_INNER_TESSFACTOR_ID ++{ ++ SWR_QUAD_U_TRI_INSIDE, ++ SWR_QUAD_V_INSIDE, ++ ++ SWR_NUM_INNER_TESS_FACTORS, ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief SWR Outer Tessellation factor ID ++/// See above GetTessFactorOutputPosition code for documentation ++enum SWR_OUTER_TESSFACTOR_ID ++{ ++ SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL, ++ SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY, ++ SWR_QUAD_U_EQ1_TRI_W, ++ SWR_QUAD_V_EQ1, ++ ++ SWR_NUM_OUTER_TESS_FACTORS, ++}; ++ ++ ++///////////////////////////////////////////////////////////////////////// ++/// simdvertex ++/// @brief Defines a vertex element that holds all the data for SIMD vertices. ++/// Contains position in clip space, hardcoded to attribute 0, ++/// space for up to 32 attributes, as well as any SGV values generated ++/// by the pipeline (to be implemented) ++///////////////////////////////////////////////////////////////////////// ++#define VERTEX_POSITION_SLOT 0 ++#define VERTEX_ATTRIB_START_SLOT 1 ++#define VERTEX_ATTRIB_END_SLOT 32 ++#define VERTEX_RTAI_SLOT 33 // GS will write RenderTargetArrayIndex here ++#define VERTEX_PRIMID_SLOT 34 // GS will write PrimId here ++#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS will write lower 4 clip/cull dist ++#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS will write upper 4 clip/cull dist ++static_assert(VERTEX_CLIPCULL_DIST_HI_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size"); ++ ++// SoAoSoA ++struct simdvertex ++{ ++ simdvector attrib[KNOB_NUM_ATTRIBUTES]; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_VS_CONTEXT ++/// @brief Input to vertex shader ++///////////////////////////////////////////////////////////////////////// ++struct SWR_VS_CONTEXT ++{ ++ simdvertex* pVin; // IN: SIMD input vertex data store ++ simdvertex* pVout; // OUT: SIMD output vertex data store ++ ++ uint32_t InstanceID; // IN: Instance ID, constant across all verts of the SIMD ++ simdscalari VertexID; // IN: Vertex ID ++ simdscalari mask; // IN: Active mask for shader ++}; ++ ++///////////////////////////////////////////////////////////////////////// ++/// ScalarCPoint ++/// @brief defines a control point element as passed from the output ++/// of the hull shader to the input of the domain shader ++///////////////////////////////////////////////////////////////////////// ++struct ScalarAttrib ++{ ++ float x; ++ float y; ++ float z; ++ float w; ++}; ++ ++struct ScalarCPoint ++{ ++ ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES]; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TESSELLATION_FACTORS ++/// @brief Tessellation factors structure (non-vector) ++///////////////////////////////////////////////////////////////////////// ++struct SWR_TESSELLATION_FACTORS ++{ ++ float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; ++ float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; ++}; ++ ++#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches ++struct ScalarPatch ++{ ++ SWR_TESSELLATION_FACTORS tessFactors; ++ ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM]; ++ ScalarCPoint patchData; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_HS_CONTEXT ++/// @brief Input to hull shader ++///////////////////////////////////////////////////////////////////////// ++struct SWR_HS_CONTEXT ++{ ++ simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data ++ simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call ++ ScalarPatch* pCPout; // OUT: Output control point patch ++ // SIMD-sized-array of SCALAR patches ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_DS_CONTEXT ++/// @brief Input to domain shader ++///////////////////////////////////////////////////////////////////////// ++struct SWR_DS_CONTEXT ++{ ++ uint32_t PrimitiveID; // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation ++ uint32_t vectorOffset; // IN: (SCALAR) vector index offset into SIMD data. ++ uint32_t vectorStride; // IN: (SCALAR) stride (in vectors) of output data per attribute-component ++ ScalarPatch* pCpIn; // IN: (SCALAR) Control patch ++ simdscalar* pDomainU; // IN: (SIMD) Domain Point U coords ++ simdscalar* pDomainV; // IN: (SIMD) Domain Point V coords ++ simdscalar* pOutputData; // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component) ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_GS_CONTEXT ++/// @brief Input to geometry shader. ++///////////////////////////////////////////////////////////////////////// ++struct SWR_GS_CONTEXT ++{ ++ simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims ++ simdscalari PrimitiveID; // IN: input primitive ID generated from the draw call ++ uint32_t InstanceID; // IN: input instance ID ++ uint8_t* pStream[4]; // OUT: output streams ++ uint8_t* pCutBuffer; // OUT: cut buffer ++ simdscalari vertexCount; // OUT: num vertices emitted per SIMD lane ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_PS_CONTEXT ++/// @brief Input to pixel shader. ++///////////////////////////////////////////////////////////////////////// ++struct SWR_PS_CONTEXT ++{ ++ simdscalar vX; // IN: x location of pixels ++ simdscalar vY; // IN: y location of pixels ++ simdscalar vZ; // INOUT: z location of pixels ++ simdscalari mask; // INOUT: mask for kill ++ ++ // rasterizer generated barycentric components ++ simdscalar vI; // IN: Barycentric I component ++ simdscalar vJ; // IN: Barycentric J component ++ simdscalar vOneOverW; // IN: 1/w ++ ++ const float* pAttribs; // IN: pointer to attribute barycentric coefficients ++ const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients ++ const float *I; // IN: Barycentric A, B, and C coefs used to compute I ++ const float *J; // IN: Barycentric A, B, and C coefs used to compute J ++ float recipDet; // IN: 1/Det, used when barycentric interpolating attributes ++ const float* pSamplePos; // IN: array of sample positions ++ simdvector shaded[SWR_NUM_RENDERTARGETS]; // OUT: result color per rendertarget ++ ++ uint32_t frontFace; // IN: front- 1, back- 0 ++ uint32_t primID; // IN: primitive ID ++ uint32_t sampleIndex; // IN: sampleIndex ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_CS_CONTEXT ++/// @brief Input to compute shader. ++///////////////////////////////////////////////////////////////////////// ++struct SWR_CS_CONTEXT ++{ ++ // The ThreadGroupId is the current thread group index relative ++ // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup, ++ // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader. ++ ++ // Compute shader accepts the following system values. ++ // o ThreadId - Current thread id relative to all other threads in dispatch. ++ // o ThreadGroupId - Current thread group id relative to all other groups in dispatch. ++ // o ThreadIdInGroup - Current thread relative to all threads in the current thread group. ++ // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup. ++ // ++ // All of these system values can be computed in the shader. They will be ++ // derived from the current tile counter. The tile counter is an atomic counter that ++ // resides in the draw context and is initialized to the product of the dispatch dims. ++ // ++ // tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z ++ // ++ // Each CPU worker thread will atomically decrement this counter and passes the current ++ // count into the shader. When the count reaches 0 then all thread groups in the ++ // dispatch call have been completed. ++ ++ uint32_t tileCounter; // The tile counter value for this thread group. ++ ++ // Dispatch dimensions used by shader to compute system values from the tile counter. ++ uint32_t dispatchDims[3]; ++ ++ uint8_t* pTGSM; // Thread Group Shared Memory pointer. ++}; ++ ++// enums ++enum SWR_TILE_MODE ++{ ++ SWR_TILE_NONE = 0x0, // Linear mode (no tiling) ++ SWR_TILE_MODE_WMAJOR, // W major tiling ++ SWR_TILE_MODE_XMAJOR, // X major tiling ++ SWR_TILE_MODE_YMAJOR, // Y major tiling ++ SWR_TILE_SWRZ, // SWR-Z tiling ++ ++ SWR_TILE_MODE_COUNT ++}; ++ ++enum SWR_SURFACE_TYPE ++{ ++ SURFACE_1D = 0, ++ SURFACE_2D = 1, ++ SURFACE_3D = 2, ++ SURFACE_CUBE = 3, ++ SURFACE_BUFFER = 4, ++ SURFACE_STRUCTURED_BUFFER = 5, ++ SURFACE_NULL = 7 ++}; ++ ++enum SWR_ZFUNCTION ++{ ++ ZFUNC_ALWAYS, ++ ZFUNC_NEVER, ++ ZFUNC_LT, ++ ZFUNC_EQ, ++ ZFUNC_LE, ++ ZFUNC_GT, ++ ZFUNC_NE, ++ ZFUNC_GE, ++ NUM_ZFUNC ++}; ++ ++enum SWR_STENCILOP ++{ ++ STENCILOP_KEEP, ++ STENCILOP_ZERO, ++ STENCILOP_REPLACE, ++ STENCILOP_INCRSAT, ++ STENCILOP_DECRSAT, ++ STENCILOP_INCR, ++ STENCILOP_DECR, ++ STENCILOP_INVERT ++}; ++ ++enum SWR_BLEND_FACTOR ++{ ++ BLENDFACTOR_ONE, ++ BLENDFACTOR_SRC_COLOR, ++ BLENDFACTOR_SRC_ALPHA, ++ BLENDFACTOR_DST_ALPHA, ++ BLENDFACTOR_DST_COLOR, ++ BLENDFACTOR_SRC_ALPHA_SATURATE, ++ BLENDFACTOR_CONST_COLOR, ++ BLENDFACTOR_CONST_ALPHA, ++ BLENDFACTOR_SRC1_COLOR, ++ BLENDFACTOR_SRC1_ALPHA, ++ BLENDFACTOR_ZERO, ++ BLENDFACTOR_INV_SRC_COLOR, ++ BLENDFACTOR_INV_SRC_ALPHA, ++ BLENDFACTOR_INV_DST_ALPHA, ++ BLENDFACTOR_INV_DST_COLOR, ++ BLENDFACTOR_INV_CONST_COLOR, ++ BLENDFACTOR_INV_CONST_ALPHA, ++ BLENDFACTOR_INV_SRC1_COLOR, ++ BLENDFACTOR_INV_SRC1_ALPHA ++}; ++ ++enum SWR_BLEND_OP ++{ ++ BLENDOP_ADD, ++ BLENDOP_SUBTRACT, ++ BLENDOP_REVSUBTRACT, ++ BLENDOP_MIN, ++ BLENDOP_MAX, ++}; ++ ++struct SWR_SURFACE_STATE ++{ ++ uint8_t *pBaseAddress; ++ SWR_SURFACE_TYPE type; // @llvm_enum ++ SWR_FORMAT format; // @llvm_enum ++ uint32_t width; ++ uint32_t height; ++ uint32_t depth; ++ uint32_t numSamples; ++ uint32_t pitch; ++ uint32_t qpitch; ++ uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler ++ uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed ++ float resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler ++ uint32_t lod; // for render targets, the lod being rendered to ++ uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces ++ SWR_TILE_MODE tileMode; // @llvm_enum ++ uint32_t halign; ++ uint32_t valign; ++ ++ uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces ++ ++ uint8_t *pAuxBaseAddress; // Used for compression, append/consume counter, etc. ++}; ++ ++// vertex fetch state ++// WARNING- any changes to this struct need to be reflected ++// in the fetch shader jit ++struct SWR_VERTEX_BUFFER_STATE ++{ ++ uint32_t index; ++ uint32_t pitch; ++ const uint8_t *pData; ++ uint32_t size; ++ uint32_t numaNode; ++ uint32_t maxVertex; // size / pitch. precalculated value used by fetch shader for OOB checks ++ uint32_t partialInboundsSize; // size % pitch. precalculated value used by fetch shader for partially OOB vertices ++}; ++ ++struct SWR_INDEX_BUFFER_STATE ++{ ++ // Format type for indices (e.g. UINT16, UINT32, etc.) ++ SWR_FORMAT format; // @llvm_enum ++ const void *pIndices; ++ uint32_t size; ++}; ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_FETCH_CONTEXT ++/// @brief Input to fetch shader. ++/// @note WARNING - Changes to this struct need to be reflected in the ++/// fetch shader jit. ++///////////////////////////////////////////////////////////////////////// ++struct SWR_FETCH_CONTEXT ++{ ++ const SWR_VERTEX_BUFFER_STATE* pStreams; // IN: array of bound vertex buffers ++ const int32_t* pIndices; // IN: pointer to index buffer for indexed draws ++ const int32_t* pLastIndex; // IN: pointer to end of index buffer, used for bounds checking ++ uint32_t CurInstance; // IN: current instance ++ uint32_t BaseVertex; // IN: base vertex ++ uint32_t StartVertex; // IN: start vertex ++ uint32_t StartInstance; // IN: start instance ++ simdscalari VertexID; // OUT: vector of vertex IDs ++ simdscalari CutMask; // OUT: vector mask of indices which have the cut index value ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_STATS ++/// ++/// @brief All statistics generated by SWR go here. These are public ++/// to driver. ++///////////////////////////////////////////////////////////////////////// ++struct SWR_STATS ++{ ++ // Occlusion Query ++ uint64_t DepthPassCount; // Number of passing depth tests. Not exact. ++ ++ // Pipeline Stats ++ uint64_t IaVertices; // Number of Fetch Shader vertices ++ uint64_t IaPrimitives; // Number of PA primitives. ++ uint64_t VsInvocations; // Number of Vertex Shader invocations ++ uint64_t HsInvocations; // Number of Hull Shader invocations ++ uint64_t DsInvocations; // Number of Domain Shader invocations ++ uint64_t GsInvocations; // Number of Geometry Shader invocations ++ uint64_t PsInvocations; // Number of Pixel Shader invocations ++ uint64_t CsInvocations; // Number of Compute Shader invocations ++ uint64_t CInvocations; // Number of clipper invocations ++ uint64_t CPrimitives; // Number of clipper primitives. ++ uint64_t GsPrimitives; // Number of prims GS outputs. ++ ++ // Streamout Stats ++ uint32_t SoWriteOffset[4]; ++ uint64_t SoPrimStorageNeeded[4]; ++ uint64_t SoNumPrimsWritten[4]; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// STREAMOUT_BUFFERS ++///////////////////////////////////////////////////////////////////////// ++ ++#define MAX_SO_STREAMS 4 ++#define MAX_ATTRIBUTES 32 ++ ++struct SWR_STREAMOUT_BUFFER ++{ ++ bool enable; ++ ++ // Pointers to streamout buffers. ++ uint32_t* pBuffer; ++ ++ // Size of buffer in dwords. ++ uint32_t bufferSize; ++ ++ // Vertex pitch of buffer in dwords. ++ uint32_t pitch; ++ ++ // Offset into buffer in dwords. SOS will increment this offset. ++ uint32_t streamOffset; ++ ++ // Offset to the SO write offset. If not null then we update offset here. ++ uint32_t* pWriteOffset; ++ ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// STREAMOUT_STATE ++///////////////////////////////////////////////////////////////////////// ++struct SWR_STREAMOUT_STATE ++{ ++ // This disables stream output. ++ bool soEnable; ++ ++ // which streams are enabled for streamout ++ bool streamEnable[MAX_SO_STREAMS]; ++ ++ // If set then do not send any streams to the rasterizer. ++ bool rasterizerDisable; ++ ++ // Specifies which stream to send to the rasterizer. ++ uint32_t streamToRasterizer; ++ ++ // The stream masks specify which attributes are sent to which streams. ++ // These masks help the FE to setup the pPrimData buffer that is passed ++ // the the Stream Output Shader (SOS) function. ++ uint32_t streamMasks[MAX_SO_STREAMS]; ++ ++ // Number of attributes, including position, per vertex that are streamed out. ++ // This should match number of bits in stream mask. ++ uint32_t streamNumEntries[MAX_SO_STREAMS]; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// STREAMOUT_CONTEXT - Passed to SOS ++///////////////////////////////////////////////////////////////////////// ++struct SWR_STREAMOUT_CONTEXT ++{ ++ uint32_t* pPrimData; ++ SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS]; ++ ++ // Num prims written for this stream ++ uint32_t numPrimsWritten; ++ ++ // Num prims that should have been written if there were no overflow. ++ uint32_t numPrimStorageNeeded; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_GS_STATE - Geometry shader state ++///////////////////////////////////////////////////////////////////////// ++struct SWR_GS_STATE ++{ ++ bool gsEnable; ++ ++ // number of input attributes per vertex. used by the frontend to ++ // optimize assembling primitives for GS ++ uint32_t numInputAttribs; ++ ++ // output topology - can be point, tristrip, or linestrip ++ PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum ++ ++ // maximum number of verts that can be emitted by a single instance of the GS ++ uint32_t maxNumVerts; ++ ++ // instance count ++ uint32_t instanceCount; ++ ++ // geometry shader emits renderTargetArrayIndex ++ bool emitsRenderTargetArrayIndex; ++ ++ // geometry shader emits PrimitiveID ++ bool emitsPrimitiveID; ++}; ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS ++///////////////////////////////////////////////////////////////////////// ++enum SWR_TS_OUTPUT_TOPOLOGY ++{ ++ SWR_TS_OUTPUT_POINT, ++ SWR_TS_OUTPUT_LINE, ++ SWR_TS_OUTPUT_TRI_CW, ++ SWR_TS_OUTPUT_TRI_CCW, ++ ++ SWR_TS_OUTPUT_TOPOLOGY_COUNT ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TS_PARTITIONING - Defines tessellation algorithm ++///////////////////////////////////////////////////////////////////////// ++enum SWR_TS_PARTITIONING ++{ ++ SWR_TS_INTEGER, ++ SWR_TS_ODD_FRACTIONAL, ++ SWR_TS_EVEN_FRACTIONAL, ++ ++ SWR_TS_PARTITIONING_COUNT ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TS_DOMAIN - Defines Tessellation Domain ++///////////////////////////////////////////////////////////////////////// ++enum SWR_TS_DOMAIN ++{ ++ SWR_TS_QUAD, ++ SWR_TS_TRI, ++ SWR_TS_ISOLINE, ++ ++ SWR_TS_DOMAIN_COUNT ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_TS_STATE - Tessellation state ++///////////////////////////////////////////////////////////////////////// ++struct SWR_TS_STATE ++{ ++ bool tsEnable; ++ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum ++ SWR_TS_PARTITIONING partitioning; // @llvm_enum ++ SWR_TS_DOMAIN domain; // @llvm_enum ++ ++ PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum ++ ++ uint32_t numHsInputAttribs; ++ uint32_t numHsOutputAttribs; ++ uint32_t numDsOutputAttribs; ++}; ++ ++// output merger state ++struct SWR_RENDER_TARGET_BLEND_STATE ++{ ++ uint32_t colorBlendEnable : 1; ++ uint32_t sourceAlphaBlendFactor : 5; ++ uint32_t destAlphaBlendFactor : 5; ++ uint32_t sourceBlendFactor : 5; ++ uint32_t destBlendFactor : 5; ++ uint32_t colorBlendFunc : 3; ++ uint32_t alphaBlendFunc : 3; ++ ++ uint32_t writeDisableRed : 1; ++ uint32_t writeDisableGreen : 1; ++ uint32_t writeDisableBlue : 1; ++ uint32_t writeDisableAlpha : 1; ++}; ++static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 4, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); ++ ++struct SWR_BLEND_STATE ++{ ++ float constantColor[4]; // constant blend factor color in RGBA float ++ bool independentAlphaBlendEnable; ++ SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS]; ++}; ++static_assert(sizeof(SWR_BLEND_STATE) == 52, "Invalid SWR_BLEND_STATE size"); ++ ++////////////////////////////////////////////////////////////////////////// ++/// FUNCTION POINTERS FOR SHADERS ++ ++typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); ++typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext); ++typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext); ++typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext); ++typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext); ++typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext); ++typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); ++typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); ++typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, BYTE*, simdvector&); ++ ++////////////////////////////////////////////////////////////////////////// ++/// FRONTEND_STATE ++///////////////////////////////////////////////////////////////////////// ++struct SWR_FRONTEND_STATE ++{ ++ // skip clip test, perspective divide, and viewport transform ++ // intended for verts in screen space ++ bool vpTransformDisable; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// VIEWPORT_MATRIX ++///////////////////////////////////////////////////////////////////////// ++struct SWR_VIEWPORT_MATRIX ++{ ++ float m00; ++ float m11; ++ float m22; ++ float m30; ++ float m31; ++ float m32; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_VIEWPORT ++///////////////////////////////////////////////////////////////////////// ++struct SWR_VIEWPORT ++{ ++ float x; ++ float y; ++ float width; ++ float height; ++ float minZ; ++ float maxZ; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_CULLMODE ++////////////////////////////////////////////////////////////////////////// ++enum SWR_CULLMODE ++{ ++ SWR_CULLMODE_BOTH, ++ SWR_CULLMODE_NONE, ++ SWR_CULLMODE_FRONT, ++ SWR_CULLMODE_BACK ++}; ++ ++enum SWR_FILLMODE ++{ ++ SWR_FILLMODE_POINT, ++ SWR_FILLMODE_WIREFRAME, ++ SWR_FILLMODE_SOLID ++}; ++ ++enum SWR_FRONTWINDING ++{ ++ SWR_FRONTWINDING_CW, ++ SWR_FRONTWINDING_CCW ++}; ++ ++#define SWR_MAX_NUM_MULTISAMPLES 16 ++enum SWR_MULTISAMPLE_COUNT ++{ ++ SWR_MULTISAMPLE_1X, ++ SWR_MULTISAMPLE_2X, ++ SWR_MULTISAMPLE_4X, ++ SWR_MULTISAMPLE_8X, ++ SWR_MULTISAMPLE_16X, ++ SWR_MULTISAMPLE_TYPE_MAX ++}; ++ ++enum SWR_PIXEL_LOCATION ++{ ++ SWR_PIXEL_LOCATION_CENTER, ++ SWR_PIXEL_LOCATION_UL, ++}; ++ ++// fixed point screen space sample locations within a pixel ++struct SWR_MULTISAMPLE_POS ++{ ++ uint32_t x; ++ uint32_t y; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// SWR_RASTSTATE ++////////////////////////////////////////////////////////////////////////// ++struct SWR_RASTSTATE ++{ ++ uint32_t cullMode : 2; ++ uint32_t fillMode : 2; ++ uint32_t frontWinding : 1; ++ uint32_t scissorEnable : 1; ++ uint32_t depthClipEnable : 1; ++ float pointSize; ++ float lineWidth; ++ ++ // point size output from the VS ++ bool pointParam; ++ uint32_t pointSizeAttrib; ++ ++ // point sprite ++ bool pointSpriteEnable; ++ bool pointSpriteTopOrigin; ++ uint32_t pointSpriteFESlot; ++ ++ // depth bias ++ float depthBias; ++ float slopeScaledDepthBias; ++ float depthBiasClamp; ++ SWR_FORMAT depthFormat; // @llvm_enum ++ ++ // multisample state ++ SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum ++ SWR_MULTISAMPLE_COUNT forcedSampleCount; // @llvm_enum ++ uint32_t pixelLocation; // UL or Center ++ uint32_t sampleMask; ++ uint8_t isSampleMasked[SWR_MAX_NUM_MULTISAMPLES]; ++ bool pixelOffset; // offset pixel positions by .5 in both the horizontal and vertical direction ++ SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES]; ++ ++ // user clip/cull distance enables ++ uint8_t cullDistanceMask; ++ uint8_t clipDistanceMask; ++}; ++ ++// backend state ++struct SWR_BACKEND_STATE ++{ ++ uint32_t constantInterpolationMask; ++ uint8_t numAttributes; ++ uint8_t numComponents[KNOB_NUM_ATTRIBUTES]; ++}; ++ ++union SWR_DEPTH_STENCIL_STATE ++{ ++ struct ++ { ++ // dword 0 ++ uint32_t depthWriteEnable : 1; ++ uint32_t depthTestEnable : 1; ++ uint32_t stencilWriteEnable : 1; ++ uint32_t stencilTestEnable : 1; ++ uint32_t doubleSidedStencilTestEnable : 1; ++ ++ uint32_t depthTestFunc : 3; ++ uint32_t stencilTestFunc : 3; ++ ++ uint32_t backfaceStencilPassDepthPassOp : 3; ++ uint32_t backfaceStencilPassDepthFailOp : 3; ++ uint32_t backfaceStencilFailOp : 3; ++ uint32_t backfaceStencilTestFunc : 3; ++ uint32_t stencilPassDepthPassOp : 3; ++ uint32_t stencilPassDepthFailOp : 3; ++ uint32_t stencilFailOp : 3; ++ ++ // dword 1 ++ uint8_t backfaceStencilWriteMask; ++ uint8_t backfaceStencilTestMask; ++ uint8_t stencilWriteMask; ++ uint8_t stencilTestMask; ++ ++ // dword 2 ++ uint8_t backfaceStencilRefValue; ++ uint8_t stencilRefValue; ++ }; ++ uint32_t value[3]; ++}; ++ ++enum SWR_SHADING_RATE ++{ ++ SWR_SHADING_RATE_PIXEL, ++ SWR_SHADING_RATE_SAMPLE, ++ SWR_SHADING_RATE_COARSE, ++ SWR_SHADING_RATE_MAX, ++}; ++ ++// pixel shader state ++struct SWR_PS_STATE ++{ ++ // dword 0-1 ++ PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn ++ ++ // dword 2 ++ uint32_t killsPixel : 1; // pixel shader can kill pixels ++ uint32_t writesODepth : 1; // pixel shader writes to depth ++ uint32_t usesSourceDepth: 1; // pixel shader reads depth ++ uint32_t maxRTSlotUsed : 3; // maximum render target slot pixel shader writes to [0..7] ++ uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h +new file mode 100644 +index 0000000..915ac77 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h +@@ -0,0 +1,88 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file tessellator.h ++* ++* @brief Tessellator fixed function unit interface definition ++* ++******************************************************************************/ ++#pragma once ++ ++/// Allocate and initialize a new tessellation context ++HANDLE SWR_API TSInitCtx( ++ SWR_TS_DOMAIN tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle) ++ SWR_TS_PARTITIONING tsPartitioning, ///< [IN] Tessellation partitioning algorithm ++ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology ++ void* pContextMem, ///< [IN] Memory to use for the context ++ size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required ++ ++/// Destroy & de-allocate tessellation context ++void SWR_API TSDestroyCtx( ++ HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed ++ ++struct SWR_TS_TESSELLATED_DATA ++{ ++ uint32_t NumPrimitives; ++ uint32_t NumDomainPoints; ++ ++ uint32_t* ppIndices[3]; ++ float* pDomainPointsU; ++ float* pDomainPointsV; ++ // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i] ++}; ++ ++/// Perform Tessellation ++void SWR_API TSTessellate( ++ HANDLE tsCtx, ///< [IN] Tessellation Context ++ const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors ++ SWR_TS_TESSELLATED_DATA& tsTessellatedData); ///< [OUT] Tessellated Data ++ ++ ++ ++/// @TODO - Implement OSS tessellator ++ ++INLINE HANDLE SWR_API TSInitCtx( ++ SWR_TS_DOMAIN tsDomain, ++ SWR_TS_PARTITIONING tsPartitioning, ++ SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ++ void* pContextMem, ++ size_t& memSize) ++{ ++ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); ++ return NULL; ++} ++ ++ ++INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx) ++{ ++ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); ++} ++ ++ ++INLINE void SWR_API TSTessellate( ++ HANDLE tsCtx, ++ const SWR_TESSELLATION_FACTORS& tsTessFactors, ++ SWR_TS_TESSELLATED_DATA& tsTessellatedData) ++{ ++ SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__); ++} ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp +new file mode 100644 +index 0000000..590bed4 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp +@@ -0,0 +1,884 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(__linux__) || defined(__gnu_linux__) ++#include ++#include ++#include ++#include ++#endif ++ ++#include "common/os.h" ++#include "context.h" ++#include "frontend.h" ++#include "backend.h" ++#include "rasterizer.h" ++#include "rdtsc_core.h" ++#include "tilemgr.h" ++#include "core/multisample.h" ++ ++// ThreadId ++struct Core ++{ ++ uint32_t procGroup = 0; ++ std::vector threadIds; ++}; ++ ++struct NumaNode ++{ ++ std::vector cores; ++}; ++ ++typedef std::vector CPUNumaNodes; ++ ++void CalculateProcessorTopology(CPUNumaNodes& out_nodes) ++{ ++ out_nodes.clear(); ++#if defined(_WIN32) ++ ++ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; ++ DWORD bufSize = sizeof(buffer); ++ ++ BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); ++ SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); ++ ++ uint32_t count = bufSize / buffer->Size; ++ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; ++ ++ for (uint32_t i = 0; i < count; ++i) ++ { ++ SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); ++ for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) ++ { ++ auto& gmask = pBuffer->Processor.GroupMask[g]; ++ uint32_t threadId = 0; ++ uint32_t procGroup = gmask.Group; ++ ++ Core* pCore = nullptr; ++ ++ uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); ++ ++ while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) ++ { ++ // clear mask ++ gmask.Mask &= ~(KAFFINITY(1) << threadId); ++ ++ // Find Numa Node ++ PROCESSOR_NUMBER procNum = {}; ++ procNum.Group = WORD(procGroup); ++ procNum.Number = UCHAR(threadId); ++ ++ uint32_t numaId = 0; ++ ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); ++ SWR_ASSERT(ret); ++ ++ // Store data ++ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); ++ auto& numaNode = out_nodes[numaId]; ++ ++ uint32_t coreId = 0; ++ ++ if (nullptr == pCore) ++ { ++ numaNode.cores.push_back(Core()); ++ pCore = &numaNode.cores.back(); ++ pCore->procGroup = procGroup; ++#if !defined(_WIN64) ++ coreId = (uint32_t)numaNode.cores.size(); ++ if ((coreId * numThreads) >= 32) ++ { ++ // Windows doesn't return threadIds >= 32 for a processor group correctly ++ // when running a 32-bit application. ++ // Just save -1 as the threadId ++ threadId = uint32_t(-1); ++ } ++#endif ++ } ++ pCore->threadIds.push_back(threadId); ++ } ++ } ++ pBuffer = PtrAdd(pBuffer, pBuffer->Size); ++ } ++ ++ ++#elif defined(__linux__) || defined (__gnu_linux__) ++ ++ // Parse /proc/cpuinfo to get full topology ++ std::ifstream input("/proc/cpuinfo"); ++ std::string line; ++ char* c; ++ uint32_t threadId = uint32_t(-1); ++ uint32_t coreId = uint32_t(-1); ++ uint32_t numaId = uint32_t(-1); ++ ++ while (std::getline(input, line)) ++ { ++ if (line.find("processor") != std::string::npos) ++ { ++ if (threadId != uint32_t(-1)) ++ { ++ // Save information. ++ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); ++ auto& numaNode = out_nodes[numaId]; ++ if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); ++ auto& core = numaNode.cores[coreId]; ++ ++ core.procGroup = coreId; ++ core.threadIds.push_back(threadId); ++ } ++ ++ auto data_start = line.find(": ") + 2; ++ threadId = std::strtoul(&line.c_str()[data_start], &c, 10); ++ continue; ++ } ++ if (line.find("core id") != std::string::npos) ++ { ++ auto data_start = line.find(": ") + 2; ++ coreId = std::strtoul(&line.c_str()[data_start], &c, 10); ++ continue; ++ } ++ if (line.find("physical id") != std::string::npos) ++ { ++ auto data_start = line.find(": ") + 2; ++ numaId = std::strtoul(&line.c_str()[data_start], &c, 10); ++ continue; ++ } ++ } ++ ++ if (threadId != uint32_t(-1)) ++ { ++ // Save information. ++ if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); ++ auto& numaNode = out_nodes[numaId]; ++ if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); ++ auto& core = numaNode.cores[coreId]; ++ ++ core.procGroup = coreId; ++ core.threadIds.push_back(threadId); ++ } ++ ++ for (uint32_t node = 0; node < out_nodes.size(); node++) { ++ auto& numaNode = out_nodes[node]; ++ auto it = numaNode.cores.begin(); ++ for ( ; it != numaNode.cores.end(); ) { ++ if (it->threadIds.size() == 0) ++ numaNode.cores.erase(it); ++ else ++ ++it; ++ } ++ } ++ ++#else ++ ++#error Unsupported platform ++ ++#endif ++} ++ ++ ++void bindThread(uint32_t threadId, uint32_t procGroupId = 0) ++{ ++#if defined(_WIN32) ++ { ++ GROUP_AFFINITY affinity = {}; ++ affinity.Group = procGroupId; ++ ++#if !defined(_WIN64) ++ if (threadId >= 32) ++ { ++ // In a 32-bit process on Windows it is impossible to bind ++ // to logical processors 32-63 within a processor group. ++ // In this case set the mask to 0 and let the system assign ++ // the processor. Hopefully it will make smart choices. ++ affinity.Mask = 0; ++ } ++ else ++#endif ++ { ++ affinity.Mask = KAFFINITY(1) << threadId; ++ } ++ ++ SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr); ++ } ++#else ++ cpu_set_t cpuset; ++ pthread_t thread = pthread_self(); ++ CPU_ZERO(&cpuset); ++ CPU_SET(threadId, &cpuset); ++ ++ pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset); ++#endif ++} ++ ++INLINE ++uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) ++{ ++ //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); ++ //return result; ++ return pContext->DrawEnqueued; ++} ++ ++INLINE ++DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId) ++{ ++ return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT]; ++} ++ ++// returns true if dependency not met ++INLINE ++bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw) ++{ ++ return (pDC->dependency > lastRetiredDraw); ++} ++ ++void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. ++{ ++ // Load clear color into SIMD register... ++ float *pClearData = (float*)(pHotTile->clearData); ++ simdscalar valR = _simd_broadcast_ss(&pClearData[0]); ++ simdscalar valG = _simd_broadcast_ss(&pClearData[1]); ++ simdscalar valB = _simd_broadcast_ss(&pClearData[2]); ++ simdscalar valA = _simd_broadcast_ss(&pClearData[3]); ++ ++ float *pfBuf = (float*)pHotTile->pBuffer; ++ uint32_t numSamples = pHotTile->numSamples; ++ ++ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) ++ { ++ _simd_store_ps(pfBuf, valR); ++ pfBuf += KNOB_SIMD_WIDTH; ++ _simd_store_ps(pfBuf, valG); ++ pfBuf += KNOB_SIMD_WIDTH; ++ _simd_store_ps(pfBuf, valB); ++ pfBuf += KNOB_SIMD_WIDTH; ++ _simd_store_ps(pfBuf, valA); ++ pfBuf += KNOB_SIMD_WIDTH; ++ } ++ } ++ } ++} ++ ++void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. ++{ ++ // Load clear color into SIMD register... ++ float *pClearData = (float*)(pHotTile->clearData); ++ simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); ++ ++ float *pfBuf = (float*)pHotTile->pBuffer; ++ uint32_t numSamples = pHotTile->numSamples; ++ ++ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) ++ { ++ _simd_store_ps(pfBuf, valZ); ++ pfBuf += KNOB_SIMD_WIDTH; ++ } ++ } ++ } ++} ++ ++void ClearStencilHotTile(const HOTTILE* pHotTile) ++{ ++ // convert from F32 to U8. ++ uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); ++ //broadcast 32x into __m256i... ++ simdscalari valS = _simd_set1_epi8(clearVal); ++ ++ simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; ++ uint32_t numSamples = pHotTile->numSamples; ++ ++ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. ++ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) ++ { ++ _simd_store_si(pBuf, valS); ++ pBuf += 1; ++ } ++ } ++ } ++} ++ ++// for draw calls, we initialize the active hot tiles and perform deferred ++// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside ++// the draw routine itself mainly for performance, to avoid unnecessary setup ++// every triangle ++// @todo support deferred clear ++INLINE ++void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) ++{ ++ const API_STATE& state = GetApiState(pDC); ++ HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; ++ const SWR_PS_STATE& psState = state.psState; ++ uint32_t numRTs = psState.maxRTSlotUsed + 1; ++ ++ uint32_t x, y; ++ MacroTileMgr::getTileIndices(macroID, x, y); ++ x *= KNOB_MACROTILE_X_DIM; ++ y *= KNOB_MACROTILE_Y_DIM; ++ ++ uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); ++ ++ // check RT if enabled ++ if (state.psState.pfnPixelShader != nullptr) ++ { ++ for (uint32_t rt = 0; rt < numRTs; ++rt) ++ { ++ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples); ++ ++ if (pHotTile->state == HOTTILE_INVALID) ++ { ++ RDTSC_START(BELoadTiles); ++ // invalid hottile before draw requires a load from surface before we can draw to it ++ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); ++ pHotTile->state = HOTTILE_DIRTY; ++ RDTSC_STOP(BELoadTiles, 0, 0); ++ } ++ else if (pHotTile->state == HOTTILE_CLEAR) ++ { ++ RDTSC_START(BELoadTiles); ++ // Clear the tile. ++ ClearColorHotTile(pHotTile); ++ pHotTile->state = HOTTILE_DIRTY; ++ RDTSC_STOP(BELoadTiles, 0, 0); ++ } ++ } ++ } ++ ++ // check depth if enabled ++ if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable) ++ { ++ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); ++ if (pHotTile->state == HOTTILE_INVALID) ++ { ++ RDTSC_START(BELoadTiles); ++ // invalid hottile before draw requires a load from surface before we can draw to it ++ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); ++ pHotTile->state = HOTTILE_DIRTY; ++ RDTSC_STOP(BELoadTiles, 0, 0); ++ } ++ else if (pHotTile->state == HOTTILE_CLEAR) ++ { ++ RDTSC_START(BELoadTiles); ++ // Clear the tile. ++ ClearDepthHotTile(pHotTile); ++ pHotTile->state = HOTTILE_DIRTY; ++ RDTSC_STOP(BELoadTiles, 0, 0); ++ } ++ } ++ ++ // check stencil if enabled ++ if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable) ++ { ++ HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); ++ if (pHotTile->state == HOTTILE_INVALID) ++ { ++ RDTSC_START(BELoadTiles); ++ // invalid hottile before draw requires a load from surface before we can draw to it ++ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); ++ pHotTile->state = HOTTILE_DIRTY; ++ RDTSC_STOP(BELoadTiles, 0, 0); ++ } ++ else if (pHotTile->state == HOTTILE_CLEAR) ++ { ++ RDTSC_START(BELoadTiles); ++ // Clear the tile. ++ ClearStencilHotTile(pHotTile); ++ pHotTile->state = HOTTILE_DIRTY; ++ RDTSC_STOP(BELoadTiles, 0, 0); ++ } ++ } ++} ++ ++INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, volatile uint64_t& curDrawBE) ++{ ++ // increment our current draw id to the first incomplete draw ++ uint64_t drawEnqueued = GetEnqueuedDraw(pContext); ++ while (curDrawBE < drawEnqueued) ++ { ++ DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; ++ ++ // If its not compute and FE is not done then break out of loop. ++ if (!pDC->doneFE && !pDC->isCompute) break; ++ ++ bool isWorkComplete = (pDC->isCompute) ? ++ pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); ++ ++ if (isWorkComplete) ++ { ++ curDrawBE++; ++ } ++ else ++ { ++ break; ++ } ++ } ++ ++ // If there are no more incomplete draws then return false. ++ return (curDrawBE >= drawEnqueued) ? false : true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief If there is any BE work then go work on it. ++/// @param pContext - pointer to SWR context. ++/// @param workerId - The unique worker ID that is assigned to this thread. ++/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread ++/// has its own curDrawBE counter and this ensures that each worker processes all the ++/// draws in order. ++/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its ++/// own set and each time it fails to lock a macrotile, because its already locked, ++/// then it will add that tile to the lockedTiles set. As a worker begins to work ++/// on future draws the lockedTiles ensure that it doesn't work on tiles that may ++/// still have work pending in a previous draw. Additionally, the lockedTiles is ++/// hueristic that can steer a worker back to the same macrotile that it had been ++/// working on in a previous draw. ++void WorkOnFifoBE( ++ SWR_CONTEXT *pContext, ++ uint32_t workerId, ++ volatile uint64_t &curDrawBE, ++ std::unordered_set& lockedTiles) ++{ ++ // Find the first incomplete draw that has pending work. If no such draw is found then ++ // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. ++ if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) ++ { ++ return; ++ } ++ ++ uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; ++ ++ // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. ++ lockedTiles.clear(); ++ ++ // Try to work on each draw in order of the available draws in flight. ++ // 1. If we're on curDrawBE, we can work on any macrotile that is available. ++ // 2. If we're trying to work on draws after curDrawBE, we are restricted to ++ // working on those macrotiles that are known to be complete in the prior draw to ++ // maintain order. The locked tiles provides the history to ensures this. ++ for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i) ++ { ++ DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; ++ ++ if (pDC->isCompute) return; // We don't look at compute work. ++ ++ // First wait for FE to be finished with this draw. This keeps threading model simple ++ // but if there are lots of bubbles between draws then serializing FE and BE may ++ // need to be revisited. ++ if (!pDC->doneFE) break; ++ ++ // If this draw is dependent on a previous draw then we need to bail. ++ if (CheckDependency(pContext, pDC, lastRetiredDraw)) ++ { ++ return; ++ } ++ ++ // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. ++ std::vector ¯oTiles = pDC->pTileMgr->getDirtyTiles(); ++ ++ for (uint32_t tileID : macroTiles) ++ { ++ MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); ++ ++ // can only work on this draw if it's not in use by other threads ++ if (lockedTiles.find(tileID) == lockedTiles.end()) ++ { ++ if (tile.getNumQueued()) ++ { ++ if (tile.tryLock()) ++ { ++ BE_WORK *pWork; ++ ++ RDTSC_START(WorkerFoundWork); ++ ++ uint32_t numWorkItems = tile.getNumQueued(); ++ ++ if (numWorkItems != 0) ++ { ++ pWork = tile.peek(); ++ SWR_ASSERT(pWork); ++ if (pWork->type == DRAW) ++ { ++ InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc); ++ } ++ } ++ ++ while ((pWork = tile.peek()) != nullptr) ++ { ++ pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); ++ tile.dequeue(); ++ } ++ RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); ++ ++ _ReadWriteBarrier(); ++ ++ pDC->pTileMgr->markTileComplete(tileID); ++ ++ // Optimization: If the draw is complete and we're the last one to have worked on it then ++ // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. ++ if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) ++ { ++ // We can increment the current BE and safely move to next draw since we know this draw is complete. ++ curDrawBE++; ++ lastRetiredDraw++; ++ ++ lockedTiles.clear(); ++ break; ++ } ++ } ++ else ++ { ++ // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. ++ lockedTiles.insert(tileID); ++ } ++ } ++ } ++ } ++ } ++} ++ ++void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawFE, UCHAR numaNode) ++{ ++ // Try to grab the next DC from the ring ++ uint64_t drawEnqueued = GetEnqueuedDraw(pContext); ++ while (curDrawFE < drawEnqueued) ++ { ++ uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT; ++ DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; ++ if (pDC->isCompute || pDC->doneFE || pDC->FeLock) ++ { ++ curDrawFE++; ++ } ++ else ++ { ++ break; ++ } ++ } ++ ++ uint64_t curDraw = curDrawFE; ++ while (curDraw < drawEnqueued) ++ { ++ uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; ++ DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; ++ ++ if (!pDC->isCompute && !pDC->FeLock) ++ { ++ uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0); ++ if (initial == 0) ++ { ++ // successfully grabbed the DC, now run the FE ++ pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc); ++ } ++ } ++ curDraw++; ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief If there is any compute work then go work on it. ++/// @param pContext - pointer to SWR context. ++/// @param workerId - The unique worker ID that is assigned to this thread. ++/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread ++/// has its own curDrawBE counter and this ensures that each worker processes all the ++/// draws in order. ++void WorkOnCompute( ++ SWR_CONTEXT *pContext, ++ uint32_t workerId, ++ volatile uint64_t& curDrawBE) ++{ ++ if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) ++ { ++ return; ++ } ++ ++ uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; ++ ++ DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; ++ if (pDC->isCompute == false) return; ++ ++ // check dependencies ++ if (CheckDependency(pContext, pDC, lastRetiredDraw)) ++ { ++ return; ++ } ++ ++ SWR_ASSERT(pDC->pDispatch != nullptr); ++ DispatchQueue& queue = *pDC->pDispatch; ++ ++ // Is there any work remaining? ++ if (queue.getNumQueued() > 0) ++ { ++ bool lastToComplete = false; ++ ++ uint32_t threadGroupId = 0; ++ while (queue.getWork(threadGroupId)) ++ { ++ ProcessComputeBE(pDC, workerId, threadGroupId); ++ ++ lastToComplete = queue.finishedWork(); ++ } ++ ++ _ReadWriteBarrier(); ++ ++ if (lastToComplete) ++ { ++ SWR_ASSERT(queue.isWorkComplete() == true); ++ pDC->doneCompute = true; ++ } ++ } ++} ++ ++DWORD workerThread(LPVOID pData) ++{ ++ THREAD_DATA *pThreadData = (THREAD_DATA*)pData; ++ SWR_CONTEXT *pContext = pThreadData->pContext; ++ uint32_t threadId = pThreadData->threadId; ++ uint32_t workerId = pThreadData->workerId; ++ ++ bindThread(threadId, pThreadData->procGroupId); ++ ++ RDTSC_INIT(threadId); ++ ++ int numaNode = (int)pThreadData->numaId; ++ ++ // flush denormals to 0 ++ _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); ++ ++ // Track tiles locked by other threads. If we try to lock a macrotile and find its already ++ // locked then we'll add it to this list so that we don't try and lock it again. ++ std::unordered_set lockedTiles; ++ ++ // each worker has the ability to work on any of the queued draws as long as certain ++ // conditions are met. the data associated ++ // with a draw is guaranteed to be active as long as a worker hasn't signaled that he ++ // has moved on to the next draw when he determines there is no more work to do. The api ++ // thread will not increment the head of the dc ring until all workers have moved past the ++ // current head. ++ // the logic to determine what to work on is: ++ // 1- try to work on the FE any draw that is queued. For now there are no dependencies ++ // on the FE work, so any worker can grab any FE and process in parallel. Eventually ++ // we'll need dependency tracking to force serialization on FEs. The worker will try ++ // to pick an FE by atomically incrementing a counter in the swr context. he'll keep ++ // trying until he reaches the tail. ++ // 2- BE work must be done in strict order. we accomplish this today by pulling work off ++ // the oldest draw (ie the head) of the dcRing. the worker can determine if there is ++ // any work left by comparing the total # of binned work items and the total # of completed ++ // work items. If they are equal, then there is no more work to do for this draw, and ++ // the worker can safely increment its oldestDraw counter and move on to the next draw. ++ std::unique_lock lock(pContext->WaitLock, std::defer_lock); ++ while (pContext->threadPool.inThreadShutdown == false) ++ { ++ uint32_t loop = 0; ++ while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued) ++ { ++ _mm_pause(); ++ } ++ ++ if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued) ++ { ++ lock.lock(); ++ ++ // check for thread idle condition again under lock ++ if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued) ++ { ++ lock.unlock(); ++ continue; ++ } ++ ++ if (pContext->threadPool.inThreadShutdown) ++ { ++ lock.unlock(); ++ break; ++ } ++ ++ RDTSC_START(WorkerWaitForThreadEvent); ++ ++ pContext->FifosNotEmpty.wait(lock); ++ lock.unlock(); ++ ++ RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); ++ ++ if (pContext->threadPool.inThreadShutdown) ++ { ++ break; ++ } ++ } ++ ++ RDTSC_START(WorkerWorkOnFifoBE); ++ WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles); ++ RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); ++ ++ WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]); ++ ++ WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode); ++ } ++ ++ return 0; ++} ++ ++void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) ++{ ++ // Bind application thread to HW thread 0 ++ bindThread(0); ++ ++ CPUNumaNodes nodes; ++ CalculateProcessorTopology(nodes); ++ ++ uint32_t numHWNodes = (uint32_t)nodes.size(); ++ uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); ++ uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); ++ ++ uint32_t numNodes = numHWNodes; ++ uint32_t numCoresPerNode = numHWCoresPerNode; ++ uint32_t numHyperThreads = numHWHyperThreads; ++ ++ if (KNOB_MAX_NUMA_NODES) ++ { ++ numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); ++ } ++ ++ if (KNOB_MAX_CORES_PER_NUMA_NODE) ++ { ++ numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE); ++ } ++ ++ if (KNOB_MAX_THREADS_PER_CORE) ++ { ++ numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); ++ } ++ ++ // Calculate numThreads ++ uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; ++ ++ if (numThreads > KNOB_MAX_NUM_THREADS) ++ { ++ printf("WARNING: system thread count %u exceeds max %u, " ++ "performance will be degraded\n", ++ numThreads, KNOB_MAX_NUM_THREADS); ++ } ++ ++ if (numThreads == 1) ++ { ++ // If only 1 worker thread, try to move it to an available ++ // HW thread. If that fails, use the API thread. ++ if (numCoresPerNode < numHWCoresPerNode) ++ { ++ numCoresPerNode++; ++ } ++ else if (numHyperThreads < numHWHyperThreads) ++ { ++ numHyperThreads++; ++ } ++ else if (numNodes < numHWNodes) ++ { ++ numNodes++; ++ } ++ else ++ { ++ pPool->numThreads = 0; ++ SET_KNOB(SINGLE_THREADED, true); ++ return; ++ } ++ } ++ else ++ { ++ // Save a HW thread for the API thread. ++ numThreads--; ++ } ++ ++ pPool->numThreads = numThreads; ++ pContext->NumWorkerThreads = pPool->numThreads; ++ ++ pPool->inThreadShutdown = false; ++ pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); ++ ++ uint32_t workerId = 0; ++ for (uint32_t n = 0; n < numNodes; ++n) ++ { ++ auto& node = nodes[n]; ++ ++ uint32_t numCores = numCoresPerNode; ++ for (uint32_t c = 0; c < numCores; ++c) ++ { ++ auto& core = node.cores[c]; ++ for (uint32_t t = 0; t < numHyperThreads; ++t) ++ { ++ if (c == 0 && n == 0 && t == 0) ++ { ++ // Skip core 0, thread0 on node 0 to reserve for API thread ++ continue; ++ } ++ ++ pPool->pThreadData[workerId].workerId = workerId; ++ pPool->pThreadData[workerId].procGroupId = core.procGroup; ++ pPool->pThreadData[workerId].threadId = core.threadIds[t]; ++ pPool->pThreadData[workerId].numaId = n; ++ pPool->pThreadData[workerId].pContext = pContext; ++ pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]); ++ ++ ++workerId; ++ } ++ } ++ } ++} ++ ++void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) ++{ ++ if (!KNOB_SINGLE_THREADED) ++ { ++ // Inform threads to finish up ++ std::unique_lock lock(pContext->WaitLock); ++ pPool->inThreadShutdown = true; ++ _mm_mfence(); ++ pContext->FifosNotEmpty.notify_all(); ++ lock.unlock(); ++ ++ // Wait for threads to finish and destroy them ++ for (uint32_t t = 0; t < pPool->numThreads; ++t) ++ { ++ pPool->threads[t]->join(); ++ delete(pPool->threads[t]); ++ } ++ ++ // Clean up data used by threads ++ free(pPool->pThreadData); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h +new file mode 100644 +index 0000000..0c91bf8 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/threads.h +@@ -0,0 +1,62 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file threads.h ++* ++* @brief Definitions for SWR threading model. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "knobs.h" ++ ++#include ++#include ++typedef std::thread* THREAD_PTR; ++ ++struct SWR_CONTEXT; ++ ++struct THREAD_DATA ++{ ++ uint32_t procGroupId; // Will always be 0 for non-Windows OS ++ uint32_t threadId; // within the procGroup for Windows ++ uint32_t numaId; // NUMA node id ++ uint32_t workerId; ++ SWR_CONTEXT *pContext; ++}; ++ ++ ++struct THREAD_POOL ++{ ++ THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; ++ uint32_t numThreads; ++ volatile bool inThreadShutdown; ++ THREAD_DATA *pThreadData; ++}; ++ ++void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); ++void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); ++ ++// Expose FE and BE worker functions to the API thread if single threaded ++void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawFE, UCHAR numaNode); ++void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawBE, std::unordered_set &usedTiles); ++void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t &curDrawBE); +diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +new file mode 100644 +index 0000000..24b4b60 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +@@ -0,0 +1,105 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file tilemgr.cpp ++* ++* @brief Implementation for Macro Tile Manager which provides the facilities ++* for threads to work on an macro tile. ++* ++******************************************************************************/ ++#include ++ ++#include "fifo.hpp" ++#include "tilemgr.h" ++ ++#define TILE_ID(x,y) ((x << 16 | y)) ++ ++// override new/delete for alignment ++void *MacroTileMgr::operator new(size_t size) ++{ ++ return _aligned_malloc(size, 64); ++} ++ ++void MacroTileMgr::operator delete(void *p) ++{ ++ _aligned_free(p); ++} ++ ++void* DispatchQueue::operator new(size_t size) ++{ ++ return _aligned_malloc(size, 64); ++} ++ ++void DispatchQueue::operator delete(void *p) ++{ ++ _aligned_free(p); ++} ++ ++MacroTileMgr::MacroTileMgr() ++{ ++} ++ ++void MacroTileMgr::initialize() ++{ ++ mWorkItemsProduced = 0; ++ mWorkItemsConsumed = 0; ++ ++ mDirtyTiles.clear(); ++} ++ ++void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) ++{ ++ // Should not enqueue more then what we have backing for in the hot tile manager. ++ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); ++ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); ++ ++ uint32_t id = TILE_ID(x, y); ++ ++ MacroTileQueue &tile = mTiles[id]; ++ tile.mWorkItemsFE++; ++ ++ if (tile.mWorkItemsFE == 1) ++ { ++ tile.clear(); ++ mDirtyTiles.push_back(id); ++ } ++ ++ mWorkItemsProduced++; ++ tile.enqueue_try_nosync(pWork); ++} ++ ++void MacroTileMgr::markTileComplete(uint32_t id) ++{ ++ SWR_ASSERT(mTiles.find(id) != mTiles.end()); ++ MacroTileQueue &tile = mTiles[id]; ++ uint32_t numTiles = tile.mWorkItemsFE; ++ InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); ++ ++ _ReadWriteBarrier(); ++ tile.mWorkItemsBE += numTiles; ++ SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE); ++ ++ // clear out tile, but defer fifo clear until the next DC first queues to it. ++ // this prevents worker threads from constantly locking a completed macro tile ++ tile.mWorkItemsFE = 0; ++ tile.mWorkItemsBE = 0; ++} +diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +new file mode 100644 +index 0000000..b537730 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +@@ -0,0 +1,392 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file tilemgr.h ++* ++* @brief Definitions for Macro Tile Manager which provides the facilities ++* for threads to work on an macro tile. ++* ++******************************************************************************/ ++#pragma once ++ ++#include ++#include ++#include "common/formats.h" ++#include "fifo.hpp" ++#include "context.h" ++#include "format_traits.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// MacroTile - work queue for a tile. ++////////////////////////////////////////////////////////////////////////// ++struct MacroTileQueue ++{ ++ MacroTileQueue() ++ { ++ mFifo.initialize(); ++ } ++ ++ ~MacroTileQueue() { } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Returns number of work items queued for this tile. ++ uint32_t getNumQueued() ++ { ++ return mFifo.getNumQueued(); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Attempt to lock the work fifo. If already locked then return false. ++ bool tryLock() ++ { ++ return mFifo.tryLock(); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Clear fifo and unlock it. ++ void clear() ++ { ++ mFifo.clear(); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Peek at work sitting at the front of the fifo. ++ BE_WORK* peek() ++ { ++ return mFifo.peek(); ++ } ++ ++ bool enqueue_try_nosync(const BE_WORK* entry) ++ { ++ return mFifo.enqueue_try_nosync(entry); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Move to next work item ++ void dequeue() ++ { ++ mFifo.dequeue_noinc(); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Destroy fifo ++ void destroy() ++ { ++ mFifo.destroy(); ++ } ++ ++ ///@todo This will all be private. ++ uint32_t mWorkItemsFE = 0; ++ uint32_t mWorkItemsBE = 0; ++ ++private: ++ QUEUE mFifo; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// MacroTileMgr - Manages macrotiles for a draw. ++////////////////////////////////////////////////////////////////////////// ++class MacroTileMgr ++{ ++public: ++ MacroTileMgr(); ++ ~MacroTileMgr() ++ { ++ for (auto &tile : mTiles) ++ { ++ tile.second.destroy(); ++ } ++ } ++ ++ void initialize(); ++ INLINE std::vector& getDirtyTiles() { return mDirtyTiles; } ++ INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; } ++ void markTileComplete(uint32_t id); ++ ++ INLINE bool isWorkComplete() ++ { ++ return mWorkItemsProduced == mWorkItemsConsumed; ++ } ++ ++ void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork); ++ ++ static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y) ++ { ++ y = tileID & 0xffff; ++ x = (tileID >> 16) & 0xffff; ++ } ++ ++ void *operator new(size_t size); ++ void operator delete (void *p); ++ ++private: ++ SWR_FORMAT mFormat; ++ std::unordered_map mTiles; ++ ++ // Any tile that has work queued to it is a dirty tile. ++ std::vector mDirtyTiles; ++ ++ OSALIGNLINE(LONG) mWorkItemsProduced; ++ OSALIGNLINE(volatile LONG) mWorkItemsConsumed; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// DispatchQueue - work queue for dispatch ++////////////////////////////////////////////////////////////////////////// ++class DispatchQueue ++{ ++public: ++ DispatchQueue() {} ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Setup the producer consumer counts. ++ void initialize(uint32_t totalTasks, void* pTaskData) ++ { ++ // The available and outstanding counts start with total tasks. ++ // At the start there are N tasks available and outstanding. ++ // When both the available and outstanding counts have reached 0 then all work has completed. ++ // When a worker starts on a threadgroup then it decrements the available count. ++ // When a worker completes a threadgroup then it decrements the outstanding count. ++ ++ mTasksAvailable = totalTasks; ++ mTasksOutstanding = totalTasks; ++ ++ mpTaskData = pTaskData; ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Returns number of tasks available for this dispatch. ++ uint32_t getNumQueued() ++ { ++ return (mTasksAvailable > 0) ? mTasksAvailable : 0; ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Atomically decrement the work available count. If the result ++ // is greater than 0 then we can on the associated thread group. ++ // Otherwise, there is no more work to do. ++ bool getWork(uint32_t& groupId) ++ { ++ LONG result = InterlockedDecrement(&mTasksAvailable); ++ ++ if (result >= 0) ++ { ++ groupId = result; ++ return true; ++ } ++ ++ return false; ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Atomically decrement the outstanding count. A worker is notifying ++ /// us that he just finished some work. Also, return true if we're ++ /// the last worker to complete this dispatch. ++ bool finishedWork() ++ { ++ LONG result = InterlockedDecrement(&mTasksOutstanding); ++ SWR_ASSERT(result >= 0, "Should never oversubscribe work"); ++ ++ return (result == 0) ? true : false; ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Work is complete once both the available/outstanding counts have reached 0. ++ bool isWorkComplete() ++ { ++ return ((mTasksAvailable <= 0) && ++ (mTasksOutstanding <= 0)); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Return pointer to task data. ++ const void* GetTasksData() ++ { ++ return mpTaskData; ++ } ++ ++ void *operator new(size_t size); ++ void operator delete (void *p); ++ ++ void* mpTaskData; // The API thread will set this up and the callback task function will interpet this. ++ ++ OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; ++ OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 }; ++}; ++ ++ ++enum HOTTILE_STATE ++{ ++ HOTTILE_INVALID, // tile is in unitialized state and should be loaded with surface contents before rendering ++ HOTTILE_CLEAR, // tile should be cleared ++ HOTTILE_DIRTY, // tile has been rendered to ++ HOTTILE_RESOLVED, // tile has been stored to memory ++}; ++ ++struct HOTTILE ++{ ++ BYTE *pBuffer; ++ HOTTILE_STATE state; ++ DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment? ++ uint32_t numSamples; ++ uint32_t renderTargetArrayIndex; // current render target array index loaded ++}; ++ ++union HotTileSet ++{ ++ struct ++ { ++ HOTTILE Color[SWR_NUM_RENDERTARGETS]; ++ HOTTILE Depth; ++ HOTTILE Stencil; ++ }; ++ HOTTILE Attachment[SWR_NUM_ATTACHMENTS]; ++}; ++ ++class HotTileMgr ++{ ++public: ++ HotTileMgr() ++ { ++ memset(&mHotTiles[0][0], 0, sizeof(mHotTiles)); ++ ++ // cache hottile size ++ for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) ++ { ++ mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits::bpp / 8; ++ } ++ mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits::bpp / 8; ++ mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits::bpp / 8; ++ } ++ ++ ~HotTileMgr() ++ { ++ for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x) ++ { ++ for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y) ++ { ++ for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) ++ { ++ if (mHotTiles[x][y].Attachment[a].pBuffer != NULL) ++ { ++ _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer); ++ mHotTiles[x][y].Attachment[a].pBuffer = NULL; ++ } ++ } ++ } ++ } ++ } ++ ++ HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, ++ uint32_t renderTargetArrayIndex = 0) ++ { ++ uint32_t x, y; ++ MacroTileMgr::getTileIndices(macroID, x, y); ++ ++ assert(x < KNOB_NUM_HOT_TILES_X); ++ assert(y < KNOB_NUM_HOT_TILES_Y); ++ ++ HotTileSet &tile = mHotTiles[x][y]; ++ HOTTILE& hotTile = tile.Attachment[attachment]; ++ if (hotTile.pBuffer == NULL) ++ { ++ if (create) ++ { ++ uint32_t size = numSamples * mHotTileSize[attachment]; ++ hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); ++ hotTile.state = HOTTILE_INVALID; ++ hotTile.numSamples = numSamples; ++ hotTile.renderTargetArrayIndex = renderTargetArrayIndex; ++ } ++ else ++ { ++ return NULL; ++ } ++ } ++ else ++ { ++ // free the old tile and create a new one with enough space to hold all samples ++ if (numSamples > hotTile.numSamples) ++ { ++ // tile should be either uninitialized or resolved if we're deleting and switching to a ++ // new sample count ++ assert((hotTile.state == HOTTILE_INVALID) || ++ (hotTile.state == HOTTILE_RESOLVED)); ++ _aligned_free(hotTile.pBuffer); ++ ++ uint32_t size = numSamples * mHotTileSize[attachment]; ++ hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); ++ hotTile.state = HOTTILE_INVALID; ++ hotTile.numSamples = numSamples; ++ } ++ ++ // if requested render target array index isn't currently loaded, need to store out the current hottile ++ // and load the requested array slice ++ if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) ++ { ++ SWR_FORMAT format; ++ switch (attachment) ++ { ++ case SWR_ATTACHMENT_COLOR0: ++ case SWR_ATTACHMENT_COLOR1: ++ case SWR_ATTACHMENT_COLOR2: ++ case SWR_ATTACHMENT_COLOR3: ++ case SWR_ATTACHMENT_COLOR4: ++ case SWR_ATTACHMENT_COLOR5: ++ case SWR_ATTACHMENT_COLOR6: ++ case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; ++ case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; ++ case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; ++ default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; ++ } ++ ++ if (hotTile.state == HOTTILE_DIRTY) ++ { ++ pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, ++ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); ++ } ++ ++ pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, ++ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); ++ ++ hotTile.renderTargetArrayIndex = renderTargetArrayIndex; ++ hotTile.state = HOTTILE_DIRTY; ++ } ++ } ++ return &tile.Attachment[attachment]; ++ } ++ ++ HotTileSet &GetHotTile(uint32_t macroID) ++ { ++ uint32_t x, y; ++ MacroTileMgr::getTileIndices(macroID, x, y); ++ assert(x < KNOB_NUM_HOT_TILES_X); ++ assert(y < KNOB_NUM_HOT_TILES_Y); ++ ++ return mHotTiles[x][y]; ++ } ++ ++private: ++ HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; ++ uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; ++}; ++ +diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp +new file mode 100644 +index 0000000..f36452f +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp +@@ -0,0 +1,148 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file utils.cpp ++* ++* @brief Utilities used by SWR core. ++* ++******************************************************************************/ ++#if defined(_WIN32) ++ ++#include ++#include ++#include ++#include ++ ++using namespace Gdiplus; ++ ++int GetEncoderClsid(const WCHAR* format, CLSID* pClsid) ++{ ++ uint32_t num = 0; // number of image encoders ++ uint32_t size = 0; // size of the image encoder array in bytes ++ ++ ImageCodecInfo* pImageCodecInfo = nullptr; ++ ++ GetImageEncodersSize(&num, &size); ++ if(size == 0) ++ return -1; // Failure ++ ++ pImageCodecInfo = (ImageCodecInfo*)(malloc(size)); ++ if(pImageCodecInfo == nullptr) ++ return -1; // Failure ++ ++ GetImageEncoders(num, size, pImageCodecInfo); ++ ++ for(uint32_t j = 0; j < num; ++j) ++ { ++ if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 ) ++ { ++ *pClsid = pImageCodecInfo[j].Clsid; ++ free(pImageCodecInfo); ++ return j; // Success ++ } ++ } ++ ++ free(pImageCodecInfo); ++ return -1; // Failure ++} ++ ++void SaveImageToPNGFile( ++ const WCHAR *pFilename, ++ void *pBuffer, ++ uint32_t width, ++ uint32_t height) ++{ ++ // dump pixels to a png ++ // Initialize GDI+. ++ GdiplusStartupInput gdiplusStartupInput; ++ ULONG_PTR gdiplusToken; ++ GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); ++ ++ Bitmap *bitmap = new Bitmap(width, height); ++ BYTE *pBytes = (BYTE*)pBuffer; ++ static const uint32_t bytesPerPixel = 4; ++ for (uint32_t y = 0; y < height; ++y) ++ for (uint32_t x = 0; x < width; ++x) ++ { ++ uint32_t pixel = *(uint32_t*)pBytes; ++ if (pixel == 0xcdcdcdcd) ++ { ++ pixel = 0xFFFF00FF; ++ } ++ else if (pixel == 0xdddddddd) ++ { ++ pixel = 0x80FF0000; ++ } ++ else ++ { ++ pixel |= 0xFF000000; ++ } ++ Color color(pixel); ++ bitmap->SetPixel(x, y, color); ++ pBytes += bytesPerPixel; ++ } ++ ++ // Save image. ++ CLSID pngClsid; ++ GetEncoderClsid(L"image/png", &pngClsid); ++ bitmap->Save(pFilename, &pngClsid, nullptr); ++ ++ delete bitmap; ++ ++ GdiplusShutdown(gdiplusToken); ++} ++ ++void OpenBitmapFromFile( ++ const WCHAR *pFilename, ++ void **pBuffer, ++ uint32_t *width, ++ uint32_t *height) ++{ ++ GdiplusStartupInput gdiplusStartupInput; ++ ULONG_PTR gdiplusToken; ++ GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); ++ ++ Bitmap *bitmap = new Bitmap(pFilename); ++ ++ *width = bitmap->GetWidth(); ++ *height = bitmap->GetHeight(); ++ *pBuffer = new BYTE[*width * *height * 4]; // width * height * |RGBA| ++ ++ // The folder 'stb_image' contains a PNG open/close module which ++ // is far less painful than this is, yo. ++ Gdiplus::Color clr; ++ for (uint32_t y = 0, idx = 0; y < *height; ++y) ++ { ++ for (uint32_t x = 0; x < *width; ++x, idx += 4) ++ { ++ bitmap->GetPixel(x, *height - y - 1, &clr); ++ ((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue(); ++ ((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen(); ++ ((BYTE*)*pBuffer)[idx + 2] = clr.GetRed(); ++ ((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha(); ++ } ++ } ++ ++ delete bitmap; ++ bitmap = 0; ++} ++#endif +diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h +new file mode 100644 +index 0000000..63d6ca1 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/core/utils.h +@@ -0,0 +1,745 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file utils.h ++* ++* @brief Utilities used by SWR core. ++* ++******************************************************************************/ ++#pragma once ++ ++#include ++#include "common/os.h" ++#include "common/simdintrin.h" ++#include "common/swr_assert.h" ++ ++#if defined(_WIN32) ++void SaveImageToPNGFile( ++ const WCHAR *pFilename, ++ void *pBuffer, ++ uint32_t width, ++ uint32_t height); ++ ++void OpenBitmapFromFile( ++ const WCHAR *pFilename, ++ void **pBuffer, ++ uint32_t *width, ++ uint32_t *height); ++#endif ++ ++/// @todo assume linux is always 64 bit ++#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) ++#define _MM_INSERT_EPI64 _mm_insert_epi64 ++#define _MM_EXTRACT_EPI64 _mm_extract_epi64 ++#else ++INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) ++{ ++ OSALIGNLINE(uint32_t) elems[4]; ++ _mm_store_si128((__m128i*)elems, a); ++ if (ndx == 0) ++ { ++ uint64_t foo = elems[0]; ++ foo |= (uint64_t)elems[1] << 32; ++ return foo; ++ } ++ else ++ { ++ uint64_t foo = elems[2]; ++ foo |= (uint64_t)elems[3] << 32; ++ return foo; ++ } ++} ++ ++INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) ++{ ++ OSALIGNLINE(int64_t) elems[2]; ++ _mm_store_si128((__m128i*)elems, a); ++ if (ndx == 0) ++ { ++ elems[0] = b; ++ } ++ else ++ { ++ elems[1] = b; ++ } ++ __m128i out; ++ out = _mm_load_si128((const __m128i*)elems); ++ return out; ++} ++#endif ++ ++OSALIGNLINE(struct) BBOX ++{ ++ int top, bottom, left, right; ++ ++ BBOX() {} ++ BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} ++ ++ bool operator==(const BBOX& rhs) ++ { ++ return (this->top == rhs.top && ++ this->bottom == rhs.bottom && ++ this->left == rhs.left && ++ this->right == rhs.right); ++ } ++ ++ bool operator!=(const BBOX& rhs) ++ { ++ return !(*this == rhs); ++ } ++}; ++ ++struct simdBBox ++{ ++ simdscalari top, bottom, left, right; ++}; ++ ++INLINE ++void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) ++{ ++ __m128i row0i = _mm_castps_si128(row0); ++ __m128i row1i = _mm_castps_si128(row1); ++ __m128i row2i = _mm_castps_si128(row2); ++ __m128i row3i = _mm_castps_si128(row3); ++ ++ __m128i vTemp = row2i; ++ row2i = _mm_unpacklo_epi32(row2i, row3i); ++ vTemp = _mm_unpackhi_epi32(vTemp, row3i); ++ ++ row3i = row0i; ++ row0i = _mm_unpacklo_epi32(row0i, row1i); ++ row3i = _mm_unpackhi_epi32(row3i, row1i); ++ ++ row1i = row0i; ++ row0i = _mm_unpacklo_epi64(row0i, row2i); ++ row1i = _mm_unpackhi_epi64(row1i, row2i); ++ ++ row2i = row3i; ++ row2i = _mm_unpacklo_epi64(row2i, vTemp); ++ row3i = _mm_unpackhi_epi64(row3i, vTemp); ++ ++ row0 = _mm_castsi128_ps(row0i); ++ row1 = _mm_castsi128_ps(row1i); ++ row2 = _mm_castsi128_ps(row2i); ++ row3 = _mm_castsi128_ps(row3i); ++} ++ ++INLINE ++void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) ++{ ++ __m128i vTemp = row2; ++ row2 = _mm_unpacklo_epi32(row2, row3); ++ vTemp = _mm_unpackhi_epi32(vTemp, row3); ++ ++ row3 = row0; ++ row0 = _mm_unpacklo_epi32(row0, row1); ++ row3 = _mm_unpackhi_epi32(row3, row1); ++ ++ row1 = row0; ++ row0 = _mm_unpacklo_epi64(row0, row2); ++ row1 = _mm_unpackhi_epi64(row1, row2); ++ ++ row2 = row3; ++ row2 = _mm_unpacklo_epi64(row2, vTemp); ++ row3 = _mm_unpackhi_epi64(row3, vTemp); ++} ++ ++#define GCC_VERSION (__GNUC__ * 10000 \ ++ + __GNUC_MINOR__ * 100 \ ++ + __GNUC_PATCHLEVEL__) ++ ++#if defined(__GNUC__) && (GCC_VERSION < 40900) ++#define _mm_undefined_ps _mm_setzero_ps ++#define _mm_undefined_si128 _mm_setzero_si128 ++#if KNOB_SIMD_WIDTH == 8 ++#define _mm256_undefined_ps _mm256_setzero_ps ++#endif ++#endif ++ ++#if KNOB_SIMD_WIDTH == 8 ++INLINE ++void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) ++{ ++ __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 ++ __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 ++ __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 ++ __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 ++ ++ r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 ++ r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 ++ __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 ++ __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 ++ ++ vDst[0] = _mm256_castps256_ps128(r02r1xlolo); ++ vDst[1] = _mm256_castps256_ps128(r02r1xlohi); ++ vDst[2] = _mm256_castps256_ps128(r02r1xhilo); ++ vDst[3] = _mm256_castps256_ps128(r02r1xhihi); ++ ++ vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); ++ vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); ++ vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); ++ vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); ++} ++ ++INLINE ++void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) ++{ ++ __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 ++ __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 ++ __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 ++ __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 ++ ++ r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 ++ r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 ++ __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 ++ __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 ++ ++ vDst[0] = _mm256_castps256_ps128(r02r1xlolo); ++ vDst[1] = _mm256_castps256_ps128(r02r1xlohi); ++ vDst[2] = _mm256_castps256_ps128(r02r1xhilo); ++ vDst[3] = _mm256_castps256_ps128(r02r1xhihi); ++ ++ vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); ++ vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); ++ vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); ++ vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); ++} ++ ++INLINE ++void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) ++{ ++ __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); ++ __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); ++ __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); ++ __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); ++ __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); ++ __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); ++ __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); ++ __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); ++ __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); ++ __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); ++ __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); ++ __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); ++ __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); ++ __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); ++ __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); ++ __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); ++ vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); ++ vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); ++ vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); ++ vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); ++ vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); ++ vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); ++ vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); ++ vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); ++} ++ ++INLINE ++void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) ++{ ++ vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), ++ _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); ++} ++#endif ++ ++////////////////////////////////////////////////////////////////////////// ++/// TranposeSingleComponent ++////////////////////////////////////////////////////////////////////////// ++template ++struct TransposeSingleComponent ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Pass-thru for single component. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++ memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose8_8_8_8 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose8_8_8_8 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++ simdscalari src = _simd_load_si((const simdscalari*)pSrc); ++#if KNOB_SIMD_WIDTH == 8 ++#if KNOB_ARCH == KNOB_ARCH_AVX ++ __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg ++ __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa ++ __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb ++ __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa ++ __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg ++ __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa ++ __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba ++ __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba ++ _mm_store_si128((__m128i*)pDst, c0123lo); ++ _mm_store_si128((__m128i*)(pDst + 16), c0123hi); ++#elif KNOB_ARCH == KNOB_ARCH_AVX2 ++ simdscalari dst01 = _mm256_shuffle_epi8(src, ++ _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); ++ simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); ++ dst23 = _mm256_shuffle_epi8(dst23, ++ _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); ++ simdscalari dst = _mm256_or_si256(dst01, dst23); ++ _simd_store_si((simdscalari*)pDst, dst); ++#endif ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose8_8_8 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose8_8_8 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose8_8 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose8_8 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 8_8 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++ simdscalari src = _simd_load_si((const simdscalari*)pSrc); ++ ++#if KNOB_SIMD_WIDTH == 8 ++ __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg ++ __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg ++ rg = _mm_unpacklo_epi8(rg, g); ++ _mm_store_si128((__m128i*)pDst, rg); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose32_32_32_32 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose32_32_32_32 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalar src0 = _simd_load_ps((const float*)pSrc); ++ simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); ++ simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); ++ simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); ++ ++ __m128 vDst[8]; ++ vTranspose4x8(vDst, src0, src1, src2, src3); ++ _mm_store_ps((float*)pDst, vDst[0]); ++ _mm_store_ps((float*)pDst+4, vDst[1]); ++ _mm_store_ps((float*)pDst+8, vDst[2]); ++ _mm_store_ps((float*)pDst+12, vDst[3]); ++ _mm_store_ps((float*)pDst+16, vDst[4]); ++ _mm_store_ps((float*)pDst+20, vDst[5]); ++ _mm_store_ps((float*)pDst+24, vDst[6]); ++ _mm_store_ps((float*)pDst+28, vDst[7]); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose32_32_32 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose32_32_32 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalar src0 = _simd_load_ps((const float*)pSrc); ++ simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); ++ simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); ++ ++ __m128 vDst[8]; ++ vTranspose3x8(vDst, src0, src1, src2); ++ _mm_store_ps((float*)pDst, vDst[0]); ++ _mm_store_ps((float*)pDst + 4, vDst[1]); ++ _mm_store_ps((float*)pDst + 8, vDst[2]); ++ _mm_store_ps((float*)pDst + 12, vDst[3]); ++ _mm_store_ps((float*)pDst + 16, vDst[4]); ++ _mm_store_ps((float*)pDst + 20, vDst[5]); ++ _mm_store_ps((float*)pDst + 24, vDst[6]); ++ _mm_store_ps((float*)pDst + 28, vDst[7]); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose32_32 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose32_32 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 32_32 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++ const float* pfSrc = (const float*)pSrc; ++ __m128 src_r0 = _mm_load_ps(pfSrc + 0); ++ __m128 src_r1 = _mm_load_ps(pfSrc + 4); ++ __m128 src_g0 = _mm_load_ps(pfSrc + 8); ++ __m128 src_g1 = _mm_load_ps(pfSrc + 12); ++ ++ __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); ++ __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); ++ __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); ++ __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); ++ ++ float* pfDst = (float*)pDst; ++ _mm_store_ps(pfDst + 0, dst0); ++ _mm_store_ps(pfDst + 4, dst1); ++ _mm_store_ps(pfDst + 8, dst2); ++ _mm_store_ps(pfDst + 12, dst3); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose16_16_16_16 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose16_16_16_16 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); ++ simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); ++ ++ __m128i src_r = _mm256_extractf128_si256(src_rg, 0); ++ __m128i src_g = _mm256_extractf128_si256(src_rg, 1); ++ __m128i src_b = _mm256_extractf128_si256(src_ba, 0); ++ __m128i src_a = _mm256_extractf128_si256(src_ba, 1); ++ ++ __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); ++ __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); ++ __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); ++ __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); ++ ++ __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); ++ __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); ++ __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); ++ __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); ++ ++ _mm_store_si128(((__m128i*)pDst) + 0, dst0); ++ _mm_store_si128(((__m128i*)pDst) + 1, dst1); ++ _mm_store_si128(((__m128i*)pDst) + 2, dst2); ++ _mm_store_si128(((__m128i*)pDst) + 3, dst3); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose16_16_16 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose16_16_16 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++#if KNOB_SIMD_WIDTH == 8 ++ simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); ++ ++ __m128i src_r = _mm256_extractf128_si256(src_rg, 0); ++ __m128i src_g = _mm256_extractf128_si256(src_rg, 1); ++ __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); ++ __m128i src_a = _mm_undefined_si128(); ++ ++ __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); ++ __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); ++ __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); ++ __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); ++ ++ __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); ++ __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); ++ __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); ++ __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); ++ ++ _mm_store_si128(((__m128i*)pDst) + 0, dst0); ++ _mm_store_si128(((__m128i*)pDst) + 1, dst1); ++ _mm_store_si128(((__m128i*)pDst) + 2, dst2); ++ _mm_store_si128(((__m128i*)pDst) + 3, dst3); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose16_16 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose16_16 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 16_16 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) ++ { ++ simdscalar src = _simd_load_ps((const float*)pSrc); ++ ++#if KNOB_SIMD_WIDTH == 8 ++ __m128 comp0 = _mm256_castps256_ps128(src); ++ __m128 comp1 = _mm256_extractf128_ps(src, 1); ++ ++ __m128i comp0i = _mm_castps_si128(comp0); ++ __m128i comp1i = _mm_castps_si128(comp1); ++ ++ __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); ++ __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); ++ ++ _mm_store_si128((__m128i*)pDst, resLo); ++ _mm_store_si128((__m128i*)pDst + 1, resHi); ++#else ++#error Unsupported vector width ++#endif ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose4_4_4_4 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose4_4_4_4 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose5_6_5 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose5_6_5 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose9_9_9_5 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose9_9_9_5 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose5_5_5_1 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose5_5_5_1 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose10_10_10_2 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose10_10_10_2 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Transpose11_11_10 ++////////////////////////////////////////////////////////////////////////// ++struct Transpose11_11_10 ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. ++ /// @param pSrc - source data in SOA form ++ /// @param pDst - output data in AOS form ++ static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; ++}; ++ ++// helper function to unroll loops ++template ++struct UnrollerL { ++ template ++ INLINE static void step(Lambda& func) { ++ func(Begin); ++ UnrollerL::step(func); ++ } ++}; ++ ++template ++struct UnrollerL { ++ template ++ static void step(Lambda& func) { ++ } ++}; ++ ++// general CRC compute ++INLINE ++uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) ++{ ++#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) ++ uint32_t sizeInQwords = size / sizeof(uint64_t); ++ uint32_t sizeRemainderBytes = size % sizeof(uint64_t); ++ uint64_t* pDataWords = (uint64_t*)pData; ++ for (uint32_t i = 0; i < sizeInQwords; ++i) ++ { ++ crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); ++ } ++#else ++ uint32_t sizeInDwords = size / sizeof(uint32_t); ++ uint32_t sizeRemainderBytes = size % sizeof(uint32_t); ++ uint32_t* pDataWords = (uint32_t*)pData; ++ for (uint32_t i = 0; i < sizeInDwords; ++i) ++ { ++ crc = _mm_crc32_u32(crc, *pDataWords++); ++ } ++#endif ++ ++ BYTE* pRemainderBytes = (BYTE*)pDataWords; ++ for (uint32_t i = 0; i < sizeRemainderBytes; ++i) ++ { ++ crc = _mm_crc32_u8(crc, *pRemainderBytes++); ++ } ++ ++ return crc; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// Add byte offset to any-type pointer ++////////////////////////////////////////////////////////////////////////// ++template ++INLINE ++static T* PtrAdd(T* p, intptr_t offset) ++{ ++ intptr_t intp = reinterpret_cast(p); ++ return reinterpret_cast(intp + offset); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// Is a power-of-2? ++////////////////////////////////////////////////////////////////////////// ++template ++INLINE ++static bool IsPow2(T value) ++{ ++ return value == (value & (0 - value)); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// Align down to specified alignment ++/// Note: IsPow2(alignment) MUST be true ++////////////////////////////////////////////////////////////////////////// ++template ++INLINE ++static T1 AlignDownPow2(T1 value, T2 alignment) ++{ ++ SWR_ASSERT(IsPow2(alignment)); ++ return value & ~T1(alignment - 1); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// Align up to specified alignment ++/// Note: IsPow2(alignment) MUST be true ++////////////////////////////////////////////////////////////////////////// ++template ++INLINE ++static T1 AlignUpPow2(T1 value, T2 alignment) ++{ ++ return AlignDownPow2(value + T1(alignment - 1), alignment); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// Align down to specified alignment ++////////////////////////////////////////////////////////////////////////// ++template ++INLINE ++static T1 AlignDown(T1 value, T2 alignment) ++{ ++ if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } ++ return value - T1(value % alignment); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// Align up to specified alignment ++/// Note: IsPow2(alignment) MUST be true ++////////////////////////////////////////////////////////////////////////// ++template ++INLINE ++static T1 AlignUp(T1 value, T2 alignment) ++{ ++ return AlignDown(value + T1(alignment - 1), alignment); ++} ++ +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +new file mode 100644 +index 0000000..726b508 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +@@ -0,0 +1,292 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file JitManager.cpp ++* ++* @brief Implementation if the Jit Manager. ++* ++* Notes: ++* ++******************************************************************************/ ++#if defined(_WIN32) ++#pragma warning(disable: 4800 4146 4244 4267 4355 4996) ++#endif ++ ++#include "jit_api.h" ++#include "JitManager.h" ++#include "fetch_jit.h" ++ ++#if defined(_WIN32) ++#include "llvm/ADT/Triple.h" ++#endif ++#include "llvm/IR/Function.h" ++#include "llvm/Support/DynamicLibrary.h" ++ ++#include "llvm/Support/MemoryBuffer.h" ++#include "llvm/Support/SourceMgr.h" ++#include "llvm/IRReader/IRReader.h" ++ ++#include "core/state.h" ++#include "common/containers.hpp" ++ ++#include "state_llvm.h" ++ ++#include ++#if defined(_WIN32) ++#include ++#include ++ ++#define INTEL_OUTPUT_DIR "c:\\Intel" ++#define RASTY_OUTPUT_DIR INTEL_OUTPUT_DIR "\\Rasty" ++#define JITTER_OUTPUT_DIR RASTY_OUTPUT_DIR "\\Jitter" ++#endif ++ ++using namespace llvm; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Contructor for JitManager. ++/// @param simdWidth - SIMD width to be used in generated program. ++JitManager::JitManager(uint32_t simdWidth, const char *arch) ++ : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch) ++{ ++ InitializeNativeTarget(); ++ InitializeNativeTargetAsmPrinter(); ++ InitializeNativeTargetDisassembler(); ++ ++ TargetOptions tOpts; ++ tOpts.AllowFPOpFusion = FPOpFusion::Fast; ++ tOpts.NoInfsFPMath = false; ++ tOpts.NoNaNsFPMath = false; ++ tOpts.UnsafeFPMath = true; ++#if defined(_DEBUG) ++ tOpts.NoFramePointerElim = true; ++#endif ++ ++ //tOpts.PrintMachineCode = true; ++ ++ std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate); ++ fnName << mJitNumber++; ++ std::unique_ptr newModule(new Module(fnName.str(), mContext)); ++ mpCurrentModule = newModule.get(); ++ ++ auto &&EB = EngineBuilder(std::move(newModule)); ++ EB.setTargetOptions(tOpts); ++ EB.setOptLevel(CodeGenOpt::Aggressive); ++ ++ StringRef hostCPUName; ++ ++ // force JIT to use the same CPU arch as the rest of rasty ++ if(mArch.AVX512F()) ++ { ++ assert(0 && "Implement AVX512 jitter"); ++ hostCPUName = sys::getHostCPUName(); ++ if (mVWidth == 0) ++ { ++ mVWidth = 16; ++ } ++ } ++ else if(mArch.AVX2()) ++ { ++ hostCPUName = StringRef("core-avx2"); ++ if (mVWidth == 0) ++ { ++ mVWidth = 8; ++ } ++ } ++ else if(mArch.AVX()) ++ { ++ if (mArch.F16C()) ++ { ++ hostCPUName = StringRef("core-avx-i"); ++ } ++ else ++ { ++ hostCPUName = StringRef("corei7-avx"); ++ } ++ if (mVWidth == 0) ++ { ++ mVWidth = 8; ++ } ++ } ++ else ++ { ++ hostCPUName = sys::getHostCPUName(); ++ if (mVWidth == 0) ++ { ++ mVWidth = 8; // 4? ++ } ++ } ++ ++ EB.setMCPU(hostCPUName); ++ ++#if defined(_WIN32) ++ // Needed for MCJIT on windows ++ Triple hostTriple(sys::getProcessTriple()); ++ hostTriple.setObjectFormat(Triple::ELF); ++ mpCurrentModule->setTargetTriple(hostTriple.getTriple()); ++#endif // _WIN32 ++ ++ mpExec = EB.create(); ++ ++#if LLVM_USE_INTEL_JITEVENTS ++ JITEventListener *vTune = JITEventListener::createIntelJITEventListener(); ++ mpExec->RegisterJITEventListener(vTune); ++#endif ++ ++ mFP32Ty = Type::getFloatTy(mContext); // float type ++ mInt8Ty = Type::getInt8Ty(mContext); ++ mInt32Ty = Type::getInt32Ty(mContext); // int type ++ mInt64Ty = Type::getInt64Ty(mContext); // int type ++ mV4FP32Ty = StructType::get(mContext, std::vector(4, mFP32Ty), false); // vector4 float type (represented as structure) ++ mV4Int32Ty = StructType::get(mContext, std::vector(4, mInt32Ty), false); // vector4 int type ++ ++ // fetch function signature ++ // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out); ++ std::vector fsArgs; ++ fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0)); ++ fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0)); ++ ++ mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false); ++ ++ mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth); ++ mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth); ++ ++ mSimdVectorTy = StructType::get(mContext, std::vector(4, mSimtFP32Ty), false); ++ mSimdVectorInt32Ty = StructType::get(mContext, std::vector(4, mSimtInt32Ty), false); ++ ++#if defined(_WIN32) ++ // explicitly instantiate used symbols from potentially staticly linked libs ++ sys::DynamicLibrary::AddSymbol("exp2f", &exp2f); ++ sys::DynamicLibrary::AddSymbol("log2f", &log2f); ++ sys::DynamicLibrary::AddSymbol("sinf", &sinf); ++ sys::DynamicLibrary::AddSymbol("cosf", &cosf); ++ sys::DynamicLibrary::AddSymbol("powf", &powf); ++#endif ++ ++#if defined(_WIN32) ++ if (KNOB_DUMP_SHADER_IR) ++ { ++ CreateDirectory(INTEL_OUTPUT_DIR, NULL); ++ CreateDirectory(RASTY_OUTPUT_DIR, NULL); ++ CreateDirectory(JITTER_OUTPUT_DIR, NULL); ++ } ++#endif ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Create new LLVM module. ++void JitManager::SetupNewModule() ++{ ++ SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!"); ++ ++ std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate); ++ fnName << mJitNumber++; ++ std::unique_ptr newModule(new Module(fnName.str(), mContext)); ++ mpCurrentModule = newModule.get(); ++#if defined(_WIN32) ++ // Needed for MCJIT on windows ++ Triple hostTriple(sys::getProcessTriple()); ++ hostTriple.setObjectFormat(Triple::ELF); ++ newModule->setTargetTriple(hostTriple.getTriple()); ++#endif // _WIN32 ++ ++ mpExec->addModule(std::move(newModule)); ++ mIsModuleFinalized = false; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Create new LLVM module from IR. ++bool JitManager::SetupModuleFromIR(const uint8_t *pIR) ++{ ++ std::unique_ptr pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), ""); ++ ++ SMDiagnostic Err; ++ std::unique_ptr newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext); ++ ++ if (newModule == nullptr) ++ { ++ SWR_ASSERT(0, "Parse failed! Check Err for details."); ++ return false; ++ } ++ ++ mpCurrentModule = newModule.get(); ++#if defined(_WIN32) ++ // Needed for MCJIT on windows ++ Triple hostTriple(sys::getProcessTriple()); ++ hostTriple.setObjectFormat(Triple::ELF); ++ newModule->setTargetTriple(hostTriple.getTriple()); ++#endif // _WIN32 ++ ++ mpExec->addModule(std::move(newModule)); ++ mIsModuleFinalized = false; ++ ++ return true; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Dump function to file. ++void JitManager::DumpToFile(Function *f, const char *fileName) ++{ ++ if (KNOB_DUMP_SHADER_IR) ++ { ++#if defined(_WIN32) ++ DWORD pid = GetCurrentProcessId(); ++ TCHAR procname[MAX_PATH]; ++ GetModuleFileName(NULL, procname, MAX_PATH); ++ const char* pBaseName = strrchr(procname, '\\'); ++ std::stringstream outDir; ++ outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends; ++ CreateDirectory(outDir.str().c_str(), NULL); ++#endif ++ ++ std::error_code EC; ++ const char *funcName = f->getName().data(); ++ char fName[256]; ++#if defined(_WIN32) ++ sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName); ++#else ++ sprintf(fName, "%s.%s.ll", funcName, fileName); ++#endif ++ raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None); ++ Module* pModule = f->getParent(); ++ pModule->print(fd, nullptr); ++ fd.flush(); ++ } ++} ++ ++extern "C" ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Create JIT context. ++ /// @param simdWidth - SIMD width to be used in generated program. ++ HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch) ++ { ++ return new JitManager(targetSimdWidth, arch); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Destroy JIT context. ++ void JITCALL JitDestroyContext(HANDLE hJitContext) ++ { ++ delete reinterpret_cast(hJitContext); ++ } ++} +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +new file mode 100644 +index 0000000..e0e8ec4 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +@@ -0,0 +1,182 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file JitManager.h ++* ++* @brief JitManager contains the LLVM data structures used for JIT generation ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/os.h" ++#include "common/isa.hpp" ++ ++#if defined(_WIN32) ++#pragma warning(disable : 4146 4244 4267 4800 4996) ++#endif ++ ++#include "llvm/IR/DataLayout.h" ++#include "llvm/IR/Instructions.h" ++#include "llvm/IR/LLVMContext.h" ++#include "llvm/IR/Module.h" ++#include "llvm/IR/Type.h" ++#include "llvm/IR/IRBuilder.h" ++#include "llvm/IR/IntrinsicInst.h" ++ ++#include "llvm/Config/llvm-config.h" ++#ifndef LLVM_VERSION_MAJOR ++#include "llvm/Config/config.h" ++#endif ++ ++#include "llvm/IR/Verifier.h" ++#include "llvm/ExecutionEngine/MCJIT.h" ++#include "llvm/Support/FileSystem.h" ++#define LLVM_F_NONE sys::fs::F_None ++ ++#include "llvm/Analysis/Passes.h" ++#include "llvm/PassManager.h" ++#include "llvm/CodeGen/Passes.h" ++#include "llvm/ExecutionEngine/ExecutionEngine.h" ++#include "llvm/Support/raw_ostream.h" ++#include "llvm/Support/TargetSelect.h" ++#include "llvm/Transforms/IPO.h" ++#include "llvm/Transforms/Scalar.h" ++#include "llvm/Support/Host.h" ++ ++ ++using namespace llvm; ++////////////////////////////////////////////////////////////////////////// ++/// JitInstructionSet ++/// @brief Subclass of InstructionSet that allows users to override ++/// the reporting of support for certain ISA features. This allows capping ++/// the jitted code to a certain feature level, e.g. jit AVX level code on ++/// a platform that supports AVX2. ++////////////////////////////////////////////////////////////////////////// ++class JitInstructionSet : public InstructionSet ++{ ++public: ++ JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa) ++ { ++ if (isaRequest == "") ++ { ++ // Check for an environment variable ++ const char* pIsaEnv = getenv("RASTY_KNOB_ARCH_STR"); ++ if (pIsaEnv) ++ { ++ isaRequest = pIsaEnv; ++ } ++ } ++ std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower); ++ ++ if(isaRequest == "avx") ++ { ++ bForceAVX = true; ++ bForceAVX2 = false; ++ bForceAVX512 = false; ++ } ++ else if(isaRequest == "avx2") ++ { ++ bForceAVX = false; ++ bForceAVX2 = true; ++ bForceAVX512 = false; ++ } ++ #if 0 ++ else if(isaRequest == "avx512") ++ { ++ bForceAVX = false; ++ bForceAVX2 = false; ++ bForceAVX512 = true; ++ } ++ #endif ++ }; ++ ++ bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); } ++ bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); } ++ bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); } ++ ++private: ++ bool bForceAVX = false; ++ bool bForceAVX2 = false; ++ bool bForceAVX512 = false; ++ std::string isaRequest; ++}; ++ ++ ++ ++struct JitLLVMContext : LLVMContext ++{ ++}; ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// JitManager ++////////////////////////////////////////////////////////////////////////// ++struct JitManager ++{ ++ JitManager(uint32_t w, const char *arch); ++ ~JitManager(){}; ++ ++ JitLLVMContext mContext; ///< LLVM compiler ++ IRBuilder<> mBuilder; ///< LLVM IR Builder ++ ExecutionEngine* mpExec; ++ ++ // Need to be rebuilt after a JIT and before building new IR ++ Module* mpCurrentModule; ++ bool mIsModuleFinalized; ++ uint32_t mJitNumber; ++ ++ uint32_t mVWidth; ++ ++ // Built in types. ++ Type* mInt8Ty; ++ Type* mInt32Ty; ++ Type* mInt64Ty; ++ Type* mFP32Ty; ++ StructType* mV4FP32Ty; ++ StructType* mV4Int32Ty; ++ ++ // helper scalar function types ++ FunctionType* mUnaryFPTy; ++ FunctionType* mBinaryFPTy; ++ FunctionType* mTrinaryFPTy; ++ FunctionType* mUnaryIntTy; ++ FunctionType* mBinaryIntTy; ++ FunctionType* mTrinaryIntTy; ++ ++ Type* mSimtFP32Ty; ++ Type* mSimtInt32Ty; ++ ++ Type* mSimdVectorInt32Ty; ++ Type* mSimdVectorTy; ++ ++ // fetch shader types ++ FunctionType* mFetchShaderTy; ++ ++ JitInstructionSet mArch; ++ ++ void SetupNewModule(); ++ bool SetupModuleFromIR(const uint8_t *pIR); ++ ++ static void DumpToFile(Function *f, const char *fileName); ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +new file mode 100644 +index 0000000..5e8e5f4 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +@@ -0,0 +1,473 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file blend_jit.cpp ++* ++* @brief Implementation of the blend jitter ++* ++* Notes: ++* ++******************************************************************************/ ++#include "jit_api.h" ++#include "blend_jit.h" ++#include "builder.h" ++#include "state_llvm.h" ++#include "common/containers.hpp" ++#include "llvm/IR/DataLayout.h" ++ ++#include ++ ++// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized ++#define QUANTIZE_THRESHOLD 2 ++ ++////////////////////////////////////////////////////////////////////////// ++/// Interface to Jitting a blend shader ++////////////////////////////////////////////////////////////////////////// ++struct BlendJit : public Builder ++{ ++ BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; ++ ++ template ++ void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4]) ++ { ++ Value* out[4]; ++ ++ switch (factor) ++ { ++ case BLENDFACTOR_ONE: ++ out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); ++ break; ++ case BLENDFACTOR_SRC_COLOR: ++ out[0] = src[0]; ++ out[1] = src[1]; ++ out[2] = src[2]; ++ out[3] = src[3]; ++ break; ++ case BLENDFACTOR_SRC_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = src[3]; ++ break; ++ case BLENDFACTOR_DST_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = dst[3]; ++ break; ++ case BLENDFACTOR_DST_COLOR: ++ out[0] = dst[0]; ++ out[1] = dst[1]; ++ out[2] = dst[2]; ++ out[3] = dst[3]; ++ break; ++ case BLENDFACTOR_SRC_ALPHA_SATURATE: ++ out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); ++ out[3] = VIMMED1(1.0f); ++ break; ++ case BLENDFACTOR_CONST_COLOR: ++ out[0] = constColor[0]; ++ out[1] = constColor[1]; ++ out[2] = constColor[2]; ++ out[3] = constColor[3]; ++ break; ++ case BLENDFACTOR_CONST_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = constColor[3]; ++ break; ++ case BLENDFACTOR_SRC1_COLOR: ++ out[0] = src1[0]; ++ out[1] = src1[1]; ++ out[2] = src1[2]; ++ out[3] = src1[3]; ++ break; ++ case BLENDFACTOR_SRC1_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = src1[3]; ++ break; ++ case BLENDFACTOR_ZERO: ++ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); ++ break; ++ case BLENDFACTOR_INV_SRC_COLOR: ++ out[0] = FSUB(VIMMED1(1.0f), src[0]); ++ out[1] = FSUB(VIMMED1(1.0f), src[1]); ++ out[2] = FSUB(VIMMED1(1.0f), src[2]); ++ out[3] = FSUB(VIMMED1(1.0f), src[3]); ++ break; ++ case BLENDFACTOR_INV_SRC_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); ++ break; ++ case BLENDFACTOR_INV_DST_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); ++ break; ++ case BLENDFACTOR_INV_DST_COLOR: ++ out[0] = FSUB(VIMMED1(1.0f), dst[0]); ++ out[1] = FSUB(VIMMED1(1.0f), dst[1]); ++ out[2] = FSUB(VIMMED1(1.0f), dst[2]); ++ out[3] = FSUB(VIMMED1(1.0f), dst[3]); ++ break; ++ case BLENDFACTOR_INV_CONST_COLOR: ++ out[0] = FSUB(VIMMED1(1.0f), constColor[0]); ++ out[1] = FSUB(VIMMED1(1.0f), constColor[1]); ++ out[2] = FSUB(VIMMED1(1.0f), constColor[2]); ++ out[3] = FSUB(VIMMED1(1.0f), constColor[3]); ++ break; ++ case BLENDFACTOR_INV_CONST_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); ++ break; ++ case BLENDFACTOR_INV_SRC1_COLOR: ++ out[0] = FSUB(VIMMED1(1.0f), src1[0]); ++ out[1] = FSUB(VIMMED1(1.0f), src1[1]); ++ out[2] = FSUB(VIMMED1(1.0f), src1[2]); ++ out[3] = FSUB(VIMMED1(1.0f), src1[3]); ++ break; ++ case BLENDFACTOR_INV_SRC1_ALPHA: ++ out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); ++ break; ++ default: ++ SWR_ASSERT(false, "Unsupported blend factor: %d", factor); ++ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); ++ break; ++ } ++ ++ if (Color) ++ { ++ result[0] = out[0]; ++ result[1] = out[1]; ++ result[2] = out[2]; ++ } ++ ++ if (Alpha) ++ { ++ result[3] = out[3]; ++ } ++ } ++ ++ void Clamp(SWR_FORMAT format, Value* src[4]) ++ { ++ const SWR_FORMAT_INFO& info = GetFormatInfo(format); ++ SWR_TYPE type = info.type[0]; ++ ++ switch (type) ++ { ++ case SWR_TYPE_FLOAT: ++ break; ++ ++ case SWR_TYPE_UNORM: ++ src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); ++ src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); ++ src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); ++ src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); ++ break; ++ ++ case SWR_TYPE_SNORM: ++ src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); ++ src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); ++ src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); ++ src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); ++ break; ++ ++ default: SWR_ASSERT(false, "Unsupport format type: %d", type); ++ } ++ } ++ ++ void ApplyDefaults(SWR_FORMAT format, Value* src[4]) ++ { ++ const SWR_FORMAT_INFO& info = GetFormatInfo(format); ++ ++ bool valid[] = { false, false, false, false }; ++ for (uint32_t c = 0; c < info.numComps; ++c) ++ { ++ valid[info.swizzle[c]] = true; ++ } ++ ++ for (uint32_t c = 0; c < 4; ++c) ++ { ++ if (!valid[c]) ++ { ++ src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); ++ } ++ } ++ } ++ ++ void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) ++ { ++ const SWR_FORMAT_INFO& info = GetFormatInfo(format); ++ ++ for (uint32_t c = 0; c < info.numComps; ++c) ++ { ++ if (info.type[c] == SWR_TYPE_UNUSED) ++ { ++ src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); ++ } ++ } ++ } ++ ++ void Quantize(SWR_FORMAT format, Value* src[4]) ++ { ++ const SWR_FORMAT_INFO& info = GetFormatInfo(format); ++ for (uint32_t c = 0; c < info.numComps; ++c) ++ { ++ if (info.bpc[c] <= QUANTIZE_THRESHOLD) ++ { ++ uint32_t swizComp = info.swizzle[c]; ++ float factor = (float)((1 << info.bpc[c]) - 1); ++ switch (info.type[c]) ++ { ++ case SWR_TYPE_UNORM: ++ src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); ++ src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); ++ src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor)); ++ break; ++ default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]); ++ } ++ } ++ } ++ } ++ ++ template ++ void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4]) ++ { ++ Value* out[4]; ++ Value* srcBlend[4]; ++ Value* dstBlend[4]; ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ srcBlend[i] = FMUL(src[i], srcFactor[i]); ++ dstBlend[i] = FMUL(dst[i], dstFactor[i]); ++ } ++ ++ switch (blendOp) ++ { ++ case BLENDOP_ADD: ++ out[0] = FADD(srcBlend[0], dstBlend[0]); ++ out[1] = FADD(srcBlend[1], dstBlend[1]); ++ out[2] = FADD(srcBlend[2], dstBlend[2]); ++ out[3] = FADD(srcBlend[3], dstBlend[3]); ++ break; ++ ++ case BLENDOP_SUBTRACT: ++ out[0] = FSUB(srcBlend[0], dstBlend[0]); ++ out[1] = FSUB(srcBlend[1], dstBlend[1]); ++ out[2] = FSUB(srcBlend[2], dstBlend[2]); ++ out[3] = FSUB(srcBlend[3], dstBlend[3]); ++ break; ++ ++ case BLENDOP_REVSUBTRACT: ++ out[0] = FSUB(dstBlend[0], srcBlend[0]); ++ out[1] = FSUB(dstBlend[1], srcBlend[1]); ++ out[2] = FSUB(dstBlend[2], srcBlend[2]); ++ out[3] = FSUB(dstBlend[3], srcBlend[3]); ++ break; ++ ++ case BLENDOP_MIN: ++ out[0] = VMINPS(src[0], dst[0]); ++ out[1] = VMINPS(src[1], dst[1]); ++ out[2] = VMINPS(src[2], dst[2]); ++ out[3] = VMINPS(src[3], dst[3]); ++ break; ++ ++ case BLENDOP_MAX: ++ out[0] = VMAXPS(src[0], dst[0]); ++ out[1] = VMAXPS(src[1], dst[1]); ++ out[2] = VMAXPS(src[2], dst[2]); ++ out[3] = VMAXPS(src[3], dst[3]); ++ break; ++ ++ default: ++ SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp); ++ out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); ++ break; ++ } ++ ++ if (Color) ++ { ++ result[0] = out[0]; ++ result[1] = out[1]; ++ result[2] = out[2]; ++ } ++ ++ if (Alpha) ++ { ++ result[3] = out[3]; ++ } ++ } ++ ++ Function* Create(const BLEND_COMPILE_STATE& state) ++ { ++ static std::size_t jitNum = 0; ++ ++ std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); ++ fnName << jitNum++; ++ ++ // blend function signature ++ // typedef void(*PFN_BLEND_JIT_FUNC)(SWR_BLEND_STATE*, simdvector&, simdvector&, uint8_t*, simdvector&); ++ ++ std::vector args{ ++ PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE* ++ PointerType::get(mSimdFP32Ty, 0), // simdvector& src ++ PointerType::get(mSimdFP32Ty, 0), // simdvector& src1 ++ PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst ++ PointerType::get(mSimdFP32Ty, 0), // simdvector& result ++ }; ++ ++ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); ++ Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); ++ ++ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); ++ ++ IRB()->SetInsertPoint(entry); ++ ++ // arguments ++ auto argitr = blendFunc->getArgumentList().begin(); ++ Value* pBlendState = argitr++; ++ pBlendState->setName("pBlendState"); ++ Value* pSrc = argitr++; ++ pSrc->setName("src"); ++ Value* pSrc1 = argitr++; ++ pSrc1->setName("src1"); ++ Value* pDst = argitr++; ++ pDst->setName("pDst"); ++ Value* pResult = argitr++; ++ pResult->setName("result"); ++ ++ static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format"); ++ Value* dst[4]; ++ Value* constantColor[4]; ++ Value* src[4]; ++ Value* src1[4]; ++ Value* result[4]; ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ // load hot tile ++ dst[i] = LOAD(pDst, { i }); ++ ++ // load constant color ++ constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i })); ++ ++ // load src ++ src[i] = LOAD(pSrc, { i }); ++ ++ // load src1 ++ src1[i] = LOAD(pSrc1, { i }); ++ } ++ ++ // clamp sources ++ Clamp(state.format, src); ++ Clamp(state.format, src1); ++ Clamp(state.format, dst); ++ Clamp(state.format, constantColor); ++ ++ // apply defaults to hottile contents to take into account missing components ++ ApplyDefaults(state.format, dst); ++ ++ // Force defaults for unused 'X' components ++ ApplyUnusedDefaults(state.format, dst); ++ ++ // Quantize low precision components ++ Quantize(state.format, dst); ++ ++ // special case clamping for R11G11B10_float which has no sign bit ++ if (state.format == R11G11B10_FLOAT) ++ { ++ dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); ++ dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); ++ dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); ++ dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); ++ } ++ ++ Value* srcFactor[4]; ++ Value* dstFactor[4]; ++ if (state.independentAlphaBlendEnable) ++ { ++ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); ++ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor); ++ ++ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); ++ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor); ++ ++ BlendFunc((SWR_BLEND_OP)state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); ++ BlendFunc((SWR_BLEND_OP)state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); ++ } ++ else ++ { ++ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); ++ GenerateBlendFactor((SWR_BLEND_FACTOR)state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); ++ ++ BlendFunc((SWR_BLEND_OP)state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); ++ } ++ ++ // store results out ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ STORE(result[i], pResult, { i }); ++ } ++ ++ RET_VOID(); ++ ++ JitManager::DumpToFile(blendFunc, ""); ++ ++ FunctionPassManager passes(JM()->mpCurrentModule); ++ passes.add(createBreakCriticalEdgesPass()); ++ passes.add(createCFGSimplificationPass()); ++ passes.add(createEarlyCSEPass()); ++ passes.add(createPromoteMemoryToRegisterPass()); ++ passes.add(createCFGSimplificationPass()); ++ passes.add(createEarlyCSEPass()); ++ passes.add(createInstructionCombiningPass()); ++ passes.add(createInstructionSimplifierPass()); ++ passes.add(createConstantPropagationPass()); ++ passes.add(createSCCPPass()); ++ passes.add(createAggressiveDCEPass()); ++ ++ passes.run(*blendFunc); ++ ++ JitManager::DumpToFile(blendFunc, "optimized"); ++ ++ return blendFunc; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JITs from fetch shader IR ++/// @param hJitMgr - JitManager handle ++/// @param func - LLVM function IR ++/// @return PFN_FETCH_FUNC - pointer to fetch code ++PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) ++{ ++ const llvm::Function *func = (const llvm::Function*)hFunc; ++ JitManager* pJitMgr = reinterpret_cast(hJitMgr); ++ PFN_BLEND_JIT_FUNC pfnBlend; ++ pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); ++ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module ++ pJitMgr->mIsModuleFinalized = true; ++ ++ return pfnBlend; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compiles blend shader ++/// @param hJitMgr - JitManager handle ++/// @param state - blend state to build function from ++extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state) ++{ ++ JitManager* pJitMgr = reinterpret_cast(hJitMgr); ++ ++ pJitMgr->SetupNewModule(); ++ ++ BlendJit theJit(pJitMgr); ++ HANDLE hFunc = theJit.Create(state); ++ ++ return JitBlendFunc(hJitMgr, hFunc); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h +new file mode 100644 +index 0000000..80c4c03 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h +@@ -0,0 +1,49 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file blend_jit.h ++* ++* @brief Definition of the blend jitter ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/formats.h" ++#include "core/context.h" ++#include "core/state.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// State required for blend jit ++////////////////////////////////////////////////////////////////////////// ++struct BLEND_COMPILE_STATE ++{ ++ SWR_FORMAT format; // format of render target being blended ++ bool independentAlphaBlendEnable; ++ SWR_RENDER_TARGET_BLEND_STATE blendState; ++ ++ bool operator==(const BLEND_COMPILE_STATE& other) const ++ { ++ return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0; ++ } ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +new file mode 100644 +index 0000000..b971791 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +@@ -0,0 +1,56 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder.h ++* ++* @brief Includes all the builder related functionality ++* ++* Notes: ++* ++******************************************************************************/ ++ ++#include "builder.h" ++ ++using namespace llvm; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Contructor for Builder. ++/// @param pJitMgr - JitManager which contains modules, function passes, etc. ++Builder::Builder(JitManager *pJitMgr) ++ : mpJitMgr(pJitMgr) ++{ ++ mpIRBuilder = &pJitMgr->mBuilder; ++ ++ mFP16Ty = Type::getHalfTy(pJitMgr->mContext); ++ mFP32Ty = Type::getFloatTy(pJitMgr->mContext); ++ mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); ++ mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); ++ mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); ++ mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); ++ mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector(4, mFP32Ty), false); // vector4 float type (represented as structure) ++ mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector(4, mInt32Ty), false); // vector4 int type ++ mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth); ++ mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth); ++ mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth); ++ mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth); ++ mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h +new file mode 100644 +index 0000000..1342f28 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h +@@ -0,0 +1,66 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder.h ++* ++* @brief Includes all the builder related functionality ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include "JitManager.h" ++#include "common/formats.h" ++ ++using namespace llvm; ++ ++struct Builder ++{ ++ Builder(JitManager *pJitMgr); ++ IRBuilder<>* IRB() { return mpIRBuilder; }; ++ JitManager* JM() { return mpJitMgr; } ++ ++ JitManager* mpJitMgr; ++ IRBuilder<>* mpIRBuilder; ++ ++ // Built in types. ++ Type* mInt8Ty; ++ Type* mInt16Ty; ++ Type* mInt32Ty; ++ Type* mInt64Ty; ++ Type* mFP16Ty; ++ Type* mFP32Ty; ++ Type* mSimdFP16Ty; ++ Type* mSimdFP32Ty; ++ Type* mSimdInt16Ty; ++ Type* mSimdInt32Ty; ++ Type* mSimdInt64Ty; ++ StructType* mV4FP32Ty; ++ StructType* mV4Int32Ty; ++ ++#include "builder_gen.h" ++#include "builder_x86.h" ++#include "builder_misc.h" ++#include "builder_math.h" ++ ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp +new file mode 100644 +index 0000000..7b5ef20 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.cpp +@@ -0,0 +1,1052 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_gen.cpp ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#include "builder.h" ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::GLOBAL_STRING(StringRef Str, const Twine &Name) ++{ ++ return IRB()->CreateGlobalString(Str, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) ++{ ++ return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) ++{ ++ return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MEMCPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) ++{ ++ return IRB()->CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MEMCPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) ++{ ++ return IRB()->CreateMemCpy(Dst, Src, Size, Align, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) ++{ ++ return IRB()->CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) ++{ ++ return IRB()->CreateMemMove(Dst, Src, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::LIFETIME_START(Value *Ptr, ConstantInt *Size) ++{ ++ return IRB()->CreateLifetimeStart(Ptr, Size); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::LIFETIME_END(Value *Ptr, ConstantInt *Size) ++{ ++ return IRB()->CreateLifetimeEnd(Ptr, Size); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru, const Twine &Name) ++{ ++ return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask) ++{ ++ return IRB()->CreateMaskedStore(Val, Ptr, Align, Mask); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::ASSUMPTION(Value *Cond) ++{ ++ return IRB()->CreateAssumption(Cond); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::GC_STATEPOINT(Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name) ++{ ++ return IRB()->CreateGCStatepoint(ActualCallee, CallArgs, DeoptArgs, GCArgs, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name) ++{ ++ return IRB()->CreateGCResult(Statepoint, ResultType, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name) ++{ ++ return IRB()->CreateGCRelocate(Statepoint, BaseOffset, DerivedOffset, ResultType, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++ReturnInst *Builder::RET_VOID() ++{ ++ return IRB()->CreateRetVoid(); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++ReturnInst *Builder::RET(Value *V) ++{ ++ return IRB()->CreateRet(V); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++ReturnInst *Builder::AGGREGATE_RET(Value *const *retVals, unsigned N) ++{ ++ return IRB()->CreateAggregateRet(retVals, N); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++BranchInst *Builder::BR(BasicBlock *Dest) ++{ ++ return IRB()->CreateBr(Dest); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++BranchInst *Builder::COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights) ++{ ++ return IRB()->CreateCondBr(Cond, True, False, BranchWeights); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++SwitchInst *Builder::SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases, MDNode *BranchWeights) ++{ ++ return IRB()->CreateSwitch(V, Dest, NumCases, BranchWeights); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++IndirectBrInst *Builder::INDIRECT_BR(Value *Addr, unsigned NumDests) ++{ ++ return IRB()->CreateIndirectBr(Addr, NumDests); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, const Twine &Name) ++{ ++ return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, const Twine &Name) ++{ ++ return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Arg1, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++InvokeInst *Builder::INVOKE3(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name) ++{ ++ return IRB()->CreateInvoke3(Callee, NormalDest, UnwindDest, Arg1, Arg2, Arg3, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++InvokeInst *Builder::INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef Args, const Twine &Name) ++{ ++ return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++ResumeInst *Builder::RESUME(Value *Exn) ++{ ++ return IRB()->CreateResume(Exn); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++UnreachableInst *Builder::UNREACHABLE() ++{ ++ return IRB()->CreateUnreachable(); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ADD(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateAdd(LHS, RHS, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NSW_ADD(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateNSWAdd(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NUW_ADD(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateNUWAdd(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FADD(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateFAdd(LHS, RHS, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SUB(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateSub(LHS, RHS, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NSW_SUB(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateNSWSub(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NUW_SUB(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateNUWSub(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FSUB(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateFSub(LHS, RHS, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::MUL(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateMul(LHS, RHS, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NSW_MUL(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateNSWMul(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NUW_MUL(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateNUWMul(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FMUL(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateFMul(LHS, RHS, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::UDIV(Value *LHS, Value *RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateUDiv(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateExactUDiv(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SDIV(Value *LHS, Value *RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateSDiv(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateExactSDiv(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FDIV(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateFDiv(LHS, RHS, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::UREM(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateURem(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SREM(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateSRem(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FREM(Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateFRem(LHS, RHS, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SHL(Value *LHS, Value *RHS, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SHL(Value *LHS, const APInt &RHS, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SHL(Value *LHS, uint64_t RHS, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::LSHR(Value *LHS, Value *RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateLShr(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::LSHR(Value *LHS, const APInt &RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateLShr(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::LSHR(Value *LHS, uint64_t RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateLShr(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ASHR(Value *LHS, Value *RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateAShr(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ASHR(Value *LHS, const APInt &RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateAShr(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ASHR(Value *LHS, uint64_t RHS, const Twine &Name, bool isExact) ++{ ++ return IRB()->CreateAShr(LHS, RHS, Name, isExact); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::AND(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateAnd(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::AND(Value *LHS, const APInt &RHS, const Twine &Name) ++{ ++ return IRB()->CreateAnd(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::AND(Value *LHS, uint64_t RHS, const Twine &Name) ++{ ++ return IRB()->CreateAnd(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::OR(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateOr(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::OR(Value *LHS, const APInt &RHS, const Twine &Name) ++{ ++ return IRB()->CreateOr(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::OR(Value *LHS, uint64_t RHS, const Twine &Name) ++{ ++ return IRB()->CreateOr(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::XOR(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateXor(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::XOR(Value *LHS, const APInt &RHS, const Twine &Name) ++{ ++ return IRB()->CreateXor(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::XOR(Value *LHS, uint64_t RHS, const Twine &Name) ++{ ++ return IRB()->CreateXor(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateBinOp(Opc, LHS, RHS, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NEG(Value *V, const Twine &Name, bool HasNUW, bool HasNSW) ++{ ++ return IRB()->CreateNeg(V, Name, HasNUW, HasNSW); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NSW_NEG(Value *V, const Twine &Name) ++{ ++ return IRB()->CreateNSWNeg(V, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NUW_NEG(Value *V, const Twine &Name) ++{ ++ return IRB()->CreateNUWNeg(V, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FNEG(Value *V, const Twine &Name, MDNode *FPMathTag) ++{ ++ return IRB()->CreateFNeg(V, Name, FPMathTag); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::NOT(Value *V, const Twine &Name) ++{ ++ return IRB()->CreateNot(V, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++AllocaInst *Builder::ALLOCA(Type *Ty, Value *ArraySize, const Twine &Name) ++{ ++ return IRB()->CreateAlloca(Ty, ArraySize, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LoadInst *Builder::LOAD(Value *Ptr, const char *Name) ++{ ++ return IRB()->CreateLoad(Ptr, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LoadInst *Builder::LOAD(Value *Ptr, const Twine &Name) ++{ ++ return IRB()->CreateLoad(Ptr, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LoadInst *Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name) ++{ ++ return IRB()->CreateLoad(Ptr, isVolatile, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++StoreInst *Builder::STORE(Value *Val, Value *Ptr, bool isVolatile) ++{ ++ return IRB()->CreateStore(Val, Ptr, isVolatile); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name) ++{ ++ return IRB()->CreateAlignedLoad(Ptr, Align, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name) ++{ ++ return IRB()->CreateAlignedLoad(Ptr, Align, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LoadInst *Builder::ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name) ++{ ++ return IRB()->CreateAlignedLoad(Ptr, Align, isVolatile, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++StoreInst *Builder::ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile) ++{ ++ return IRB()->CreateAlignedStore(Val, Ptr, Align, isVolatile); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++FenceInst *Builder::FENCE(AtomicOrdering Ordering, SynchronizationScope SynchScope, const Twine &Name) ++{ ++ return IRB()->CreateFence(Ordering, SynchScope, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++AtomicCmpXchgInst *Builder::ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) ++{ ++ return IRB()->CreateAtomicCmpXchg(Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SynchScope); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++AtomicRMWInst *Builder::ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SynchronizationScope SynchScope) ++{ ++ return IRB()->CreateAtomicRMW(Op, Ptr, Val, Ordering, SynchScope); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::GEPA(Value *Ptr, ArrayRef IdxList, const Twine &Name) ++{ ++ return IRB()->CreateGEP(Ptr, IdxList, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::IN_BOUNDS_GEP(Value *Ptr, ArrayRef IdxList, const Twine &Name) ++{ ++ return IRB()->CreateInBoundsGEP(Ptr, IdxList, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::GEP(Value *Ptr, Value *Idx, const Twine &Name) ++{ ++ return IRB()->CreateGEP(Ptr, Idx, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::IN_BOUNDS_GEP(Value *Ptr, Value *Idx, const Twine &Name) ++{ ++ return IRB()->CreateInBoundsGEP(Ptr, Idx, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name) ++{ ++ return IRB()->CreateConstGEP1_32(Ptr, Idx0, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_IN_BOUNDS_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name) ++{ ++ return IRB()->CreateConstInBoundsGEP1_32(Ptr, Idx0, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name) ++{ ++ return IRB()->CreateConstGEP2_32(Ptr, Idx0, Idx1, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_IN_BOUNDS_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name) ++{ ++ return IRB()->CreateConstInBoundsGEP2_32(Ptr, Idx0, Idx1, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name) ++{ ++ return IRB()->CreateConstGEP1_64(Ptr, Idx0, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name) ++{ ++ return IRB()->CreateConstInBoundsGEP1_64(Ptr, Idx0, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name) ++{ ++ return IRB()->CreateConstGEP2_64(Ptr, Idx0, Idx1, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name) ++{ ++ return IRB()->CreateConstInBoundsGEP2_64(Ptr, Idx0, Idx1, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::STRUCT_GEP(Value *Ptr, unsigned Idx, const Twine &Name) ++{ ++ return IRB()->CreateStructGEP(Ptr, Idx, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::GLOBAL_STRING_PTR(StringRef Str, const Twine &Name) ++{ ++ return IRB()->CreateGlobalStringPtr(Str, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::TRUNC(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateTrunc(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::Z_EXT(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateZExt(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::S_EXT(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateSExt(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateZExtOrTrunc(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateSExtOrTrunc(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FP_TO_UI(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateFPToUI(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FP_TO_SI(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateFPToSI(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::UI_TO_FP(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateUIToFP(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SI_TO_FP(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateSIToFP(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FP_TRUNC(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateFPTrunc(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FP_EXT(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateFPExt(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreatePtrToInt(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateIntToPtr(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::BITCAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateBitCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateAddrSpaceCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateZExtOrBitCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateSExtOrBitCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateTruncOrBitCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateCast(Op, V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::POINTER_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreatePointerCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreatePointerBitCastOrAddrSpaceCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name) ++{ ++ return IRB()->CreateIntCast(V, DestTy, isSigned, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateBitOrPointerCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FP_CAST(Value *V, Type *DestTy, const Twine &Name) ++{ ++ return IRB()->CreateFPCast(V, DestTy, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpEQ(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_NE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpNE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpUGT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpUGE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpULT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpULE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpSGT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpSGE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpSLT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmpSLE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpOEQ(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpOGT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpOGE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpOLT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpOLE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpONE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpORD(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpUNO(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpUEQ(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpUGT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpUGE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpULT(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpULE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmpUNE(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateICmp(P, LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreateFCmp(P, LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++PHINode *Builder::PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name) ++{ ++ return IRB()->CreatePHI(Ty, NumReservedValues, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALL(Value *Callee, const Twine &Name) ++{ ++ return IRB()->CreateCall(Callee, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALL(Value *Callee, Value *Arg, const Twine &Name) ++{ ++ return IRB()->CreateCall(Callee, Arg, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALL2(Value *Callee, Value *Arg1, Value *Arg2, const Twine &Name) ++{ ++ return IRB()->CreateCall2(Callee, Arg1, Arg2, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALL3(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name) ++{ ++ return IRB()->CreateCall3(Callee, Arg1, Arg2, Arg3, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALL4(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, const Twine &Name) ++{ ++ return IRB()->CreateCall4(Callee, Arg1, Arg2, Arg3, Arg4, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALL5(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, Value *Arg5, const Twine &Name) ++{ ++ return IRB()->CreateCall5(Callee, Arg1, Arg2, Arg3, Arg4, Arg5, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::CALLA(Value *Callee, ArrayRef Args, const Twine &Name) ++{ ++ return IRB()->CreateCall(Callee, Args, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::SELECT(Value *C, Value *True, Value *False, const Twine &Name) ++{ ++ return IRB()->CreateSelect(C, True, False, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++VAArgInst *Builder::VA_ARG(Value *List, Type *Ty, const Twine &Name) ++{ ++ return IRB()->CreateVAArg(List, Ty, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VEXTRACT(Value *Vec, Value *Idx, const Twine &Name) ++{ ++ return IRB()->CreateExtractElement(Vec, Idx, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name) ++{ ++ return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name) ++{ ++ return IRB()->CreateShuffleVector(V1, V2, Mask, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::EXTRACT_VALUE(Value *Agg, ArrayRef Idxs, const Twine &Name) ++{ ++ return IRB()->CreateExtractValue(Agg, Idxs, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::INSERT_VALUE(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &Name) ++{ ++ return IRB()->CreateInsertValue(Agg, Val, Idxs, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++LandingPadInst *Builder::LANDING_PAD(Type *Ty, Value *PersFn, unsigned NumClauses, const Twine &Name) ++{ ++ return IRB()->CreateLandingPad(Ty, PersFn, NumClauses, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::IS_NULL(Value *Arg, const Twine &Name) ++{ ++ return IRB()->CreateIsNull(Arg, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::IS_NOT_NULL(Value *Arg, const Twine &Name) ++{ ++ return IRB()->CreateIsNotNull(Arg, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name) ++{ ++ return IRB()->CreatePtrDiff(LHS, RHS, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name) ++{ ++ return IRB()->CreateVectorSplat(NumElts, V, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name) ++{ ++ return IRB()->CreateExtractInteger(DL, From, ExtractedTy, Offset, Name); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++CallInst *Builder::ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue) ++{ ++ return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue); ++} ++ +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h +new file mode 100644 +index 0000000..c39077c +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gen.h +@@ -0,0 +1,205 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_gen.h ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#pragma once ++ ++////////////////////////////////////////////////////////////////////////// ++/// Auto-generated Builder IR declarations ++////////////////////////////////////////////////////////////////////////// ++Value *GLOBAL_STRING(StringRef Str, const Twine &Name = ""); ++CallInst *MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); ++CallInst *MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); ++CallInst *MEMCPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); ++CallInst *MEMCPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); ++CallInst *MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); ++CallInst *MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); ++CallInst *LIFETIME_START(Value *Ptr, ConstantInt *Size = nullptr); ++CallInst *LIFETIME_END(Value *Ptr, ConstantInt *Size = nullptr); ++CallInst *MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = 0, const Twine &Name = ""); ++CallInst *MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask); ++CallInst *ASSUMPTION(Value *Cond); ++CallInst *GC_STATEPOINT(Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = ""); ++CallInst *GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name = ""); ++CallInst *GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name = ""); ++ReturnInst *RET_VOID(); ++ReturnInst *RET(Value *V); ++ReturnInst *AGGREGATE_RET(Value *const *retVals, unsigned N); ++BranchInst *BR(BasicBlock *Dest); ++BranchInst *COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights = nullptr); ++SwitchInst *SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases = 10, MDNode *BranchWeights = nullptr); ++IndirectBrInst *INDIRECT_BR(Value *Addr, unsigned NumDests = 10); ++InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, const Twine &Name = ""); ++InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, const Twine &Name = ""); ++InvokeInst *INVOKE3(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name = ""); ++InvokeInst *INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef Args, const Twine &Name = ""); ++ResumeInst *RESUME(Value *Exn); ++UnreachableInst *UNREACHABLE(); ++Value *ADD(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *NSW_ADD(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *NUW_ADD(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FADD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *SUB(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *NSW_SUB(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *NUW_SUB(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FSUB(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *MUL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *NSW_MUL(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *NUW_MUL(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FMUL(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *UDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); ++Value *EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *SDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); ++Value *EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FDIV(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *UREM(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *SREM(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FREM(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *SHL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *SHL(Value *LHS, const APInt &RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *SHL(Value *LHS, uint64_t RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *LSHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); ++Value *LSHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false); ++Value *LSHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false); ++Value *ASHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false); ++Value *ASHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false); ++Value *ASHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false); ++Value *AND(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *AND(Value *LHS, const APInt &RHS, const Twine &Name = ""); ++Value *AND(Value *LHS, uint64_t RHS, const Twine &Name = ""); ++Value *OR(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *OR(Value *LHS, const APInt &RHS, const Twine &Name = ""); ++Value *OR(Value *LHS, uint64_t RHS, const Twine &Name = ""); ++Value *XOR(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *XOR(Value *LHS, const APInt &RHS, const Twine &Name = ""); ++Value *XOR(Value *LHS, uint64_t RHS, const Twine &Name = ""); ++Value *BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *NEG(Value *V, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false); ++Value *NSW_NEG(Value *V, const Twine &Name = ""); ++Value *NUW_NEG(Value *V, const Twine &Name = ""); ++Value *FNEG(Value *V, const Twine &Name = "", MDNode *FPMathTag = nullptr); ++Value *NOT(Value *V, const Twine &Name = ""); ++AllocaInst *ALLOCA(Type *Ty, Value *ArraySize = nullptr, const Twine &Name = ""); ++LoadInst *LOAD(Value *Ptr, const char *Name); ++LoadInst *LOAD(Value *Ptr, const Twine &Name = ""); ++LoadInst *LOAD(Value *Ptr, bool isVolatile, const Twine &Name = ""); ++StoreInst *STORE(Value *Val, Value *Ptr, bool isVolatile = false); ++LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name); ++LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name = ""); ++LoadInst *ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name = ""); ++StoreInst *ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false); ++FenceInst *FENCE(AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread, const Twine &Name = ""); ++AtomicCmpXchgInst *ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SynchronizationScope SynchScope = CrossThread); ++AtomicRMWInst *ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SynchronizationScope SynchScope = CrossThread); ++Value *GEPA(Value *Ptr, ArrayRef IdxList, const Twine &Name = ""); ++Value *IN_BOUNDS_GEP(Value *Ptr, ArrayRef IdxList, const Twine &Name = ""); ++Value *GEP(Value *Ptr, Value *Idx, const Twine &Name = ""); ++Value *IN_BOUNDS_GEP(Value *Ptr, Value *Idx, const Twine &Name = ""); ++Value *CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = ""); ++Value *CONST_IN_BOUNDS_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = ""); ++Value *CONST_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = ""); ++Value *CONST_IN_BOUNDS_GEP2_32(Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = ""); ++Value *CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = ""); ++Value *CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = ""); ++Value *CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = ""); ++Value *CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = ""); ++Value *STRUCT_GEP(Value *Ptr, unsigned Idx, const Twine &Name = ""); ++Value *GLOBAL_STRING_PTR(StringRef Str, const Twine &Name = ""); ++Value *TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *Z_EXT(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *S_EXT(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *FP_TO_UI(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *FP_TO_SI(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *UI_TO_FP(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *SI_TO_FP(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *FP_TRUNC(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *FP_EXT(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *BITCAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name = ""); ++Value *POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name = ""); ++Value *BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *FP_CAST(Value *V, Type *DestTy, const Twine &Name = ""); ++Value *ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_NE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = ""); ++PHINode *PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name = ""); ++CallInst *CALL(Value *Callee, const Twine &Name = ""); ++CallInst *CALL(Value *Callee, Value *Arg, const Twine &Name = ""); ++CallInst *CALL2(Value *Callee, Value *Arg1, Value *Arg2, const Twine &Name = ""); ++CallInst *CALL3(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, const Twine &Name = ""); ++CallInst *CALL4(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, const Twine &Name = ""); ++CallInst *CALL5(Value *Callee, Value *Arg1, Value *Arg2, Value *Arg3, Value *Arg4, Value *Arg5, const Twine &Name = ""); ++CallInst *CALLA(Value *Callee, ArrayRef Args, const Twine &Name = ""); ++Value *SELECT(Value *C, Value *True, Value *False, const Twine &Name = ""); ++VAArgInst *VA_ARG(Value *List, Type *Ty, const Twine &Name = ""); ++Value *VEXTRACT(Value *Vec, Value *Idx, const Twine &Name = ""); ++Value *VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name = ""); ++Value *VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name = ""); ++Value *EXTRACT_VALUE(Value *Agg, ArrayRef Idxs, const Twine &Name = ""); ++Value *INSERT_VALUE(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &Name = ""); ++LandingPadInst *LANDING_PAD(Type *Ty, Value *PersFn, unsigned NumClauses, const Twine &Name = ""); ++Value *IS_NULL(Value *Arg, const Twine &Name = ""); ++Value *IS_NOT_NULL(Value *Arg, const Twine &Name = ""); ++Value *PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name = ""); ++Value *VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name = ""); ++Value *EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name); ++CallInst *ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue = nullptr); +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h +new file mode 100644 +index 0000000..92867ec +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h +@@ -0,0 +1,34 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_math.h ++* ++* @brief math/alu builder functions ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++Value* VLOG2PS(Value* src); ++Value* VPOW24PS(Value* src); ++Value* VEXP2PS(Value* src); +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +new file mode 100644 +index 0000000..5897121 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +@@ -0,0 +1,1195 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_misc.cpp ++* ++* @brief Implementation for miscellaneous builder functions ++* ++* Notes: ++* ++******************************************************************************/ ++#include "builder.h" ++#include "llvm/Support/DynamicLibrary.h" ++ ++void __cdecl CallPrint(const char* fmt, ...); ++ ++Constant *Builder::C(bool i) ++{ ++ return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); ++} ++ ++Constant *Builder::C(char i) ++{ ++ return ConstantInt::get(IRB()->getInt8Ty(), i); ++} ++ ++Constant *Builder::C(uint8_t i) ++{ ++ return ConstantInt::get(IRB()->getInt8Ty(), i); ++} ++ ++Constant *Builder::C(int i) ++{ ++ return ConstantInt::get(IRB()->getInt32Ty(), i); ++} ++ ++Constant *Builder::C(int64_t i) ++{ ++ return ConstantInt::get(IRB()->getInt64Ty(), i); ++} ++ ++Constant *Builder::C(UINT16 i) ++{ ++ return ConstantInt::get(mInt16Ty,i); ++} ++ ++Constant *Builder::C(uint32_t i) ++{ ++ return ConstantInt::get(IRB()->getInt32Ty(), i); ++} ++ ++Constant *Builder::C(float i) ++{ ++ return ConstantFP::get(IRB()->getFloatTy(), i); ++} ++ ++Constant *Builder::PRED(bool pred) ++{ ++ return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); ++} ++ ++Value *Builder::VIMMED1(int i) ++{ ++ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); ++} ++ ++Value *Builder::VIMMED1(uint32_t i) ++{ ++ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); ++} ++ ++Value *Builder::VIMMED1(float i) ++{ ++ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); ++} ++ ++Value *Builder::VIMMED1(bool i) ++{ ++ return ConstantVector::getSplat(JM()->mVWidth, cast(C(i))); ++} ++ ++Value *Builder::VUNDEF_IPTR() ++{ ++ return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth)); ++} ++ ++Value *Builder::VUNDEF_I() ++{ ++ return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth)); ++} ++ ++Value *Builder::VUNDEF(Type *ty, uint32_t size) ++{ ++ return UndefValue::get(VectorType::get(ty, size)); ++} ++ ++Value *Builder::VUNDEF_F() ++{ ++ return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth)); ++} ++ ++Value *Builder::VUNDEF(Type* t) ++{ ++ return UndefValue::get(VectorType::get(t, JM()->mVWidth)); ++} ++ ++Value *Builder::VINSERT(Value *vec, Value *val, int index) ++{ ++ return VINSERT(vec, val, C(index)); ++} ++ ++Value *Builder::VBROADCAST(Value *src) ++{ ++ // check if src is already a vector ++ if (src->getType()->isVectorTy()) ++ { ++ return src; ++ } ++ ++ Value *vecRet = VUNDEF(src->getType()); ++ vecRet = VINSERT(vecRet, src, 0); ++ vecRet = VSHUFFLE(vecRet, vecRet, VIMMED1(0)); ++ ++ return vecRet; ++} ++ ++uint32_t Builder::IMMED(Value* v) ++{ ++ SWR_ASSERT(isa(v)); ++ ConstantInt *pValConst = cast(v); ++ return pValConst->getZExtValue(); ++} ++ ++Value *Builder::GEP(Value* ptr, const std::initializer_list &indexList) ++{ ++ std::vector indices; ++ for (auto i : indexList) ++ indices.push_back(i); ++ return GEPA(ptr, indices); ++} ++ ++Value *Builder::GEP(Value* ptr, const std::initializer_list &indexList) ++{ ++ std::vector indices; ++ for (auto i : indexList) ++ indices.push_back(C(i)); ++ return GEPA(ptr, indices); ++} ++ ++LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list &indices, const llvm::Twine& name) ++{ ++ std::vector valIndices; ++ for (auto i : indices) ++ valIndices.push_back(C(i)); ++ return LOAD(GEPA(basePtr, valIndices), name); ++} ++ ++LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list &indices, const llvm::Twine& name) ++{ ++ std::vector valIndices; ++ for (auto i : indices) ++ valIndices.push_back(i); ++ return LOAD(GEPA(basePtr, valIndices), name); ++} ++ ++StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list &indices) ++{ ++ std::vector valIndices; ++ for (auto i : indices) ++ valIndices.push_back(C(i)); ++ return STORE(val, GEPA(basePtr, valIndices)); ++} ++ ++StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list &indices) ++{ ++ std::vector valIndices; ++ for (auto i : indices) ++ valIndices.push_back(i); ++ return STORE(val, GEPA(basePtr, valIndices)); ++} ++ ++CallInst *Builder::CALL(Value *Callee, const std::initializer_list &argsList) ++{ ++ std::vector args; ++ for (auto arg : argsList) ++ args.push_back(arg); ++ return CALLA(Callee, args); ++} ++ ++Value *Builder::VRCP(Value *va) ++{ ++ return FDIV(VIMMED1(1.0f), va); // 1 / a ++} ++ ++Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) ++{ ++ Value* vOut = FMADDPS(vA, vX, vC); ++ vOut = FMADDPS(vB, vY, vOut); ++ return vOut; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate an i32 masked load operation in LLVM IR. If not ++/// supported on the underlying platform, emulate it with float masked load ++/// @param src - base address pointer for the load ++/// @param vMask - SIMD wide mask that controls whether to access memory load 0 ++Value *Builder::MASKLOADD(Value* src,Value* mask) ++{ ++ Value* vResult; ++ // use avx2 gather instruction is available ++ if(JM()->mArch.AVX2()) ++ { ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); ++ vResult = CALL2(func,src,mask); ++ } ++ else ++ { ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); ++ Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth)); ++ vResult = BITCAST(CALL2(func,src,fMask), VectorType::get(mInt32Ty,JM()->mVWidth)); ++ } ++ return vResult; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief insert a JIT call to CallPrint ++/// - outputs formatted string to both stdout and VS output window ++/// - DEBUG builds only ++/// Usage example: ++/// PRINT("index %d = 0x%p\n",{C(lane), pIndex}); ++/// where C(lane) creates a constant value to print, and pIndex is the Value* ++/// result from a GEP, printing out the pointer to memory ++/// @param printStr - constant string to print, which includes format specifiers ++/// @param printArgs - initializer list of Value*'s to print to std out ++CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list &printArgs) ++{ ++#if defined( DEBUG ) || defined( _DEBUG ) ++ // push the arguments to CallPrint into a vector ++ std::vector printCallArgs; ++ // save room for the format string. we still need to modify it for vectors ++ printCallArgs.resize(1); ++ ++ // search through the format string for special processing ++ size_t pos = 0; ++ std::string tempStr(printStr); ++ pos = tempStr.find('%', pos); ++ auto v = printArgs.begin(); ++ // printing is slow. now it's slower... ++ while((pos != std::string::npos) && (v != printArgs.end())) ++ { ++ // for %f we need to cast float Values to doubles so that they print out correctly ++ if((tempStr[pos+1]=='f') && ((*v)->getType()->isFloatTy())) ++ { ++ printCallArgs.push_back(FP_EXT(*v, Type::getDoubleTy(JM()->mContext))); ++ pos++; ++ } ++ // add special handling for %f and %d format specifiers to make printing llvm vector types easier ++ else if((*v)->getType()->isVectorTy()) ++ { ++ if((tempStr[pos+1]=='f') && ((*v)->getType()->getContainedType(0)->isFloatTy())) ++ { ++ uint32_t i = 0; ++ for( ; i < ((*v)->getType()->getVectorNumElements())-1; i++) ++ { ++ tempStr.insert(pos, std::string("%f ")); ++ pos+=3; ++ printCallArgs.push_back(FP_EXT(VEXTRACT(*v, C(i)), Type::getDoubleTy(JM()->mContext))); ++ } ++ printCallArgs.push_back(FP_EXT(VEXTRACT(*v,C(i)),Type::getDoubleTy(JM()->mContext))); ++ } ++ else if((tempStr[pos+1]=='d') && ((*v)->getType()->getContainedType(0)->isIntegerTy())) ++ { ++ uint32_t i = 0; ++ for( ; i < ((*v)->getType()->getVectorNumElements())-1; i++) ++ { ++ tempStr.insert(pos,std::string("%d ")); ++ pos += 3; ++ printCallArgs.push_back(VEXTRACT(*v,C(i))); ++ } ++ printCallArgs.push_back(VEXTRACT(*v,C(i))); ++ } ++ else ++ { ++ /// not a supported vector to print ++ /// @todo pointer types too ++ SWR_ASSERT(0); ++ } ++ } ++ else ++ { ++ printCallArgs.push_back(*v); ++ } ++ ++ // advance to the next arguement ++ v++; ++ pos = tempStr.find('%', ++pos); ++ } ++ ++ // create global variable constant string ++ Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); ++ GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); ++ JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); ++ ++ // get a pointer to the first character in the constant string array ++ std::vector geplist{C(0),C(0)}; ++ Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); ++ ++ // insert the pointer to the format string in the argument vector ++ printCallArgs[0] = strGEP; ++ ++ // get pointer to CallPrint function and insert decl into the module if needed ++ std::vector args; ++ args.push_back(PointerType::get(mInt8Ty,0)); ++ FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); ++ Function *callPrintFn = cast(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); ++ ++ // if we haven't yet added the symbol to the symbol table ++ if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) ++ { ++ sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); ++ } ++ ++ // insert a call to CallPrint ++ return CALLA(callPrintFn,printCallArgs); ++#else // #if defined( DEBUG ) || defined( _DEBUG ) ++ return nullptr; ++#endif ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a masked gather operation in LLVM IR. If not ++/// supported on the underlying platform, emulate it with loads ++/// @param vSrc - SIMD wide value that will be loaded if mask is invalid ++/// @param pBase - Int8* base VB address pointer value ++/// @param vIndices - SIMD wide value of VB byte offsets ++/// @param vMask - SIMD wide mask that controls whether to access memory or the src values ++/// @param scale - value to scale indices by ++Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) ++{ ++ Value* vGather; ++ ++ // use avx2 gather instruction if available ++ if(JM()->mArch.AVX2()) ++ { ++ // force mask to , required by vgather ++ vMask = BITCAST(vMask, mSimdFP32Ty); ++ vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); ++ } ++ else ++ { ++ Value* pStack = STACKSAVE(); ++ ++ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address ++ Value* vSrcPtr = ALLOCA(vSrc->getType()); ++ STORE(vSrc, vSrcPtr); ++ ++ vGather = VUNDEF_F(); ++ Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); ++ Value *vOffsets = MUL(vIndices,vScaleVec); ++ Value *mask = MASK(vMask); ++ for(uint32_t i = 0; i < JM()->mVWidth; ++i) ++ { ++ // single component byte index ++ Value *offset = VEXTRACT(vOffsets,C(i)); ++ // byte pointer to component ++ Value *loadAddress = GEP(pBase,offset); ++ loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); ++ // pointer to the value to load if we're masking off a component ++ Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); ++ Value *selMask = VEXTRACT(mask,C(i)); ++ // switch in a safe address to load if we're trying to access a vertex ++ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); ++ Value *val = LOAD(validAddress); ++ vGather = VINSERT(vGather,val,C(i)); ++ } ++ STACKRESTORE(pStack); ++ } ++ ++ return vGather; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a masked gather operation in LLVM IR. If not ++/// supported on the underlying platform, emulate it with loads ++/// @param vSrc - SIMD wide value that will be loaded if mask is invalid ++/// @param pBase - Int8* base VB address pointer value ++/// @param vIndices - SIMD wide value of VB byte offsets ++/// @param vMask - SIMD wide mask that controls whether to access memory or the src values ++/// @param scale - value to scale indices by ++Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) ++{ ++ Value* vGather; ++ ++ // use avx2 gather instruction if available ++ if(JM()->mArch.AVX2()) ++ { ++ vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); ++ } ++ else ++ { ++ Value* pStack = STACKSAVE(); ++ ++ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address ++ Value* vSrcPtr = ALLOCA(vSrc->getType()); ++ STORE(vSrc, vSrcPtr); ++ ++ vGather = VUNDEF_I(); ++ Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); ++ Value *vOffsets = MUL(vIndices, vScaleVec); ++ Value *mask = MASK(vMask); ++ for(uint32_t i = 0; i < JM()->mVWidth; ++i) ++ { ++ // single component byte index ++ Value *offset = VEXTRACT(vOffsets, C(i)); ++ // byte pointer to component ++ Value *loadAddress = GEP(pBase, offset); ++ loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); ++ // pointer to the value to load if we're masking off a component ++ Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); ++ Value *selMask = VEXTRACT(mask, C(i)); ++ // switch in a safe address to load if we're trying to access a vertex ++ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); ++ Value *val = LOAD(validAddress, C(0)); ++ vGather = VINSERT(vGather, val, C(i)); ++ } ++ ++ STACKRESTORE(pStack); ++ } ++ return vGather; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief convert x86 mask to llvm mask ++Value* Builder::MASK(Value* vmask) ++{ ++ Value* src = BITCAST(vmask, mSimdInt32Ty); ++ return ICMP_SLT(src, VIMMED1(0)); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief convert llvm mask to x86 mask ++Value* Builder::VMASK(Value* mask) ++{ ++ return S_EXT(mask, mSimdInt32Ty); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a VPSHUFB operation in LLVM IR. If not ++/// supported on the underlying platform, emulate it ++/// @param a - 256bit SIMD(32x8bit) of 8bit integer values ++/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values ++/// Byte masks in lower 128 lane of b selects 8 bit values from lower ++/// 128bits of a, and vice versa for the upper lanes. If the mask ++/// value is negative, '0' is inserted. ++Value *Builder::PSHUFB(Value* a, Value* b) ++{ ++ Value* res; ++ // use avx2 pshufb instruction if available ++ if(JM()->mArch.AVX2()) ++ { ++ res = VPSHUFB(a, b); ++ } ++ else ++ { ++ Constant* cB = dyn_cast(b); ++ // number of 8 bit elements in b ++ uint32_t numElms = cast(cB->getType())->getNumElements(); ++ // output vector ++ Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); ++ ++ // insert an 8 bit value from the high and low lanes of a per loop iteration ++ numElms /= 2; ++ for(uint32_t i = 0; i < numElms; i++) ++ { ++ ConstantInt* cLow128b = cast(cB->getAggregateElement(i)); ++ ConstantInt* cHigh128b = cast(cB->getAggregateElement(i + numElms)); ++ ++ // extract values from constant mask ++ char valLow128bLane = (char)(cLow128b->getSExtValue()); ++ char valHigh128bLane = (char)(cHigh128b->getSExtValue()); ++ ++ Value* insertValLow128b; ++ Value* insertValHigh128b; ++ ++ // if the mask value is negative, insert a '0' in the respective output position ++ // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector ++ insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); ++ insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); ++ ++ vShuf = VINSERT(vShuf, insertValLow128b, i); ++ vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); ++ } ++ res = vShuf; ++ } ++ return res; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 ++/// bits)in LLVM IR. If not supported on the underlying platform, emulate it ++/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only ++/// lower 8 values are used. ++Value *Builder::PMOVSXBD(Value* a) ++{ ++ Value* res; ++ // use avx2 byte sign extend instruction if available ++ if(JM()->mArch.AVX2()) ++ { ++ res = VPMOVSXBD(a); ++ } ++ else ++ { ++ // VPMOVSXBD output type ++ Type* v8x32Ty = VectorType::get(mInt32Ty, 8); ++ // Extract 8 values from 128bit lane and sign extend ++ res = S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); ++ } ++ return res; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 ++/// bits)in LLVM IR. If not supported on the underlying platform, emulate it ++/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. ++Value *Builder::PMOVSXWD(Value* a) ++{ ++ Value* res; ++ // use avx2 word sign extend if available ++ if(JM()->mArch.AVX2()) ++ { ++ res = VPMOVSXWD(a); ++ } ++ else ++ { ++ // VPMOVSXWD output type ++ Type* v8x32Ty = VectorType::get(mInt32Ty, 8); ++ // Extract 8 values from 128bit lane and sign extend ++ res = S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); ++ } ++ return res; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a VPERMD operation (shuffle 32 bit integer values ++/// across 128 bit lanes) in LLVM IR. If not supported on the underlying ++/// platform, emulate it ++/// @param a - 256bit SIMD lane(8x32bit) of integer values. ++/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values ++Value *Builder::PERMD(Value* a, Value* idx) ++{ ++ Value* res; ++ // use avx2 permute instruction if available ++ if(JM()->mArch.AVX2()) ++ { ++ // llvm 3.6.0 swapped the order of the args to vpermd ++ res = VPERMD(idx, a); ++ } ++ else ++ { ++ res = VSHUFFLE(a, a, idx); ++ } ++ return res; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) ++/// in LLVM IR. If not supported on the underlying platform, emulate it ++/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. ++Value *Builder::CVTPH2PS(Value* a) ++{ ++ if (JM()->mArch.F16C()) ++ { ++ return VCVTPH2PS(a); ++ } ++ else ++ { ++ Value* vExt = S_EXT(a, mSimdInt32Ty); ++ Value* sign = AND(vExt,0x80000000); ++ ++ // normal case ++ Value* mantissa = SHL(AND(vExt, 0x03ff), 13); ++ Value* exponent = AND(vExt, 0x7c00); ++ exponent = ADD(exponent, VIMMED1(0x1c000)); ++ exponent = SHL(exponent, 13); ++ ++ Value* result = OR(OR(sign, mantissa), exponent); ++ ++ // handle 0 ++ Value* zeroMask = ICMP_EQ(AND(vExt, 0x7fff), VIMMED1(0)); ++ result = SELECT(zeroMask, sign, result); ++ ++ // handle infinity ++ Value* infMask = ICMP_EQ(AND(vExt, 0x7c00), VIMMED1(0x7c00)); ++ Value* signedInf = OR(VIMMED1(0x7f800000), sign); ++ result = SELECT(infMask, signedInf, result); ++ ++ // @todo handle subnormal ++ ++ // cast to f32 ++ result = BITCAST(result, mSimdFP32Ty); ++ return result; ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) ++/// in LLVM IR. If not supported on the underlying platform, emulate it ++/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. ++Value *Builder::CVTPS2PH(Value* a, Value* rounding) ++{ ++ if (JM()->mArch.F16C()) ++ { ++ return VCVTPS2PH(a, rounding); ++ } ++ else ++ { ++ SWR_ASSERT(false, "Emulation of VCVTPH2PS unimplemented."); ++ return nullptr; ++ } ++} ++ ++Value *Builder::PMAXSD(Value* a, Value* b) ++{ ++ if (JM()->mArch.AVX2()) ++ { ++ return VPMAXSD(a, b); ++ } ++ else ++ { ++ // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources ++ Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd); ++ ++ // low 128 ++ Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); ++ Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); ++ Value* resLo = CALL2(pmaxsd, aLo, bLo); ++ ++ // high 128 ++ Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); ++ Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); ++ Value* resHi = CALL2(pmaxsd, aHi, bHi); ++ ++ // combine ++ Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); ++ result = VINSERTI128(result, resHi, C((uint8_t)1)); ++ ++ return result; ++ } ++} ++ ++Value *Builder::PMINSD(Value* a, Value* b) ++{ ++ if (JM()->mArch.AVX2()) ++ { ++ return VPMINSD(a, b); ++ } ++ else ++ { ++ // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources ++ Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd); ++ ++ // low 128 ++ Value* aLo = VEXTRACTI128(a, C((uint8_t)0)); ++ Value* bLo = VEXTRACTI128(b, C((uint8_t)0)); ++ Value* resLo = CALL2(pminsd, aLo, bLo); ++ ++ // high 128 ++ Value* aHi = VEXTRACTI128(a, C((uint8_t)1)); ++ Value* bHi = VEXTRACTI128(b, C((uint8_t)1)); ++ Value* resHi = CALL2(pminsd, aHi, bHi); ++ ++ // combine ++ Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0)); ++ result = VINSERTI128(result, resHi, C((uint8_t)1)); ++ ++ return result; ++ } ++} ++ ++void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, ++ Value* mask, Value* vGatherComponents[], bool bPackedOutput) ++{ ++ const SWR_FORMAT_INFO &info = GetFormatInfo(format); ++ if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) ++ { ++ // ensure our mask is the correct type ++ mask = BITCAST(mask, mSimdFP32Ty); ++ GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); ++ } ++ else ++ { ++ // ensure our mask is the correct type ++ mask = BITCAST(mask, mSimdInt32Ty); ++ GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); ++ } ++} ++ ++void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, ++ Value* mask, Value* vGatherComponents[], bool bPackedOutput) ++{ ++ switch(info.bpp / info.numComps) ++ { ++ case 16: ++ { ++ Value* vGatherResult[2]; ++ Value *vMask; ++ ++ // TODO: vGatherMaskedVal ++ Value* vGatherMaskedVal = VIMMED1((float)0); ++ ++ // always have at least one component out of x or y to fetch ++ ++ // save mask as it is zero'd out after each gather ++ vMask = mask; ++ ++ vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); ++ // e.g. result of first 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy ++ // ++ ++ // if we have at least one component out of x or y to fetch ++ if(info.numComps > 2) ++ { ++ // offset base to the next components(zw) in the vertex to gather ++ pSrcBase = GEP(pSrcBase, C((char)4)); ++ vMask = mask; ++ ++ vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); ++ // e.g. result of second 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw ++ // ++ } ++ else ++ { ++ vGatherResult[1] = vGatherMaskedVal; ++ } ++ ++ // Shuffle gathered components into place, each row is a component ++ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); ++ } ++ break; ++ case 32: ++ { ++ // apply defaults ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); ++ } ++ ++ for(uint32_t i = 0; i < info.numComps; i++) ++ { ++ uint32_t swizzleIndex = info.swizzle[i]; ++ ++ // save mask as it is zero'd out after each gather ++ Value *vMask = mask; ++ ++ // Gather a SIMD of components ++ vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); ++ ++ // offset base to the next component to gather ++ pSrcBase = GEP(pSrcBase, C((char)4)); ++ } ++ } ++ break; ++ default: ++ SWR_ASSERT(0, "Invalid float format"); ++ break; ++ } ++} ++ ++void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, ++ Value* mask, Value* vGatherComponents[], bool bPackedOutput) ++{ ++ switch (info.bpp / info.numComps) ++ { ++ case 8: ++ { ++ Value* vGatherMaskedVal = VIMMED1((int32_t)0); ++ Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); ++ // e.g. result of an 8x32bit integer gather for 8bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw ++ ++ Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); ++ } ++ break; ++ case 16: ++ { ++ Value* vGatherResult[2]; ++ Value *vMask; ++ ++ // TODO: vGatherMaskedVal ++ Value* vGatherMaskedVal = VIMMED1((int32_t)0); ++ ++ // always have at least one component out of x or y to fetch ++ ++ // save mask as it is zero'd out after each gather ++ vMask = mask; ++ ++ vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); ++ // e.g. result of first 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy ++ // ++ ++ // if we have at least one component out of x or y to fetch ++ if(info.numComps > 2) ++ { ++ // offset base to the next components(zw) in the vertex to gather ++ pSrcBase = GEP(pSrcBase, C((char)4)); ++ vMask = mask; ++ ++ vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); ++ // e.g. result of second 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw ++ // ++ } ++ else ++ { ++ vGatherResult[1] = vGatherMaskedVal; ++ } ++ ++ // Shuffle gathered components into place, each row is a component ++ Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); ++ ++ } ++ break; ++ case 32: ++ { ++ // apply defaults ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ vGatherComponents[i] = VIMMED1((int)info.defaults[i]); ++ } ++ ++ for(uint32_t i = 0; i < info.numComps; i++) ++ { ++ uint32_t swizzleIndex = info.swizzle[i]; ++ ++ // save mask as it is zero'd out after each gather ++ Value *vMask = mask; ++ ++ // Gather a SIMD of components ++ vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); ++ ++ // offset base to the next component to gather ++ pSrcBase = GEP(pSrcBase, C((char)4)); ++ } ++ } ++ break; ++ default: ++ SWR_ASSERT(0, "unsupported format"); ++ break; ++ } ++} ++ ++void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) ++{ ++ // cast types ++ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); ++ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits ++ ++ // input could either be float or int vector; do shuffle work in int ++ vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); ++ vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); ++ ++ if(bPackedOutput) ++ { ++ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits ++ ++ // shuffle mask ++ Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, ++ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); ++ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); ++ // after pshufb: group components together in each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy ++ ++ Value* vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); ++ // after PERMD: move and pack xy components into each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy ++ ++ // do the same for zw components ++ Value* vi128ZW = nullptr; ++ if(info.numComps > 2) ++ { ++ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); ++ vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); ++ } ++ ++ for(uint32_t i = 0; i < 4; i++) ++ { ++ uint32_t swizzleIndex = info.swizzle[i]; ++ // todo: fixed for packed ++ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); ++ if(i >= info.numComps) ++ { ++ // set the default component val ++ vGatherOutput[swizzleIndex] = vGatherMaskedVal; ++ continue; ++ } ++ ++ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 ++ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; ++ // if x or y, use vi128XY permute result, else use vi128ZW ++ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; ++ ++ // extract packed component 128 bit lanes ++ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); ++ } ++ ++ } ++ else ++ { ++ // pshufb masks for each component ++ Value* vConstMask[2]; ++ // x/z shuffle mask ++ vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, ++ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); ++ ++ // y/w shuffle mask ++ vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, ++ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); ++ ++ ++ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits ++ // apply defaults ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); ++ } ++ ++ for(uint32_t i = 0; i < info.numComps; i++) ++ { ++ uint32_t swizzleIndex = info.swizzle[i]; ++ ++ // select correct constMask for x/z or y/w pshufb ++ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; ++ // if x or y, use vi128XY permute result, else use vi128ZW ++ uint32_t selectedGather = (i < 2) ? 0 : 1; ++ ++ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); ++ // after pshufb mask for x channel; z uses the same shuffle from the second gather ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 ++ } ++ } ++} ++ ++void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) ++{ ++ // cast types ++ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); ++ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits ++ ++ if(bPackedOutput) ++ { ++ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits ++ // shuffle mask ++ Value* vConstMask = C({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, ++ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); ++ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); ++ // after pshufb: group components together in each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww ++ ++ Value* vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); ++ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) ++ ++ // do the same for zw components ++ Value* vi128ZW = nullptr; ++ if(info.numComps > 2) ++ { ++ vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); ++ } ++ ++ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex ++ for(uint32_t i = 0; i < 4; i++) ++ { ++ uint32_t swizzleIndex = info.swizzle[i]; ++ // todo: fix for packed ++ Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); ++ if(i >= info.numComps) ++ { ++ // set the default component val ++ vGatherOutput[swizzleIndex] = vGatherMaskedVal; ++ continue; ++ } ++ ++ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 ++ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; ++ // if x or y, use vi128XY permute result, else use vi128ZW ++ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; ++ ++ // sign extend ++ vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); ++ } ++ } ++ // else zero extend ++ else{ ++ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits ++ // apply defaults ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); ++ } ++ ++ for(uint32_t i = 0; i < info.numComps; i++){ ++ uint32_t swizzleIndex = info.swizzle[i]; ++ ++ // pshufb masks for each component ++ Value* vConstMask; ++ switch(i) ++ { ++ case 0: ++ // x shuffle mask ++ vConstMask = C({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, ++ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); ++ break; ++ case 1: ++ // y shuffle mask ++ vConstMask = C({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, ++ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); ++ break; ++ case 2: ++ // z shuffle mask ++ vConstMask = C({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, ++ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); ++ break; ++ case 3: ++ // w shuffle mask ++ vConstMask = C({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, ++ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); ++ break; ++ default: ++ vConstMask = nullptr; ++ break; ++ } ++ ++ vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); ++ // after pshufb for x channel ++ // 256i - 0 1 2 3 4 5 6 7 ++ // x000 x000 x000 x000 x000 x000 x000 x000 ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief emulates a scatter operation. ++/// @param pDst - pointer to destination ++/// @param vSrc - vector of src data to scatter ++/// @param vOffsets - vector of byte offsets from pDst ++/// @param vMask - mask of valid lanes ++void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) ++{ ++ Value* pStack = STACKSAVE(); ++ ++ // allocate tmp stack for masked off lanes ++ Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType()); ++ ++ Value *mask = MASK(vMask); ++ for (uint32_t i = 0; i < JM()->mVWidth; ++i) ++ { ++ Value *offset = VEXTRACT(vOffsets, C(i)); ++ // byte pointer to component ++ Value *storeAddress = GEP(pDst, offset); ++ storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0)); ++ Value *selMask = VEXTRACT(mask, C(i)); ++ Value *srcElem = VEXTRACT(vSrc, C(i)); ++ // switch in a safe address to load if we're trying to access a vertex ++ Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr); ++ STORE(srcElem, validAddress); ++ } ++ ++ STACKRESTORE(pStack); ++} ++ ++Value* Builder::VABSPS(Value* a) ++{ ++ Value* asInt = BITCAST(a, mSimdInt32Ty); ++ Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); ++ return result; ++} ++ ++Value *Builder::ICLAMP(Value* src, Value* low, Value* high) ++{ ++ Value *lowCmp = ICMP_SLT(src, low); ++ Value *ret = SELECT(lowCmp, low, src); ++ ++ Value *highCmp = ICMP_SGT(ret, high); ++ ret = SELECT(highCmp, high, ret); ++ ++ return ret; ++} ++ ++Value *Builder::FCLAMP(Value* src, Value* low, Value* high) ++{ ++ Value *lowCmp = FCMP_OLT(src, low); ++ Value *ret = SELECT(lowCmp, low, src); ++ ++ Value *highCmp = FCMP_OGT(ret, high); ++ ret = SELECT(highCmp, high, ret); ++ ++ return ret; ++} ++ ++Value *Builder::FCLAMP(Value* src, float low, float high) ++{ ++ Value* result = VMAXPS(src, VIMMED1(low)); ++ result = VMINPS(result, VIMMED1(high)); ++ ++ return result; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief save/restore stack, providing ability to push/pop the stack and ++/// reduce overall stack requirements for temporary stack use ++Value* Builder::STACKSAVE() ++{ ++ Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); ++ return CALL(pfnStackSave); ++} ++ ++void Builder::STACKRESTORE(Value* pSaved) ++{ ++ Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); ++ CALL(pfnStackRestore, pSaved); ++} ++ ++Value *Builder::FMADDPS(Value* a, Value* b, Value* c) ++{ ++ Value* vOut; ++ // use FMADs if available ++ if(JM()->mArch.AVX2()) ++ { ++ vOut = VFMADDPS(a, b, c); ++ } ++ else ++ { ++ vOut = FADD(FMUL(a, b), c); ++ } ++ return vOut; ++} ++ ++Value* Builder::POPCNT(Value* a) ++{ ++ Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); ++ return CALL(pCtPop, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief C functions called by LLVM IR ++////////////////////////////////////////////////////////////////////////// ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief called in JIT code, inserted by PRINT ++/// output to both stdout and visual studio debug console ++void __cdecl CallPrint(const char* fmt, ...) ++{ ++#if defined( DEBUG ) || defined( _DEBUG ) ++ va_list args; ++ va_start(args, fmt); ++ vprintf(fmt, args); ++ ++#if defined( _WIN32 ) ++ char strBuf[1024]; ++ vsnprintf_s(strBuf, _TRUNCATE, fmt, args); ++ OutputDebugString(strBuf); ++#endif ++#endif // #if defined( DEBUG ) || defined( _DEBUG ) ++} +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +new file mode 100644 +index 0000000..8a32c6a +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +@@ -0,0 +1,141 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_misc.h ++* ++* @brief miscellaneous builder functions ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++Constant *C(bool i); ++Constant *C(char i); ++Constant *C(uint8_t i); ++Constant *C(int i); ++Constant *C(int64_t i); ++Constant *C(UINT16 i); ++Constant *C(uint32_t i); ++Constant *C(float i); ++ ++template ++Constant *C(const std::initializer_list &constList) ++{ ++ std::vector vConsts; ++ for(auto i : constList) { ++ ++ vConsts.push_back(C((Ty)i)); ++ } ++ return ConstantVector::get(vConsts); ++} ++ ++Constant *PRED(bool pred); ++Value *VIMMED1(int i); ++Value *VIMMED1(uint32_t i); ++Value *VIMMED1(float i); ++Value *VIMMED1(bool i); ++Value *VUNDEF(Type* t); ++Value *VUNDEF_F(); ++Value *VUNDEF_I(); ++Value *VUNDEF(Type* ty, uint32_t size); ++Value *VUNDEF_IPTR(); ++Value *VINSERT(Value *vec, Value *val, int index); ++Value *VBROADCAST(Value *src); ++Value *VRCP(Value *va); ++Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); ++ ++uint32_t IMMED(Value* i); ++ ++Value *GEP(Value* ptr, const std::initializer_list &indexList); ++Value *GEP(Value* ptr, const std::initializer_list &indexList); ++CallInst *CALL(Value *Callee, const std::initializer_list &args); ++ ++LoadInst *LOAD(Value *BasePtr, const std::initializer_list &offset, const llvm::Twine& name = ""); ++LoadInst *LOADV(Value *BasePtr, const std::initializer_list &offset, const llvm::Twine& name = ""); ++StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list &offset); ++StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list &offset); ++ ++Value *VCMPPS_EQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); } ++Value *VCMPPS_LT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); } ++Value *VCMPPS_LE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); } ++Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); } ++Value *VCMPPS_NEQ(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); } ++Value *VCMPPS_GE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); } ++Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); } ++Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); } ++ ++Value *MASK(Value* vmask); ++Value *VMASK(Value* mask); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief functions that build IR to call x86 intrinsics directly, or ++/// emulate them with other instructions if not available on the host ++////////////////////////////////////////////////////////////////////////// ++Value *MASKLOADD(Value* src, Value* mask); ++ ++void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, ++ Value* mask, Value* vGatherComponents[], bool bPackedOutput); ++ ++Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); ++void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, ++ Value* mask, Value* vGatherComponents[], bool bPackedOutput); ++ ++Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); ++void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, ++ Value* mask, Value* vGatherComponents[], bool bPackedOutput); ++ ++void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); ++ ++void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput); ++void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput); ++ ++Value *PSHUFB(Value* a, Value* b); ++Value *PMOVSXBD(Value* a); ++Value *PMOVSXWD(Value* a); ++Value *PERMD(Value* a, Value* idx); ++Value *CVTPH2PS(Value* a); ++Value *CVTPS2PH(Value* a, Value* rounding); ++Value *PMAXSD(Value* a, Value* b); ++Value *PMINSD(Value* a, Value* b); ++Value *VABSPS(Value* a); ++Value *FMADDPS(Value* a, Value* b, Value* c); ++ ++// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior ++Value *VPCMPGTD(Value* a, Value* b) ++{ ++ Value* vIndexMask = ICMP_UGT(a,b); ++ ++ // need to set the high bit for x86 intrinsic masks ++ return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth)); ++} ++ ++Value *ICLAMP(Value* src, Value* low, Value* high); ++Value *FCLAMP(Value* src, Value* low, Value* high); ++Value *FCLAMP(Value* src, float low, float high); ++ ++CallInst *PRINT(const std::string &printStr,const std::initializer_list &printArgs); ++Value* STACKSAVE(); ++void STACKRESTORE(Value* pSaved); ++ ++Value* POPCNT(Value* a); ++ +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp +new file mode 100644 +index 0000000..b4ae075 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.cpp +@@ -0,0 +1,242 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_x86.cpp ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#include "builder.h" ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256); ++ return IRB()->CreateCall5(func, src, pBase, indices, mask, scale); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256); ++ return IRB()->CreateCall5(func, src, pBase, indices, mask, scale); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VSQRTPS(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_sqrt_ps_256); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VRSQRTPS(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_rsqrt_ps_256); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VRCPPS(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_rcp_ps_256); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VMINPS(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_min_ps_256); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VMAXPS(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_max_ps_256); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPMINSD(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPMAXSD(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VROUND(Value* a, Value* rounding) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); ++ return IRB()->CreateCall2(func, a, rounding); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VCMPPS(Value* a, Value* b, Value* cmpop) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_cmp_ps_256); ++ return IRB()->CreateCall3(func, a, b, cmpop); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VBLENDVPS(Value* a, Value* b, Value* mask) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_blendv_ps_256); ++ return IRB()->CreateCall3(func, a, b, mask); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::BEXTR_32(Value* src, Value* control) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_bmi_bextr_32); ++ return IRB()->CreateCall2(func, src, control); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VMASKLOADD(Value* src, Value* mask) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); ++ return IRB()->CreateCall2(func, src, mask); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VMASKMOVPS(Value* src, Value* mask) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256); ++ return IRB()->CreateCall2(func, src, mask); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPSHUFB(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pshuf_b); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPMOVSXBD(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPMOVSXWD(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPERMD(Value* idx, Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_permd); ++ return IRB()->CreateCall2(func, idx, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VCVTPH2PS(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VCVTPS2PH(Value* a, Value* round) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtps2ph_256); ++ return IRB()->CreateCall2(func, a, round); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VEXTRACTF128(Value* a, Value* imm8) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_ps_256); ++ return IRB()->CreateCall2(func, a, imm8); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VEXTRACTI128(Value* a, Value* imm8) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_si_256); ++ return IRB()->CreateCall2(func, a, imm8); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VINSERTF128(Value* a, Value* b, Value* imm8) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_ps_256); ++ return IRB()->CreateCall3(func, a, b, imm8); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VINSERTI128(Value* a, Value* b, Value* imm8) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_si_256); ++ return IRB()->CreateCall3(func, a, b, imm8); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VHSUBPS(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VPTESTC(Value* a, Value* b) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_ptestc_256); ++ return IRB()->CreateCall2(func, a, b); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VFMADDPS(Value* a, Value* b, Value* c) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_fma_vfmadd_ps_256); ++ return IRB()->CreateCall3(func, a, b, c); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VCVTTPS2DQ(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_cvtt_ps2dq_256); ++ return IRB()->CreateCall(func, a); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++Value *Builder::VMOVMSKPS(Value* a) ++{ ++ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_movmsk_ps_256); ++ return IRB()->CreateCall(func, a); ++} ++ +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h +new file mode 100644 +index 0000000..bdaabca +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_x86.h +@@ -0,0 +1,65 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file builder_x86.h ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#pragma once ++ ++////////////////////////////////////////////////////////////////////////// ++/// Auto-generated x86 intrinsics ++////////////////////////////////////////////////////////////////////////// ++Value *VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); ++Value *VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); ++Value *VSQRTPS(Value* a); ++Value *VRSQRTPS(Value* a); ++Value *VRCPPS(Value* a); ++Value *VMINPS(Value* a, Value* b); ++Value *VMAXPS(Value* a, Value* b); ++Value *VPMINSD(Value* a, Value* b); ++Value *VPMAXSD(Value* a, Value* b); ++Value *VROUND(Value* a, Value* rounding); ++Value *VCMPPS(Value* a, Value* b, Value* cmpop); ++Value *VBLENDVPS(Value* a, Value* b, Value* mask); ++Value *BEXTR_32(Value* src, Value* control); ++Value *VMASKLOADD(Value* src, Value* mask); ++Value *VMASKMOVPS(Value* src, Value* mask); ++Value *VPSHUFB(Value* a, Value* b); ++Value *VPMOVSXBD(Value* a); ++Value *VPMOVSXWD(Value* a); ++Value *VPERMD(Value* idx, Value* a); ++Value *VCVTPH2PS(Value* a); ++Value *VCVTPS2PH(Value* a, Value* round); ++Value *VEXTRACTF128(Value* a, Value* imm8); ++Value *VEXTRACTI128(Value* a, Value* imm8); ++Value *VINSERTF128(Value* a, Value* b, Value* imm8); ++Value *VINSERTI128(Value* a, Value* b, Value* imm8); ++Value *VHSUBPS(Value* a, Value* b); ++Value *VPTESTC(Value* a, Value* b); ++Value *VFMADDPS(Value* a, Value* b, Value* c); ++Value *VCVTTPS2DQ(Value* a); ++Value *VMOVMSKPS(Value* a); +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +new file mode 100644 +index 0000000..1b87769 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +@@ -0,0 +1,1450 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file fetch_jit.cpp ++* ++* @brief Implementation of the fetch jitter ++* ++* Notes: ++* ++******************************************************************************/ ++#include "jit_api.h" ++#include "fetch_jit.h" ++#include "builder.h" ++#include "state_llvm.h" ++#include "common/containers.hpp" ++#include "llvm/IR/DataLayout.h" ++#include ++#include ++ ++//#define FETCH_DUMP_VERTEX 1 ++ ++bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); ++ ++enum ConversionType ++{ ++ CONVERT_NONE, ++ CONVERT_NORMALIZED, ++ CONVERT_USCALED, ++ CONVERT_SSCALED, ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// Interface to Jitting a fetch shader ++////////////////////////////////////////////////////////////////////////// ++struct FetchJit : public Builder ++{ ++ FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; ++ ++ Function* Create(const FETCH_COMPILE_STATE& fetchState); ++ Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); ++ Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); ++ Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); ++ ++ // package up Shuffle*bpcGatherd args into a tuple for convenience ++ typedef std::tuple Shuffle8bpcArgs; ++ void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); ++ ++ typedef std::tuple Shuffle16bpcArgs; ++ void Shuffle16bpcGather(Shuffle16bpcArgs &args); ++ ++ void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); ++ ++ Value* GenerateCompCtrlVector(const ComponentControl ctrl); ++ ++ void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut); ++ void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut); ++}; ++ ++Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) ++{ ++ static std::size_t fetchNum = 0; ++ ++ std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); ++ fnName << fetchNum++; ++ ++ Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); ++ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); ++ ++ IRB()->SetInsertPoint(entry); ++ ++ auto argitr = fetch->getArgumentList().begin(); ++ ++ // Fetch shader arguments ++ Value* fetchInfo = argitr; ++argitr; ++ fetchInfo->setName("fetchInfo"); ++ Value* pVtxOut = argitr; ++ pVtxOut->setName("vtxOutput"); ++ // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex ++ // index 0(just the pointer to the simdvertex structure ++ // index 1(which element of the simdvertex structure to offset to(in this case 0) ++ // so the indices being i32's doesn't matter ++ // TODO: generated this GEP with a VECTOR structure type so this makes sense ++ std::vector vtxInputIndices(2, C(0)); ++ // GEP ++ pVtxOut = GEP(pVtxOut, C(0)); ++ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0)); ++ ++ // SWR_FETCH_CONTEXT::pStreams ++ Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); ++ streams->setName("pStreams"); ++ ++ // SWR_FETCH_CONTEXT::pIndices ++ Value* indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices}); ++ indices->setName("pIndices"); ++ ++ // SWR_FETCH_CONTEXT::pLastIndex ++ Value* pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex}); ++ pLastIndex->setName("pLastIndex"); ++ ++ ++ Value* vIndices; ++ switch(fetchState.indexType) ++ { ++ case R8_UINT: ++ indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); ++ if(fetchState.bDisableIndexOOBCheck){ ++ vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); ++ vIndices = Z_EXT(vIndices, mSimdInt32Ty); ++ } ++ else{ ++ pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); ++ vIndices = GetSimdValid8bitIndices(indices, pLastIndex); ++ } ++ break; ++ case R16_UINT: ++ indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); ++ if(fetchState.bDisableIndexOOBCheck){ ++ vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); ++ vIndices = Z_EXT(vIndices, mSimdInt32Ty); ++ } ++ else{ ++ pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); ++ vIndices = GetSimdValid16bitIndices(indices, pLastIndex); ++ } ++ break; ++ case R32_UINT: ++ (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0}) ++ : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); ++ break; // incoming type is already 32bit int ++ default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break; ++ } ++ ++ // store out vertex IDs ++ STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); ++ ++ // store out cut mask if enabled ++ if (fetchState.bEnableCutIndex) ++ { ++ Value* vCutIndex = VIMMED1(fetchState.cutIndex); ++ Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); ++ STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); ++ } ++ ++ // Fetch attributes from memory and output to a simdvertex struct ++ // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use ++ (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut) ++ : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut); ++ ++ RET_VOID(); ++ ++ //#define KNOB_SWRC_TRACING ++ ++#if defined(KNOB_SWRC_TRACING) ++ std::string err; ++ char fName[1024]; ++ const char *funcName = fetch->getName().data(); ++ sprintf(fName, "%s.ll", funcName); ++ raw_fd_ostream fetchFD(fName, err, LLVM_F_NONE); ++ fetch->print(fetchFD); ++ fetchFD.flush(); ++#endif ++ verifyFunction(*fetch); ++ ++ FunctionPassManager setupPasses(JM()->mpCurrentModule); ++ ++ ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) ++ setupPasses.add(createBreakCriticalEdgesPass()); ++ setupPasses.add(createCFGSimplificationPass()); ++ setupPasses.add(createEarlyCSEPass()); ++ setupPasses.add(createPromoteMemoryToRegisterPass()); ++ ++ setupPasses.run(*fetch); ++ ++#if defined(KNOB_SWRC_TRACING) ++ sprintf(fName, "%s.se.ll", funcName); ++ raw_fd_ostream seFetchFD(fName, err, LLVM_F_NONE); ++ fetch->print(seFetchFD); ++ seFetchFD.flush(); ++#endif ++ ++ FunctionPassManager optPasses(JM()->mpCurrentModule); ++ ++ ///@todo Haven't touched these either. Need to remove some of these and add others. ++ optPasses.add(createCFGSimplificationPass()); ++ optPasses.add(createEarlyCSEPass()); ++ optPasses.add(createInstructionCombiningPass()); ++ optPasses.add(createInstructionSimplifierPass()); ++ optPasses.add(createConstantPropagationPass()); ++ optPasses.add(createSCCPPass()); ++ optPasses.add(createAggressiveDCEPass()); ++ ++ optPasses.run(*fetch); ++ optPasses.run(*fetch); ++ ++#if defined(KNOB_SWRC_TRACING) ++ sprintf(fName, "%s.opt.ll", funcName); ++ raw_fd_ostream optFetchFD(fName, err, LLVM_F_NONE); ++ fetch->print(optFetchFD); ++ optFetchFD.flush(); ++#endif ++ ++ return fetch; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Loads attributes from memory using LOADs, shuffling the ++/// components into SOA form. ++/// *Note* currently does not support component control, ++/// component packing, or instancing ++/// @param fetchState - info about attributes to be fetched from memory ++/// @param streams - value pointer to the current vertex stream ++/// @param vIndices - vector value of indices to load ++/// @param pVtxOut - value pointer to output simdvertex struct ++void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut) ++{ ++ // Zack shuffles; a variant of the Charleston. ++ ++ SWRL::UncheckedFixedVector vectors; ++ ++ std::vector pMask(JM()->mVWidth); ++ for(uint32_t i = 0; i < JM()->mVWidth; ++i) ++ { ++ pMask[i] = (C(i < 4 ? i : 4)); ++ } ++ Constant* promoteMask = ConstantVector::get(pMask); ++ Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4)); ++ ++ Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); ++ ++ for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt) ++ { ++ Value* elements[4] = {0}; ++ const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt]; ++ const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); ++ uint32_t numComponents = info.numComps; ++ uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. ++ ++ vectors.clear(); ++ ++ // load SWR_VERTEX_BUFFER_STATE::pData ++ Value *stream = LOAD(streams, {ied.StreamIndex, 2}); ++ ++ // load SWR_VERTEX_BUFFER_STATE::pitch ++ Value *stride = LOAD(streams, {ied.StreamIndex, 1}); ++ stride = Z_EXT(stride, mInt64Ty); ++ ++ // load SWR_VERTEX_BUFFER_STATE::size ++ Value *size = LOAD(streams, {ied.StreamIndex, 3}); ++ size = Z_EXT(size, mInt64Ty); ++ ++ Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride); ++ ++ // Load from the stream. ++ for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane) ++ { ++ // Get index ++ Value* index = VEXTRACT(vIndices, C(lane)); ++ index = Z_EXT(index, mInt64Ty); ++ ++ Value* offset = MUL(index, stride); ++ offset = ADD(offset, C((int64_t)ied.AlignedByteOffset)); ++ offset = ADD(offset, startVertexOffset); ++ ++ if (!fetchState.bDisableIndexOOBCheck) { ++ // check for out of bound access, including partial OOB, and mask them to 0 ++ Value *endOffset = ADD(offset, C((int64_t)info.Bpp)); ++ Value *oob = ICMP_ULE(endOffset, size); ++ offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0)); ++ } ++ ++ Value* pointer = GEP(stream, offset); ++ // We use a full-lane, but don't actually care. ++ Value* vptr = 0; ++ ++ // get a pointer to a 4 component attrib in default address space ++ switch(bpc) ++ { ++ case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break; ++ case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break; ++ case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break; ++ default: SWR_ASSERT(false, "Unsupported underlying bpp!"); ++ } ++ ++ // load 4 components of attribute ++ Value* vec = ALIGNED_LOAD(vptr, 1, false); ++ ++ // Convert To FP32 internally ++ switch(info.type[0]) ++ { ++ case SWR_TYPE_UNORM: ++ switch(bpc) ++ { ++ case 8: ++ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); ++ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0)))); ++ break; ++ case 16: ++ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); ++ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0)))); ++ break; ++ default: ++ SWR_ASSERT(false, "Unsupported underlying type!"); ++ break; ++ } ++ break; ++ case SWR_TYPE_SNORM: ++ switch(bpc) ++ { ++ case 8: ++ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); ++ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0)))); ++ break; ++ case 16: ++ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); ++ vec = FMUL(vec, ConstantVector::get(std::vector(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0)))); ++ break; ++ default: ++ SWR_ASSERT(false, "Unsupported underlying type!"); ++ break; ++ } ++ break; ++ case SWR_TYPE_UINT: ++ // Zero extend uint32_t types. ++ switch(bpc) ++ { ++ case 8: ++ case 16: ++ vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4)); ++ vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); ++ break; ++ case 32: ++ break; // Pass through unchanged. ++ default: ++ SWR_ASSERT(false, "Unsupported underlying type!"); ++ break; ++ } ++ break; ++ case SWR_TYPE_SINT: ++ // Sign extend SINT types. ++ switch(bpc) ++ { ++ case 8: ++ case 16: ++ vec = S_EXT(vec, VectorType::get(mInt32Ty, 4)); ++ vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); ++ break; ++ case 32: ++ break; // Pass through unchanged. ++ default: ++ SWR_ASSERT(false, "Unsupported underlying type!"); ++ break; ++ } ++ break; ++ case SWR_TYPE_FLOAT: ++ switch(bpc) ++ { ++ case 32: ++ break; // Pass through unchanged. ++ default: ++ SWR_ASSERT(false, "Unsupported underlying type!"); ++ } ++ break; ++ case SWR_TYPE_USCALED: ++ vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); ++ break; ++ case SWR_TYPE_SSCALED: ++ vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); ++ break; ++ case SWR_TYPE_UNKNOWN: ++ case SWR_TYPE_UNUSED: ++ SWR_ASSERT(false, "Unsupported type %d!", info.type[0]); ++ } ++ ++ // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4) ++ // uwvec: 4 x F32, undef value ++ Value* wvec = VSHUFFLE(vec, uwvec, promoteMask); ++ vectors.push_back(wvec); ++ } ++ ++ std::vector v01Mask(JM()->mVWidth); ++ std::vector v23Mask(JM()->mVWidth); ++ std::vector v02Mask(JM()->mVWidth); ++ std::vector v13Mask(JM()->mVWidth); ++ ++ // Concatenate the vectors together. ++ elements[0] = VUNDEF_F(); ++ elements[1] = VUNDEF_F(); ++ elements[2] = VUNDEF_F(); ++ elements[3] = VUNDEF_F(); ++ for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b) ++ { ++ v01Mask[4 * b + 0] = C(0 + 4 * b); ++ v01Mask[4 * b + 1] = C(1 + 4 * b); ++ v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); ++ v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth); ++ ++ v23Mask[4 * b + 0] = C(2 + 4 * b); ++ v23Mask[4 * b + 1] = C(3 + 4 * b); ++ v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth); ++ v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); ++ ++ v02Mask[4 * b + 0] = C(0 + 4 * b); ++ v02Mask[4 * b + 1] = C(2 + 4 * b); ++ v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); ++ v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth); ++ ++ v13Mask[4 * b + 0] = C(1 + 4 * b); ++ v13Mask[4 * b + 1] = C(3 + 4 * b); ++ v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth); ++ v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); ++ ++ std::vector iMask(JM()->mVWidth); ++ for(uint32_t i = 0; i < JM()->mVWidth; ++i) ++ { ++ if(((4 * b) <= i) && (i < (4 * (b + 1)))) ++ { ++ iMask[i] = C(i % 4 + JM()->mVWidth); ++ } ++ else ++ { ++ iMask[i] = C(i); ++ } ++ } ++ Constant* insertMask = ConstantVector::get(iMask); ++ elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask); ++ elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask); ++ elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask); ++ elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask); ++ } ++ ++ Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask)); ++ Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask)); ++ Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask)); ++ Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask)); ++ elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask)); ++ elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask)); ++ elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask)); ++ elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask)); ++ ++ switch(numComponents + 1) ++ { ++ case 1: elements[0] = VIMMED1(0.0f); ++ case 2: elements[1] = VIMMED1(0.0f); ++ case 3: elements[2] = VIMMED1(0.0f); ++ case 4: elements[3] = VIMMED1(1.0f); ++ } ++ ++ for(uint32_t c = 0; c < 4; ++c) ++ { ++ Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP"); ++ STORE(elements[c], dest); ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Loads attributes from memory using AVX2 GATHER(s) ++/// @param fetchState - info about attributes to be fetched from memory ++/// @param fetchInfo - first argument passed to fetch shader ++/// @param streams - value pointer to the current vertex stream ++/// @param vIndices - vector value of indices to gather ++/// @param pVtxOut - value pointer to output simdvertex struct ++void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, ++ Value* streams, Value* vIndices, Value* pVtxOut) ++{ ++ uint32_t currentVertexElement = 0; ++ uint32_t outputElt = 0; ++ Value* vVertexElements[4]; ++ ++ Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); ++ Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); ++ Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); ++ Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); ++ curInstance->setName("curInstance"); ++ ++ for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt) ++ { ++ const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; ++ const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); ++ uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. ++ ++ Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); ++ ++ // VGATHER* takes an *i8 src pointer ++ Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0)); ++ ++ Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); ++ Value *vStride = VBROADCAST(stride); ++ ++ // max vertex index that is fully in bounds ++ Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); ++ maxVertex = LOAD(maxVertex); ++ ++ Value *vCurIndices; ++ Value *startOffset; ++ if(ied.InstanceEnable) ++ { ++ Value* stepRate = C(ied.InstanceDataStepRate); ++ ++ // prevent a div by 0 for 0 step rate ++ Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); ++ stepRate = SELECT(isNonZeroStep, stepRate, C(1)); ++ ++ // calc the current offset into instanced data buffer ++ Value* calcInstance = UDIV(curInstance, stepRate); ++ ++ // if step rate is 0, every instance gets instance 0 ++ calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); ++ ++ vCurIndices = VBROADCAST(calcInstance); ++ ++ startOffset = startInstance; ++ } ++ else ++ { ++ // offset indices by baseVertex ++ vCurIndices = ADD(vIndices, vBaseVertex); ++ ++ startOffset = startVertex; ++ } ++ ++ // All of the OOB calculations are in vertices, not VB offsets, to prevent having to ++ // do 64bit address offset calculations. ++ ++ // calculate byte offset to the start of the VB ++ Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); ++ pStreamBase = GEP(pStreamBase, baseOffset); ++ ++ // if we have a start offset, subtract from max vertex. Used for OOB check ++ maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); ++ Value* neg = ICMP_SLT(maxVertex, C((int64_t)0)); ++ // if we have a negative value, we're already OOB. clamp at 0. ++ maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty)); ++ ++ // Load the in bounds size of a partially valid vertex ++ Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); ++ partialInboundsSize = LOAD(partialInboundsSize); ++ Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); ++ Value* vBpp = VBROADCAST(C(info.Bpp)); ++ ++ // is the element is <= the partially valid size ++ Value* vElementInBoundsMask = ICMP_ULE(vBpp, vPartialVertexSize); ++ ++ // are vertices partially OOB? ++ Value* vMaxVertex = VBROADCAST(maxVertex); ++ Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); ++ ++ // are vertices are fully in bounds? ++ Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); ++ ++ // blend in any partially OOB indices that have valid elements ++ vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); ++ vGatherMask = VMASK(vGatherMask); ++ ++ // calculate the actual offsets into the VB ++ Value* vOffsets = MUL(vCurIndices, vStride); ++ Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); ++ vOffsets = ADD(vOffsets, vAlignmentOffsets); ++ ++ // Packing and component control ++ ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; ++ const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, ++ (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; ++ ++ if(info.type[0] == SWR_TYPE_FLOAT) ++ { ++ ///@todo: support 64 bit vb accesses ++ Value* gatherSrc = VIMMED1(0.0f); ++ ++ // Gather components from memory to store in a simdvertex structure ++ switch(bpc) ++ { ++ case 16: ++ { ++ Value* vGatherResult[2]; ++ Value *vMask; ++ ++ // if we have at least one component out of x or y to fetch ++ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ ++ // save mask as it is zero'd out after each gather ++ vMask = vGatherMask; ++ ++ vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); ++ // e.g. result of first 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy ++ // ++ } ++ ++ // if we have at least one component out of z or w to fetch ++ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ ++ // offset base to the next components(zw) in the vertex to gather ++ pStreamBase = GEP(pStreamBase, C((char)4)); ++ vMask = vGatherMask; ++ ++ vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); ++ // e.g. result of second 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw ++ // ++ } ++ ++ // if we have at least one component to shuffle into place ++ if(compMask){ ++ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, ++ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); ++ // Shuffle gathered components into place in simdvertex struct ++ Shuffle16bpcGather(args); // outputs to vVertexElements ref ++ } ++ } ++ break; ++ case 32: ++ { ++ for(uint32_t i = 0; i < 4; i++) ++ { ++ if(!isComponentEnabled(compMask, i)){ ++ // offset base to the next component in the vertex to gather ++ pStreamBase = GEP(pStreamBase, C((char)4)); ++ continue; ++ } ++ ++ // if we need to gather the component ++ if(compCtrl[i] == StoreSrc){ ++ // save mask as it is zero'd out after each gather ++ Value *vMask = vGatherMask; ++ ++ // Gather a SIMD of vertices ++ vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); ++ } ++ else{ ++ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); ++ } ++ ++ if(currentVertexElement > 3){ ++ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); ++ // reset to the next vVertexElement to output ++ currentVertexElement = 0; ++ } ++ ++ // offset base to the next component in the vertex to gather ++ pStreamBase = GEP(pStreamBase, C((char)4)); ++ } ++ } ++ break; ++ default: ++ SWR_ASSERT(0, "Tried to fetch invalid FP format"); ++ break; ++ } ++ } ++ else ++ { ++ Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; ++ ConversionType conversionType = CONVERT_NONE; ++ ++ switch(info.type[0]) ++ { ++ case SWR_TYPE_UNORM: ++ conversionType = CONVERT_NORMALIZED; ++ case SWR_TYPE_UINT: ++ extendCastType = Instruction::CastOps::ZExt; ++ break; ++ case SWR_TYPE_SNORM: ++ conversionType = CONVERT_NORMALIZED; ++ case SWR_TYPE_SINT: ++ extendCastType = Instruction::CastOps::SExt; ++ break; ++ case SWR_TYPE_USCALED: ++ conversionType = CONVERT_USCALED; ++ extendCastType = Instruction::CastOps::UIToFP; ++ break; ++ case SWR_TYPE_SSCALED: ++ conversionType = CONVERT_SSCALED; ++ extendCastType = Instruction::CastOps::SIToFP; ++ break; ++ default: ++ break; ++ } ++ ++ // value substituted when component of gather is masked ++ Value* gatherSrc = VIMMED1(0); ++ ++ // Gather components from memory to store in a simdvertex structure ++ switch (bpc) ++ { ++ case 8: ++ { ++ // if we have at least one component to fetch ++ if(compMask){ ++ Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); ++ // e.g. result of an 8x32bit integer gather for 8bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw ++ ++ Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, ++ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); ++ // Shuffle gathered components into place in simdvertex struct ++ Shuffle8bpcGatherd(args); // outputs to vVertexElements ref ++ } ++ } ++ break; ++ case 16: ++ { ++ Value* vGatherResult[2]; ++ Value *vMask; ++ ++ // if we have at least one component out of x or y to fetch ++ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ ++ // save mask as it is zero'd out after each gather ++ vMask = vGatherMask; ++ ++ vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); ++ // e.g. result of first 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy ++ // ++ } ++ ++ // if we have at least one component out of z or w to fetch ++ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ ++ // offset base to the next components(zw) in the vertex to gather ++ pStreamBase = GEP(pStreamBase, C((char)4)); ++ vMask = vGatherMask; ++ ++ vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); ++ // e.g. result of second 8x32bit integer gather for 16bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw ++ // ++ } ++ ++ // if we have at least one component to shuffle into place ++ if(compMask){ ++ Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, ++ currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); ++ // Shuffle gathered components into place in simdvertex struct ++ Shuffle16bpcGather(args); // outputs to vVertexElements ref ++ } ++ } ++ break; ++ case 32: ++ { ++ SWR_ASSERT(conversionType == CONVERT_NONE); ++ ++ // Gathered components into place in simdvertex struct ++ for(uint32_t i = 0; i < 4; i++) ++ { ++ if(!isComponentEnabled(compMask, i)){ ++ // offset base to the next component in the vertex to gather ++ pStreamBase = GEP(pStreamBase, C((char)4)); ++ continue; ++ } ++ ++ // if we need to gather the component ++ if(compCtrl[i] == StoreSrc){ ++ // save mask as it is zero'd out after each gather ++ Value *vMask = vGatherMask; ++ ++ vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); ++ ++ // e.g. result of a single 8x32bit integer gather for 32bit components ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx ++ } ++ else{ ++ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); ++ } ++ ++ if(currentVertexElement > 3){ ++ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); ++ // reset to the next vVertexElement to output ++ currentVertexElement = 0; ++ } ++ ++ // offset base to the next component in the vertex to gather ++ pStreamBase = GEP(pStreamBase, C((char)4)); ++ } ++ } ++ break; ++ } ++ } ++ } ++ ++ // if we have a partially filled vVertexElement struct, output it ++ if(currentVertexElement > 0){ ++ StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Loads a simd of valid indices. OOB indices are set to 0 ++/// *Note* have to do 16bit index checking in scalar until we have AVX-512 ++/// support ++/// @param pIndices - pointer to 8 bit indices ++/// @param pLastIndex - pointer to last valid index ++Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) ++{ ++ // can fit 2 16 bit integers per vWidth lane ++ Value* vIndices = VUNDEF_I(); ++ ++ // store 0 index on stack to be used to conditionally load from if index address is OOB ++ Value* pZeroIndex = ALLOCA(mInt8Ty); ++ STORE(C((uint8_t)0), pZeroIndex); ++ ++ // Load a SIMD of index pointers ++ for(int64_t lane = 0; lane < JM()->mVWidth; lane++) ++ { ++ // Calculate the address of the requested index ++ Value *pIndex = GEP(pIndices, C(lane)); ++ ++ // check if the address is less than the max index, ++ Value* mask = ICMP_ULT(pIndex, pLastIndex); ++ ++ // if valid, load the index. if not, load 0 from the stack ++ Value* pValid = SELECT(mask, pIndex, pZeroIndex); ++ Value *index = LOAD(pValid, "valid index"); ++ ++ // zero extended index to 32 bits and insert into the correct simd lane ++ index = Z_EXT(index, mInt32Ty); ++ vIndices = VINSERT(vIndices, index, lane); ++ } ++ return vIndices; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Loads a simd of valid indices. OOB indices are set to 0 ++/// *Note* have to do 16bit index checking in scalar until we have AVX-512 ++/// support ++/// @param pIndices - pointer to 16 bit indices ++/// @param pLastIndex - pointer to last valid index ++Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) ++{ ++ // can fit 2 16 bit integers per vWidth lane ++ Value* vIndices = VUNDEF_I(); ++ ++ // store 0 index on stack to be used to conditionally load from if index address is OOB ++ Value* pZeroIndex = ALLOCA(mInt16Ty); ++ STORE(C((uint16_t)0), pZeroIndex); ++ ++ // Load a SIMD of index pointers ++ for(int64_t lane = 0; lane < JM()->mVWidth; lane++) ++ { ++ // Calculate the address of the requested index ++ Value *pIndex = GEP(pIndices, C(lane)); ++ ++ // check if the address is less than the max index, ++ Value* mask = ICMP_ULT(pIndex, pLastIndex); ++ ++ // if valid, load the index. if not, load 0 from the stack ++ Value* pValid = SELECT(mask, pIndex, pZeroIndex); ++ Value *index = LOAD(pValid, "valid index"); ++ ++ // zero extended index to 32 bits and insert into the correct simd lane ++ index = Z_EXT(index, mInt32Ty); ++ vIndices = VINSERT(vIndices, index, lane); ++ } ++ return vIndices; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Loads a simd of valid indices. OOB indices are set to 0 ++/// @param pIndices - pointer to 32 bit indices ++/// @param pLastIndex - pointer to last valid index ++Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) ++{ ++ DataLayout dL(JM()->mpCurrentModule); ++ unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits ++ Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize)); ++ Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize)); ++ ++ // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) ++ Value* numIndicesLeft = SUB(iLastIndex,iIndices); ++ numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); ++ numIndicesLeft = SDIV(numIndicesLeft, C(4)); ++ ++ // create a vector of index counts from the base index ptr passed into the fetch ++ const std::vector vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)}; ++ Constant* vIndexOffsets = ConstantVector::get(vecIndices); ++ ++ // compare index count to the max valid index ++ // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load ++ // vIndexOffsets 0 1 2 3 4 5 6 7 ++ // ------------------------------ ++ // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass ++ // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 ++ Value* vMaxIndex = VBROADCAST(numIndicesLeft); ++ Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets); ++ ++ // VMASKLOAD takes an *i8 src pointer ++ pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); ++ ++ // Load the indices; OOB loads 0 ++ return MASKLOADD(pIndices,vIndexMask); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, ++/// denormalizes if needed, converts to F32 if needed, and positions in ++// the proper SIMD rows to be output to the simdvertex structure ++/// @param args: (tuple of args, listed below) ++/// @param vGatherResult - 8 gathered 8bpc vertices ++/// @param pVtxOut - base pointer to output simdvertex struct ++/// @param extendType - sign extend or zero extend ++/// @param bNormalized - do we need to denormalize? ++/// @param currentVertexElement - reference to the current vVertexElement ++/// @param outputElt - reference to the current offset from simdvertex we're o ++/// @param compMask - component packing mask ++/// @param compCtrl - component control val ++/// @param vVertexElements[4] - vertex components to output ++/// @param swizzle[4] - component swizzle location ++void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) ++{ ++ // Unpack tuple args ++ Value*& vGatherResult = std::get<0>(args); ++ Value* pVtxOut = std::get<1>(args); ++ const Instruction::CastOps extendType = std::get<2>(args); ++ const ConversionType conversionType = std::get<3>(args); ++ uint32_t ¤tVertexElement = std::get<4>(args); ++ uint32_t &outputElt = std::get<5>(args); ++ const ComponentEnable compMask = std::get<6>(args); ++ const ComponentControl (&compCtrl)[4] = std::get<7>(args); ++ Value* (&vVertexElements)[4] = std::get<8>(args); ++ const uint32_t (&swizzle)[4] = std::get<9>(args); ++ ++ // cast types ++ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); ++ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits ++ ++ // have to do extra work for sign extending ++ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ ++ Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane ++ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits ++ ++ // shuffle mask, including any swizzling ++ const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; ++ const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; ++ Value* vConstMask = C({char(x), char(x+4), char(x+8), char(x+12), ++ char(y), char(y+4), char(y+8), char(y+12), ++ char(z), char(z+4), char(z+8), char(z+12), ++ char(w), char(w+4), char(w+8), char(w+12), ++ char(x), char(x+4), char(x+8), char(x+12), ++ char(y), char(y+4), char(y+8), char(y+12), ++ char(z), char(z+4), char(z+8), char(z+12), ++ char(w), char(w+4), char(w+8), char(w+12)}); ++ ++ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); ++ // after pshufb: group components together in each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww ++ ++ Value* vi128XY = nullptr; ++ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ ++ vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); ++ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) ++ } ++ ++ // do the same for zw components ++ Value* vi128ZW = nullptr; ++ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ ++ vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); ++ } ++ ++ // init denormalize variables if needed ++ Instruction::CastOps fpCast; ++ Value* conversionFactor; ++ ++ switch (conversionType) ++ { ++ case CONVERT_NORMALIZED: ++ fpCast = Instruction::CastOps::SIToFP; ++ conversionFactor = VIMMED1((float)(1.0 / 127.0)); ++ break; ++ case CONVERT_SSCALED: ++ fpCast = Instruction::CastOps::SIToFP; ++ conversionFactor = VIMMED1((float)(1.0)); ++ break; ++ case CONVERT_USCALED: ++ SWR_ASSERT(0, "Type should not be sign extended!"); ++ conversionFactor = nullptr; ++ break; ++ default: ++ SWR_ASSERT(conversionType == CONVERT_NONE); ++ conversionFactor = nullptr; ++ break; ++ } ++ ++ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex ++ for(uint32_t i = 0; i < 4; i++){ ++ if(!isComponentEnabled(compMask, i)){ ++ continue; ++ } ++ ++ if(compCtrl[i] == ComponentControl::StoreSrc){ ++ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 ++ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; ++ // if x or y, use vi128XY permute result, else use vi128ZW ++ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; ++ ++ // sign extend ++ vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty)); ++ ++ // denormalize if needed ++ if(conversionType != CONVERT_NONE){ ++ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); ++ } ++ currentVertexElement++; ++ } ++ else{ ++ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); ++ } ++ ++ if(currentVertexElement > 3){ ++ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); ++ // reset to the next vVertexElement to output ++ currentVertexElement = 0; ++ } ++ } ++ } ++ // else zero extend ++ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) ++ { ++ // init denormalize variables if needed ++ Instruction::CastOps fpCast; ++ Value* conversionFactor; ++ ++ switch (conversionType) ++ { ++ case CONVERT_NORMALIZED: ++ fpCast = Instruction::CastOps::UIToFP; ++ conversionFactor = VIMMED1((float)(1.0 / 255.0)); ++ break; ++ case CONVERT_USCALED: ++ fpCast = Instruction::CastOps::UIToFP; ++ conversionFactor = VIMMED1((float)(1.0)); ++ break; ++ case CONVERT_SSCALED: ++ SWR_ASSERT(0, "Type should not be zero extended!"); ++ conversionFactor = nullptr; ++ break; ++ default: ++ SWR_ASSERT(conversionType == CONVERT_NONE); ++ conversionFactor = nullptr; ++ break; ++ } ++ ++ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits ++ for(uint32_t i = 0; i < 4; i++){ ++ if(!isComponentEnabled(compMask, i)){ ++ continue; ++ } ++ ++ if(compCtrl[i] == ComponentControl::StoreSrc){ ++ // pshufb masks for each component ++ Value* vConstMask; ++ switch(swizzle[i]){ ++ case 0: ++ // x shuffle mask ++ vConstMask = C({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, ++ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); ++ break; ++ case 1: ++ // y shuffle mask ++ vConstMask = C({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, ++ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); ++ break; ++ case 2: ++ // z shuffle mask ++ vConstMask = C({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, ++ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); ++ break; ++ case 3: ++ // w shuffle mask ++ vConstMask = C({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, ++ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); ++ break; ++ default: ++ vConstMask = nullptr; ++ break; ++ } ++ ++ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); ++ // after pshufb for x channel ++ // 256i - 0 1 2 3 4 5 6 7 ++ // x000 x000 x000 x000 x000 x000 x000 x000 ++ ++ // denormalize if needed ++ if (conversionType != CONVERT_NONE){ ++ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); ++ } ++ currentVertexElement++; ++ } ++ else{ ++ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); ++ } ++ ++ if(currentVertexElement > 3){ ++ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); ++ // reset to the next vVertexElement to output ++ currentVertexElement = 0; ++ } ++ } ++ } ++ else ++ { ++ SWR_ASSERT(0, "Unsupported conversion type"); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, ++/// denormalizes if needed, converts to F32 if needed, and positions in ++// the proper SIMD rows to be output to the simdvertex structure ++/// @param args: (tuple of args, listed below) ++/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index ++/// @param pVtxOut - base pointer to output simdvertex struct ++/// @param extendType - sign extend or zero extend ++/// @param bNormalized - do we need to denormalize? ++/// @param currentVertexElement - reference to the current vVertexElement ++/// @param outputElt - reference to the current offset from simdvertex we're o ++/// @param compMask - component packing mask ++/// @param compCtrl - component control val ++/// @param vVertexElements[4] - vertex components to output ++void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) ++{ ++ // Unpack tuple args ++ Value* (&vGatherResult)[2] = std::get<0>(args); ++ Value* pVtxOut = std::get<1>(args); ++ const Instruction::CastOps extendType = std::get<2>(args); ++ const ConversionType conversionType = std::get<3>(args); ++ uint32_t ¤tVertexElement = std::get<4>(args); ++ uint32_t &outputElt = std::get<5>(args); ++ const ComponentEnable compMask = std::get<6>(args); ++ const ComponentControl(&compCtrl)[4] = std::get<7>(args); ++ Value* (&vVertexElements)[4] = std::get<8>(args); ++ ++ // cast types ++ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); ++ Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits ++ ++ // have to do extra work for sign extending ++ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| ++ (extendType == Instruction::CastOps::FPExt)) ++ { ++ // is this PP float? ++ bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; ++ ++ Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane ++ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits ++ ++ // shuffle mask ++ Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, ++ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); ++ Value* vi128XY = nullptr; ++ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ ++ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); ++ // after pshufb: group components together in each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy ++ ++ vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); ++ // after PERMD: move and pack xy components into each 128bit lane ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy ++ } ++ ++ // do the same for zw components ++ Value* vi128ZW = nullptr; ++ if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ ++ Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); ++ vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); ++ } ++ ++ // init denormalize variables if needed ++ Instruction::CastOps IntToFpCast; ++ Value* conversionFactor; ++ ++ switch (conversionType) ++ { ++ case CONVERT_NORMALIZED: ++ IntToFpCast = Instruction::CastOps::SIToFP; ++ conversionFactor = VIMMED1((float)(1.0 / 32767.0)); ++ break; ++ case CONVERT_SSCALED: ++ IntToFpCast = Instruction::CastOps::SIToFP; ++ conversionFactor = VIMMED1((float)(1.0)); ++ break; ++ case CONVERT_USCALED: ++ SWR_ASSERT(0, "Type should not be sign extended!"); ++ conversionFactor = nullptr; ++ break; ++ default: ++ SWR_ASSERT(conversionType == CONVERT_NONE); ++ conversionFactor = nullptr; ++ break; ++ } ++ ++ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex ++ for(uint32_t i = 0; i < 4; i++){ ++ if(!isComponentEnabled(compMask, i)){ ++ continue; ++ } ++ ++ if(compCtrl[i] == ComponentControl::StoreSrc){ ++ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 ++ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; ++ // if x or y, use vi128XY permute result, else use vi128ZW ++ Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; ++ ++ if(bFP) { ++ // extract 128 bit lanes to sign extend each component ++ vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); ++ } ++ else { ++ // extract 128 bit lanes to sign extend each component ++ vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); ++ ++ // denormalize if needed ++ if(conversionType != CONVERT_NONE){ ++ vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); ++ } ++ } ++ currentVertexElement++; ++ } ++ else{ ++ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); ++ } ++ ++ if(currentVertexElement > 3){ ++ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); ++ // reset to the next vVertexElement to output ++ currentVertexElement = 0; ++ } ++ } ++ ++ } ++ // else zero extend ++ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) ++ { ++ // pshufb masks for each component ++ Value* vConstMask[2]; ++ if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ ++ // x/z shuffle mask ++ vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, ++ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); ++ } ++ ++ if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ ++ // y/w shuffle mask ++ vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, ++ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); ++ } ++ ++ // init denormalize variables if needed ++ Instruction::CastOps fpCast; ++ Value* conversionFactor; ++ ++ switch (conversionType) ++ { ++ case CONVERT_NORMALIZED: ++ fpCast = Instruction::CastOps::UIToFP; ++ conversionFactor = VIMMED1((float)(1.0 / 65535.0)); ++ break; ++ case CONVERT_USCALED: ++ fpCast = Instruction::CastOps::UIToFP; ++ conversionFactor = VIMMED1((float)(1.0f)); ++ break; ++ case CONVERT_SSCALED: ++ SWR_ASSERT(0, "Type should not be zero extended!"); ++ conversionFactor = nullptr; ++ break; ++ default: ++ SWR_ASSERT(conversionType == CONVERT_NONE); ++ conversionFactor = nullptr; ++ break; ++ } ++ ++ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits ++ for(uint32_t i = 0; i < 4; i++){ ++ if(!isComponentEnabled(compMask, i)){ ++ continue; ++ } ++ ++ if(compCtrl[i] == ComponentControl::StoreSrc){ ++ // select correct constMask for x/z or y/w pshufb ++ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; ++ // if x or y, use vi128XY permute result, else use vi128ZW ++ uint32_t selectedGather = (i < 2) ? 0 : 1; ++ ++ vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); ++ // after pshufb mask for x channel; z uses the same shuffle from the second gather ++ // 256i - 0 1 2 3 4 5 6 7 ++ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 ++ ++ // denormalize if needed ++ if(conversionType != CONVERT_NONE){ ++ vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); ++ } ++ currentVertexElement++; ++ } ++ else{ ++ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); ++ } ++ ++ if(currentVertexElement > 3){ ++ StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); ++ // reset to the next vVertexElement to output ++ currentVertexElement = 0; ++ } ++ } ++ } ++ else ++ { ++ SWR_ASSERT(0, "Unsupported conversion type"); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Output a simdvertex worth of elements to the current outputElt ++/// @param pVtxOut - base address of VIN output struct ++/// @param outputElt - simdvertex offset in VIN to write to ++/// @param numEltsToStore - number of simdvertex rows to write out ++/// @param vVertexElements - LLVM Value*[] simdvertex to write out ++void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) ++{ ++ for(uint32_t c = 0; c < numEltsToStore; ++c) ++ { ++ // STORE expects FP32 x vWidth type, just bitcast if needed ++ if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){ ++#if FETCH_DUMP_VERTEX ++ PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); ++#endif ++ vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); ++ } ++#if FETCH_DUMP_VERTEX ++ else ++ { ++ PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); ++ } ++#endif ++ // outputElt * 4 = offsetting by the size of a simdvertex ++ // + c offsets to a 32bit x vWidth row within the current vertex ++ Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); ++ STORE(vVertexElements[c], dest); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Generates a constant vector of values based on the ++/// ComponentControl value ++/// @param ctrl - ComponentControl value ++Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) ++{ ++ switch(ctrl) ++ { ++ case NoStore: return VUNDEF_I(); ++ case Store0: return VIMMED1(0); ++ case Store1Fp: return VIMMED1(1.0f); ++ case Store1Int: return VIMMED1(1); ++ case StoreSrc: ++ default: SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I(); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Returns the enable mask for the specified component. ++/// @param enableMask - enable bits ++/// @param component - component to check if enabled. ++bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) ++{ ++ switch (component) ++ { ++ // X ++ case 0: return (enableMask & ComponentEnable::X); ++ // Y ++ case 1: return (enableMask & ComponentEnable::Y); ++ // Z ++ case 2: return (enableMask & ComponentEnable::Z); ++ // W ++ case 3: return (enableMask & ComponentEnable::W); ++ ++ default: return false; ++ } ++} ++ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JITs from fetch shader IR ++/// @param hJitMgr - JitManager handle ++/// @param func - LLVM function IR ++/// @return PFN_FETCH_FUNC - pointer to fetch code ++PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) ++{ ++ const llvm::Function* func = (const llvm::Function*)hFunc; ++ JitManager* pJitMgr = reinterpret_cast(hJitMgr); ++ PFN_FETCH_FUNC pfnFetch; ++ ++ pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); ++ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module ++ pJitMgr->mIsModuleFinalized = true; ++ ++#if defined(KNOB_SWRC_TRACING) ++ char fName[1024]; ++ const char *funcName = func->getName().data(); ++ sprintf(fName, "%s.bin", funcName); ++ FILE *fd = fopen(fName, "wb"); ++ fwrite((void *)pfnFetch, 1, 2048, fd); ++ fclose(fd); ++#endif ++ ++ return pfnFetch; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compiles fetch shader ++/// @param hJitMgr - JitManager handle ++/// @param state - fetch state to build function from ++extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) ++{ ++ JitManager* pJitMgr = reinterpret_cast(hJitMgr); ++ ++ pJitMgr->SetupNewModule(); ++ ++ FetchJit theJit(pJitMgr); ++ HANDLE hFunc = theJit.Create(state); ++ ++ return JitFetchFunc(hJitMgr, hFunc); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h +new file mode 100644 +index 0000000..ea3625d +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h +@@ -0,0 +1,128 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file fetch_jit.h ++* ++* @brief Definition of the fetch jitter ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/formats.h" ++#include "core/state.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// INPUT_ELEMENT_DESC ++////////////////////////////////////////////////////////////////////////// ++struct INPUT_ELEMENT_DESC ++{ ++ union ++ { ++ struct ++ { ++ uint32_t AlignedByteOffset : 12; ++ uint32_t Format : 10; ++ uint32_t StreamIndex : 6; ++ uint32_t InstanceEnable : 1; ++ uint32_t ComponentControl0 : 3; ++ uint32_t ComponentControl1 : 3; ++ uint32_t ComponentControl2 : 3; ++ uint32_t ComponentControl3 : 3; ++ uint32_t ComponentPacking : 4; ++ uint32_t _reserved : 19; ++ }; ++ uint64_t bits; ++ }; ++ uint32_t InstanceDataStepRate; ++}; ++ ++// used to set ComponentPacking ++enum ComponentEnable ++{ ++ NONE = 0x0, ++ X = 0x1, ++ Y = 0x2, ++ XY = 0x3, ++ Z = 0x4, ++ XZ = 0x5, ++ YZ = 0x6, ++ XYZ = 0x7, ++ W = 0x8, ++ XW = 0x9, ++ YW = 0xA, ++ XYW = 0xB, ++ ZW = 0xC, ++ XZW = 0xD, ++ YZW = 0xE, ++ XYZW = 0xF, ++}; ++ ++enum ComponentControl ++{ ++ NoStore = 0, ++ StoreSrc = 1, ++ Store0 = 2, ++ Store1Fp = 3, ++ Store1Int = 4, ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// State required for fetch shader jit compile. ++////////////////////////////////////////////////////////////////////////// ++struct FETCH_COMPILE_STATE ++{ ++ uint32_t numAttribs; ++ INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES]; ++ SWR_FORMAT indexType; ++ uint32_t cutIndex{ 0xffffffff }; ++ ++ // Options that effect the JIT'd code ++ bool bDisableVGATHER; // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs ++ bool bDisableIndexOOBCheck; // if enabled, FetchJit will exclude index OOB check ++ bool bEnableCutIndex{ false }; // compares indices with the cut index and returns a cut mask ++ ++ FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) : ++ bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){}; ++ ++ bool operator==(const FETCH_COMPILE_STATE &other) const ++ { ++ if (numAttribs != other.numAttribs) return false; ++ if (indexType != other.indexType) return false; ++ if (bDisableVGATHER != other.bDisableVGATHER) return false; ++ if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false; ++ if (bEnableCutIndex != other.bEnableCutIndex) return false; ++ if (cutIndex != other.cutIndex) return false; ++ ++ for(uint32_t i = 0; i < numAttribs; ++i) ++ { ++ if((layout[i].bits != other.layout[i].bits) || ++ ((layout[i].InstanceEnable == 1) && ++ (layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){ ++ return false; ++ } ++ } ++ ++ return true; ++ } ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h +new file mode 100644 +index 0000000..afa33bb +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h +@@ -0,0 +1,105 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file jit_api.h ++* ++* @brief Platform independent JIT interface ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++#include "common/os.h" ++ ++#include "fetch_jit.h" ++#include "streamout_jit.h" ++#include "blend_jit.h" ++ ++#if defined(_WIN32) ++#define JITCALL __stdcall ++#else ++#define JITCALL ++#endif ++ ++extern "C" ++{ ++ ++struct ShaderInfo; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Create JIT context. ++HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Destroy JIT context. ++void JITCALL JitDestroyContext(HANDLE hJitContext); ++ ++////////////////////////////////////////////////////////////////////////// ++/// Jit Compile Info Input ++////////////////////////////////////////////////////////////////////////// ++struct JIT_COMPILE_INPUT ++{ ++ SWR_SHADER_TYPE type; ++ ++ const void* pIR; ///< Pointer to LLVM IR text. ++ ++ bool enableJitSampler; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compile shader. ++/// @param hJitContext - Jit Context ++/// @param input - Input containing LLVM IR and other information ++/// @param output - Output containing information about JIT shader ++/// @return HANDLE - pointer to shader object. ++HANDLE JITCALL JitCompileShader( ++ HANDLE hJitContext, ++ const JIT_COMPILE_INPUT& input, ++ ShaderInfo& output); ///@todo Move ShaderInfo into Jitter. ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT destroy shader. ++/// @param hJitContext - Jit Context ++/// @param hShader - pointer to shader object. ++void JITCALL JitDestroyShader( ++ HANDLE hJitContext, ++ HANDLE hShader); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compiles fetch shader ++/// @param hJitContext - Jit Context ++/// @param state - Fetch state to build function from ++PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compiles streamout shader ++/// @param hJitContext - Jit Context ++/// @param state - SO state to build function from ++PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compiles blend shader ++/// @param hJitContext - Jit Context ++/// @param state - blend state to build function from ++PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state); ++ ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py +new file mode 100644 +index 0000000..268871b +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py +@@ -0,0 +1,334 @@ ++# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a ++# copy of this software and associated documentation files (the "Software"), ++# to deal in the Software without restriction, including without limitation ++# the rights to use, copy, modify, merge, publish, distribute, sublicense, ++# and/or sell copies of the Software, and to permit persons to whom the ++# Software is furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice (including the next ++# paragraph) shall be included in all copies or substantial portions of the ++# Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++# IN THE SOFTWARE. ++ ++#!deps/python32/python.exe ++ ++import os, sys, re ++import argparse ++import json as JSON ++import operator ++ ++header = r""" ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file %s ++* ++* @brief auto-generated file ++* ++* DO NOT EDIT ++* ++******************************************************************************/ ++ ++#pragma once ++ ++""" ++ ++""" ++""" ++def gen_file_header(filename): ++ global header ++ headerStr = header % filename ++ return headerStr.splitlines() ++ ++""" ++""" ++def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file): ++ ++ llvm_type = '' ++ ++ if is_llvm_struct: ++ if is_pointer or is_pointer_pointer: ++ llvm_type = 'Type::getInt32Ty(ctx)' ++ else: ++ llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type ++ elif is_llvm_enum: ++ llvm_type = 'Type::getInt32Ty(ctx)' ++ elif is_llvm_pfn: ++ llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)' ++ else: ++ if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool': ++ llvm_type = 'Type::getInt8Ty(ctx)' ++ elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t': ++ llvm_type = 'Type::getInt64Ty(ctx)' ++ elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t': ++ llvm_type = 'Type::getInt16Ty(ctx)' ++ elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t': ++ llvm_type = 'Type::getInt32Ty(ctx)' ++ elif type == 'float' or type == 'FLOAT': ++ llvm_type = 'Type::getFloatTy(ctx)' ++ elif type == 'double' or type == 'DOUBLE': ++ llvm_type = 'Type::getDoubleTy(ctx)' ++ elif type == 'void' or type == 'VOID': ++ llvm_type = 'Type::getInt32Ty(ctx)' ++ elif type == 'HANDLE': ++ llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)' ++ elif type == 'simdscalar': ++ llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)' ++ elif type == 'simdscalari': ++ llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)' ++ elif type == 'simdvector': ++ llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)' ++ else: ++ llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name) ++ ++ if is_pointer: ++ llvm_type = 'PointerType::get(%s, 0)' % llvm_type ++ ++ if is_pointer_pointer: ++ llvm_type = 'PointerType::get(%s, 0)' % llvm_type ++ ++ if is_array_array: ++ llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count) ++ elif is_array: ++ llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count) ++ ++ return [' members.push_back( %s ); // %s' % (llvm_type, name)] ++ ++""" ++""" ++def gen_llvm_types(input_file, output_file): ++ ++ output_lines = gen_file_header(os.path.basename(output_file.name)) ++ ++ lines = input_file.readlines() ++ ++ postfix_name = "" ++ ++ for idx in range(len(lines)): ++ line = lines[idx].rstrip() ++ ++ match = re.match(r"(\s*)struct(\s*)(\w+)", line) ++ if match: ++ llvm_args = [] ++ ++ # Detect start of structure ++ is_fwd_decl = re.search(r";", line) ++ ++ if not is_fwd_decl: ++ ++ # Extract the command name ++ struct_name = match.group(3).strip() ++ ++ output_lines += [ ++ '//////////////////////////////////////////////////////////////////////////', ++ '/// Generate LLVM type information for %s' % struct_name, ++ 'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name), ++ '{', ++ ' LLVMContext& ctx = pJitMgr->mContext;', ++ ' std::vector members;', ++ '', ++ ] ++ ++ end_of_struct = False ++ ++ while not end_of_struct and idx < len(lines)-1: ++ idx += 1 ++ line = lines[idx].rstrip() ++ ++ ########################################### ++ # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure. ++ is_llvm_struct = re.search(r"@llvm_struct", line) ++ ++ if is_llvm_struct is not None: ++ is_llvm_struct = True ++ else: ++ is_llvm_struct = False ++ ++ ########################################### ++ # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type. ++ is_llvm_enum = re.search(r"@llvm_enum", line) ++ ++ if is_llvm_enum is not None: ++ is_llvm_enum = True ++ else: ++ is_llvm_enum = False ++ ++ ########################################### ++ # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type. ++ is_llvm_pfn = re.search(r"@llvm_pfn", line) ++ ++ if is_llvm_pfn is not None: ++ is_llvm_pfn = True ++ else: ++ is_llvm_pfn = False ++ ++ ########################################### ++ # Is field const? ++ is_const = re.search(r"\s+const\s+", line) ++ ++ if is_const is not None: ++ is_const = True ++ else: ++ is_const = False ++ ++ ########################################### ++ # Is field a pointer? ++ is_pointer_pointer = re.search("\*\*", line) ++ ++ if is_pointer_pointer is not None: ++ is_pointer_pointer = True ++ else: ++ is_pointer_pointer = False ++ ++ ########################################### ++ # Is field a pointer? ++ is_pointer = re.search("\*", line) ++ ++ if is_pointer is not None: ++ is_pointer = True ++ else: ++ is_pointer = False ++ ++ ########################################### ++ # Is field an array of arrays? ++ # TODO: Can add this to a list. ++ is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line) ++ array_count = '0' ++ array_count1 = '0' ++ ++ if is_array_array is not None: ++ array_count = is_array_array.group(1) ++ array_count1 = is_array_array.group(2) ++ is_array_array = True ++ else: ++ is_array_array = False ++ ++ ########################################### ++ # Is field an array? ++ is_array = re.search("\[(\w*)\]", line) ++ ++ if is_array is not None: ++ array_count = is_array.group(1) ++ is_array = True ++ else: ++ is_array = False ++ ++ is_scoped = re.search("::", line) ++ ++ if is_scoped is not None: ++ is_scoped = True ++ else: ++ is_scoped = False ++ ++ type = None ++ name = None ++ if is_const and is_pointer: ++ ++ if is_scoped: ++ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line) ++ ++ type = "%s%s" % (field_match.group(4), field_match.group(5)) ++ name = field_match.group(7) ++ else: ++ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line) ++ ++ type = field_match.group(4) ++ name = field_match.group(6) ++ ++ elif is_pointer: ++ field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line) ++ ++ if field_match: ++ type = field_match.group(3) ++ name = field_match.group(5) ++ elif is_const: ++ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line) ++ ++ if field_match: ++ type = field_match.group(4) ++ name = field_match.group(6) ++ else: ++ if is_scoped: ++ field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line) ++ ++ if field_match: ++ type = field_match.group(1) + '::' + field_match.group(2) ++ name = field_match.group(3) ++ else: ++ field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line) ++ ++ if field_match: ++ type = field_match.group(2) ++ name = field_match.group(4) ++ ++ if type is not None: ++ output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file) ++ llvm_args.append(name) ++ ++ # Detect end of structure ++ end_of_struct = re.match(r"(\s*)};", line) ++ ++ if (end_of_struct): ++ output_lines += [ ++ '', ++ ' return StructType::get(ctx, members, false);', ++ '}', ++ '', ++ ] ++ ++ for i in range(len(llvm_args)): ++ output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i)) ++ ++ output_lines.append('') ++ ++ output_file.write('\n'.join(output_lines) + '\n') ++ ++""" ++ Function which is invoked when this script is started from a command line. ++ Will present and consume a set of arguments which will tell this script how ++ to behave ++""" ++def main(): ++ ++ # Parse args... ++ parser = argparse.ArgumentParser() ++ parser.add_argument("--input", "-i", type=argparse.FileType('r'), ++ help="Path to input file containing structs", required=True) ++ parser.add_argument("--output", "-o", type=argparse.FileType('w'), ++ help="Path to output file", required=True) ++ parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False) ++ args = parser.parse_args() ++ ++ gen_llvm_types(args.input, args.output) ++ ++if __name__ == '__main__': ++ main() ++# END OF FILE +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +new file mode 100644 +index 0000000..6a64a1c +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +@@ -0,0 +1,348 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file streamout_jit.cpp ++* ++* @brief Implementation of the streamout jitter ++* ++* Notes: ++* ++******************************************************************************/ ++#include "jit_api.h" ++#include "streamout_jit.h" ++#include "builder.h" ++#include "state_llvm.h" ++#include "common/containers.hpp" ++#include "llvm/IR/DataLayout.h" ++ ++#include ++#include ++ ++////////////////////////////////////////////////////////////////////////// ++/// Interface to Jitting a fetch shader ++////////////////////////////////////////////////////////////////////////// ++struct StreamOutJit : public Builder ++{ ++ StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){}; ++ ++ // returns pointer to SWR_STREAMOUT_BUFFER ++ Value* getSOBuffer(Value* pSoCtx, uint32_t buffer) ++ { ++ return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer }); ++ } ++ ++ ++ ////////////////////////////////////////////////////////////////////////// ++ // @brief checks if streamout buffer is oob ++ // @return true/false ++ Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer) ++ { ++ Value* returnMask = C(false); ++ ++ Value* pBuf = getSOBuffer(pSoCtx, buffer); ++ ++ // load enable ++ // @todo bool data types should generate llvm type ++ Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty()); ++ ++ // load buffer size ++ Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize }); ++ ++ // load current streamOffset ++ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); ++ ++ // load buffer pitch ++ Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); ++ ++ // buffer is considered oob if in use in a decl but not enabled ++ returnMask = OR(returnMask, NOT(enabled)); ++ ++ // buffer is oob if cannot fit a prims worth of verts ++ Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim))); ++ returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize)); ++ ++ return returnMask; ++ } ++ ++ ++ ////////////////////////////////////////////////////////////////////////// ++ // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, ++ // packing the active mask bits ++ // ex. bitmask 0011 -> (0, 1, 0, 0) ++ // bitmask 1000 -> (3, 0, 0, 0) ++ // bitmask 1100 -> (2, 3, 0, 0) ++ Value* PackMask(uint32_t bitmask) ++ { ++ std::vector indices(4, C(0)); ++ DWORD index; ++ uint32_t elem = 0; ++ while (_BitScanForward(&index, bitmask)) ++ { ++ indices[elem++] = C((int)index); ++ bitmask &= ~(1 << index); ++ } ++ ++ return ConstantVector::get(indices); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ // @brief convert scalar bitmask to <4xfloat> bitmask ++ Value* ToMask(uint32_t bitmask) ++ { ++ std::vector indices; ++ for (uint32_t i = 0; i < 4; ++i) ++ { ++ if (bitmask & (1 << i)) ++ { ++ indices.push_back(C(-1.0f)); ++ } ++ else ++ { ++ indices.push_back(C(0.0f)); ++ } ++ } ++ return ConstantVector::get(indices); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ // @brief processes a single decl from the streamout stream. Reads 4 components from the input ++ // stream and writes N components to the output buffer given the componentMask or if ++ // a hole, just increments the buffer pointer ++ // @param pStream - pointer to current attribute ++ // @param pOutBuffers - pointers to the current location of each output buffer ++ // @param decl - input decl ++ void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl) ++ { ++ // @todo add this to x86 macros ++ Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps); ++ ++ uint32_t numComponents = _mm_popcnt_u32(decl.componentMask); ++ uint32_t packedMask = (1 << numComponents) - 1; ++ if (!decl.hole) ++ { ++ // increment stream pointer to correct slot ++ Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot)); ++ ++ // load 4 components from stream ++ Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4); ++ Type* simd4PtrTy = PointerType::get(simd4Ty, 0); ++ pAttrib = BITCAST(pAttrib, simd4PtrTy); ++ Value *vattrib = LOAD(pAttrib); ++ ++ // shuffle/pack enabled components ++ Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask)); ++ ++ // store to output buffer ++ // cast SO buffer to i8*, needed by maskstore ++ Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0)); ++ ++ // cast input to <4xfloat> ++ Value* src = BITCAST(vpackedAttrib, simd4Ty); ++ CALL3(maskStore, pOut, ToMask(packedMask), src); ++ } ++ ++ // increment SO buffer ++ pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents)); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ // @brief builds a single vertex worth of data for the given stream ++ // @param streamState - state for this stream ++ // @param pCurVertex - pointer to src stream vertex data ++ // @param pOutBuffer - pointers to up to 4 SO buffers ++ void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4]) ++ { ++ for (uint32_t d = 0; d < streamState.numDecls; ++d) ++ { ++ const STREAMOUT_DECL& decl = streamState.decl[d]; ++ buildDecl(pCurVertex, pOutBuffer, decl); ++ } ++ } ++ ++ void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc) ++ { ++ // get list of active SO buffers ++ std::unordered_set activeSOBuffers; ++ for (uint32_t d = 0; d < streamState.numDecls; ++d) ++ { ++ const STREAMOUT_DECL& decl = streamState.decl[d]; ++ activeSOBuffers.insert(decl.bufferIndex); ++ } ++ ++ // always increment numPrimStorageNeeded ++ Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); ++ numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1)); ++ STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded }); ++ ++ // check OOB on active SO buffers. If any buffer is out of bound, don't write ++ // the primitive to any buffer ++ Value* oobMask = C(false); ++ for (uint32_t buffer : activeSOBuffers) ++ { ++ oobMask = OR(oobMask, oob(state, pSoCtx, buffer)); ++ } ++ ++ BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc); ++ ++ // early out if OOB ++ COND_BR(oobMask, returnBB, validBB); ++ ++ IRB()->SetInsertPoint(validBB); ++ ++ Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); ++ numPrimsWritten = ADD(numPrimsWritten, C(1)); ++ STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten }); ++ ++ // compute start pointer for each output buffer ++ Value* pOutBuffer[4]; ++ Value* pOutBufferStartVertex[4]; ++ Value* outBufferPitch[4]; ++ for (uint32_t b: activeSOBuffers) ++ { ++ Value* pBuf = getSOBuffer(pSoCtx, b); ++ Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer }); ++ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); ++ pOutBuffer[b] = GEP(pData, streamOffset); ++ pOutBufferStartVertex[b] = pOutBuffer[b]; ++ ++ outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch }); ++ } ++ ++ // loop over the vertices of the prim ++ Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData }); ++ for (uint32_t v = 0; v < state.numVertsPerPrim; ++v) ++ { ++ buildVertex(streamState, pStreamData, pOutBuffer); ++ ++ // increment stream and output buffer pointers ++ // stream verts are always 32*4 dwords apart ++ pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4)); ++ ++ // output buffers offset using pitch in buffer state ++ for (uint32_t b : activeSOBuffers) ++ { ++ pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]); ++ pOutBuffer[b] = pOutBufferStartVertex[b]; ++ } ++ } ++ ++ // update each active buffer's streamOffset ++ for (uint32_t b : activeSOBuffers) ++ { ++ Value* pBuf = getSOBuffer(pSoCtx, b); ++ Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); ++ streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b])); ++ STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset }); ++ } ++ } ++ ++ Function* Create(const STREAMOUT_COMPILE_STATE& state) ++ { ++ static std::size_t soNum = 0; ++ ++ std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate); ++ fnName << soNum++; ++ ++ // SO function signature ++ // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*) ++ ++ std::vector args{ ++ PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* ++ }; ++ ++ FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); ++ Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); ++ ++ // create return basic block ++ BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc); ++ BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc); ++ ++ IRB()->SetInsertPoint(entry); ++ ++ // arguments ++ auto argitr = soFunc->getArgumentList().begin(); ++ Value* pSoCtx = argitr++; ++ pSoCtx->setName("pSoCtx"); ++ ++ const STREAMOUT_STREAM& streamState = state.stream; ++ buildStream(state, streamState, pSoCtx, returnBB, soFunc); ++ ++ BR(returnBB); ++ ++ IRB()->SetInsertPoint(returnBB); ++ RET_VOID(); ++ ++ JitManager::DumpToFile(soFunc, "SoFunc"); ++ ++ FunctionPassManager passes(JM()->mpCurrentModule); ++ passes.add(createBreakCriticalEdgesPass()); ++ passes.add(createCFGSimplificationPass()); ++ passes.add(createEarlyCSEPass()); ++ passes.add(createPromoteMemoryToRegisterPass()); ++ passes.add(createCFGSimplificationPass()); ++ passes.add(createEarlyCSEPass()); ++ passes.add(createInstructionCombiningPass()); ++ passes.add(createInstructionSimplifierPass()); ++ passes.add(createConstantPropagationPass()); ++ passes.add(createSCCPPass()); ++ passes.add(createAggressiveDCEPass()); ++ ++ passes.run(*soFunc); ++ ++ JitManager::DumpToFile(soFunc, "SoFunc_optimized"); ++ ++ return soFunc; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JITs from streamout shader IR ++/// @param hJitMgr - JitManager handle ++/// @param func - LLVM function IR ++/// @return PFN_SO_FUNC - pointer to SOS function ++PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc) ++{ ++ const llvm::Function *func = (const llvm::Function*)hFunc; ++ JitManager* pJitMgr = reinterpret_cast(hJitMgr); ++ PFN_SO_FUNC pfnStreamOut; ++ pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); ++ // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module ++ pJitMgr->mIsModuleFinalized = true; ++ ++ return pfnStreamOut; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief JIT compiles streamout shader ++/// @param hJitMgr - JitManager handle ++/// @param state - SO state to build function from ++extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state) ++{ ++ JitManager* pJitMgr = reinterpret_cast(hJitMgr); ++ ++ pJitMgr->SetupNewModule(); ++ ++ StreamOutJit theJit(pJitMgr); ++ HANDLE hFunc = theJit.Create(state); ++ ++ return JitStreamoutFunc(hJitMgr, hFunc); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h +new file mode 100644 +index 0000000..4372a9d +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h +@@ -0,0 +1,91 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file streamout_jit.h ++* ++* @brief Definition of the streamout jitter ++* ++* Notes: ++* ++******************************************************************************/ ++#pragma once ++ ++#include "common/formats.h" ++#include "core/state.h" ++ ++////////////////////////////////////////////////////////////////////////// ++/// STREAMOUT_DECL - Stream decl ++////////////////////////////////////////////////////////////////////////// ++struct STREAMOUT_DECL ++{ ++ // Buffer that stream maps to. ++ DWORD bufferIndex; ++ ++ // attribute to stream ++ uint32_t attribSlot; ++ ++ // attribute component mask ++ uint32_t componentMask; ++ ++ // indicates this decl is a hole ++ bool hole; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// STREAMOUT_STREAM - Stream decls ++////////////////////////////////////////////////////////////////////////// ++struct STREAMOUT_STREAM ++{ ++ // numnber of decls for this stream ++ uint32_t numDecls; ++ ++ // array of numDecls decls ++ STREAMOUT_DECL decl[128]; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// State required for streamout jit ++////////////////////////////////////////////////////////////////////////// ++struct STREAMOUT_COMPILE_STATE ++{ ++ // number of verts per primitive ++ uint32_t numVertsPerPrim; ++ ++ // stream decls ++ STREAMOUT_STREAM stream; ++ ++ bool operator==(const STREAMOUT_COMPILE_STATE &other) const ++ { ++ if (numVertsPerPrim != other.numVertsPerPrim) return false; ++ if (stream.numDecls != other.stream.numDecls) return false; ++ ++ for (uint32_t i = 0; i < stream.numDecls; ++i) ++ { ++ if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false; ++ if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false; ++ if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false; ++ if (stream.decl[i].hole != other.stream.decl[i].hole) return false; ++ } ++ ++ return true; ++ } ++}; +diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp +new file mode 100644 +index 0000000..ad73cd8 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp +@@ -0,0 +1,287 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file ClearTile.cpp ++* ++* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro ++* tile in the destination. ++* ++******************************************************************************/ ++#include "common/os.h" ++#include "core/context.h" ++#include "common/formats.h" ++#include "memory/TilingFunctions.h" ++#include "memory/tilingtraits.h" ++#include "memory/Convert.h" ++ ++typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT); ++ ++////////////////////////////////////////////////////////////////////////// ++/// Clear Raster Tile Function Tables. ++////////////////////////////////////////////////////////////////////////// ++static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS]; ++ ++static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS]; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StoreRasterTileClear ++////////////////////////////////////////////////////////////////////////// ++template ++struct StoreRasterTileClear ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pColor - Pointer to clear color. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void StoreClear( ++ const BYTE* dstFormattedColor, ++ UINT dstBytesPerPixel, ++ SWR_SURFACE_STATE* pDstSurface, ++ UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile. ++ { ++ // Compute destination address for raster tile. ++ BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress + ++ (y * pDstSurface->pitch) + (x * dstBytesPerPixel); ++ ++ // start of first row ++ BYTE* pDst = pDstTile; ++ UINT dstBytesPerRow = 0; ++ ++ // For each raster tile pixel in row 0 (rx, 0) ++ for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx) ++ { ++ memcpy(pDst, dstFormattedColor, dstBytesPerPixel); ++ ++ // Increment pointer to next pixel in row. ++ pDst += dstBytesPerPixel; ++ dstBytesPerRow += dstBytesPerPixel; ++ } ++ ++ // start of second row ++ pDst = pDstTile + pDstSurface->pitch; ++ ++ // For each remaining row in the rest of the raster tile ++ for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry) ++ { ++ // copy row ++ memcpy(pDst, pDstTile, dstBytesPerRow); ++ ++ // Increment pointer to first pixel in next row. ++ pDst += pDstSurface->pitch; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles. ++////////////////////////////////////////////////////////////////////////// ++template ++struct StoreMacroTileClear ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores a macrotile to the destination surface. ++ /// @param pColor - Pointer to color to write to pixels. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to macro tile ++ static void StoreClear( ++ const FLOAT *pColor, ++ SWR_SURFACE_STATE* pDstSurface, ++ UINT x, UINT y) ++ { ++ UINT dstBytesPerPixel = (FormatTraits::bpp / 8); ++ ++ BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel ++ ++ FLOAT srcColor[4]; ++ ++ for (UINT comp = 0; comp < FormatTraits::numComps; ++comp) ++ { ++ srcColor[comp] = pColor[FormatTraits::swizzle(comp)]; ++ } ++ ++ // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value ++ ConvertPixelFromFloat(dstFormattedColor, srcColor); ++ ++ // Store each raster tile from the hot tile to the destination surface. ++ // TODO: Put in check for partial coverage on x/y -- SWR_ASSERT if it happens. ++ // Intent is for this function to only handle full tiles. ++ for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ StoreRasterTileClear::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row)); ++ } ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Writes clear color to every pixel of a render surface ++/// @param hPrivateContext - Handle to private DC ++/// @param renderTargetIndex - Index to destination render target ++/// @param x, y - Coordinates to raster tile. ++/// @param pClearColor - Pointer to clear color ++void StoreHotTileClear( ++ SWR_SURFACE_STATE *pDstSurface, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ UINT x, ++ UINT y, ++ const float* pClearColor) ++{ ++ PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL; ++ ++ SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL); ///@todo Not supported yet. ++ ++ if (renderTargetIndex != SWR_ATTACHMENT_DEPTH) ++ { ++ pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format]; ++ } ++ else ++ { ++ pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format]; ++ } ++ ++ SWR_ASSERT(pfnStoreTilesClear != NULL); ++ ++ // Store a macro tile. ++ /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress. ++ if (pfnStoreTilesClear != NULL) ++ { ++ pfnStoreTilesClear(pClearColor, pDstSurface, x, y); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. ++#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \ ++ memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \ ++ \ ++ sStoreTilesClearColorTable[R32G32B32A32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32B32A32_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32B32A32_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32B32X32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32B32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32B32_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32B32_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16A16_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16A16_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16A16_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16A16_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16A16_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32G32_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16X16_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16X16_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B8G8R8A8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R10G10B10A2_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R10G10B10A2_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8A8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8A8_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8A8_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8A8_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B10G10R10A2_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R11G11B10_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[A32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B8G8R8X8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8X8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B10G10R10X2_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B5G6R5_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B5G5R5A1_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B4G4R4A4_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[A16_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[A16_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B5G5R5X1_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[A8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC1_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC2_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC3_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC4_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC5_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC1_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC2_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC3_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC4_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[BC5_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16_UNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R16G16B16_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R10G10B10A2_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R10G10B10A2_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B10G10R10A2_SNORM] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B10G10R10A2_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[B10G10R10A2_SINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8_UINT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearColorTable[R8G8B8_SINT] = StoreMacroTileClear::StoreClear; \ ++ ++////////////////////////////////////////////////////////////////////////// ++/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. ++#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \ ++ memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \ ++ \ ++ sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear::StoreClear; \ ++ sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear::StoreClear; \ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Sets up tables for ClearTile ++void InitSimClearTilesTable() ++{ ++ INIT_STORE_TILES_CLEAR_COLOR_TABLE(); ++ INIT_STORE_TILES_CLEAR_DEPTH_TABLE(); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h +new file mode 100644 +index 0000000..0f9e0ad +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h +@@ -0,0 +1,698 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file Convert.h ++* ++* @brief Conversion utility functions ++* ++******************************************************************************/ ++#pragma once ++ ++#if defined(_WIN32) ++// disable "potential divide by 0" ++#pragma warning(disable: 4723) ++#endif ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision ++/// float ++/// @param val - 16-bit float ++/// @todo Maybe move this outside of this file into a header? ++static float ConvertSmallFloatTo32(UINT val) ++{ ++ UINT result; ++ if ((val & 0x7fff) == 0) ++ { ++ result = ((uint32_t)(val & 0x8000)) << 16; ++ } ++ else if ((val & 0x7c00) == 0x7c00) ++ { ++ result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; ++ result |= ((uint32_t)val & 0x8000) << 16; ++ } ++ else ++ { ++ uint32_t sign = (val & 0x8000) << 16; ++ uint32_t mant = (val & 0x3ff) << 13; ++ uint32_t exp = (val >> 10) & 0x1f; ++ if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals ++ { ++ mant <<= 1; ++ while (mant < (0x400 << 13)) ++ { ++ exp--; ++ mant <<= 1; ++ } ++ mant &= (0x3ff << 13); ++ } ++ exp = ((exp - 15 + 127) & 0xff) << 23; ++ result = sign | exp | mant; ++ } ++ ++ return *(float*)&result; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Convert an IEEE 754 32-bit single precision float to an ++/// unsigned small float with 5 exponent bits and a variable ++/// number of mantissa bits. ++/// @param val - 32-bit float ++/// @todo Maybe move this outside of this file into a header? ++template ++static UINT Convert32ToSmallFloat(float val) ++{ ++ uint32_t sign, exp, mant; ++ uint32_t roundBits; ++ ++ // Extract the sign, exponent, and mantissa ++ UINT uf = *(UINT*)&val; ++ ++ sign = (uf & 0x80000000) >> 31; ++ exp = (uf & 0x7F800000) >> 23; ++ mant = uf & 0x007FFFFF; ++ ++ // 10/11 bit floats are unsigned. Negative values are clamped to 0. ++ if (sign != 0) ++ { ++ exp = mant = 0; ++ } ++ // Check for out of range ++ else if ((exp == 0xFF) && (mant != 0)) // NaN ++ { ++ exp = 0x1F; ++ mant = 1 << numMantissaBits; ++ } ++ else if ((exp == 0xFF) && (mant == 0)) // INF ++ { ++ exp = 0x1F; ++ mant = 0; ++ } ++ else if (exp > (0x70 + 0x1E)) // Too big to represent ++ { ++ exp = 0x1Eu; ++ mant = (1 << numMantissaBits) - 1; // 0x3F for 6 bit mantissa. ++ } ++ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm ++ { ++ mant |= 0x00800000; ++ for (; exp <= 0x70; mant >>= 1, exp++) ++ ; ++ exp = 0; ++ mant = mant >> (23 - numMantissaBits); ++ } ++ else if (exp < 0x66) // Too small to represent -> Zero ++ { ++ exp = 0; ++ mant = 0; ++ } ++ else ++ { ++ // Saves bits that will be shifted off for rounding ++ roundBits = mant & 0x1FFFu; ++ // convert exponent and mantissa to 16 bit format ++ exp = exp - 0x70u; ++ mant = mant >> (23 - numMantissaBits); ++ ++ // Essentially RTZ, but round up if off by only 1 lsb ++ if (roundBits == 0x1FFFu) ++ { ++ mant++; ++ // check for overflow ++ if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits) ++ exp++; ++ // make sure only the needed bits are used ++ mant &= (1 << numMantissaBits) - 1; ++ } ++ } ++ ++ UINT tmpVal = (exp << numMantissaBits) | mant; ++ return tmpVal; ++} ++ ++#if KNOB_ARCH == KNOB_ARCH_AVX ++////////////////////////////////////////////////////////////////////////// ++/// @brief Convert an IEEE 754 32-bit single precision float to an ++/// 16 bit float with 5 exponent bits and a variable ++/// number of mantissa bits. ++/// @param val - 32-bit float ++/// @todo Maybe move this outside of this file into a header? ++static uint16_t Convert32To16Float(float val) ++{ ++ uint32_t sign, exp, mant; ++ uint32_t roundBits; ++ ++ // Extract the sign, exponent, and mantissa ++ uint32_t uf = *(uint32_t*)&val; ++ sign = (uf & 0x80000000) >> 31; ++ exp = (uf & 0x7F800000) >> 23; ++ mant = uf & 0x007FFFFF; ++ ++ // Check for out of range ++ if (std::isnan(val)) ++ { ++ exp = 0x1F; ++ mant = 0x200; ++ sign = 1; // set the sign bit for NANs ++ } ++ else if (std::isinf(val)) ++ { ++ exp = 0x1f; ++ mant = 0x0; ++ } ++ else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value ++ { ++ exp = 0x1E; ++ mant = 0x3FF; ++ } ++ else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm ++ { ++ mant |= 0x00800000; ++ for (; exp <= 0x70; mant >>= 1, exp++) ++ ; ++ exp = 0; ++ mant = mant >> 13; ++ } ++ else if (exp < 0x66) // Too small to represent -> Zero ++ { ++ exp = 0; ++ mant = 0; ++ } ++ else ++ { ++ // Saves bits that will be shifted off for rounding ++ roundBits = mant & 0x1FFFu; ++ // convert exponent and mantissa to 16 bit format ++ exp = exp - 0x70; ++ mant = mant >> 13; ++ ++ // Essentially RTZ, but round up if off by only 1 lsb ++ if (roundBits == 0x1FFFu) ++ { ++ mant++; ++ // check for overflow ++ if ((mant & 0xC00u) != 0) ++ exp++; ++ // make sure only the needed bits are used ++ mant &= 0x3FF; ++ } ++ } ++ ++ uint32_t tmpVal = (sign << 15) | (exp << 10) | mant; ++ return (uint16_t)tmpVal; ++} ++#endif ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Retrieve color from hot tile source which is always float. ++/// @param pDstPixel - Pointer to destination pixel. ++/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest). ++template ++static void ConvertPixelFromFloat( ++ BYTE* pDstPixel, ++ const float srcPixel[4]) ++{ ++ UINT outColor[4]; // typeless bits ++ ++ // Store component ++ for (UINT comp = 0; comp < FormatTraits::numComps; ++comp) ++ { ++ SWR_TYPE type = FormatTraits::GetType(comp); ++ ++ float src = srcPixel[comp]; ++ ++ switch (type) ++ { ++ case SWR_TYPE_UNORM: ++ { ++ // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. ++ src = (src != src) ? 0.0f : src; ++ ++ // Clamp [0, 1] ++ src = std::max(src, 0.0f); ++ src = std::min(src, 1.0f); ++ ++ // SRGB ++ if (FormatTraits::isSRGB && comp != 3) ++ { ++ src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f); ++ } ++ ++ // Float scale to integer scale. ++ UINT scale = (1 << FormatTraits::GetBPC(comp)) - 1; ++ src = (float)scale * src; ++ src = roundf(src); ++ outColor[comp] = (UINT)src; // Drop fractional part. ++ break; ++ } ++ case SWR_TYPE_SNORM: ++ { ++ SWR_ASSERT(!FormatTraits::isSRGB); ++ ++ // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false. ++ src = (src != src) ? 0.0f : src; ++ ++ // Clamp [-1, 1] ++ src = std::max(src, -1.0f); ++ src = std::min(src, 1.0f); ++ ++ // Float scale to integer scale. ++ UINT scale = (1 << (FormatTraits::GetBPC(comp) - 1)) - 1; ++ src = (float)scale * src; ++ ++ // Round ++ src += (src >= 0) ? 0.5f : -0.5f; ++ ++ INT out = (INT)src; ++ ++ outColor[comp] = *(UINT*)&out; ++ ++ break; ++ } ++ case SWR_TYPE_UINT: ++ { ++ ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float. ++ // However, the number in the hot tile should be unsigned integer. So doing this ++ // to preserve bits intead of doing a float -> integer conversion. ++ if (FormatTraits::GetBPC(comp) == 32) ++ { ++ outColor[comp] = *(UINT*)&src; ++ } ++ else ++ { ++ outColor[comp] = *(UINT*)&src; ++ UINT max = (1 << FormatTraits::GetBPC(comp)) - 1; // 2^numBits - 1 ++ ++ outColor[comp] = std::min(max, outColor[comp]); ++ } ++ break; ++ } ++ case SWR_TYPE_SINT: ++ { ++ if (FormatTraits::GetBPC(comp) == 32) ++ { ++ outColor[comp] = *(UINT*)&src; ++ } ++ else ++ { ++ INT out = *(INT*)&src; // Hot tile format is SINT? ++ INT max = (1 << (FormatTraits::GetBPC(comp) - 1)) - 1; ++ INT min = -1 - max; ++ ++ ///@note The output is unsigned integer (bag of bits) and so performing ++ // the clamping here based on range of output component. Also, manually adding ++ // the sign bit in the appropriate spot. Maybe a better way? ++ out = std::max(out, min); ++ out = std::min(out, max); ++ ++ outColor[comp] = *(UINT*)&out; ++ } ++ break; ++ } ++ case SWR_TYPE_FLOAT: ++ { ++ if (FormatTraits::GetBPC(comp) == 16) ++ { ++ // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph ++ // @todo 16bit float instruction support is orthogonal to avx support. need to ++ // add check for F16C support instead. ++#if KNOB_ARCH == KNOB_ARCH_AVX2 ++ __m128 src128 = _mm_set1_ps(src); ++ __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC); ++ UINT value = _mm_extract_epi16(srci128, 0); ++#else ++ UINT value = Convert32To16Float(src); ++#endif ++ ++ outColor[comp] = value; ++ } ++ else if (FormatTraits::GetBPC(comp) == 11) ++ { ++ outColor[comp] = Convert32ToSmallFloat<6>(src); ++ } ++ else if (FormatTraits::GetBPC(comp) == 10) ++ { ++ outColor[comp] = Convert32ToSmallFloat<5>(src); ++ } ++ else ++ { ++ outColor[comp] = *(UINT*)&src; ++ } ++ ++ break; ++ } ++ default: ++ SWR_ASSERT(0); ++ break; ++ } ++ } ++ ++ typename FormatTraits::FormatT* pPixel = (typename FormatTraits::FormatT*)pDstPixel; ++ ++ switch (FormatTraits::numComps) ++ { ++ case 4: ++ pPixel->a = outColor[3]; ++ case 3: ++ pPixel->b = outColor[2]; ++ case 2: ++ pPixel->g = outColor[1]; ++ case 1: ++ pPixel->r = outColor[0]; ++ break; ++ default: ++ SWR_ASSERT(0); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Convert pixel in any format to float32 ++/// @param pDstPixel - Pointer to destination pixel. ++/// @param srcPixel - Pointer to source pixel ++template ++INLINE static void ConvertPixelToFloat( ++ float dstPixel[4], ++ const BYTE* pSrc) ++{ ++ UINT srcColor[4]; // typeless bits ++ ++ // unpack src pixel ++ typename FormatTraits::FormatT* pPixel = (typename FormatTraits::FormatT*)pSrc; ++ ++ // apply format defaults ++ for (uint32_t comp = 0; comp < 4; ++comp) ++ { ++ uint32_t def = FormatTraits::GetDefault(comp); ++ dstPixel[comp] = *(float*)&def; ++ } ++ ++ // load format data ++ switch (FormatTraits::numComps) ++ { ++ case 4: ++ srcColor[3] = pPixel->a; ++ case 3: ++ srcColor[2] = pPixel->b; ++ case 2: ++ srcColor[1] = pPixel->g; ++ case 1: ++ srcColor[0] = pPixel->r; ++ break; ++ default: ++ SWR_ASSERT(0); ++ } ++ ++ // Convert components ++ for (UINT comp = 0; comp < FormatTraits::numComps; ++comp) ++ { ++ SWR_TYPE type = FormatTraits::GetType(comp); ++ ++ UINT src = srcColor[comp]; ++ ++ switch (type) ++ { ++ case SWR_TYPE_UNORM: ++ { ++ float dst; ++ if (FormatTraits::isSRGB && comp != 3) ++ { ++ dst = *(float*)&srgb8Table[src]; ++ } ++ else ++ { ++ // component sizes > 16 must use fp divide to maintain ulp requirements ++ if (FormatTraits::GetBPC(comp) > 16) ++ { ++ dst = (float)src / (float)((1 << FormatTraits::GetBPC(comp)) - 1); ++ } ++ else ++ { ++ const float scale = (1.0f / (float)((1 << FormatTraits::GetBPC(comp)) - 1)); ++ dst = (float)src * scale; ++ } ++ } ++ dstPixel[FormatTraits::swizzle(comp)] = dst; ++ break; ++ } ++ case SWR_TYPE_SNORM: ++ { ++ SWR_ASSERT(!FormatTraits::isSRGB); ++ ++ float dst; ++ if (src == 0x10) ++ { ++ dst = -1.0f; ++ } ++ else ++ { ++ switch (FormatTraits::GetBPC(comp)) ++ { ++ case 8: ++ dst = (float)((int8_t)src); ++ break; ++ case 16: ++ dst = (float)((int16_t)src); ++ break; ++ case 32: ++ dst = (float)((int32_t)src); ++ break; ++ default: ++ assert(0 && "attempted to load from SNORM with unsupported bpc"); ++ dst = 0.0f; ++ break; ++ } ++ dst = dst * (1.0f / ((1 << (FormatTraits::GetBPC(comp) - 1)) - 1)); ++ } ++ dstPixel[FormatTraits::swizzle(comp)] = dst; ++ break; ++ } ++ case SWR_TYPE_UINT: ++ { ++ UINT dst = (UINT)src; ++ dstPixel[FormatTraits::swizzle(comp)] = *(float*)&dst; ++ break; ++ } ++ case SWR_TYPE_SINT: ++ { ++ int dst; ++ switch (FormatTraits::GetBPC(comp)) ++ { ++ case 8: ++ dst = (int8_t)src; ++ break; ++ case 16: ++ dst = (int16_t)src; ++ break; ++ case 32: ++ dst = (int32_t)src; ++ break; ++ default: ++ assert(0 && "attempted to load from SINT with unsupported bpc"); ++ dst = 0; ++ break; ++ } ++ dstPixel[FormatTraits::swizzle(comp)] = *(float*)&dst; ++ break; ++ } ++ case SWR_TYPE_FLOAT: ++ { ++ float dst; ++ if (FormatTraits::GetBPC(comp) == 16) ++ { ++#if KNOB_ARCH == KNOB_ARCH_AVX2 ++ // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps ++ // @todo 16bit float instruction support is orthogonal to avx support. need to ++ // add check for F16C support instead. ++ __m128i src128 = _mm_set1_epi32(src); ++ __m128 res = _mm_cvtph_ps(src128); ++ _mm_store_ss(&dst, res); ++#else ++ dst = ConvertSmallFloatTo32(src); ++#endif ++ } ++ else if (FormatTraits::GetBPC(comp) == 11) ++ { ++ dst = ConvertSmallFloatTo32(src << 4); ++ } ++ else if (FormatTraits::GetBPC(comp) == 10) ++ { ++ dst = ConvertSmallFloatTo32(src << 5); ++ } ++ else ++ { ++ dst = *(float*)&src; ++ } ++ ++ dstPixel[FormatTraits::swizzle(comp)] = *(float*)&dst; ++ break; ++ } ++ default: ++ SWR_ASSERT(0); ++ break; ++ } ++ } ++} ++ ++// non-templated version of conversion functions ++INLINE static void ConvertPixelFromFloat( ++ SWR_FORMAT format, ++ uint8_t* pDst, ++ const float srcPixel[4]) ++{ ++ switch (format) ++ { ++ case R32G32B32A32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32A32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32A32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32X32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32A32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32A32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32B32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16X16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16X16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16A16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32G32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B8G8R8A8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R11G11B10_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case A32_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B8G8R8X8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8X8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10X2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10X2_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8A8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R32_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B5G6R5_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B5G5R5A1_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B4G4R4A4_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case A16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case A16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B5G5R5X1_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case A8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case YCRCB_SWAPUVY: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC1_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC2_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC3_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC4_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC5_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC1_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC2_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC3_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case YCRCB_SWAPUV: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC4_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC5_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_FLOAT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC7_UNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case BC7_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R16G16B16_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R10G10B10A2_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_SNORM: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_USCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_SSCALED: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case B10G10R10A2_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_UINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ case R8G8B8_SINT: ConvertPixelFromFloat(pDst, srcPixel); break; ++ default: ++ break; ++ } ++} ++ ++ +diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp +new file mode 100644 +index 0000000..49893e8 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp +@@ -0,0 +1,382 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file LoadTile.cpp ++* ++* @brief Functionality for Load ++* ++******************************************************************************/ ++#include "common/os.h" ++#include "common/formats.h" ++#include "core/context.h" ++#include "core/rdtsc_core.h" ++#include "memory/TilingFunctions.h" ++#include "memory/tilingtraits.h" ++#include "memory/Convert.h" ++ ++typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t); ++ ++////////////////////////////////////////////////////////////////////////// ++/// Load Raster Tile Function Tables. ++////////////////////////////////////////////////////////////////////////// ++static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; ++static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS]; ++ ++static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; ++static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS]; ++ ++static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS]; ++ ++////////////////////////////////////////////////////////////////////////// ++/// LoadRasterTile ++////////////////////////////////////////////////////////////////////////// ++template ++struct LoadRasterTile ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Retrieve color from hot tile source which is always float. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param x, y - Coordinates to raster tile. ++ /// @param output - output color ++ INLINE static void SetSwizzledDstColor( ++ const float srcColor[4], ++ uint32_t x, uint32_t y, ++ uint8_t* pDst) ++ { ++ typedef SimdTile SimdT; ++ ++ SimdT* pDstSimdTiles = (SimdT*)pDst; ++ ++ // Compute which simd tile we're accessing within 8x8 tile. ++ // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. ++ uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); ++ ++ SimdT* pSimdTile = &pDstSimdTiles[simdIndex]; ++ ++ uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); ++ ++ pSimdTile->SetSwizzledColor(simdOffset, srcColor); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Loads an 8x8 raster tile from the src surface. ++ /// @param pSrcSurface - Src surface state ++ /// @param pDst - Destination hot tile pointer ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Load( ++ SWR_SURFACE_STATE* pSrcSurface, ++ uint8_t* pDst, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. ++ { ++ uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod; ++ uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod; ++ ++ // For each raster tile pixel (rx, ry) ++ for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) ++ { ++ for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) ++ { ++ if (((x + rx) < lodWidth) && ++ ((y + ry) < lodHeight)) ++ { ++ uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex, ++ pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, ++ pSrcSurface->lod, pSrcSurface); ++ ++ float srcColor[4]; ++ ConvertPixelToFloat(srcColor, pSrc); ++ ++ // store pixel to hottile ++ SetSwizzledDstColor(srcColor, rx, ry, pDst); ++ } ++ } ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// LoadMacroTile - Loads a macro tile which consists of raster tiles. ++////////////////////////////////////////////////////////////////////////// ++template ++struct LoadMacroTile ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Load a macrotile to the destination surface. ++ /// @param pSrc - Pointer to macro tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to macro tile ++ static void Load( ++ SWR_SURFACE_STATE* pSrcSurface, ++ uint8_t *pDstHotTile, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) ++ { ++ // Load each raster tile from the hot tile to the destination surface. ++ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++) ++ { ++ LoadRasterTile::Load(pSrcSurface, pDstHotTile, ++ (x + col), (y + row), sampleNum, renderTargetArrayIndex); ++ pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); ++ } ++ } ++ } ++ } ++}; ++ ++static void BUCKETS_START(UINT id) ++{ ++#ifdef KNOB_ENABLE_RDTSC ++ gBucketMgr.StartBucket(id); ++#endif ++} ++ ++static void BUCKETS_STOP(UINT id) ++{ ++#ifdef KNOB_ENABLE_RDTSC ++ gBucketMgr.StopBucket(id); ++#endif ++} ++ ++// on demand buckets for load tiles ++static std::vector sBuckets(NUM_SWR_FORMATS, -1); ++static std::mutex sBucketMutex; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Loads a full hottile from a render surface ++/// @param hPrivateContext - Handle to private DC ++/// @param dstFormat - Format for hot tile. ++/// @param renderTargetIndex - Index to src render target ++/// @param x, y - Coordinates to raster tile. ++/// @param pDstHotTile - Pointer to Hot Tile ++void LoadHotTile( ++ SWR_SURFACE_STATE *pSrcSurface, ++ SWR_FORMAT dstFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, ++ uint8_t *pDstHotTile) ++{ ++ PFN_LOAD_TILES pfnLoadTiles = NULL; ++ ++ // don't need to load null surfaces ++ if (pSrcSurface->type == SURFACE_NULL) ++ { ++ return; ++ } ++ ++ if (renderTargetIndex < SWR_ATTACHMENT_DEPTH) ++ { ++ switch (pSrcSurface->tileMode) ++ { ++ case SWR_TILE_NONE: ++ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format]; ++ break; ++ case SWR_TILE_MODE_YMAJOR: ++ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; ++ break; ++ case SWR_TILE_MODE_XMAJOR: ++ pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format]; ++ break; ++ default: ++ SWR_ASSERT(0, "Unsupported tiling mode"); ++ break; ++ } ++ } ++ else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH) ++ { ++ // Currently depth can map to linear and tile-y. ++ switch (pSrcSurface->tileMode) ++ { ++ case SWR_TILE_NONE: ++ pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format]; ++ break; ++ case SWR_TILE_MODE_YMAJOR: ++ pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format]; ++ break; ++ default: ++ SWR_ASSERT(0, "Unsupported tiling mode"); ++ break; ++ } ++ } ++ else ++ { ++ SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL); ++ SWR_ASSERT(pSrcSurface->format == R8_UINT); ++ switch (pSrcSurface->tileMode) ++ { ++ case SWR_TILE_NONE: ++ pfnLoadTiles = LoadMacroTile, R8_UINT, R8_UINT>::Load; ++ break; ++ case SWR_TILE_MODE_WMAJOR: ++ pfnLoadTiles = LoadMacroTile, R8_UINT, R8_UINT>::Load; ++ break; ++ default: ++ SWR_ASSERT(0, "Unsupported tiling mode"); ++ break; ++ } ++ } ++ ++ SWR_ASSERT(pfnLoadTiles != NULL); ++ ++ // Load a macro tile. ++#ifdef KNOB_ENABLE_RDTSC ++ if (sBuckets[pSrcSurface->format] == -1) ++ { ++ // guard sBuckets update since storetiles is called by multiple threads ++ sBucketMutex.lock(); ++ if (sBuckets[pSrcSurface->format] == -1) ++ { ++ const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format); ++ BUCKET_DESC desc{ info.name, "", false, 0xffffffff }; ++ sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc); ++ } ++ sBucketMutex.unlock(); ++ } ++#endif ++ ++ BUCKETS_START(sBuckets[pSrcSurface->format]); ++ pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex); ++ BUCKETS_STOP(sBuckets[pSrcSurface->format]); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. ++#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \ ++ memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \ ++ \ ++ sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT] = LoadMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT] = LoadMacroTile, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT] = LoadMacroTile, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT] = LoadMacroTile, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT] = LoadMacroTile, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32B32_SINT] = LoadMacroTile, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32B32_UINT] = LoadMacroTile, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM] = LoadMacroTile, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM] = LoadMacroTile, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT] = LoadMacroTile, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT] = LoadMacroTile, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT] = LoadMacroTile, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32_FLOAT] = LoadMacroTile, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32_SINT] = LoadMacroTile, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32G32_UINT] = LoadMacroTile, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM] = LoadMacroTile, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT] = LoadMacroTile, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM] = LoadMacroTile, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB] = LoadMacroTile, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM] = LoadMacroTile, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB] = LoadMacroTile, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT] = LoadMacroTile, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM] = LoadMacroTile, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB] = LoadMacroTile, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM] = LoadMacroTile, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT] = LoadMacroTile, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT] = LoadMacroTile, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16_UNORM] = LoadMacroTile, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16_SNORM] = LoadMacroTile, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16_SINT] = LoadMacroTile, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16_UINT] = LoadMacroTile, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16_FLOAT] = LoadMacroTile, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM] = LoadMacroTile, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB] = LoadMacroTile, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT] = LoadMacroTile, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32_SINT] = LoadMacroTile, R32_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32_UINT] = LoadMacroTile, R32_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[A32_FLOAT] = LoadMacroTile, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM] = LoadMacroTile, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB] = LoadMacroTile, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM] = LoadMacroTile, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB] = LoadMacroTile, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM] = LoadMacroTile, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B5G6R5_UNORM] = LoadMacroTile, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB] = LoadMacroTile, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM] = LoadMacroTile, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB] = LoadMacroTile, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM] = LoadMacroTile, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB] = LoadMacroTile, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8_UNORM] = LoadMacroTile, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8_SNORM] = LoadMacroTile, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8_SINT] = LoadMacroTile, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8_UINT] = LoadMacroTile, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16_UNORM] = LoadMacroTile, R16_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16_SNORM] = LoadMacroTile, R16_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16_SINT] = LoadMacroTile, R16_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16_UINT] = LoadMacroTile, R16_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16_FLOAT] = LoadMacroTile, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[A16_UNORM] = LoadMacroTile, A16_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[A16_FLOAT] = LoadMacroTile, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM] = LoadMacroTile, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB] = LoadMacroTile, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8_UNORM] = LoadMacroTile, R8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8_SNORM] = LoadMacroTile, R8_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8_SINT] = LoadMacroTile, R8_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8_UINT] = LoadMacroTile, R8_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[A8_UNORM] = LoadMacroTile, A8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC1_UNORM] = LoadMacroTile, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC2_UNORM] = LoadMacroTile, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC3_UNORM] = LoadMacroTile, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC4_UNORM] = LoadMacroTile, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC5_UNORM] = LoadMacroTile, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB] = LoadMacroTile, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB] = LoadMacroTile, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB] = LoadMacroTile, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8_UNORM] = LoadMacroTile, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8_SNORM] = LoadMacroTile, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC4_SNORM] = LoadMacroTile, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[BC5_SNORM] = LoadMacroTile, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT] = LoadMacroTile, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16_UNORM] = LoadMacroTile, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16_SNORM] = LoadMacroTile, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB] = LoadMacroTile, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16_UINT] = LoadMacroTile, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R16G16B16_SINT] = LoadMacroTile, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM] = LoadMacroTile, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT] = LoadMacroTile, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM] = LoadMacroTile, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT] = LoadMacroTile, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT] = LoadMacroTile, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8_UINT] = LoadMacroTile, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \ ++ sLoadTilesColorTable_##tilemode[R8G8B8_SINT] = LoadMacroTile, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \ ++ ++////////////////////////////////////////////////////////////////////////// ++/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables. ++#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \ ++ memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \ ++ \ ++ sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile, R16_UNORM, R32_FLOAT>::Load; \ ++ sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile, R32_FLOAT, R32_FLOAT>::Load; \ ++ sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \ ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Sets up tables for LoadTile ++void InitSimLoadTilesTable() ++{ ++ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE); ++ INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE); ++ ++ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR); ++ INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR); ++ ++ INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp +new file mode 100644 +index 0000000..fbd76a3 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp +@@ -0,0 +1,1645 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file StoreTile.cpp ++* ++* @brief Functionality for Store. ++* ++******************************************************************************/ ++#include "common/os.h" ++#include "common/formats.h" ++#include "core/context.h" ++#include "core/rdtsc_core.h" ++#include "core/format_conversion.h" ++ ++#include "memory/TilingFunctions.h" ++#include "memory/tilingtraits.h" ++#include "memory/Convert.h" ++#include "core/multisample.h" ++ ++#include ++#include ++ ++typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); ++ ++////////////////////////////////////////////////////////////////////////// ++/// Store Raster Tile Function Tables. ++////////////////////////////////////////////////////////////////////////// ++static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; ++static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; ++static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StorePixels ++/// @brief Stores a 4x2 (AVX) raster-tile to two rows. ++/// @param pSrc - Pointer to source raster tile in SWRZ pixel order ++/// @param ppDsts - Array of destination pointers. Each pointer is ++/// to a single row of at most 16B. ++/// @tparam NumDests - Number of destination pointers. Each pair of ++/// pointers is for a 16-byte column of two rows. ++////////////////////////////////////////////////////////////////////////// ++template ++struct StorePixels ++{ ++ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StorePixels (32-bit pixel specialization) ++/// @brief Stores a 4x2 (AVX) raster-tile to two rows. ++/// @param pSrc - Pointer to source raster tile in SWRZ pixel order ++/// @param ppDsts - Array of destination pointers. Each pointer is ++/// to a single row of at most 16B. ++/// @tparam NumDests - Number of destination pointers. Each pair of ++/// pointers is for a 16-byte column of two rows. ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct StorePixels<8, 2> ++{ ++ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) ++ { ++ // Each 4-pixel row is 4 bytes. ++ const uint16_t* pPixSrc = (const uint16_t*)pSrc; ++ ++ // Unswizzle from SWR-Z order ++ uint16_t* pRow = (uint16_t*)ppDsts[0]; ++ pRow[0] = pPixSrc[0]; ++ pRow[1] = pPixSrc[2]; ++ ++ pRow = (uint16_t*)ppDsts[1]; ++ pRow[0] = pPixSrc[1]; ++ pRow[1] = pPixSrc[3]; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StorePixels (32-bit pixel specialization) ++/// @brief Stores a 4x2 (AVX) raster-tile to two rows. ++/// @param pSrc - Pointer to source raster tile in SWRZ pixel order ++/// @param ppDsts - Array of destination pointers. Each pointer is ++/// to a single row of at most 16B. ++/// @tparam NumDests - Number of destination pointers. Each pair of ++/// pointers is for a 16-byte column of two rows. ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct StorePixels<16, 2> ++{ ++ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) ++ { ++ // Each 4-pixel row is 8 bytes. ++ const uint32_t* pPixSrc = (const uint32_t*)pSrc; ++ ++ // Unswizzle from SWR-Z order ++ uint32_t* pRow = (uint32_t*)ppDsts[0]; ++ pRow[0] = pPixSrc[0]; ++ pRow[1] = pPixSrc[2]; ++ ++ pRow = (uint32_t*)ppDsts[1]; ++ pRow[0] = pPixSrc[1]; ++ pRow[1] = pPixSrc[3]; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StorePixels (32-bit pixel specialization) ++/// @brief Stores a 4x2 (AVX) raster-tile to two rows. ++/// @param pSrc - Pointer to source raster tile in SWRZ pixel order ++/// @param ppDsts - Array of destination pointers. Each pointer is ++/// to a single row of at most 16B. ++/// @tparam NumDests - Number of destination pointers. Each pair of ++/// pointers is for a 16-byte column of two rows. ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct StorePixels<32, 2> ++{ ++ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) ++ { ++ // Each 4-pixel row is 16-bytes ++ __m128i *pZRow01 = (__m128i*)pSrc; ++ __m128i vQuad00 = _mm_load_si128(pZRow01); ++ __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); ++ ++ __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); ++ __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); ++ ++ _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); ++ _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StorePixels (32-bit pixel specialization) ++/// @brief Stores a 4x2 (AVX) raster-tile to two rows. ++/// @param pSrc - Pointer to source raster tile in SWRZ pixel order ++/// @param ppDsts - Array of destination pointers. Each pointer is ++/// to a single row of at most 16B. ++/// @tparam NumDests - Number of destination pointers. Each pair of ++/// pointers is for a 16-byte column of two rows. ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct StorePixels<64, 4> ++{ ++ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) ++ { ++ // Each 4-pixel row is 32 bytes. ++ const __m128i* pPixSrc = (const __m128i*)pSrc; ++ ++ // order of pointers match SWR-Z layout ++ __m128i** pvDsts = (__m128i**)&ppDsts[0]; ++ *pvDsts[0] = pPixSrc[0]; ++ *pvDsts[1] = pPixSrc[1]; ++ *pvDsts[2] = pPixSrc[2]; ++ *pvDsts[3] = pPixSrc[3]; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StorePixels (32-bit pixel specialization) ++/// @brief Stores a 4x2 (AVX) raster-tile to two rows. ++/// @param pSrc - Pointer to source raster tile in SWRZ pixel order ++/// @param ppDsts - Array of destination pointers. Each pointer is ++/// to a single row of at most 16B. ++/// @tparam NumDests - Number of destination pointers. Each pair of ++/// pointers is for a 16-byte column of two rows. ++////////////////////////////////////////////////////////////////////////// ++template <> ++struct StorePixels<128, 8> ++{ ++ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) ++ { ++ // Each 4-pixel row is 64 bytes. ++ const __m128i* pPixSrc = (const __m128i*)pSrc; ++ ++ // Unswizzle from SWR-Z order ++ __m128i** pvDsts = (__m128i**)&ppDsts[0]; ++ *pvDsts[0] = pPixSrc[0]; ++ *pvDsts[1] = pPixSrc[2]; ++ *pvDsts[2] = pPixSrc[1]; ++ *pvDsts[3] = pPixSrc[3]; ++ *pvDsts[4] = pPixSrc[4]; ++ *pvDsts[5] = pPixSrc[6]; ++ *pvDsts[6] = pPixSrc[5]; ++ *pvDsts[7] = pPixSrc[7]; ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) ++////////////////////////////////////////////////////////////////////////// ++template ++struct ConvertPixelsSOAtoAOS ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Converts a SIMD from the Hot Tile to the destination format ++ /// and converts from SOA to AOS. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDst - Pointer to destination surface or deswizzling buffer. ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel ++ ++ OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; ++ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; ++ ++ // Convert from SrcFormat --> DstFormat ++ simdvector src; ++ LoadSOA(pSrc, src); ++ StoreSOA(src, soaTile); ++ ++ // Convert from SOA --> AOS ++ FormatTraits::TransposeT::Transpose(soaTile, aosTile); ++ ++ // Store data into destination ++ StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) ++/// Specialization for no format conversion ++////////////////////////////////////////////////////////////////////////// ++template ++struct ConvertPixelsSOAtoAOS ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Converts a SIMD from the Hot Tile to the destination format ++ /// and converts from SOA to AOS. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDst - Pointer to destination surface or deswizzling buffer. ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel ++ ++ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; ++ ++ // Convert from SOA --> AOS ++ FormatTraits::TransposeT::Transpose(pSrc, aosTile); ++ ++ // Store data into destination ++ StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) ++////////////////////////////////////////////////////////////////////////// ++template<> ++struct ConvertPixelsSOAtoAOS ++{ ++ static const SWR_FORMAT SrcFormat = R32_FLOAT; ++ static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Converts a SIMD from the Hot Tile to the destination format ++ /// and converts from SOA to AOS. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDst - Pointer to destination surface or deswizzling buffer. ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel ++ ++ OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; ++ OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; ++ ++ // Convert from SrcFormat --> DstFormat ++ simdvector src; ++ LoadSOA(pSrc, src); ++ StoreSOA(src, soaTile); ++ ++ // Convert from SOA --> AOS ++ FormatTraits::TransposeT::Transpose(soaTile, aosTile); ++ ++ // Store data into destination but don't overwrite the X8 bits ++ // Each 4-pixel row is 16-bytes ++ __m128i *pZRow01 = (__m128i*)aosTile; ++ __m128i vQuad00 = _mm_load_si128(pZRow01); ++ __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); ++ ++ __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); ++ __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); ++ ++ __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); ++ __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); ++ ++ __m128i vMask = _mm_set1_epi32(0xFFFFFF); ++ ++ vDst0 = _mm_andnot_si128(vMask, vDst0); ++ vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); ++ vDst1 = _mm_andnot_si128(vMask, vDst1); ++ vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); ++ ++ _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); ++ _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); ++ } ++}; ++ ++template ++INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) ++{ ++ static const uint32_t offset = sizeof(simdscalar); ++ ++ // swizzle rgba -> bgra while we load ++ simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr ++ simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg ++ simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb ++ simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(3))*offset)); // float32 aaaaaaaa ++ ++ // clamp ++ vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); ++ vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); ++ ++ vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); ++ vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); ++ ++ vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); ++ vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); ++ ++ vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); ++ vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); ++ ++ if (FormatTraits::isSRGB) ++ { ++ // Gamma-correct only rgb ++ vComp0 = FormatTraits::convertSrgb(0, vComp0); ++ vComp1 = FormatTraits::convertSrgb(1, vComp1); ++ vComp2 = FormatTraits::convertSrgb(2, vComp2); ++ } ++ ++ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format ++ vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); ++ vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); ++ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); ++ vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits::fromFloat(3))); ++ ++ // moving to 8 wide integer vector types ++ __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr ++ __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg ++ __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb ++ __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa ++ ++#if KNOB_ARCH == KNOB_ARCH_AVX ++ ++ // splitting into two sets of 4 wide integer vector types ++ // because AVX doesn't have instructions to support this operation at 8 wide ++ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r ++ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g ++ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b ++ __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a ++ ++ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r ++ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g ++ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b ++ __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a ++ ++ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 ++ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 ++ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 ++ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 ++ srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 ++ srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 ++ ++ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr ++ srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 ++ ++ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr ++ srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 ++ ++ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr ++ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr ++ ++ // unpack into rows that get the tiling order correct ++ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr ++ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); ++ ++ __m256i final = _mm256_castsi128_si256(vRow00); ++ final = _mm256_insertf128_si256(final, vRow10, 1); ++ ++#elif KNOB_ARCH == KNOB_ARCH_AVX2 ++ ++ // logic is as above, only wider ++ src1 = _mm256_slli_si256(src1, 1); ++ src2 = _mm256_slli_si256(src2, 2); ++ src3 = _mm256_slli_si256(src3, 3); ++ ++ src0 = _mm256_or_si256(src0, src1); ++ src2 = _mm256_or_si256(src2, src3); ++ ++ __m256i final = _mm256_or_si256(src0, src2); ++ ++ // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 ++ final = _mm256_permute4x64_epi64(final, 0xD8); ++ ++#endif ++ ++ _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); ++} ++ ++template ++INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) ++{ ++ static const uint32_t offset = sizeof(simdscalar); ++ ++ // swizzle rgba -> bgra while we load ++ simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(0))*offset)); // float32 rrrrrrrr ++ simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(1))*offset)); // float32 gggggggg ++ simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits::swizzle(2))*offset)); // float32 bbbbbbbb ++ // clamp ++ vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); ++ vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); ++ ++ vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); ++ vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); ++ ++ vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); ++ vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); ++ ++ if (FormatTraits::isSRGB) ++ { ++ // Gamma-correct only rgb ++ vComp0 = FormatTraits::convertSrgb(0, vComp0); ++ vComp1 = FormatTraits::convertSrgb(1, vComp1); ++ vComp2 = FormatTraits::convertSrgb(2, vComp2); ++ } ++ ++ // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format ++ vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits::fromFloat(0))); ++ vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits::fromFloat(1))); ++ vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits::fromFloat(2))); ++ ++ // moving to 8 wide integer vector types ++ __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr ++ __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg ++ __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb ++ ++#if KNOB_ARCH == KNOB_ARCH_AVX ++ ++ // splitting into two sets of 4 wide integer vector types ++ // because AVX doesn't have instructions to support this operation at 8 wide ++ __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r ++ __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g ++ __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b ++ ++ __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r ++ __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g ++ __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b ++ ++ srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 ++ srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 ++ srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 ++ srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 ++ ++ srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr ++ ++ srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr ++ ++ srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr ++ srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr ++ ++ // unpack into rows that get the tiling order correct ++ __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr ++ __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); ++ ++ __m256i final = _mm256_castsi128_si256(vRow00); ++ final = _mm256_insertf128_si256(final, vRow10, 1); ++ ++#elif KNOB_ARCH == KNOB_ARCH_AVX2 ++ ++ // logic is as above, only wider ++ src1 = _mm256_slli_si256(src1, 1); ++ src2 = _mm256_slli_si256(src2, 2); ++ ++ src0 = _mm256_or_si256(src0, src1); ++ ++ __m256i final = _mm256_or_si256(src0, src2); ++ ++ // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 ++ final = _mm256_permute4x64_epi64(final, 0xD8); ++ ++#endif ++ ++ _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); ++} ++ ++template<> ++struct ConvertPixelsSOAtoAOS ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvert(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++template<> ++struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > ++{ ++ template ++ INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) ++ { ++ FlatConvertNoAlpha(pSrc, ppDsts[0], ppDsts[1]); ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StoreRasterTile ++////////////////////////////////////////////////////////////////////////// ++template ++struct StoreRasterTile ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Retrieve color from hot tile source which is always float. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param x, y - Coordinates to raster tile. ++ /// @param output - output color ++ INLINE static void GetSwizzledSrcColor( ++ uint8_t* pSrc, ++ uint32_t x, uint32_t y, ++ float outputColor[4]) ++ { ++ typedef SimdTile SimdT; ++ ++ SimdT* pSrcSimdTiles = (SimdT*)pSrc; ++ ++ // Compute which simd tile we're accessing within 8x8 tile. ++ // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. ++ uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); ++ ++ SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; ++ ++ uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); ++ ++ pSimdTile->GetSwizzledColor(simdOffset, outputColor); ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. ++ { ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ ++ // For each raster tile pixel (rx, ry) ++ for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) ++ { ++ for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) ++ { ++ // Perform bounds checking. ++ if (((x + rx) < lodWidth) && ++ ((y + ry) < lodHeight)) ++ { ++ float srcColor[4]; ++ GetSwizzledSrcColor(pSrc, rx, ry, srcColor); ++ ++ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress((x + rx), (y + ry), ++ pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ sampleNum, pDstSurface->lod, pDstSurface); ++ ConvertPixelFromFloat(pDst, srcColor); ++ } ++ } ++ } ++ } ++}; ++ ++template ++struct OptStoreRasterTile : StoreRasterTile ++{}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) ++ { ++ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; ++ ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) ++ { ++ // Format conversion and convert from SOA to AOS, and store the rows. ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); ++ ++ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; ++ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; ++ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; ++ } ++ ++ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; ++ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) ++ { ++ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; ++ ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) ++ { ++ // Format conversion and convert from SOA to AOS, and store the rows. ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); ++ ++ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; ++ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; ++ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; ++ } ++ ++ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; ++ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) ++ { ++ uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; ++ ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) ++ { ++ // Format conversion and convert from SOA to AOS, and store the rows. ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppRows); ++ ++ ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; ++ ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;; ++ pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; ++ } ++ ++ ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; ++ ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t MAX_DST_COLUMN_BYTES = 16; ++ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; ++ static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ uint8_t* ppDsts[] = ++ { ++ pDst, // row 0, col 0 ++ pDst + pDstSurface->pitch, // row 1, col 0 ++ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 ++ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 ++ }; ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) ++ { ++ uint8_t* ppStartRows[] = ++ { ++ ppDsts[0], ++ ppDsts[1], ++ ppDsts[2], ++ ppDsts[3], ++ }; ++ ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) ++ { ++ // Format conversion and convert from SOA to AOS, and store the rows. ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ ++ ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; ++ ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; ++ ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; ++ ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; ++ pSrc += SRC_COLUMN_BYTES; ++ } ++ ++ ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; ++ ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; ++ ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; ++ ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t MAX_DST_COLUMN_BYTES = 16; ++ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; ++ static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ struct DstPtrs ++ { ++ uint8_t* ppDsts[8]; ++ } ptrs; ++ ++ // Need 8 pointers, 4 columns of 2 rows each ++ for (uint32_t y = 0; y < 2; ++y) ++ { ++ for (uint32_t x = 0; x < 4; ++x) ++ { ++ ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; ++ } ++ } ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) ++ { ++ DstPtrs startPtrs = ptrs; ++ ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) ++ { ++ // Format conversion and convert from SOA to AOS, and store the rows. ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); ++ ++ ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; ++ pSrc += SRC_COLUMN_BYTES; ++ } ++ ++ ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; ++ ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ static const uint32_t DestRowWidthBytes = 16; // 16B rows ++ ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. ++ // We can compute the offsets to each column within the raster tile once and increment from these. ++ // There will be 2 x 4-wide columns in an 8x8 raster tile. ++ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ ++ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. ++ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; ++ ++ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) ++ { ++ uint32_t rowOffset = row * DestRowWidthBytes; ++ ++ uint8_t* pRow = pCol0 + rowOffset; ++ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ ++ ppDsts[0] += DestRowWidthBytes / 4; ++ ppDsts[1] += DestRowWidthBytes / 4; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ static const uint32_t DestRowWidthBytes = 16; // 16B rows ++ ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. ++ // We can compute the offsets to each column within the raster tile once and increment from these. ++ // There will be 2 x 4-wide columns in an 8x8 raster tile. ++ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ ++ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. ++ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; ++ ++ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) ++ { ++ uint32_t rowOffset = row * DestRowWidthBytes; ++ ++ uint8_t* pRow = pCol0 + rowOffset; ++ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ ++ ppDsts[0] += DestRowWidthBytes / 2; ++ ppDsts[1] += DestRowWidthBytes / 2; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ static const uint32_t DestRowWidthBytes = 512; // 512B rows ++ ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. ++ // We can compute the offsets to each column within the raster tile once and increment from these. ++ uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ uint8_t* pRow1 = pRow0 + DestRowWidthBytes; ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) ++ { ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) ++ { ++ uint32_t xRowOffset = col * (FormatTraits::bpp / 8); ++ ++ uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ ++ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. ++ pSrc += (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; ++ } ++ ++ pRow0 += (DestRowWidthBytes * 2); ++ pRow1 += (DestRowWidthBytes * 2); ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ static const uint32_t DestRowWidthBytes = 16; // 16B rows ++ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. ++ ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. ++ // We can compute the offsets to each column within the raster tile once and increment from these. ++ // There will be 2 x 4-wide columns in an 8x8 raster tile. ++ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ ++ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. ++ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; ++ ++ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) ++ { ++ uint32_t rowOffset = row * DestRowWidthBytes; ++ ++ uint8_t* pRow = pCol0 + rowOffset; ++ uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ ++ ppDsts[0] += DestColumnBytes; ++ ppDsts[1] += DestColumnBytes; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ static const uint32_t DestRowWidthBytes = 16; // 16B rows ++ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. ++ ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. ++ // We can compute the offsets to each column within the raster tile once and increment from these. ++ // There will be 2 x 4-wide columns in an 8x8 raster tile. ++ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ uint8_t* pCol1 = pCol0 + DestColumnBytes; ++ ++ // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. ++ // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. ++ uint32_t pSrcInc = (FormatTraits::bpp * KNOB_SIMD_WIDTH) / 8; ++ ++ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) ++ { ++ uint32_t rowOffset = row * DestRowWidthBytes; ++ uint8_t* ppDsts[] = ++ { ++ pCol0 + rowOffset, ++ pCol0 + rowOffset + DestRowWidthBytes, ++ pCol1 + rowOffset, ++ pCol1 + rowOffset + DestRowWidthBytes, ++ }; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ ++ ppDsts[0] += DestColumnBytes * 2; ++ ppDsts[1] += DestColumnBytes * 2; ++ ppDsts[2] += DestColumnBytes * 2; ++ ppDsts[3] += DestColumnBytes * 2; ++ ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ppDsts); ++ pSrc += pSrcInc; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp ++////////////////////////////////////////////////////////////////////////// ++template ++struct OptStoreRasterTile< TilingTraits, SrcFormat, DstFormat > ++{ ++ typedef StoreRasterTile, SrcFormat, DstFormat> GenericStoreTile; ++ ++ static const size_t TILE_Y_COL_WIDTH_BYTES = 16; ++ static const size_t TILE_Y_ROWS = 32; ++ static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; ++ ++ static const size_t DST_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits::bpp / 8; ++ static const size_t MAX_DST_COLUMN_BYTES = 16; ++ ++ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; ++ static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores an 8x8 raster tile to the destination surface. ++ /// @param pSrc - Pointer to raster tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to raster tile. ++ INLINE static void Store( ++ uint8_t *pSrc, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) ++ { ++ // Punt non-full tiles to generic store ++ uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); ++ uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); ++ if (x + KNOB_TILE_X_DIM > lodWidth || ++ y + KNOB_TILE_Y_DIM > lodHeight) ++ { ++ return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); ++ } ++ ++ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); ++ struct DstPtrs ++ { ++ uint8_t* ppDsts[8]; ++ } ptrs; ++ ++ // Need 8 pointers, 4 columns of 2 rows each ++ for (uint32_t y = 0; y < 2; ++y) ++ { ++ for (uint32_t x = 0; x < 4; ++x) ++ { ++ ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; ++ } ++ } ++ ++ for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) ++ { ++ DstPtrs startPtrs = ptrs; ++ ++ for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) ++ { ++ // Format conversion and convert from SOA to AOS, and store the rows. ++ ConvertPixelsSOAtoAOS::Convert(pSrc, ptrs.ppDsts); ++ ++ ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; ++ ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; ++ pSrc += SRC_COLUMN_BYTES; ++ } ++ ++ ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// StoreMacroTile - Stores a macro tile which consists of raster tiles. ++////////////////////////////////////////////////////////////////////////// ++template ++struct StoreMacroTile ++{ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores a macrotile to the destination surface using safe implementation. ++ /// @param pSrc - Pointer to macro tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to macro tile ++ static void StoreGeneric( ++ uint8_t *pSrcHotTile, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) ++ { ++ // Store each raster tile from the hot tile to the destination surface. ++ for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) ++ { ++ StoreRasterTile::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, ++ renderTargetArrayIndex); ++ pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); ++ } ++ } ++ } ++ } ++ ++ typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Stores a macrotile to the destination surface. ++ /// @param pSrc - Pointer to macro tile. ++ /// @param pDstSurface - Destination surface state ++ /// @param x, y - Coordinates to macro tile ++ static void Store( ++ uint8_t *pSrcHotTile, ++ SWR_SURFACE_STATE* pDstSurface, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) ++ { ++ PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; ++ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) ++ { ++ size_t dstSurfAddress = (size_t)ComputeSurfaceAddress( ++ 0, ++ 0, ++ pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces ++ pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays ++ sampleNum, ++ pDstSurface->lod, ++ pDstSurface); ++ ++ // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear ++ bool bForceGeneric = (pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff)); ++ ++ pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile::Store : OptStoreRasterTile::Store; ++ } ++ ++ // Store each raster tile from the hot tile to the destination surface. ++ for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) ++ { ++ for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) ++ { ++ for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) ++ { ++ pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); ++ pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits::bpp / 8); ++ } ++ } ++ } ++ } ++}; ++ ++static void BUCKETS_START(UINT id) ++{ ++#ifdef KNOB_ENABLE_RDTSC ++ gBucketMgr.StartBucket(id); ++#endif ++} ++ ++static void BUCKETS_STOP(UINT id) ++{ ++#ifdef KNOB_ENABLE_RDTSC ++ gBucketMgr.StopBucket(id); ++#endif ++} ++ ++// on demand buckets for store tiles ++static std::mutex sBucketMutex; ++static std::vector sBuckets(NUM_SWR_FORMATS, -1); ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Deswizzles and stores a full hottile to a render surface ++/// @param hPrivateContext - Handle to private DC ++/// @param srcFormat - Format for hot tile. ++/// @param renderTargetIndex - Index to destination render target ++/// @param x, y - Coordinates to raster tile. ++/// @param pSrcHotTile - Pointer to Hot Tile ++void StoreHotTile( ++ SWR_SURFACE_STATE *pDstSurface, ++ SWR_FORMAT srcFormat, ++ SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, ++ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, ++ uint8_t *pSrcHotTile) ++{ ++ // shouldn't ever see a null surface come through StoreTiles ++ SWR_ASSERT(pDstSurface->type != SURFACE_NULL); ++ ++ PFN_STORE_TILES pfnStoreTiles = nullptr; ++ if(renderTargetIndex <= SWR_ATTACHMENT_COLOR7) ++ { ++ pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format]; ++ } ++ else if(renderTargetIndex == SWR_ATTACHMENT_DEPTH) ++ { ++ pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format]; ++ } ++ else ++ { ++ pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format]; ++ } ++ ++ if(nullptr == pfnStoreTiles) ++ { ++ SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles"); ++ } ++ ++ // Store a macro tile ++#ifdef KNOB_ENABLE_RDTSC ++ if (sBuckets[pDstSurface->format] == -1) ++ { ++ // guard sBuckets update since storetiles is called by multiple threads ++ sBucketMutex.lock(); ++ if (sBuckets[pDstSurface->format] == -1) ++ { ++ const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format); ++ BUCKET_DESC desc{info.name, "", false, 0xffffffff}; ++ sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc); ++ } ++ sBucketMutex.unlock(); ++ } ++#endif ++ ++ BUCKETS_START(sBuckets[pDstSurface->format]); ++ pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex); ++ BUCKETS_STOP(sBuckets[pDstSurface->format]); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// InitStoreTilesTable - Helper for setting up the tables. ++template ++void InitStoreTilesTableColor( ++ PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) ++{ ++ table[TileModeT][R32G32B32A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; ++ table[TileModeT][R32G32B32A32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; ++ table[TileModeT][R32G32B32A32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; ++ table[TileModeT][R32G32B32X32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; ++ table[TileModeT][R32G32B32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; ++ table[TileModeT][R32G32B32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; ++ table[TileModeT][R32G32B32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; ++ table[TileModeT][R16G16B16A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; ++ table[TileModeT][R16G16B16A16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; ++ table[TileModeT][R16G16B16A16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; ++ table[TileModeT][R16G16B16A16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; ++ table[TileModeT][R16G16B16A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; ++ table[TileModeT][R32G32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; ++ table[TileModeT][R32G32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_SINT>::Store; ++ table[TileModeT][R32G32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32G32_UINT>::Store; ++ table[TileModeT][R16G16B16X16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; ++ table[TileModeT][R16G16B16X16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; ++ table[TileModeT][B8G8R8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; ++ table[TileModeT][B8G8R8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; ++ ++ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now ++ table[TileModeT][R10G10B10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; ++ table[TileModeT][R10G10B10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; ++ table[TileModeT][R10G10B10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; ++ ++ table[TileModeT][R8G8B8A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; ++ table[TileModeT][R8G8B8A8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; ++ table[TileModeT][R8G8B8A8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; ++ table[TileModeT][R8G8B8A8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; ++ table[TileModeT][R8G8B8A8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; ++ table[TileModeT][R16G16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; ++ table[TileModeT][R16G16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; ++ table[TileModeT][R16G16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_SINT>::Store; ++ table[TileModeT][R16G16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_UINT>::Store; ++ table[TileModeT][R16G16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; ++ ++ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now ++ table[TileModeT][B10G10R10A2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; ++ table[TileModeT][B10G10R10A2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; ++ table[TileModeT][R11G11B10_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; ++ ++ table[TileModeT][R32_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_SINT>::Store; ++ table[TileModeT][R32_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_UINT>::Store; ++ table[TileModeT][R32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R32_FLOAT>::Store; ++ table[TileModeT][A32_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A32_FLOAT>::Store; ++ table[TileModeT][B8G8R8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; ++ table[TileModeT][B8G8R8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; ++ table[TileModeT][R8G8B8X8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; ++ table[TileModeT][R8G8B8X8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; ++ ++ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now ++ table[TileModeT][B10G10R10X2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; ++ table[TileModeT][B5G6R5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreGeneric; ++ table[TileModeT][B5G6R5_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; ++ table[TileModeT][B5G5R5A1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; ++ table[TileModeT][B5G5R5A1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; ++ table[TileModeT][B4G4R4A4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; ++ table[TileModeT][B4G4R4A4_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; ++ ++ table[TileModeT][R8G8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; ++ table[TileModeT][R8G8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; ++ table[TileModeT][R8G8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_SINT>::Store; ++ table[TileModeT][R8G8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8_UINT>::Store; ++ table[TileModeT][R16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UNORM>::Store; ++ table[TileModeT][R16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SNORM>::Store; ++ table[TileModeT][R16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_SINT>::Store; ++ table[TileModeT][R16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_UINT>::Store; ++ table[TileModeT][R16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16_FLOAT>::Store; ++ table[TileModeT][A16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A16_UNORM>::Store; ++ table[TileModeT][A16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, A16_FLOAT>::Store; ++ ++ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now ++ table[TileModeT][B5G5R5X1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; ++ table[TileModeT][B5G5R5X1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; ++ ++ table[TileModeT][R8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UNORM>::Store; ++ table[TileModeT][R8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SNORM>::Store; ++ table[TileModeT][R8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_SINT>::Store; ++ table[TileModeT][R8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8_UINT>::Store; ++ table[TileModeT][A8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, A8_UNORM>::Store; ++ table[TileModeT][BC1_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM>::Store; ++ table[TileModeT][BC2_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM>::Store; ++ table[TileModeT][BC3_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM>::Store; ++ table[TileModeT][BC4_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_UNORM>::Store; ++ table[TileModeT][BC5_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_UNORM>::Store; ++ table[TileModeT][BC1_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store; ++ table[TileModeT][BC2_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store; ++ table[TileModeT][BC3_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store; ++ table[TileModeT][R8G8B8_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; ++ table[TileModeT][R8G8B8_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; ++ table[TileModeT][BC4_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC4_SNORM>::Store; ++ table[TileModeT][BC5_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, BC5_SNORM>::Store; ++ table[TileModeT][R16G16B16_FLOAT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; ++ table[TileModeT][R16G16B16_UNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; ++ table[TileModeT][R16G16B16_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; ++ table[TileModeT][R8G8B8_UNORM_SRGB] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; ++ table[TileModeT][R16G16B16_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; ++ table[TileModeT][R16G16B16_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; ++ ++ // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now ++ table[TileModeT][R10G10B10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; ++ table[TileModeT][R10G10B10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; ++ table[TileModeT][B10G10R10A2_SNORM] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; ++ table[TileModeT][B10G10R10A2_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; ++ table[TileModeT][B10G10R10A2_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; ++ ++ table[TileModeT][R8G8B8_UINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; ++ table[TileModeT][R8G8B8_SINT] = StoreMacroTile, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. ++template ++void InitStoreTilesTableDepth( ++ PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) ++{ ++ table[TileModeT][R32_FLOAT] = StoreMacroTile, R32_FLOAT, R32_FLOAT>::Store; ++ table[TileModeT][R24_UNORM_X8_TYPELESS] = StoreMacroTile, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; ++ table[TileModeT][R16_UNORM] = StoreMacroTile, R32_FLOAT, R16_UNORM>::Store; ++} ++ ++template ++void InitStoreTilesTableStencil( ++ PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) ++{ ++ table[TileModeT][R8_UINT] = StoreMacroTile, R8_UINT, R8_UINT>::Store; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Sets up tables for StoreTile ++void InitSimStoreTilesTable() ++{ ++ InitStoreTilesTableColor(sStoreTilesTableColor); ++ InitStoreTilesTableDepth(sStoreTilesTableDepth); ++ InitStoreTilesTableStencil(sStoreTilesTableStencil); ++ ++ InitStoreTilesTableColor(sStoreTilesTableColor); ++ InitStoreTilesTableColor(sStoreTilesTableColor); ++ ++ InitStoreTilesTableDepth(sStoreTilesTableDepth); ++ InitStoreTilesTableStencil(sStoreTilesTableStencil); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +new file mode 100644 +index 0000000..78f54f8 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +@@ -0,0 +1,518 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file TilingFunctions.h ++* ++* @brief Tiling functions. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "core/state.h" ++#include "core/format_traits.h" ++#include "memory/tilingtraits.h" ++ ++#include ++ ++#define MAX_NUM_LOD 15 ++ ++#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit. ++ ++////////////////////////////////////////////////////////////////////////// ++/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?) ++////////////////////////////////////////////////////////////////////////// ++template ++struct SimdTile ++{ ++ // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) ++ float color[FormatTraits::numComps][KNOB_SIMD_WIDTH]; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Retrieve color from simd. ++ /// @param index - linear index to color within simd. ++ /// @param outputColor - output color ++ INLINE void GetSwizzledColor( ++ uint32_t index, ++ float outputColor[4]) ++ { ++ // SOA pattern for 2x2 is a subset of 4x2. ++ // 0 1 4 5 ++ // 2 3 6 7 ++ // The offset converts pattern to linear ++#if (SIMD_TILE_X_DIM == 4) ++ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; ++#elif (SIMD_TILE_X_DIM == 2) ++ static const uint32_t offset[] = { 0, 1, 2, 3 }; ++#endif ++ ++ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) ++ { ++ outputColor[i] = this->color[FormatTraits::swizzle(i)][offset[index]]; ++ } ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Retrieve color from simd. ++ /// @param index - linear index to color within simd. ++ /// @param outputColor - output color ++ INLINE void SetSwizzledColor( ++ uint32_t index, ++ const float src[4]) ++ { ++ // SOA pattern for 2x2 is a subset of 4x2. ++ // 0 1 4 5 ++ // 2 3 6 7 ++ // The offset converts pattern to linear ++#if (SIMD_TILE_X_DIM == 4) ++ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; ++#elif (SIMD_TILE_X_DIM == 2) ++ static const uint32_t offset[] = { 0, 1, 2, 3 }; ++#endif ++ ++ // Only loop over the components needed for destination. ++ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) ++ { ++ this->color[i][offset[index]] = src[i]; ++ } ++ } ++}; ++ ++template<> ++struct SimdTile ++{ ++ // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa ) ++ uint8_t color[FormatTraits::numComps][KNOB_SIMD_WIDTH]; ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Retrieve color from simd. ++ /// @param index - linear index to color within simd. ++ /// @param outputColor - output color ++ INLINE void GetSwizzledColor( ++ uint32_t index, ++ float outputColor[4]) ++ { ++ // SOA pattern for 2x2 is a subset of 4x2. ++ // 0 1 4 5 ++ // 2 3 6 7 ++ // The offset converts pattern to linear ++#if (SIMD_TILE_X_DIM == 4) ++ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; ++#elif (SIMD_TILE_X_DIM == 2) ++ static const uint32_t offset[] = { 0, 1, 2, 3 }; ++#endif ++ ++ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) ++ { ++ uint32_t src = this->color[FormatTraits::swizzle(i)][offset[index]]; ++ outputColor[i] = *(float*)&src; ++ } ++ } ++ ++ ////////////////////////////////////////////////////////////////////////// ++ /// @brief Retrieve color from simd. ++ /// @param index - linear index to color within simd. ++ /// @param outputColor - output color ++ INLINE void SetSwizzledColor( ++ uint32_t index, ++ const float src[4]) ++ { ++ // SOA pattern for 2x2 is a subset of 4x2. ++ // 0 1 4 5 ++ // 2 3 6 7 ++ // The offset converts pattern to linear ++#if (SIMD_TILE_X_DIM == 4) ++ static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 }; ++#elif (SIMD_TILE_X_DIM == 2) ++ static const uint32_t offset[] = { 0, 1, 2, 3 }; ++#endif ++ ++ // Only loop over the components needed for destination. ++ for (uint32_t i = 0; i < FormatTraits::numComps; ++i) ++ { ++ this->color[i][offset[index]] = *(uint8_t*)&src[i]; ++ } ++ } ++}; ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes lod offset for 1D surface at specified lod. ++/// @param baseWidth - width of basemip (mip 0). ++/// @param hAlign - horizontal alignment per miip, in texels ++/// @param lod - lod index ++/// @param offset - output offset. ++INLINE void ComputeLODOffset1D( ++ const SWR_FORMAT_INFO& info, ++ uint32_t baseWidth, ++ uint32_t hAlign, ++ uint32_t lod, ++ uint32_t &offset) ++{ ++ if (lod == 0) ++ { ++ offset = 0; ++ } ++ else ++ { ++ uint32_t curWidth = baseWidth; ++ // translate mip width from pixels to blocks for block compressed formats ++ // @note hAlign is already in blocks for compressed formats so no need to convert ++ if (info.isBC) curWidth /= info.bcWidth; ++ ++ offset = GFX_ALIGN(curWidth, hAlign); ++ for (uint32_t l = 1; l < lod; ++l) ++ { ++ curWidth = GFX_ALIGN(std::max(curWidth >> 1, 1U), hAlign); ++ offset += curWidth; ++ } ++ ++ if (info.isSubsampled) ++ { ++ offset /= info.bcWidth; ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes x lod offset for 2D surface at specified lod. ++/// @param baseWidth - width of basemip (mip 0). ++/// @param hAlign - horizontal alignment per mip, in texels ++/// @param lod - lod index ++/// @param offset - output offset. ++INLINE void ComputeLODOffsetX( ++ const SWR_FORMAT_INFO& info, ++ uint32_t baseWidth, ++ uint32_t hAlign, ++ uint32_t lod, ++ uint32_t &offset) ++{ ++ if (lod < 2) ++ { ++ offset = 0; ++ } ++ else ++ { ++ uint32_t curWidth = baseWidth; ++ // convert mip width from pixels to blocks for block compressed formats ++ // @note hAlign is already in blocks for compressed formats so no need to convert ++ if (info.isBC) curWidth /= info.bcWidth; ++ ++ curWidth = std::max(curWidth >> 1, 1U); ++ curWidth = GFX_ALIGN(curWidth, hAlign); ++ ++ if (info.isSubsampled) ++ { ++ curWidth /= info.bcWidth; ++ } ++ ++ offset = curWidth; ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes y lod offset for 2D surface at specified lod. ++/// @param baseWidth - width of basemip (mip 0). ++/// @param vAlign - vertical alignment per mip, in rows ++/// @param lod - lod index ++/// @param offset - output offset. ++INLINE void ComputeLODOffsetY( ++ const SWR_FORMAT_INFO& info, ++ uint32_t baseHeight, ++ uint32_t vAlign, ++ uint32_t lod, ++ uint32_t &offset) ++{ ++ if (lod == 0) ++ { ++ offset = 0; ++ } ++ else ++ { ++ offset = 0; ++ uint32_t mipHeight = baseHeight; ++ ++ // translate mip height from pixels to blocks for block compressed formats ++ // @note VAlign is already in blocks for compressed formats so no need to convert ++ if (info.isBC) mipHeight /= info.bcHeight; ++ ++ for (uint32_t l = 1; l <= lod; ++l) ++ { ++ uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign); ++ offset += ((l != 2) ? alignedMipHeight : 0); ++ mipHeight = std::max(mipHeight >> 1, 1U); ++ } ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes 1D surface offset ++/// @param x - offset from start of array slice at given lod. ++/// @param array - array slice index ++/// @param lod - lod index ++/// @param pState - surface state ++/// @param xOffsetBytes - output offset in bytes. ++template ++INLINE void ComputeSurfaceOffset1D( ++ uint32_t x, ++ uint32_t array, ++ uint32_t lod, ++ const SWR_SURFACE_STATE *pState, ++ uint32_t &xOffsetBytes) ++{ ++ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); ++ uint32_t lodOffset; ++ ++ if (UseCachedOffsets) ++ { ++ lodOffset = pState->lodOffsets[0][lod]; ++ } ++ else ++ { ++ ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset); ++ } ++ ++ xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Adjusts the array slice for legacy TileY MSAA ++/// @param pState - surface state ++/// @param array - array slice index ++/// @param sampleNum - requested sample ++INLINE uint32_t AdjustArrayIndexForMSAA(const SWR_SURFACE_STATE *pState, uint32_t arrayIndex, uint32_t sampleNum) ++{ ++ uint32_t sampleSlice; ++ /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF. ++ if(pState->tileMode == SWR_TILE_MODE_YMAJOR || ++ pState->tileMode == SWR_TILE_NONE) ++ { ++ uint32_t sampleShift; ++ switch(pState->numSamples) ++ { ++ case 1: ++ assert(sampleNum == 0); ++ sampleShift = 0; ++ break; ++ case 2: ++ assert(pState->type == SURFACE_2D); ++ sampleShift = 1; ++ break; ++ case 4: ++ assert(pState->type == SURFACE_2D); ++ sampleShift = 2; ++ break; ++ case 8: ++ assert(pState->type == SURFACE_2D); ++ sampleShift = 3; ++ break; ++ case 16: ++ assert(pState->type == SURFACE_2D); ++ sampleShift = 4; ++ break; ++ default: ++ assert(0 && "Unsupported sample count"); ++ sampleShift = 0; ++ break; ++ } ++ sampleSlice = (arrayIndex << sampleShift) | sampleNum; ++ } ++ else ++ { ++ sampleSlice = arrayIndex; ++ } ++ return sampleSlice; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes 2D surface offset ++/// @param x - horizontal offset from start of array slice and lod. ++/// @param y - vertical offset from start of array slice and lod. ++/// @param array - array slice index ++/// @param lod - lod index ++/// @param pState - surface state ++/// @param xOffsetBytes - output x offset in bytes. ++/// @param yOffsetRows - output y offset in bytes. ++template ++INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows) ++{ ++ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); ++ uint32_t lodOffsetX, lodOffsetY; ++ ++ if (UseCachedOffsets) ++ { ++ lodOffsetX = pState->lodOffsets[0][lod]; ++ lodOffsetY = pState->lodOffsets[1][lod]; ++ } ++ else ++ { ++ ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); ++ ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); ++ } ++ ++ uint32_t arrayIndex = AdjustArrayIndexForMSAA(pState, array, sampleNum); ++ xOffsetBytes = (x + lodOffsetX) * info.Bpp; ++ yOffsetRows = (arrayIndex * pState->qpitch) + lodOffsetY + y; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes 3D surface offset ++/// @param x - horizontal offset from start of array slice and lod. ++/// @param y - vertical offset from start of array slice and lod. ++/// @param z - depth offset from start of array slice and lod. ++/// @param lod - lod index ++/// @param pState - surface state ++/// @param xOffsetBytes - output x offset in bytes. ++/// @param yOffsetRows - output y offset in rows. ++/// @param zOffsetSlices - output y offset in slices. ++template ++INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices) ++{ ++ const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format); ++ uint32_t lodOffsetX, lodOffsetY; ++ ++ if (UseCachedOffsets) ++ { ++ lodOffsetX = pState->lodOffsets[0][lod]; ++ lodOffsetY = pState->lodOffsets[1][lod]; ++ } ++ else ++ { ++ ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX); ++ ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY); ++ } ++ ++ xOffsetBytes = (x + lodOffsetX) * info.Bpp; ++ yOffsetRows = lodOffsetY + y; ++ zOffsetSlices = z; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Swizzles the linear x,y offsets depending on surface tiling mode ++/// and returns final surface address ++/// @param xOffsetBytes - x offset from base of surface in bytes ++/// @param yOffsetRows - y offset from base of surface in rows ++/// @param pState - pointer to the surface state ++template ++INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) ++{ ++ return ComputeOffset2D(pState->pitch, xOffsetBytes, yOffsetRows); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Swizzles the linear x,y offsets depending on surface tiling mode ++/// and returns final surface address ++/// @param xOffsetBytes - x offset from base of surface in bytes ++/// @param yOffsetRows - y offset from base of surface in rows ++/// @param pState - pointer to the surface state ++template ++INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) ++{ ++ return ComputeOffset3D(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Swizzles the linear x,y offsets depending on surface tiling mode ++/// and returns final surface address ++/// @param xOffsetBytes - x offset from base of surface in bytes ++/// @param yOffsetRows - y offset from base of surface in rows ++/// @param pState - pointer to the surface state ++INLINE ++uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState) ++{ ++ switch (pState->tileMode) ++ { ++ case SWR_TILE_NONE: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); ++ case SWR_TILE_SWRZ: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); ++ case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); ++ case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); ++ case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D >(xOffsetBytes, yOffsetRows, pState); ++ default: SWR_ASSERT(0, "Unsupported tiling mode"); ++ } ++ return (uint32_t) NULL; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode ++/// and returns final surface address ++/// @param xOffsetBytes - x offset from base of surface in bytes ++/// @param yOffsetRows - y offset from base of surface in rows ++/// @param zOffsetSlices - z offset from base of surface in slices ++/// @param pState - pointer to the surface state ++INLINE ++uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState) ++{ ++ switch (pState->tileMode) ++ { ++ case SWR_TILE_NONE: return ComputeTileSwizzle3D >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); ++ case SWR_TILE_SWRZ: return ComputeTileSwizzle3D >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); ++ case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState); ++ default: SWR_ASSERT(0, "Unsupported tiling mode"); ++ } ++ return (uint32_t) NULL; ++} ++ ++template ++INLINE ++uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) ++{ ++ uint32_t offsetX = 0, offsetY = 0, offsetZ = 0; ++ switch (pState->type) ++ { ++ case SURFACE_BUFFER: ++ case SURFACE_STRUCTURED_BUFFER: ++ offsetX = x * pState->pitch; ++ return offsetX; ++ break; ++ case SURFACE_1D: ++ ComputeSurfaceOffset1D(x, array, lod, pState, offsetX); ++ return TileSwizzle2D(offsetX, 0, pState); ++ break; ++ case SURFACE_2D: ++ ComputeSurfaceOffset2D(x, y, array, sampleNum, lod, pState, offsetX, offsetY); ++ return TileSwizzle2D(offsetX, offsetY, pState); ++ case SURFACE_3D: ++ ComputeSurfaceOffset3D(x, y, z, lod, pState, offsetX, offsetY, offsetZ); ++ return TileSwizzle3D(offsetX, offsetY, offsetZ, pState); ++ break; ++ case SURFACE_CUBE: ++ ComputeSurfaceOffset2D(x, y, array, sampleNum, lod, pState, offsetX, offsetY); ++ return TileSwizzle2D(offsetX, offsetY, pState); ++ break; ++ default: SWR_ASSERT(0, "Unsupported format"); ++ } ++ ++ return (uint32_t) NULL; ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes surface address at the given location and lod ++/// @param x - x location in pixels ++/// @param y - y location in rows ++/// @param z - z location for 3D surfaces ++/// @param array - array slice for 1D and 2D surfaces ++/// @param lod - level of detail ++/// @param pState - pointer to the surface state ++template ++INLINE ++void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState) ++{ ++ return pState->pBaseAddress + ComputeSurfaceOffset(x, y, z, array, sampleNum, lod, pState); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +new file mode 100644 +index 0000000..9dd4cd2 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +@@ -0,0 +1,239 @@ ++/**************************************************************************** ++* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++* ++* Permission is hereby granted, free of charge, to any person obtaining a ++* copy of this software and associated documentation files (the "Software"), ++* to deal in the Software without restriction, including without limitation ++* the rights to use, copy, modify, merge, publish, distribute, sublicense, ++* and/or sell copies of the Software, and to permit persons to whom the ++* Software is furnished to do so, subject to the following conditions: ++* ++* The above copyright notice and this permission notice (including the next ++* paragraph) shall be included in all copies or substantial portions of the ++* Software. ++* ++* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++* IN THE SOFTWARE. ++* ++* @file tilingtraits.h ++* ++* @brief Tiling traits. ++* ++******************************************************************************/ ++#pragma once ++ ++#include "core/state.h" ++ ++template ++struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ mode }; ++ static UINT GetCu() { SWR_ASSERT(0); return 0; } ++ static UINT GetCv() { SWR_ASSERT(0); return 0; } ++ static UINT GetCr() { SWR_ASSERT(0); return 0; } ++ static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; } ++ ++ /// @todo correct pdep shifts for all rastertile dims. Unused for now ++ static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; } ++ static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; } ++}; ++ ++template struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE }; ++ static UINT GetCu() { return 0; } ++ static UINT GetCv() { return 0; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return 0; } ++ static UINT GetPdepX() { return 0x00; } ++ static UINT GetPdepY() { return 0x00; } ++}; ++ ++template<> struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; ++ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; } ++ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; } ++ ++ /// @todo correct pdep shifts for all rastertile dims. Unused for now ++ static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; } ++ static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; } ++}; ++ ++template<> struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; ++ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; } ++ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; } ++ ++ static UINT GetPdepX() { return 0x37; } ++ static UINT GetPdepY() { return 0xC8; } ++}; ++ ++template<> struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ }; ++ static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; } ++ static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; } ++ ++ /// @todo correct pdep shifts for all rastertile dims. Unused for now ++ static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; } ++ static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; } ++}; ++ ++// y-major tiling layout unaffected by element size ++template struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR }; ++ static UINT GetCu() { return 7; } ++ static UINT GetCv() { return 5; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return 12; } ++ ++ static UINT GetPdepX() { return 0xe0f; } ++ static UINT GetPdepY() { return 0x1f0; } ++}; ++ ++// x-major tiling layout unaffected by element size ++template struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR }; ++ static UINT GetCu() { return 9; } ++ static UINT GetCv() { return 3; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return 12; } ++ ++ static UINT GetPdepX() { return 0x1ff; } ++ static UINT GetPdepY() { return 0xe00; } ++}; ++ ++template struct TilingTraits ++{ ++ static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR }; ++ static UINT GetCu() { return 6; } ++ static UINT GetCv() { return 6; } ++ static UINT GetCr() { return 0; } ++ static UINT GetTileIDShift() { return 12; } ++ ++ static UINT GetPdepX() { return 0xe15; } ++ static UINT GetPdepY() { return 0x1ea; } ++}; ++ ++INLINE ++UINT pdep_u32(UINT a, UINT mask) ++{ ++#if KNOB_ARCH==KNOB_ARCH_AVX2 ++ return _pdep_u32(a, mask); ++#else ++ UINT result = 0; ++ ++ // copied from http://wm.ite.pl/articles/pdep-soft-emu.html ++ // using bsf instead of funky loop ++ DWORD maskIndex; ++ while (_BitScanForward(&maskIndex, mask)) ++ { ++ // 1. isolate lowest set bit of mask ++ const UINT lowest = 1 << maskIndex; ++ ++ // 2. populate LSB from src ++ const UINT LSB = (UINT)((int)(a << 31) >> 31); ++ ++ // 3. copy bit from mask ++ result |= LSB & lowest; ++ ++ // 4. clear lowest bit ++ mask &= ~lowest; ++ ++ // 5. prepare for next iteration ++ a >>= 1; ++ } ++ ++ return result; ++#endif ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes the tileID for 2D tiled surfaces ++/// @param pitch - surface pitch in bytes ++/// @param tileX - x offset in tiles ++/// @param tileY - y offset in tiles ++template ++INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY) ++{ ++ UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX; ++ return tileID << TTraits::GetTileIDShift(); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes the tileID for 3D tiled surfaces ++/// @param qpitch - surface qpitch in rows ++/// @param pitch - surface pitch in bytes ++/// @param tileX - x offset in tiles ++/// @param tileY - y offset in tiles ++/// @param tileZ - y offset in tiles ++template ++INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ) ++{ ++ UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX; ++ return tileID << TTraits::GetTileIDShift(); ++} ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes the byte offset for 2D tiled surfaces ++/// @param pitch - surface pitch in bytes ++/// @param x - x offset in bytes ++/// @param y - y offset in rows ++template ++INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y) ++{ ++ UINT tileID = ComputeTileOffset2D(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); ++ UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); ++ UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); ++ return (tileID | xSwizzle | ySwizzle); ++} ++ ++#if KNOB_ARCH <= KNOB_ARCH_AVX ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes the byte offset for 2D tiled surfaces. Specialization ++/// for tile-y surfaces that uses bit twiddling instead of pdep emulation. ++/// @param pitch - surface pitch in bytes ++/// @param x - x offset in bytes ++/// @param y - y offset in rows ++template<> ++INLINE UINT ComputeOffset2D >(UINT pitch, UINT x, UINT y) ++{ ++ typedef TilingTraits TTraits; ++ ++ UINT tileID = ComputeTileOffset2D(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv()); ++ UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf); ++ UINT ySwizzle = (y << 4) & 0x1f0; ++ return (tileID | xSwizzle | ySwizzle); ++} ++#endif ++ ++////////////////////////////////////////////////////////////////////////// ++/// @brief Computes the byte offset for 3D tiled surfaces ++/// @param qpitch - depth pitch in rows ++/// @param pitch - surface pitch in bytes ++/// @param x - x offset in bytes ++/// @param y - y offset in rows ++/// @param z - y offset in slices ++template ++INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z) ++{ ++ UINT tileID = ComputeTileOffset3D(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr()); ++ UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX()); ++ UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY()); ++ return (tileID | xSwizzle | ySwizzle); ++} +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py +new file mode 100644 +index 0000000..a6aa81b +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py +@@ -0,0 +1,79 @@ ++# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a ++# copy of this software and associated documentation files (the "Software"), ++# to deal in the Software without restriction, including without limitation ++# the rights to use, copy, modify, merge, publish, distribute, sublicense, ++# and/or sell copies of the Software, and to permit persons to whom the ++# Software is furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice (including the next ++# paragraph) shall be included in all copies or substantial portions of the ++# Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++# IN THE SOFTWARE. ++ ++# Python source ++from __future__ import print_function ++import os ++import sys ++import knob_defs ++from mako.template import Template ++from mako.exceptions import RichTraceback ++ ++def write_template_to_string(template_filename, **kwargs): ++ try: ++ template = Template(filename=template_filename) ++ # Split + Join fixes line-endings for whatever platform you are using ++ return '\n'.join(template.render(**kwargs).splitlines()) ++ except: ++ traceback = RichTraceback() ++ for (filename, lineno, function, line) in traceback.traceback: ++ print("File %s, line %s, in %s" % (filename, lineno, function)) ++ print(line, "\n") ++ print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error)) ++ ++def write_template_to_file(template_filename, output_filename, **kwargs): ++ with open(output_filename, "w") as outfile: ++ print(write_template_to_string(template_filename, **kwargs), file=outfile) ++ ++def main(args=sys.argv[1:]): ++ if len(args) != 1: ++ print('Usage:', sys.argv[0], '', file=sys.stderr) ++ return 1 ++ ++ output_dir = args[0] ++ if not os.path.isdir(output_dir): ++ if os.path.exists(output_dir): ++ print('ERROR: Invalid output directory:', output_dir, file=sys.stderr) ++ return 1 ++ ++ try: ++ os.makedirs(output_dir) ++ except: ++ print('ERROR: Could not create output directory:', output_dir, file=sys.stderr) ++ return 1 ++ ++ # Output path exists, now just run the template ++ template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template']) ++ output_file = os.sep.join([output_dir, 'gen_knobs.cpp']) ++ output_header = os.sep.join([output_dir, 'gen_knobs.h']) ++ ++ for f in [output_header, output_file]: ++ write_template_to_file(template_file, f, ++ filename='gen_knobs', ++ knobs=knob_defs.KNOBS, ++ includes=['core/knobs_init.h'], ++ gen_header=True if f == output_header else False) ++ ++ return 0 ++ ++if __name__ == '__main__': ++ sys.exit(main()) ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +new file mode 100644 +index 0000000..0a64953 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +@@ -0,0 +1,212 @@ ++# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a ++# copy of this software and associated documentation files (the "Software"), ++# to deal in the Software without restriction, including without limitation ++# the rights to use, copy, modify, merge, publish, distribute, sublicense, ++# and/or sell copies of the Software, and to permit persons to whom the ++# Software is furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice (including the next ++# paragraph) shall be included in all copies or substantial portions of the ++# Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++# IN THE SOFTWARE. ++ ++# Python source ++KNOBS = [ ++ ['ENABLE_ASSERT_DIALOGS', { ++ 'type' : 'bool', ++ 'default' : 'true', ++ 'desc' : ['Use dialogs when asserts fire.', ++ 'Asserts are only enabled in debug builds'], ++ }], ++ ++ ['USE_GENERIC_STORETILE', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Always use generic function for performing StoreTile.', ++ 'Will be slightly slower than using optimized (jitted) path'], ++ }], ++ ++ ['SINGLE_THREADED', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['If enabled will perform all rendering on the API thread.', ++ 'This is useful mainly for debugging purposes.'], ++ }], ++ ++ ['FAST_CLEAR', { ++ 'type' : 'bool', ++ 'default' : 'true', ++ 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and', ++ 'defer clear execution to first backend op on hottile, or hottile store'], ++ }], ++ ++ ['MAX_NUMA_NODES', { ++ 'type' : 'uint32_t', ++ 'default' : '0', ++ 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', ++ ' 0 == ALL NUMA-nodes in the system', ++ ' N == Use at most N NUMA-nodes for rendering'], ++ }], ++ ++ ['MAX_CORES_PER_NUMA_NODE', { ++ 'type' : 'uint32_t', ++ 'default' : '0', ++ 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.', ++ ' 0 == ALL non-API thread cores per NUMA-node', ++ ' N == Use at most N cores per NUMA-node'], ++ }], ++ ++ ['MAX_THREADS_PER_CORE', { ++ 'type' : 'uint32_t', ++ 'default' : '1', ++ 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.', ++ ' 0 == ALL hyper-threads per core', ++ ' N == Use at most N hyper-threads per physical core'], ++ }], ++ ++ ['BUCKETS_START_FRAME', { ++ 'type' : 'uint32_t', ++ 'default' : '1200', ++ 'desc' : ['Frame from when to start saving buckets data.', ++ '', ++ 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', ++ 'for this to have an effect.'], ++ }], ++ ++ ['BUCKETS_END_FRAME', { ++ 'type' : 'uint32_t', ++ 'default' : '1400', ++ 'desc' : ['Frame at which to stop saving buckets data.', ++ '', ++ 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', ++ 'for this to have an effect.'], ++ }], ++ ++ ['TOSS_DRAW', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Disable per-draw execution', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_QUEUE_FE', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at worker FE', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_FETCH', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at vertex fetch', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_IA', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at input assembler', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_VS', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at vertex shader', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_SETUP_TRIS', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at primitive setup', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_BIN_TRIS', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at primitive binning', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['TOSS_RS', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Stop per-draw execution at rasterizer', ++ '', ++ 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], ++ }], ++ ++ ['WORKER_SPIN_LOOP_COUNT', { ++ 'type' : 'uint32_t', ++ 'default' : '5000', ++ 'desc' : ['Number of spin-loop iterations worker threads will perform', ++ 'before going to sleep when waiting for work'], ++ }], ++ ++ ['MAX_DRAWS_IN_FLIGHT', { ++ 'type' : 'uint32_t', ++ 'default' : '160', ++ 'desc' : ['Maximum number of draws outstanding before API thread blocks.'], ++ }], ++ ++ ['MAX_PRIMS_PER_DRAW', { ++ 'type' : 'uint32_t', ++ 'default' : '2040', ++ 'desc' : ['Maximum primitives in a single Draw().', ++ 'Larger primitives are split into smaller Draw calls.', ++ 'Should be a multiple of (3 * vectorWidth).'], ++ }], ++ ++ ['MAX_TESS_PRIMS_PER_DRAW', { ++ 'type' : 'uint32_t', ++ 'default' : '16', ++ 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', ++ 'Larger primitives are split into smaller Draw calls.', ++ 'Should be a multiple of (vectorWidth).'], ++ }], ++ ++ ['MAX_FRAC_ODD_TESS_FACTOR', { ++ 'type' : 'float', ++ 'default' : '63.0f', ++ 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'], ++ }], ++ ++ ['MAX_FRAC_EVEN_TESS_FACTOR', { ++ 'type' : 'float', ++ 'default' : '64.0f', ++ 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'], ++ }], ++ ++ ['MAX_INTEGER_TESS_FACTOR', { ++ 'type' : 'uint32_t', ++ 'default' : '64', ++ 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'], ++ }], ++ ++ ['DUMP_SHADER_IR', { ++ 'type' : 'bool', ++ 'default' : 'false', ++ 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], ++ }], ++ ++ ++] +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py +new file mode 100644 +index 0000000..d963848 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py +@@ -0,0 +1,8 @@ ++# mako/__init__.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++ ++__version__ = '1.0.1' +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py +new file mode 100644 +index 0000000..efbc4fc +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py +@@ -0,0 +1,845 @@ ++# mako/_ast_util.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++""" ++ ast ++ ~~~ ++ ++ The `ast` module helps Python applications to process trees of the Python ++ abstract syntax grammar. The abstract syntax itself might change with ++ each Python release; this module helps to find out programmatically what ++ the current grammar looks like and allows modifications of it. ++ ++ An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as ++ a flag to the `compile()` builtin function or by using the `parse()` ++ function from this module. The result will be a tree of objects whose ++ classes all inherit from `ast.AST`. ++ ++ A modified abstract syntax tree can be compiled into a Python code object ++ using the built-in `compile()` function. ++ ++ Additionally various helper functions are provided that make working with ++ the trees simpler. The main intention of the helper functions and this ++ module in general is to provide an easy to use interface for libraries ++ that work tightly with the python syntax (template engines for example). ++ ++ ++ :copyright: Copyright 2008 by Armin Ronacher. ++ :license: Python License. ++""" ++from _ast import * ++from mako.compat import arg_stringname ++ ++BOOLOP_SYMBOLS = { ++ And: 'and', ++ Or: 'or' ++} ++ ++BINOP_SYMBOLS = { ++ Add: '+', ++ Sub: '-', ++ Mult: '*', ++ Div: '/', ++ FloorDiv: '//', ++ Mod: '%', ++ LShift: '<<', ++ RShift: '>>', ++ BitOr: '|', ++ BitAnd: '&', ++ BitXor: '^' ++} ++ ++CMPOP_SYMBOLS = { ++ Eq: '==', ++ Gt: '>', ++ GtE: '>=', ++ In: 'in', ++ Is: 'is', ++ IsNot: 'is not', ++ Lt: '<', ++ LtE: '<=', ++ NotEq: '!=', ++ NotIn: 'not in' ++} ++ ++UNARYOP_SYMBOLS = { ++ Invert: '~', ++ Not: 'not', ++ UAdd: '+', ++ USub: '-' ++} ++ ++ALL_SYMBOLS = {} ++ALL_SYMBOLS.update(BOOLOP_SYMBOLS) ++ALL_SYMBOLS.update(BINOP_SYMBOLS) ++ALL_SYMBOLS.update(CMPOP_SYMBOLS) ++ALL_SYMBOLS.update(UNARYOP_SYMBOLS) ++ ++ ++def parse(expr, filename='', mode='exec'): ++ """Parse an expression into an AST node.""" ++ return compile(expr, filename, mode, PyCF_ONLY_AST) ++ ++ ++def to_source(node, indent_with=' ' * 4): ++ """ ++ This function can convert a node tree back into python sourcecode. This ++ is useful for debugging purposes, especially if you're dealing with custom ++ asts not generated by python itself. ++ ++ It could be that the sourcecode is evaluable when the AST itself is not ++ compilable / evaluable. The reason for this is that the AST contains some ++ more data than regular sourcecode does, which is dropped during ++ conversion. ++ ++ Each level of indentation is replaced with `indent_with`. Per default this ++ parameter is equal to four spaces as suggested by PEP 8, but it might be ++ adjusted to match the application's styleguide. ++ """ ++ generator = SourceGenerator(indent_with) ++ generator.visit(node) ++ return ''.join(generator.result) ++ ++ ++def dump(node): ++ """ ++ A very verbose representation of the node passed. This is useful for ++ debugging purposes. ++ """ ++ def _format(node): ++ if isinstance(node, AST): ++ return '%s(%s)' % (node.__class__.__name__, ++ ', '.join('%s=%s' % (a, _format(b)) ++ for a, b in iter_fields(node))) ++ elif isinstance(node, list): ++ return '[%s]' % ', '.join(_format(x) for x in node) ++ return repr(node) ++ if not isinstance(node, AST): ++ raise TypeError('expected AST, got %r' % node.__class__.__name__) ++ return _format(node) ++ ++ ++def copy_location(new_node, old_node): ++ """ ++ Copy the source location hint (`lineno` and `col_offset`) from the ++ old to the new node if possible and return the new one. ++ """ ++ for attr in 'lineno', 'col_offset': ++ if attr in old_node._attributes and attr in new_node._attributes \ ++ and hasattr(old_node, attr): ++ setattr(new_node, attr, getattr(old_node, attr)) ++ return new_node ++ ++ ++def fix_missing_locations(node): ++ """ ++ Some nodes require a line number and the column offset. Without that ++ information the compiler will abort the compilation. Because it can be ++ a dull task to add appropriate line numbers and column offsets when ++ adding new nodes this function can help. It copies the line number and ++ column offset of the parent node to the child nodes without this ++ information. ++ ++ Unlike `copy_location` this works recursive and won't touch nodes that ++ already have a location information. ++ """ ++ def _fix(node, lineno, col_offset): ++ if 'lineno' in node._attributes: ++ if not hasattr(node, 'lineno'): ++ node.lineno = lineno ++ else: ++ lineno = node.lineno ++ if 'col_offset' in node._attributes: ++ if not hasattr(node, 'col_offset'): ++ node.col_offset = col_offset ++ else: ++ col_offset = node.col_offset ++ for child in iter_child_nodes(node): ++ _fix(child, lineno, col_offset) ++ _fix(node, 1, 0) ++ return node ++ ++ ++def increment_lineno(node, n=1): ++ """ ++ Increment the line numbers of all nodes by `n` if they have line number ++ attributes. This is useful to "move code" to a different location in a ++ file. ++ """ ++ for node in zip((node,), walk(node)): ++ if 'lineno' in node._attributes: ++ node.lineno = getattr(node, 'lineno', 0) + n ++ ++ ++def iter_fields(node): ++ """Iterate over all fields of a node, only yielding existing fields.""" ++ # CPython 2.5 compat ++ if not hasattr(node, '_fields') or not node._fields: ++ return ++ for field in node._fields: ++ try: ++ yield field, getattr(node, field) ++ except AttributeError: ++ pass ++ ++ ++def get_fields(node): ++ """Like `iter_fiels` but returns a dict.""" ++ return dict(iter_fields(node)) ++ ++ ++def iter_child_nodes(node): ++ """Iterate over all child nodes or a node.""" ++ for name, field in iter_fields(node): ++ if isinstance(field, AST): ++ yield field ++ elif isinstance(field, list): ++ for item in field: ++ if isinstance(item, AST): ++ yield item ++ ++ ++def get_child_nodes(node): ++ """Like `iter_child_nodes` but returns a list.""" ++ return list(iter_child_nodes(node)) ++ ++ ++def get_compile_mode(node): ++ """ ++ Get the mode for `compile` of a given node. If the node is not a `mod` ++ node (`Expression`, `Module` etc.) a `TypeError` is thrown. ++ """ ++ if not isinstance(node, mod): ++ raise TypeError('expected mod node, got %r' % node.__class__.__name__) ++ return { ++ Expression: 'eval', ++ Interactive: 'single' ++ }.get(node.__class__, 'expr') ++ ++ ++def get_docstring(node): ++ """ ++ Return the docstring for the given node or `None` if no docstring can be ++ found. If the node provided does not accept docstrings a `TypeError` ++ will be raised. ++ """ ++ if not isinstance(node, (FunctionDef, ClassDef, Module)): ++ raise TypeError("%r can't have docstrings" % node.__class__.__name__) ++ if node.body and isinstance(node.body[0], Str): ++ return node.body[0].s ++ ++ ++def walk(node): ++ """ ++ Iterate over all nodes. This is useful if you only want to modify nodes in ++ place and don't care about the context or the order the nodes are returned. ++ """ ++ from collections import deque ++ todo = deque([node]) ++ while todo: ++ node = todo.popleft() ++ todo.extend(iter_child_nodes(node)) ++ yield node ++ ++ ++class NodeVisitor(object): ++ """ ++ Walks the abstract syntax tree and call visitor functions for every node ++ found. The visitor functions may return values which will be forwarded ++ by the `visit` method. ++ ++ Per default the visitor functions for the nodes are ``'visit_'`` + ++ class name of the node. So a `TryFinally` node visit function would ++ be `visit_TryFinally`. This behavior can be changed by overriding ++ the `get_visitor` function. If no visitor function exists for a node ++ (return value `None`) the `generic_visit` visitor is used instead. ++ ++ Don't use the `NodeVisitor` if you want to apply changes to nodes during ++ traversing. For this a special visitor exists (`NodeTransformer`) that ++ allows modifications. ++ """ ++ ++ def get_visitor(self, node): ++ """ ++ Return the visitor function for this node or `None` if no visitor ++ exists for this node. In that case the generic visit function is ++ used instead. ++ """ ++ method = 'visit_' + node.__class__.__name__ ++ return getattr(self, method, None) ++ ++ def visit(self, node): ++ """Visit a node.""" ++ f = self.get_visitor(node) ++ if f is not None: ++ return f(node) ++ return self.generic_visit(node) ++ ++ def generic_visit(self, node): ++ """Called if no explicit visitor function exists for a node.""" ++ for field, value in iter_fields(node): ++ if isinstance(value, list): ++ for item in value: ++ if isinstance(item, AST): ++ self.visit(item) ++ elif isinstance(value, AST): ++ self.visit(value) ++ ++ ++class NodeTransformer(NodeVisitor): ++ """ ++ Walks the abstract syntax tree and allows modifications of nodes. ++ ++ The `NodeTransformer` will walk the AST and use the return value of the ++ visitor functions to replace or remove the old node. If the return ++ value of the visitor function is `None` the node will be removed ++ from the previous location otherwise it's replaced with the return ++ value. The return value may be the original node in which case no ++ replacement takes place. ++ ++ Here an example transformer that rewrites all `foo` to `data['foo']`:: ++ ++ class RewriteName(NodeTransformer): ++ ++ def visit_Name(self, node): ++ return copy_location(Subscript( ++ value=Name(id='data', ctx=Load()), ++ slice=Index(value=Str(s=node.id)), ++ ctx=node.ctx ++ ), node) ++ ++ Keep in mind that if the node you're operating on has child nodes ++ you must either transform the child nodes yourself or call the generic ++ visit function for the node first. ++ ++ Nodes that were part of a collection of statements (that applies to ++ all statement nodes) may also return a list of nodes rather than just ++ a single node. ++ ++ Usually you use the transformer like this:: ++ ++ node = YourTransformer().visit(node) ++ """ ++ ++ def generic_visit(self, node): ++ for field, old_value in iter_fields(node): ++ old_value = getattr(node, field, None) ++ if isinstance(old_value, list): ++ new_values = [] ++ for value in old_value: ++ if isinstance(value, AST): ++ value = self.visit(value) ++ if value is None: ++ continue ++ elif not isinstance(value, AST): ++ new_values.extend(value) ++ continue ++ new_values.append(value) ++ old_value[:] = new_values ++ elif isinstance(old_value, AST): ++ new_node = self.visit(old_value) ++ if new_node is None: ++ delattr(node, field) ++ else: ++ setattr(node, field, new_node) ++ return node ++ ++ ++class SourceGenerator(NodeVisitor): ++ """ ++ This visitor is able to transform a well formed syntax tree into python ++ sourcecode. For more details have a look at the docstring of the ++ `node_to_source` function. ++ """ ++ ++ def __init__(self, indent_with): ++ self.result = [] ++ self.indent_with = indent_with ++ self.indentation = 0 ++ self.new_lines = 0 ++ ++ def write(self, x): ++ if self.new_lines: ++ if self.result: ++ self.result.append('\n' * self.new_lines) ++ self.result.append(self.indent_with * self.indentation) ++ self.new_lines = 0 ++ self.result.append(x) ++ ++ def newline(self, n=1): ++ self.new_lines = max(self.new_lines, n) ++ ++ def body(self, statements): ++ self.new_line = True ++ self.indentation += 1 ++ for stmt in statements: ++ self.visit(stmt) ++ self.indentation -= 1 ++ ++ def body_or_else(self, node): ++ self.body(node.body) ++ if node.orelse: ++ self.newline() ++ self.write('else:') ++ self.body(node.orelse) ++ ++ def signature(self, node): ++ want_comma = [] ++ def write_comma(): ++ if want_comma: ++ self.write(', ') ++ else: ++ want_comma.append(True) ++ ++ padding = [None] * (len(node.args) - len(node.defaults)) ++ for arg, default in zip(node.args, padding + node.defaults): ++ write_comma() ++ self.visit(arg) ++ if default is not None: ++ self.write('=') ++ self.visit(default) ++ if node.vararg is not None: ++ write_comma() ++ self.write('*' + arg_stringname(node.vararg)) ++ if node.kwarg is not None: ++ write_comma() ++ self.write('**' + arg_stringname(node.kwarg)) ++ ++ def decorators(self, node): ++ for decorator in node.decorator_list: ++ self.newline() ++ self.write('@') ++ self.visit(decorator) ++ ++ # Statements ++ ++ def visit_Assign(self, node): ++ self.newline() ++ for idx, target in enumerate(node.targets): ++ if idx: ++ self.write(', ') ++ self.visit(target) ++ self.write(' = ') ++ self.visit(node.value) ++ ++ def visit_AugAssign(self, node): ++ self.newline() ++ self.visit(node.target) ++ self.write(BINOP_SYMBOLS[type(node.op)] + '=') ++ self.visit(node.value) ++ ++ def visit_ImportFrom(self, node): ++ self.newline() ++ self.write('from %s%s import ' % ('.' * node.level, node.module)) ++ for idx, item in enumerate(node.names): ++ if idx: ++ self.write(', ') ++ self.write(item) ++ ++ def visit_Import(self, node): ++ self.newline() ++ for item in node.names: ++ self.write('import ') ++ self.visit(item) ++ ++ def visit_Expr(self, node): ++ self.newline() ++ self.generic_visit(node) ++ ++ def visit_FunctionDef(self, node): ++ self.newline(n=2) ++ self.decorators(node) ++ self.newline() ++ self.write('def %s(' % node.name) ++ self.signature(node.args) ++ self.write('):') ++ self.body(node.body) ++ ++ def visit_ClassDef(self, node): ++ have_args = [] ++ def paren_or_comma(): ++ if have_args: ++ self.write(', ') ++ else: ++ have_args.append(True) ++ self.write('(') ++ ++ self.newline(n=3) ++ self.decorators(node) ++ self.newline() ++ self.write('class %s' % node.name) ++ for base in node.bases: ++ paren_or_comma() ++ self.visit(base) ++ # XXX: the if here is used to keep this module compatible ++ # with python 2.6. ++ if hasattr(node, 'keywords'): ++ for keyword in node.keywords: ++ paren_or_comma() ++ self.write(keyword.arg + '=') ++ self.visit(keyword.value) ++ if node.starargs is not None: ++ paren_or_comma() ++ self.write('*') ++ self.visit(node.starargs) ++ if node.kwargs is not None: ++ paren_or_comma() ++ self.write('**') ++ self.visit(node.kwargs) ++ self.write(have_args and '):' or ':') ++ self.body(node.body) ++ ++ def visit_If(self, node): ++ self.newline() ++ self.write('if ') ++ self.visit(node.test) ++ self.write(':') ++ self.body(node.body) ++ while True: ++ else_ = node.orelse ++ if len(else_) == 1 and isinstance(else_[0], If): ++ node = else_[0] ++ self.newline() ++ self.write('elif ') ++ self.visit(node.test) ++ self.write(':') ++ self.body(node.body) ++ else: ++ self.newline() ++ self.write('else:') ++ self.body(else_) ++ break ++ ++ def visit_For(self, node): ++ self.newline() ++ self.write('for ') ++ self.visit(node.target) ++ self.write(' in ') ++ self.visit(node.iter) ++ self.write(':') ++ self.body_or_else(node) ++ ++ def visit_While(self, node): ++ self.newline() ++ self.write('while ') ++ self.visit(node.test) ++ self.write(':') ++ self.body_or_else(node) ++ ++ def visit_With(self, node): ++ self.newline() ++ self.write('with ') ++ self.visit(node.context_expr) ++ if node.optional_vars is not None: ++ self.write(' as ') ++ self.visit(node.optional_vars) ++ self.write(':') ++ self.body(node.body) ++ ++ def visit_Pass(self, node): ++ self.newline() ++ self.write('pass') ++ ++ def visit_Print(self, node): ++ # XXX: python 2.6 only ++ self.newline() ++ self.write('print ') ++ want_comma = False ++ if node.dest is not None: ++ self.write(' >> ') ++ self.visit(node.dest) ++ want_comma = True ++ for value in node.values: ++ if want_comma: ++ self.write(', ') ++ self.visit(value) ++ want_comma = True ++ if not node.nl: ++ self.write(',') ++ ++ def visit_Delete(self, node): ++ self.newline() ++ self.write('del ') ++ for idx, target in enumerate(node): ++ if idx: ++ self.write(', ') ++ self.visit(target) ++ ++ def visit_TryExcept(self, node): ++ self.newline() ++ self.write('try:') ++ self.body(node.body) ++ for handler in node.handlers: ++ self.visit(handler) ++ ++ def visit_TryFinally(self, node): ++ self.newline() ++ self.write('try:') ++ self.body(node.body) ++ self.newline() ++ self.write('finally:') ++ self.body(node.finalbody) ++ ++ def visit_Global(self, node): ++ self.newline() ++ self.write('global ' + ', '.join(node.names)) ++ ++ def visit_Nonlocal(self, node): ++ self.newline() ++ self.write('nonlocal ' + ', '.join(node.names)) ++ ++ def visit_Return(self, node): ++ self.newline() ++ self.write('return ') ++ self.visit(node.value) ++ ++ def visit_Break(self, node): ++ self.newline() ++ self.write('break') ++ ++ def visit_Continue(self, node): ++ self.newline() ++ self.write('continue') ++ ++ def visit_Raise(self, node): ++ # XXX: Python 2.6 / 3.0 compatibility ++ self.newline() ++ self.write('raise') ++ if hasattr(node, 'exc') and node.exc is not None: ++ self.write(' ') ++ self.visit(node.exc) ++ if node.cause is not None: ++ self.write(' from ') ++ self.visit(node.cause) ++ elif hasattr(node, 'type') and node.type is not None: ++ self.visit(node.type) ++ if node.inst is not None: ++ self.write(', ') ++ self.visit(node.inst) ++ if node.tback is not None: ++ self.write(', ') ++ self.visit(node.tback) ++ ++ # Expressions ++ ++ def visit_Attribute(self, node): ++ self.visit(node.value) ++ self.write('.' + node.attr) ++ ++ def visit_Call(self, node): ++ want_comma = [] ++ def write_comma(): ++ if want_comma: ++ self.write(', ') ++ else: ++ want_comma.append(True) ++ ++ self.visit(node.func) ++ self.write('(') ++ for arg in node.args: ++ write_comma() ++ self.visit(arg) ++ for keyword in node.keywords: ++ write_comma() ++ self.write(keyword.arg + '=') ++ self.visit(keyword.value) ++ if node.starargs is not None: ++ write_comma() ++ self.write('*') ++ self.visit(node.starargs) ++ if node.kwargs is not None: ++ write_comma() ++ self.write('**') ++ self.visit(node.kwargs) ++ self.write(')') ++ ++ def visit_Name(self, node): ++ self.write(node.id) ++ ++ def visit_NameConstant(self, node): ++ self.write(str(node.value)) ++ ++ def visit_arg(self, node): ++ self.write(node.arg) ++ ++ def visit_Str(self, node): ++ self.write(repr(node.s)) ++ ++ def visit_Bytes(self, node): ++ self.write(repr(node.s)) ++ ++ def visit_Num(self, node): ++ self.write(repr(node.n)) ++ ++ def visit_Tuple(self, node): ++ self.write('(') ++ idx = -1 ++ for idx, item in enumerate(node.elts): ++ if idx: ++ self.write(', ') ++ self.visit(item) ++ self.write(idx and ')' or ',)') ++ ++ def sequence_visit(left, right): ++ def visit(self, node): ++ self.write(left) ++ for idx, item in enumerate(node.elts): ++ if idx: ++ self.write(', ') ++ self.visit(item) ++ self.write(right) ++ return visit ++ ++ visit_List = sequence_visit('[', ']') ++ visit_Set = sequence_visit('{', '}') ++ del sequence_visit ++ ++ def visit_Dict(self, node): ++ self.write('{') ++ for idx, (key, value) in enumerate(zip(node.keys, node.values)): ++ if idx: ++ self.write(', ') ++ self.visit(key) ++ self.write(': ') ++ self.visit(value) ++ self.write('}') ++ ++ def visit_BinOp(self, node): ++ self.write('(') ++ self.visit(node.left) ++ self.write(' %s ' % BINOP_SYMBOLS[type(node.op)]) ++ self.visit(node.right) ++ self.write(')') ++ ++ def visit_BoolOp(self, node): ++ self.write('(') ++ for idx, value in enumerate(node.values): ++ if idx: ++ self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)]) ++ self.visit(value) ++ self.write(')') ++ ++ def visit_Compare(self, node): ++ self.write('(') ++ self.visit(node.left) ++ for op, right in zip(node.ops, node.comparators): ++ self.write(' %s ' % CMPOP_SYMBOLS[type(op)]) ++ self.visit(right) ++ self.write(')') ++ ++ def visit_UnaryOp(self, node): ++ self.write('(') ++ op = UNARYOP_SYMBOLS[type(node.op)] ++ self.write(op) ++ if op == 'not': ++ self.write(' ') ++ self.visit(node.operand) ++ self.write(')') ++ ++ def visit_Subscript(self, node): ++ self.visit(node.value) ++ self.write('[') ++ self.visit(node.slice) ++ self.write(']') ++ ++ def visit_Slice(self, node): ++ if node.lower is not None: ++ self.visit(node.lower) ++ self.write(':') ++ if node.upper is not None: ++ self.visit(node.upper) ++ if node.step is not None: ++ self.write(':') ++ if not (isinstance(node.step, Name) and node.step.id == 'None'): ++ self.visit(node.step) ++ ++ def visit_ExtSlice(self, node): ++ for idx, item in node.dims: ++ if idx: ++ self.write(', ') ++ self.visit(item) ++ ++ def visit_Yield(self, node): ++ self.write('yield ') ++ self.visit(node.value) ++ ++ def visit_Lambda(self, node): ++ self.write('lambda ') ++ self.signature(node.args) ++ self.write(': ') ++ self.visit(node.body) ++ ++ def visit_Ellipsis(self, node): ++ self.write('Ellipsis') ++ ++ def generator_visit(left, right): ++ def visit(self, node): ++ self.write(left) ++ self.visit(node.elt) ++ for comprehension in node.generators: ++ self.visit(comprehension) ++ self.write(right) ++ return visit ++ ++ visit_ListComp = generator_visit('[', ']') ++ visit_GeneratorExp = generator_visit('(', ')') ++ visit_SetComp = generator_visit('{', '}') ++ del generator_visit ++ ++ def visit_DictComp(self, node): ++ self.write('{') ++ self.visit(node.key) ++ self.write(': ') ++ self.visit(node.value) ++ for comprehension in node.generators: ++ self.visit(comprehension) ++ self.write('}') ++ ++ def visit_IfExp(self, node): ++ self.visit(node.body) ++ self.write(' if ') ++ self.visit(node.test) ++ self.write(' else ') ++ self.visit(node.orelse) ++ ++ def visit_Starred(self, node): ++ self.write('*') ++ self.visit(node.value) ++ ++ def visit_Repr(self, node): ++ # XXX: python 2.6 only ++ self.write('`') ++ self.visit(node.value) ++ self.write('`') ++ ++ # Helper Nodes ++ ++ def visit_alias(self, node): ++ self.write(node.name) ++ if node.asname is not None: ++ self.write(' as ' + node.asname) ++ ++ def visit_comprehension(self, node): ++ self.write(' for ') ++ self.visit(node.target) ++ self.write(' in ') ++ self.visit(node.iter) ++ if node.ifs: ++ for if_ in node.ifs: ++ self.write(' if ') ++ self.visit(if_) ++ ++ def visit_excepthandler(self, node): ++ self.newline() ++ self.write('except') ++ if node.type is not None: ++ self.write(' ') ++ self.visit(node.type) ++ if node.name is not None: ++ self.write(' as ') ++ self.visit(node.name) ++ self.write(':') ++ self.body(node.body) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py +new file mode 100644 +index 0000000..65fd84d +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py +@@ -0,0 +1,178 @@ ++# mako/ast.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""utilities for analyzing expressions and blocks of Python ++code, as well as generating Python from AST nodes""" ++ ++from mako import exceptions, pyparser, compat ++import re ++ ++class PythonCode(object): ++ """represents information about a string containing Python code""" ++ def __init__(self, code, **exception_kwargs): ++ self.code = code ++ ++ # represents all identifiers which are assigned to at some point in ++ # the code ++ self.declared_identifiers = set() ++ ++ # represents all identifiers which are referenced before their ++ # assignment, if any ++ self.undeclared_identifiers = set() ++ ++ # note that an identifier can be in both the undeclared and declared ++ # lists. ++ ++ # using AST to parse instead of using code.co_varnames, ++ # code.co_names has several advantages: ++ # - we can locate an identifier as "undeclared" even if ++ # its declared later in the same block of code ++ # - AST is less likely to break with version changes ++ # (for example, the behavior of co_names changed a little bit ++ # in python version 2.5) ++ if isinstance(code, compat.string_types): ++ expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs) ++ else: ++ expr = code ++ ++ f = pyparser.FindIdentifiers(self, **exception_kwargs) ++ f.visit(expr) ++ ++class ArgumentList(object): ++ """parses a fragment of code as a comma-separated list of expressions""" ++ def __init__(self, code, **exception_kwargs): ++ self.codeargs = [] ++ self.args = [] ++ self.declared_identifiers = set() ++ self.undeclared_identifiers = set() ++ if isinstance(code, compat.string_types): ++ if re.match(r"\S", code) and not re.match(r",\s*$", code): ++ # if theres text and no trailing comma, insure its parsed ++ # as a tuple by adding a trailing comma ++ code += "," ++ expr = pyparser.parse(code, "exec", **exception_kwargs) ++ else: ++ expr = code ++ ++ f = pyparser.FindTuple(self, PythonCode, **exception_kwargs) ++ f.visit(expr) ++ ++class PythonFragment(PythonCode): ++ """extends PythonCode to provide identifier lookups in partial control ++ statements ++ ++ e.g. ++ for x in 5: ++ elif y==9: ++ except (MyException, e): ++ etc. ++ """ ++ def __init__(self, code, **exception_kwargs): ++ m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S) ++ if not m: ++ raise exceptions.CompileException( ++ "Fragment '%s' is not a partial control statement" % ++ code, **exception_kwargs) ++ if m.group(3): ++ code = code[:m.start(3)] ++ (keyword, expr) = m.group(1,2) ++ if keyword in ['for','if', 'while']: ++ code = code + "pass" ++ elif keyword == 'try': ++ code = code + "pass\nexcept:pass" ++ elif keyword == 'elif' or keyword == 'else': ++ code = "if False:pass\n" + code + "pass" ++ elif keyword == 'except': ++ code = "try:pass\n" + code + "pass" ++ elif keyword == 'with': ++ code = code + "pass" ++ else: ++ raise exceptions.CompileException( ++ "Unsupported control keyword: '%s'" % ++ keyword, **exception_kwargs) ++ super(PythonFragment, self).__init__(code, **exception_kwargs) ++ ++ ++class FunctionDecl(object): ++ """function declaration""" ++ def __init__(self, code, allow_kwargs=True, **exception_kwargs): ++ self.code = code ++ expr = pyparser.parse(code, "exec", **exception_kwargs) ++ ++ f = pyparser.ParseFunc(self, **exception_kwargs) ++ f.visit(expr) ++ if not hasattr(self, 'funcname'): ++ raise exceptions.CompileException( ++ "Code '%s' is not a function declaration" % code, ++ **exception_kwargs) ++ if not allow_kwargs and self.kwargs: ++ raise exceptions.CompileException( ++ "'**%s' keyword argument not allowed here" % ++ self.kwargnames[-1], **exception_kwargs) ++ ++ def get_argument_expressions(self, as_call=False): ++ """Return the argument declarations of this FunctionDecl as a printable ++ list. ++ ++ By default the return value is appropriate for writing in a ``def``; ++ set `as_call` to true to build arguments to be passed to the function ++ instead (assuming locals with the same names as the arguments exist). ++ """ ++ ++ namedecls = [] ++ ++ # Build in reverse order, since defaults and slurpy args come last ++ argnames = self.argnames[::-1] ++ kwargnames = self.kwargnames[::-1] ++ defaults = self.defaults[::-1] ++ kwdefaults = self.kwdefaults[::-1] ++ ++ # Named arguments ++ if self.kwargs: ++ namedecls.append("**" + kwargnames.pop(0)) ++ ++ for name in kwargnames: ++ # Keyword-only arguments must always be used by name, so even if ++ # this is a call, print out `foo=foo` ++ if as_call: ++ namedecls.append("%s=%s" % (name, name)) ++ elif kwdefaults: ++ default = kwdefaults.pop(0) ++ if default is None: ++ # The AST always gives kwargs a default, since you can do ++ # `def foo(*, a=1, b, c=3)` ++ namedecls.append(name) ++ else: ++ namedecls.append("%s=%s" % ( ++ name, pyparser.ExpressionGenerator(default).value())) ++ else: ++ namedecls.append(name) ++ ++ # Positional arguments ++ if self.varargs: ++ namedecls.append("*" + argnames.pop(0)) ++ ++ for name in argnames: ++ if as_call or not defaults: ++ namedecls.append(name) ++ else: ++ default = defaults.pop(0) ++ namedecls.append("%s=%s" % ( ++ name, pyparser.ExpressionGenerator(default).value())) ++ ++ namedecls.reverse() ++ return namedecls ++ ++ @property ++ def allargnames(self): ++ return tuple(self.argnames) + tuple(self.kwargnames) ++ ++class FunctionArgs(FunctionDecl): ++ """the argument portion of a function declaration""" ++ ++ def __init__(self, code, **kwargs): ++ super(FunctionArgs, self).__init__("def ANON(%s):pass" % code, ++ **kwargs) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py +new file mode 100644 +index 0000000..c405c51 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py +@@ -0,0 +1,238 @@ ++# mako/cache.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++from mako import compat, util ++ ++_cache_plugins = util.PluginLoader("mako.cache") ++ ++register_plugin = _cache_plugins.register ++register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl") ++ ++ ++class Cache(object): ++ """Represents a data content cache made available to the module ++ space of a specific :class:`.Template` object. ++ ++ .. versionadded:: 0.6 ++ :class:`.Cache` by itself is mostly a ++ container for a :class:`.CacheImpl` object, which implements ++ a fixed API to provide caching services; specific subclasses exist to ++ implement different ++ caching strategies. Mako includes a backend that works with ++ the Beaker caching system. Beaker itself then supports ++ a number of backends (i.e. file, memory, memcached, etc.) ++ ++ The construction of a :class:`.Cache` is part of the mechanics ++ of a :class:`.Template`, and programmatic access to this ++ cache is typically via the :attr:`.Template.cache` attribute. ++ ++ """ ++ ++ impl = None ++ """Provide the :class:`.CacheImpl` in use by this :class:`.Cache`. ++ ++ This accessor allows a :class:`.CacheImpl` with additional ++ methods beyond that of :class:`.Cache` to be used programmatically. ++ ++ """ ++ ++ id = None ++ """Return the 'id' that identifies this cache. ++ ++ This is a value that should be globally unique to the ++ :class:`.Template` associated with this cache, and can ++ be used by a caching system to name a local container ++ for data specific to this template. ++ ++ """ ++ ++ starttime = None ++ """Epochal time value for when the owning :class:`.Template` was ++ first compiled. ++ ++ A cache implementation may wish to invalidate data earlier than ++ this timestamp; this has the effect of the cache for a specific ++ :class:`.Template` starting clean any time the :class:`.Template` ++ is recompiled, such as when the original template file changed on ++ the filesystem. ++ ++ """ ++ ++ def __init__(self, template, *args): ++ # check for a stale template calling the ++ # constructor ++ if isinstance(template, compat.string_types) and args: ++ return ++ self.template = template ++ self.id = template.module.__name__ ++ self.starttime = template.module._modified_time ++ self._def_regions = {} ++ self.impl = self._load_impl(self.template.cache_impl) ++ ++ def _load_impl(self, name): ++ return _cache_plugins.load(name)(self) ++ ++ def get_or_create(self, key, creation_function, **kw): ++ """Retrieve a value from the cache, using the given creation function ++ to generate a new value.""" ++ ++ return self._ctx_get_or_create(key, creation_function, None, **kw) ++ ++ def _ctx_get_or_create(self, key, creation_function, context, **kw): ++ """Retrieve a value from the cache, using the given creation function ++ to generate a new value.""" ++ ++ if not self.template.cache_enabled: ++ return creation_function() ++ ++ return self.impl.get_or_create( ++ key, ++ creation_function, ++ **self._get_cache_kw(kw, context)) ++ ++ def set(self, key, value, **kw): ++ """Place a value in the cache. ++ ++ :param key: the value's key. ++ :param value: the value. ++ :param \**kw: cache configuration arguments. ++ ++ """ ++ ++ self.impl.set(key, value, **self._get_cache_kw(kw, None)) ++ ++ put = set ++ """A synonym for :meth:`.Cache.set`. ++ ++ This is here for backwards compatibility. ++ ++ """ ++ ++ def get(self, key, **kw): ++ """Retrieve a value from the cache. ++ ++ :param key: the value's key. ++ :param \**kw: cache configuration arguments. The ++ backend is configured using these arguments upon first request. ++ Subsequent requests that use the same series of configuration ++ values will use that same backend. ++ ++ """ ++ return self.impl.get(key, **self._get_cache_kw(kw, None)) ++ ++ def invalidate(self, key, **kw): ++ """Invalidate a value in the cache. ++ ++ :param key: the value's key. ++ :param \**kw: cache configuration arguments. The ++ backend is configured using these arguments upon first request. ++ Subsequent requests that use the same series of configuration ++ values will use that same backend. ++ ++ """ ++ self.impl.invalidate(key, **self._get_cache_kw(kw, None)) ++ ++ def invalidate_body(self): ++ """Invalidate the cached content of the "body" method for this ++ template. ++ ++ """ ++ self.invalidate('render_body', __M_defname='render_body') ++ ++ def invalidate_def(self, name): ++ """Invalidate the cached content of a particular ``<%def>`` within this ++ template. ++ ++ """ ++ ++ self.invalidate('render_%s' % name, __M_defname='render_%s' % name) ++ ++ def invalidate_closure(self, name): ++ """Invalidate a nested ``<%def>`` within this template. ++ ++ Caching of nested defs is a blunt tool as there is no ++ management of scope -- nested defs that use cache tags ++ need to have names unique of all other nested defs in the ++ template, else their content will be overwritten by ++ each other. ++ ++ """ ++ ++ self.invalidate(name, __M_defname=name) ++ ++ def _get_cache_kw(self, kw, context): ++ defname = kw.pop('__M_defname', None) ++ if not defname: ++ tmpl_kw = self.template.cache_args.copy() ++ tmpl_kw.update(kw) ++ elif defname in self._def_regions: ++ tmpl_kw = self._def_regions[defname] ++ else: ++ tmpl_kw = self.template.cache_args.copy() ++ tmpl_kw.update(kw) ++ self._def_regions[defname] = tmpl_kw ++ if context and self.impl.pass_context: ++ tmpl_kw = tmpl_kw.copy() ++ tmpl_kw.setdefault('context', context) ++ return tmpl_kw ++ ++ ++class CacheImpl(object): ++ """Provide a cache implementation for use by :class:`.Cache`.""" ++ ++ def __init__(self, cache): ++ self.cache = cache ++ ++ pass_context = False ++ """If ``True``, the :class:`.Context` will be passed to ++ :meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``. ++ """ ++ ++ def get_or_create(self, key, creation_function, **kw): ++ """Retrieve a value from the cache, using the given creation function ++ to generate a new value. ++ ++ This function *must* return a value, either from ++ the cache, or via the given creation function. ++ If the creation function is called, the newly ++ created value should be populated into the cache ++ under the given key before being returned. ++ ++ :param key: the value's key. ++ :param creation_function: function that when called generates ++ a new value. ++ :param \**kw: cache configuration arguments. ++ ++ """ ++ raise NotImplementedError() ++ ++ def set(self, key, value, **kw): ++ """Place a value in the cache. ++ ++ :param key: the value's key. ++ :param value: the value. ++ :param \**kw: cache configuration arguments. ++ ++ """ ++ raise NotImplementedError() ++ ++ def get(self, key, **kw): ++ """Retrieve a value from the cache. ++ ++ :param key: the value's key. ++ :param \**kw: cache configuration arguments. ++ ++ """ ++ raise NotImplementedError() ++ ++ def invalidate(self, key, **kw): ++ """Invalidate a value in the cache. ++ ++ :param key: the value's key. ++ :param \**kw: cache configuration arguments. ++ ++ """ ++ raise NotImplementedError() +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py +new file mode 100644 +index 0000000..1a9ca56 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py +@@ -0,0 +1,62 @@ ++# mako/cmd.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++from argparse import ArgumentParser ++from os.path import isfile, dirname ++import sys ++from mako.template import Template ++from mako.lookup import TemplateLookup ++from mako import exceptions ++ ++def varsplit(var): ++ if "=" not in var: ++ return (var, "") ++ return var.split("=", 1) ++ ++def _exit(): ++ sys.stderr.write(exceptions.text_error_template().render()) ++ sys.exit(1) ++ ++def cmdline(argv=None): ++ ++ parser = ArgumentParser("usage: %prog [FILENAME]") ++ parser.add_argument("--var", default=[], action="append", ++ help="variable (can be used multiple times, use name=value)") ++ parser.add_argument("--template-dir", default=[], action="append", ++ help="Directory to use for template lookup (multiple " ++ "directories may be provided). If not given then if the " ++ "template is read from stdin, the value defaults to be " ++ "the current directory, otherwise it defaults to be the " ++ "parent directory of the file provided.") ++ parser.add_argument('input', nargs='?', default='-') ++ ++ options = parser.parse_args(argv) ++ if options.input == '-': ++ lookup_dirs = options.template_dir or ["."] ++ lookup = TemplateLookup(lookup_dirs) ++ try: ++ template = Template(sys.stdin.read(), lookup=lookup) ++ except: ++ _exit() ++ else: ++ filename = options.input ++ if not isfile(filename): ++ raise SystemExit("error: can't find %s" % filename) ++ lookup_dirs = options.template_dir or [dirname(filename)] ++ lookup = TemplateLookup(lookup_dirs) ++ try: ++ template = Template(filename=filename, lookup=lookup) ++ except: ++ _exit() ++ ++ kw = dict([varsplit(var) for var in options.var]) ++ try: ++ print(template.render(**kw)) ++ except: ++ _exit() ++ ++ ++if __name__ == "__main__": ++ cmdline() +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py +new file mode 100644 +index 0000000..4b0bda8 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py +@@ -0,0 +1,1237 @@ ++# mako/codegen.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""provides functionality for rendering a parsetree constructing into module ++source code.""" ++ ++import time ++import re ++from mako.pygen import PythonPrinter ++from mako import util, ast, parsetree, filters, exceptions ++from mako import compat ++ ++ ++MAGIC_NUMBER = 10 ++ ++# names which are hardwired into the ++# template and are not accessed via the ++# context itself ++RESERVED_NAMES = set(['context', 'loop', 'UNDEFINED']) ++ ++def compile(node, ++ uri, ++ filename=None, ++ default_filters=None, ++ buffer_filters=None, ++ imports=None, ++ future_imports=None, ++ source_encoding=None, ++ generate_magic_comment=True, ++ disable_unicode=False, ++ strict_undefined=False, ++ enable_loop=True, ++ reserved_names=frozenset()): ++ ++ """Generate module source code given a parsetree node, ++ uri, and optional source filename""" ++ ++ # if on Py2K, push the "source_encoding" string to be ++ # a bytestring itself, as we will be embedding it into ++ # the generated source and we don't want to coerce the ++ # result into a unicode object, in "disable_unicode" mode ++ if not compat.py3k and isinstance(source_encoding, compat.text_type): ++ source_encoding = source_encoding.encode(source_encoding) ++ ++ ++ buf = util.FastEncodingBuffer() ++ ++ printer = PythonPrinter(buf) ++ _GenerateRenderMethod(printer, ++ _CompileContext(uri, ++ filename, ++ default_filters, ++ buffer_filters, ++ imports, ++ future_imports, ++ source_encoding, ++ generate_magic_comment, ++ disable_unicode, ++ strict_undefined, ++ enable_loop, ++ reserved_names), ++ node) ++ return buf.getvalue() ++ ++class _CompileContext(object): ++ def __init__(self, ++ uri, ++ filename, ++ default_filters, ++ buffer_filters, ++ imports, ++ future_imports, ++ source_encoding, ++ generate_magic_comment, ++ disable_unicode, ++ strict_undefined, ++ enable_loop, ++ reserved_names): ++ self.uri = uri ++ self.filename = filename ++ self.default_filters = default_filters ++ self.buffer_filters = buffer_filters ++ self.imports = imports ++ self.future_imports = future_imports ++ self.source_encoding = source_encoding ++ self.generate_magic_comment = generate_magic_comment ++ self.disable_unicode = disable_unicode ++ self.strict_undefined = strict_undefined ++ self.enable_loop = enable_loop ++ self.reserved_names = reserved_names ++ ++class _GenerateRenderMethod(object): ++ """A template visitor object which generates the ++ full module source for a template. ++ ++ """ ++ def __init__(self, printer, compiler, node): ++ self.printer = printer ++ self.compiler = compiler ++ self.node = node ++ self.identifier_stack = [None] ++ self.in_def = isinstance(node, (parsetree.DefTag, parsetree.BlockTag)) ++ ++ if self.in_def: ++ name = "render_%s" % node.funcname ++ args = node.get_argument_expressions() ++ filtered = len(node.filter_args.args) > 0 ++ buffered = eval(node.attributes.get('buffered', 'False')) ++ cached = eval(node.attributes.get('cached', 'False')) ++ defs = None ++ pagetag = None ++ if node.is_block and not node.is_anonymous: ++ args += ['**pageargs'] ++ else: ++ defs = self.write_toplevel() ++ pagetag = self.compiler.pagetag ++ name = "render_body" ++ if pagetag is not None: ++ args = pagetag.body_decl.get_argument_expressions() ++ if not pagetag.body_decl.kwargs: ++ args += ['**pageargs'] ++ cached = eval(pagetag.attributes.get('cached', 'False')) ++ self.compiler.enable_loop = self.compiler.enable_loop or eval( ++ pagetag.attributes.get( ++ 'enable_loop', 'False') ++ ) ++ else: ++ args = ['**pageargs'] ++ cached = False ++ buffered = filtered = False ++ if args is None: ++ args = ['context'] ++ else: ++ args = [a for a in ['context'] + args] ++ ++ self.write_render_callable( ++ pagetag or node, ++ name, args, ++ buffered, filtered, cached) ++ ++ if defs is not None: ++ for node in defs: ++ _GenerateRenderMethod(printer, compiler, node) ++ ++ if not self.in_def: ++ self.write_metadata_struct() ++ ++ def write_metadata_struct(self): ++ self.printer.source_map[self.printer.lineno] = \ ++ max(self.printer.source_map) ++ struct = { ++ "filename": self.compiler.filename, ++ "uri": self.compiler.uri, ++ "source_encoding": self.compiler.source_encoding, ++ "line_map": self.printer.source_map, ++ } ++ self.printer.writelines( ++ '"""', ++ '__M_BEGIN_METADATA', ++ compat.json.dumps(struct), ++ '__M_END_METADATA\n' ++ '"""' ++ ) ++ ++ @property ++ def identifiers(self): ++ return self.identifier_stack[-1] ++ ++ def write_toplevel(self): ++ """Traverse a template structure for module-level directives and ++ generate the start of module-level code. ++ ++ """ ++ inherit = [] ++ namespaces = {} ++ module_code = [] ++ ++ self.compiler.pagetag = None ++ ++ class FindTopLevel(object): ++ def visitInheritTag(s, node): ++ inherit.append(node) ++ def visitNamespaceTag(s, node): ++ namespaces[node.name] = node ++ def visitPageTag(s, node): ++ self.compiler.pagetag = node ++ def visitCode(s, node): ++ if node.ismodule: ++ module_code.append(node) ++ ++ f = FindTopLevel() ++ for n in self.node.nodes: ++ n.accept_visitor(f) ++ ++ self.compiler.namespaces = namespaces ++ ++ module_ident = set() ++ for n in module_code: ++ module_ident = module_ident.union(n.declared_identifiers()) ++ ++ module_identifiers = _Identifiers(self.compiler) ++ module_identifiers.declared = module_ident ++ ++ # module-level names, python code ++ if self.compiler.generate_magic_comment and \ ++ self.compiler.source_encoding: ++ self.printer.writeline("# -*- coding:%s -*-" % ++ self.compiler.source_encoding) ++ ++ if self.compiler.future_imports: ++ self.printer.writeline("from __future__ import %s" % ++ (", ".join(self.compiler.future_imports),)) ++ self.printer.writeline("from mako import runtime, filters, cache") ++ self.printer.writeline("UNDEFINED = runtime.UNDEFINED") ++ self.printer.writeline("__M_dict_builtin = dict") ++ self.printer.writeline("__M_locals_builtin = locals") ++ self.printer.writeline("_magic_number = %r" % MAGIC_NUMBER) ++ self.printer.writeline("_modified_time = %r" % time.time()) ++ self.printer.writeline("_enable_loop = %r" % self.compiler.enable_loop) ++ self.printer.writeline( ++ "_template_filename = %r" % self.compiler.filename) ++ self.printer.writeline("_template_uri = %r" % self.compiler.uri) ++ self.printer.writeline( ++ "_source_encoding = %r" % self.compiler.source_encoding) ++ if self.compiler.imports: ++ buf = '' ++ for imp in self.compiler.imports: ++ buf += imp + "\n" ++ self.printer.writeline(imp) ++ impcode = ast.PythonCode( ++ buf, ++ source='', lineno=0, ++ pos=0, ++ filename='template defined imports') ++ else: ++ impcode = None ++ ++ main_identifiers = module_identifiers.branch(self.node) ++ module_identifiers.topleveldefs = \ ++ module_identifiers.topleveldefs.\ ++ union(main_identifiers.topleveldefs) ++ module_identifiers.declared.add("UNDEFINED") ++ if impcode: ++ module_identifiers.declared.update(impcode.declared_identifiers) ++ ++ self.compiler.identifiers = module_identifiers ++ self.printer.writeline("_exports = %r" % ++ [n.name for n in ++ main_identifiers.topleveldefs.values()] ++ ) ++ self.printer.write_blanks(2) ++ ++ if len(module_code): ++ self.write_module_code(module_code) ++ ++ if len(inherit): ++ self.write_namespaces(namespaces) ++ self.write_inherit(inherit[-1]) ++ elif len(namespaces): ++ self.write_namespaces(namespaces) ++ ++ return list(main_identifiers.topleveldefs.values()) ++ ++ def write_render_callable(self, node, name, args, buffered, filtered, ++ cached): ++ """write a top-level render callable. ++ ++ this could be the main render() method or that of a top-level def.""" ++ ++ if self.in_def: ++ decorator = node.decorator ++ if decorator: ++ self.printer.writeline( ++ "@runtime._decorate_toplevel(%s)" % decorator) ++ ++ self.printer.start_source(node.lineno) ++ self.printer.writelines( ++ "def %s(%s):" % (name, ','.join(args)), ++ # push new frame, assign current frame to __M_caller ++ "__M_caller = context.caller_stack._push_frame()", ++ "try:" ++ ) ++ if buffered or filtered or cached: ++ self.printer.writeline("context._push_buffer()") ++ ++ self.identifier_stack.append( ++ self.compiler.identifiers.branch(self.node)) ++ if (not self.in_def or self.node.is_block) and '**pageargs' in args: ++ self.identifier_stack[-1].argument_declared.add('pageargs') ++ ++ if not self.in_def and ( ++ len(self.identifiers.locally_assigned) > 0 or ++ len(self.identifiers.argument_declared) > 0 ++ ): ++ self.printer.writeline("__M_locals = __M_dict_builtin(%s)" % ++ ','.join([ ++ "%s=%s" % (x, x) for x in ++ self.identifiers.argument_declared ++ ])) ++ ++ self.write_variable_declares(self.identifiers, toplevel=True) ++ ++ for n in self.node.nodes: ++ n.accept_visitor(self) ++ ++ self.write_def_finish(self.node, buffered, filtered, cached) ++ self.printer.writeline(None) ++ self.printer.write_blanks(2) ++ if cached: ++ self.write_cache_decorator( ++ node, name, ++ args, buffered, ++ self.identifiers, toplevel=True) ++ ++ def write_module_code(self, module_code): ++ """write module-level template code, i.e. that which ++ is enclosed in <%! %> tags in the template.""" ++ for n in module_code: ++ self.printer.start_source(n.lineno) ++ self.printer.write_indented_block(n.text) ++ ++ def write_inherit(self, node): ++ """write the module-level inheritance-determination callable.""" ++ ++ self.printer.writelines( ++ "def _mako_inherit(template, context):", ++ "_mako_generate_namespaces(context)", ++ "return runtime._inherit_from(context, %s, _template_uri)" % ++ (node.parsed_attributes['file']), ++ None ++ ) ++ ++ def write_namespaces(self, namespaces): ++ """write the module-level namespace-generating callable.""" ++ self.printer.writelines( ++ "def _mako_get_namespace(context, name):", ++ "try:", ++ "return context.namespaces[(__name__, name)]", ++ "except KeyError:", ++ "_mako_generate_namespaces(context)", ++ "return context.namespaces[(__name__, name)]", ++ None, None ++ ) ++ self.printer.writeline("def _mako_generate_namespaces(context):") ++ ++ ++ for node in namespaces.values(): ++ if 'import' in node.attributes: ++ self.compiler.has_ns_imports = True ++ self.printer.start_source(node.lineno) ++ if len(node.nodes): ++ self.printer.writeline("def make_namespace():") ++ export = [] ++ identifiers = self.compiler.identifiers.branch(node) ++ self.in_def = True ++ class NSDefVisitor(object): ++ def visitDefTag(s, node): ++ s.visitDefOrBase(node) ++ ++ def visitBlockTag(s, node): ++ s.visitDefOrBase(node) ++ ++ def visitDefOrBase(s, node): ++ if node.is_anonymous: ++ raise exceptions.CompileException( ++ "Can't put anonymous blocks inside " ++ "<%namespace>", ++ **node.exception_kwargs ++ ) ++ self.write_inline_def(node, identifiers, nested=False) ++ export.append(node.funcname) ++ vis = NSDefVisitor() ++ for n in node.nodes: ++ n.accept_visitor(vis) ++ self.printer.writeline("return [%s]" % (','.join(export))) ++ self.printer.writeline(None) ++ self.in_def = False ++ callable_name = "make_namespace()" ++ else: ++ callable_name = "None" ++ ++ if 'file' in node.parsed_attributes: ++ self.printer.writeline( ++ "ns = runtime.TemplateNamespace(%r," ++ " context._clean_inheritance_tokens()," ++ " templateuri=%s, callables=%s, " ++ " calling_uri=_template_uri)" % ++ ( ++ node.name, ++ node.parsed_attributes.get('file', 'None'), ++ callable_name, ++ ) ++ ) ++ elif 'module' in node.parsed_attributes: ++ self.printer.writeline( ++ "ns = runtime.ModuleNamespace(%r," ++ " context._clean_inheritance_tokens()," ++ " callables=%s, calling_uri=_template_uri," ++ " module=%s)" % ++ ( ++ node.name, ++ callable_name, ++ node.parsed_attributes.get( ++ 'module', 'None') ++ ) ++ ) ++ else: ++ self.printer.writeline( ++ "ns = runtime.Namespace(%r," ++ " context._clean_inheritance_tokens()," ++ " callables=%s, calling_uri=_template_uri)" % ++ ( ++ node.name, ++ callable_name, ++ ) ++ ) ++ if eval(node.attributes.get('inheritable', "False")): ++ self.printer.writeline("context['self'].%s = ns" % (node.name)) ++ ++ self.printer.writeline( ++ "context.namespaces[(__name__, %s)] = ns" % repr(node.name)) ++ self.printer.write_blanks(1) ++ if not len(namespaces): ++ self.printer.writeline("pass") ++ self.printer.writeline(None) ++ ++ def write_variable_declares(self, identifiers, toplevel=False, limit=None): ++ """write variable declarations at the top of a function. ++ ++ the variable declarations are in the form of callable ++ definitions for defs and/or name lookup within the ++ function's context argument. the names declared are based ++ on the names that are referenced in the function body, ++ which don't otherwise have any explicit assignment ++ operation. names that are assigned within the body are ++ assumed to be locally-scoped variables and are not ++ separately declared. ++ ++ for def callable definitions, if the def is a top-level ++ callable then a 'stub' callable is generated which wraps ++ the current Context into a closure. if the def is not ++ top-level, it is fully rendered as a local closure. ++ ++ """ ++ ++ # collection of all defs available to us in this scope ++ comp_idents = dict([(c.funcname, c) for c in identifiers.defs]) ++ to_write = set() ++ ++ # write "context.get()" for all variables we are going to ++ # need that arent in the namespace yet ++ to_write = to_write.union(identifiers.undeclared) ++ ++ # write closure functions for closures that we define ++ # right here ++ to_write = to_write.union( ++ [c.funcname for c in identifiers.closuredefs.values()]) ++ ++ # remove identifiers that are declared in the argument ++ # signature of the callable ++ to_write = to_write.difference(identifiers.argument_declared) ++ ++ # remove identifiers that we are going to assign to. ++ # in this way we mimic Python's behavior, ++ # i.e. assignment to a variable within a block ++ # means that variable is now a "locally declared" var, ++ # which cannot be referenced beforehand. ++ to_write = to_write.difference(identifiers.locally_declared) ++ ++ if self.compiler.enable_loop: ++ has_loop = "loop" in to_write ++ to_write.discard("loop") ++ else: ++ has_loop = False ++ ++ # if a limiting set was sent, constraint to those items in that list ++ # (this is used for the caching decorator) ++ if limit is not None: ++ to_write = to_write.intersection(limit) ++ ++ if toplevel and getattr(self.compiler, 'has_ns_imports', False): ++ self.printer.writeline("_import_ns = {}") ++ self.compiler.has_imports = True ++ for ident, ns in self.compiler.namespaces.items(): ++ if 'import' in ns.attributes: ++ self.printer.writeline( ++ "_mako_get_namespace(context, %r)." ++ "_populate(_import_ns, %r)" % ++ ( ++ ident, ++ re.split(r'\s*,\s*', ns.attributes['import']) ++ )) ++ ++ if has_loop: ++ self.printer.writeline( ++ 'loop = __M_loop = runtime.LoopStack()' ++ ) ++ ++ for ident in to_write: ++ if ident in comp_idents: ++ comp = comp_idents[ident] ++ if comp.is_block: ++ if not comp.is_anonymous: ++ self.write_def_decl(comp, identifiers) ++ else: ++ self.write_inline_def(comp, identifiers, nested=True) ++ else: ++ if comp.is_root(): ++ self.write_def_decl(comp, identifiers) ++ else: ++ self.write_inline_def(comp, identifiers, nested=True) ++ ++ elif ident in self.compiler.namespaces: ++ self.printer.writeline( ++ "%s = _mako_get_namespace(context, %r)" % ++ (ident, ident) ++ ) ++ else: ++ if getattr(self.compiler, 'has_ns_imports', False): ++ if self.compiler.strict_undefined: ++ self.printer.writelines( ++ "%s = _import_ns.get(%r, UNDEFINED)" % ++ (ident, ident), ++ "if %s is UNDEFINED:" % ident, ++ "try:", ++ "%s = context[%r]" % (ident, ident), ++ "except KeyError:", ++ "raise NameError(\"'%s' is not defined\")" % ++ ident, ++ None, None ++ ) ++ else: ++ self.printer.writeline( ++ "%s = _import_ns.get(%r, context.get(%r, UNDEFINED))" % ++ (ident, ident, ident)) ++ else: ++ if self.compiler.strict_undefined: ++ self.printer.writelines( ++ "try:", ++ "%s = context[%r]" % (ident, ident), ++ "except KeyError:", ++ "raise NameError(\"'%s' is not defined\")" % ++ ident, ++ None ++ ) ++ else: ++ self.printer.writeline( ++ "%s = context.get(%r, UNDEFINED)" % (ident, ident) ++ ) ++ ++ self.printer.writeline("__M_writer = context.writer()") ++ ++ def write_def_decl(self, node, identifiers): ++ """write a locally-available callable referencing a top-level def""" ++ funcname = node.funcname ++ namedecls = node.get_argument_expressions() ++ nameargs = node.get_argument_expressions(as_call=True) ++ ++ if not self.in_def and ( ++ len(self.identifiers.locally_assigned) > 0 or ++ len(self.identifiers.argument_declared) > 0): ++ nameargs.insert(0, 'context._locals(__M_locals)') ++ else: ++ nameargs.insert(0, 'context') ++ self.printer.writeline("def %s(%s):" % (funcname, ",".join(namedecls))) ++ self.printer.writeline( ++ "return render_%s(%s)" % (funcname, ",".join(nameargs))) ++ self.printer.writeline(None) ++ ++ def write_inline_def(self, node, identifiers, nested): ++ """write a locally-available def callable inside an enclosing def.""" ++ ++ namedecls = node.get_argument_expressions() ++ ++ decorator = node.decorator ++ if decorator: ++ self.printer.writeline( ++ "@runtime._decorate_inline(context, %s)" % decorator) ++ self.printer.writeline( ++ "def %s(%s):" % (node.funcname, ",".join(namedecls))) ++ filtered = len(node.filter_args.args) > 0 ++ buffered = eval(node.attributes.get('buffered', 'False')) ++ cached = eval(node.attributes.get('cached', 'False')) ++ self.printer.writelines( ++ # push new frame, assign current frame to __M_caller ++ "__M_caller = context.caller_stack._push_frame()", ++ "try:" ++ ) ++ if buffered or filtered or cached: ++ self.printer.writelines( ++ "context._push_buffer()", ++ ) ++ ++ identifiers = identifiers.branch(node, nested=nested) ++ ++ self.write_variable_declares(identifiers) ++ ++ self.identifier_stack.append(identifiers) ++ for n in node.nodes: ++ n.accept_visitor(self) ++ self.identifier_stack.pop() ++ ++ self.write_def_finish(node, buffered, filtered, cached) ++ self.printer.writeline(None) ++ if cached: ++ self.write_cache_decorator(node, node.funcname, ++ namedecls, False, identifiers, ++ inline=True, toplevel=False) ++ ++ def write_def_finish(self, node, buffered, filtered, cached, ++ callstack=True): ++ """write the end section of a rendering function, either outermost or ++ inline. ++ ++ this takes into account if the rendering function was filtered, ++ buffered, etc. and closes the corresponding try: block if any, and ++ writes code to retrieve captured content, apply filters, send proper ++ return value.""" ++ ++ if not buffered and not cached and not filtered: ++ self.printer.writeline("return ''") ++ if callstack: ++ self.printer.writelines( ++ "finally:", ++ "context.caller_stack._pop_frame()", ++ None ++ ) ++ ++ if buffered or filtered or cached: ++ if buffered or cached: ++ # in a caching scenario, don't try to get a writer ++ # from the context after popping; assume the caching ++ # implemenation might be using a context with no ++ # extra buffers ++ self.printer.writelines( ++ "finally:", ++ "__M_buf = context._pop_buffer()" ++ ) ++ else: ++ self.printer.writelines( ++ "finally:", ++ "__M_buf, __M_writer = context._pop_buffer_and_writer()" ++ ) ++ ++ if callstack: ++ self.printer.writeline("context.caller_stack._pop_frame()") ++ ++ s = "__M_buf.getvalue()" ++ if filtered: ++ s = self.create_filter_callable(node.filter_args.args, s, ++ False) ++ self.printer.writeline(None) ++ if buffered and not cached: ++ s = self.create_filter_callable(self.compiler.buffer_filters, ++ s, False) ++ if buffered or cached: ++ self.printer.writeline("return %s" % s) ++ else: ++ self.printer.writelines( ++ "__M_writer(%s)" % s, ++ "return ''" ++ ) ++ ++ def write_cache_decorator(self, node_or_pagetag, name, ++ args, buffered, identifiers, ++ inline=False, toplevel=False): ++ """write a post-function decorator to replace a rendering ++ callable with a cached version of itself.""" ++ ++ self.printer.writeline("__M_%s = %s" % (name, name)) ++ cachekey = node_or_pagetag.parsed_attributes.get('cache_key', ++ repr(name)) ++ ++ cache_args = {} ++ if self.compiler.pagetag is not None: ++ cache_args.update( ++ ( ++ pa[6:], ++ self.compiler.pagetag.parsed_attributes[pa] ++ ) ++ for pa in self.compiler.pagetag.parsed_attributes ++ if pa.startswith('cache_') and pa != 'cache_key' ++ ) ++ cache_args.update( ++ ( ++ pa[6:], ++ node_or_pagetag.parsed_attributes[pa] ++ ) for pa in node_or_pagetag.parsed_attributes ++ if pa.startswith('cache_') and pa != 'cache_key' ++ ) ++ if 'timeout' in cache_args: ++ cache_args['timeout'] = int(eval(cache_args['timeout'])) ++ ++ self.printer.writeline("def %s(%s):" % (name, ','.join(args))) ++ ++ # form "arg1, arg2, arg3=arg3, arg4=arg4", etc. ++ pass_args = [ ++ "%s=%s" % ((a.split('=')[0],) * 2) if '=' in a else a ++ for a in args ++ ] ++ ++ self.write_variable_declares( ++ identifiers, ++ toplevel=toplevel, ++ limit=node_or_pagetag.undeclared_identifiers() ++ ) ++ if buffered: ++ s = "context.get('local')."\ ++ "cache._ctx_get_or_create("\ ++ "%s, lambda:__M_%s(%s), context, %s__M_defname=%r)" % ( ++ cachekey, name, ','.join(pass_args), ++ ''.join(["%s=%s, " % (k, v) ++ for k, v in cache_args.items()]), ++ name ++ ) ++ # apply buffer_filters ++ s = self.create_filter_callable(self.compiler.buffer_filters, s, ++ False) ++ self.printer.writelines("return " + s, None) ++ else: ++ self.printer.writelines( ++ "__M_writer(context.get('local')." ++ "cache._ctx_get_or_create(" ++ "%s, lambda:__M_%s(%s), context, %s__M_defname=%r))" % ++ ( ++ cachekey, name, ','.join(pass_args), ++ ''.join(["%s=%s, " % (k, v) ++ for k, v in cache_args.items()]), ++ name, ++ ), ++ "return ''", ++ None ++ ) ++ ++ def create_filter_callable(self, args, target, is_expression): ++ """write a filter-applying expression based on the filters ++ present in the given filter names, adjusting for the global ++ 'default' filter aliases as needed.""" ++ ++ def locate_encode(name): ++ if re.match(r'decode\..+', name): ++ return "filters." + name ++ elif self.compiler.disable_unicode: ++ return filters.NON_UNICODE_ESCAPES.get(name, name) ++ else: ++ return filters.DEFAULT_ESCAPES.get(name, name) ++ ++ if 'n' not in args: ++ if is_expression: ++ if self.compiler.pagetag: ++ args = self.compiler.pagetag.filter_args.args + args ++ if self.compiler.default_filters: ++ args = self.compiler.default_filters + args ++ for e in args: ++ # if filter given as a function, get just the identifier portion ++ if e == 'n': ++ continue ++ m = re.match(r'(.+?)(\(.*\))', e) ++ if m: ++ ident, fargs = m.group(1, 2) ++ f = locate_encode(ident) ++ e = f + fargs ++ else: ++ e = locate_encode(e) ++ assert e is not None ++ target = "%s(%s)" % (e, target) ++ return target ++ ++ def visitExpression(self, node): ++ self.printer.start_source(node.lineno) ++ if len(node.escapes) or \ ++ ( ++ self.compiler.pagetag is not None and ++ len(self.compiler.pagetag.filter_args.args) ++ ) or \ ++ len(self.compiler.default_filters): ++ ++ s = self.create_filter_callable(node.escapes_code.args, ++ "%s" % node.text, True) ++ self.printer.writeline("__M_writer(%s)" % s) ++ else: ++ self.printer.writeline("__M_writer(%s)" % node.text) ++ ++ def visitControlLine(self, node): ++ if node.isend: ++ self.printer.writeline(None) ++ if node.has_loop_context: ++ self.printer.writeline('finally:') ++ self.printer.writeline("loop = __M_loop._exit()") ++ self.printer.writeline(None) ++ else: ++ self.printer.start_source(node.lineno) ++ if self.compiler.enable_loop and node.keyword == 'for': ++ text = mangle_mako_loop(node, self.printer) ++ else: ++ text = node.text ++ self.printer.writeline(text) ++ children = node.get_children() ++ # this covers the three situations where we want to insert a pass: ++ # 1) a ternary control line with no children, ++ # 2) a primary control line with nothing but its own ternary ++ # and end control lines, and ++ # 3) any control line with no content other than comments ++ if not children or ( ++ compat.all(isinstance(c, (parsetree.Comment, ++ parsetree.ControlLine)) ++ for c in children) and ++ compat.all((node.is_ternary(c.keyword) or c.isend) ++ for c in children ++ if isinstance(c, parsetree.ControlLine))): ++ self.printer.writeline("pass") ++ ++ def visitText(self, node): ++ self.printer.start_source(node.lineno) ++ self.printer.writeline("__M_writer(%s)" % repr(node.content)) ++ ++ def visitTextTag(self, node): ++ filtered = len(node.filter_args.args) > 0 ++ if filtered: ++ self.printer.writelines( ++ "__M_writer = context._push_writer()", ++ "try:", ++ ) ++ for n in node.nodes: ++ n.accept_visitor(self) ++ if filtered: ++ self.printer.writelines( ++ "finally:", ++ "__M_buf, __M_writer = context._pop_buffer_and_writer()", ++ "__M_writer(%s)" % ++ self.create_filter_callable( ++ node.filter_args.args, ++ "__M_buf.getvalue()", ++ False), ++ None ++ ) ++ ++ def visitCode(self, node): ++ if not node.ismodule: ++ self.printer.start_source(node.lineno) ++ self.printer.write_indented_block(node.text) ++ ++ if not self.in_def and len(self.identifiers.locally_assigned) > 0: ++ # if we are the "template" def, fudge locally ++ # declared/modified variables into the "__M_locals" dictionary, ++ # which is used for def calls within the same template, ++ # to simulate "enclosing scope" ++ self.printer.writeline( ++ '__M_locals_builtin_stored = __M_locals_builtin()') ++ self.printer.writeline( ++ '__M_locals.update(__M_dict_builtin([(__M_key,' ++ ' __M_locals_builtin_stored[__M_key]) for __M_key in' ++ ' [%s] if __M_key in __M_locals_builtin_stored]))' % ++ ','.join([repr(x) for x in node.declared_identifiers()])) ++ ++ def visitIncludeTag(self, node): ++ self.printer.start_source(node.lineno) ++ args = node.attributes.get('args') ++ if args: ++ self.printer.writeline( ++ "runtime._include_file(context, %s, _template_uri, %s)" % ++ (node.parsed_attributes['file'], args)) ++ else: ++ self.printer.writeline( ++ "runtime._include_file(context, %s, _template_uri)" % ++ (node.parsed_attributes['file'])) ++ ++ def visitNamespaceTag(self, node): ++ pass ++ ++ def visitDefTag(self, node): ++ pass ++ ++ def visitBlockTag(self, node): ++ if node.is_anonymous: ++ self.printer.writeline("%s()" % node.funcname) ++ else: ++ nameargs = node.get_argument_expressions(as_call=True) ++ nameargs += ['**pageargs'] ++ self.printer.writeline("if 'parent' not in context._data or " ++ "not hasattr(context._data['parent'], '%s'):" ++ % node.funcname) ++ self.printer.writeline( ++ "context['self'].%s(%s)" % (node.funcname, ",".join(nameargs))) ++ self.printer.writeline("\n") ++ ++ def visitCallNamespaceTag(self, node): ++ # TODO: we can put namespace-specific checks here, such ++ # as ensure the given namespace will be imported, ++ # pre-import the namespace, etc. ++ self.visitCallTag(node) ++ ++ def visitCallTag(self, node): ++ self.printer.writeline("def ccall(caller):") ++ export = ['body'] ++ callable_identifiers = self.identifiers.branch(node, nested=True) ++ body_identifiers = callable_identifiers.branch(node, nested=False) ++ # we want the 'caller' passed to ccall to be used ++ # for the body() function, but for other non-body() ++ # <%def>s within <%call> we want the current caller ++ # off the call stack (if any) ++ body_identifiers.add_declared('caller') ++ ++ self.identifier_stack.append(body_identifiers) ++ class DefVisitor(object): ++ def visitDefTag(s, node): ++ s.visitDefOrBase(node) ++ ++ def visitBlockTag(s, node): ++ s.visitDefOrBase(node) ++ ++ def visitDefOrBase(s, node): ++ self.write_inline_def(node, callable_identifiers, nested=False) ++ if not node.is_anonymous: ++ export.append(node.funcname) ++ # remove defs that are within the <%call> from the ++ # "closuredefs" defined in the body, so they dont render twice ++ if node.funcname in body_identifiers.closuredefs: ++ del body_identifiers.closuredefs[node.funcname] ++ ++ vis = DefVisitor() ++ for n in node.nodes: ++ n.accept_visitor(vis) ++ self.identifier_stack.pop() ++ ++ bodyargs = node.body_decl.get_argument_expressions() ++ self.printer.writeline("def body(%s):" % ','.join(bodyargs)) ++ ++ # TODO: figure out best way to specify ++ # buffering/nonbuffering (at call time would be better) ++ buffered = False ++ if buffered: ++ self.printer.writelines( ++ "context._push_buffer()", ++ "try:" ++ ) ++ self.write_variable_declares(body_identifiers) ++ self.identifier_stack.append(body_identifiers) ++ ++ for n in node.nodes: ++ n.accept_visitor(self) ++ self.identifier_stack.pop() ++ ++ self.write_def_finish(node, buffered, False, False, callstack=False) ++ self.printer.writelines( ++ None, ++ "return [%s]" % (','.join(export)), ++ None ++ ) ++ ++ self.printer.writelines( ++ # push on caller for nested call ++ "context.caller_stack.nextcaller = " ++ "runtime.Namespace('caller', context, " ++ "callables=ccall(__M_caller))", ++ "try:") ++ self.printer.start_source(node.lineno) ++ self.printer.writelines( ++ "__M_writer(%s)" % self.create_filter_callable( ++ [], node.expression, True), ++ "finally:", ++ "context.caller_stack.nextcaller = None", ++ None ++ ) ++ ++class _Identifiers(object): ++ """tracks the status of identifier names as template code is rendered.""" ++ ++ def __init__(self, compiler, node=None, parent=None, nested=False): ++ if parent is not None: ++ # if we are the branch created in write_namespaces(), ++ # we don't share any context from the main body(). ++ if isinstance(node, parsetree.NamespaceTag): ++ self.declared = set() ++ self.topleveldefs = util.SetLikeDict() ++ else: ++ # things that have already been declared ++ # in an enclosing namespace (i.e. names we can just use) ++ self.declared = set(parent.declared).\ ++ union([c.name for c in parent.closuredefs.values()]).\ ++ union(parent.locally_declared).\ ++ union(parent.argument_declared) ++ ++ # if these identifiers correspond to a "nested" ++ # scope, it means whatever the parent identifiers ++ # had as undeclared will have been declared by that parent, ++ # and therefore we have them in our scope. ++ if nested: ++ self.declared = self.declared.union(parent.undeclared) ++ ++ # top level defs that are available ++ self.topleveldefs = util.SetLikeDict(**parent.topleveldefs) ++ else: ++ self.declared = set() ++ self.topleveldefs = util.SetLikeDict() ++ ++ self.compiler = compiler ++ ++ # things within this level that are referenced before they ++ # are declared (e.g. assigned to) ++ self.undeclared = set() ++ ++ # things that are declared locally. some of these things ++ # could be in the "undeclared" list as well if they are ++ # referenced before declared ++ self.locally_declared = set() ++ ++ # assignments made in explicit python blocks. ++ # these will be propagated to ++ # the context of local def calls. ++ self.locally_assigned = set() ++ ++ # things that are declared in the argument ++ # signature of the def callable ++ self.argument_declared = set() ++ ++ # closure defs that are defined in this level ++ self.closuredefs = util.SetLikeDict() ++ ++ self.node = node ++ ++ if node is not None: ++ node.accept_visitor(self) ++ ++ illegal_names = self.compiler.reserved_names.intersection( ++ self.locally_declared) ++ if illegal_names: ++ raise exceptions.NameConflictError( ++ "Reserved words declared in template: %s" % ++ ", ".join(illegal_names)) ++ ++ ++ def branch(self, node, **kwargs): ++ """create a new Identifiers for a new Node, with ++ this Identifiers as the parent.""" ++ ++ return _Identifiers(self.compiler, node, self, **kwargs) ++ ++ @property ++ def defs(self): ++ return set(self.topleveldefs.union(self.closuredefs).values()) ++ ++ def __repr__(self): ++ return "Identifiers(declared=%r, locally_declared=%r, "\ ++ "undeclared=%r, topleveldefs=%r, closuredefs=%r, "\ ++ "argumentdeclared=%r)" %\ ++ ( ++ list(self.declared), ++ list(self.locally_declared), ++ list(self.undeclared), ++ [c.name for c in self.topleveldefs.values()], ++ [c.name for c in self.closuredefs.values()], ++ self.argument_declared) ++ ++ def check_declared(self, node): ++ """update the state of this Identifiers with the undeclared ++ and declared identifiers of the given node.""" ++ ++ for ident in node.undeclared_identifiers(): ++ if ident != 'context' and\ ++ ident not in self.declared.union(self.locally_declared): ++ self.undeclared.add(ident) ++ for ident in node.declared_identifiers(): ++ self.locally_declared.add(ident) ++ ++ def add_declared(self, ident): ++ self.declared.add(ident) ++ if ident in self.undeclared: ++ self.undeclared.remove(ident) ++ ++ def visitExpression(self, node): ++ self.check_declared(node) ++ ++ def visitControlLine(self, node): ++ self.check_declared(node) ++ ++ def visitCode(self, node): ++ if not node.ismodule: ++ self.check_declared(node) ++ self.locally_assigned = self.locally_assigned.union( ++ node.declared_identifiers()) ++ ++ def visitNamespaceTag(self, node): ++ # only traverse into the sub-elements of a ++ # <%namespace> tag if we are the branch created in ++ # write_namespaces() ++ if self.node is node: ++ for n in node.nodes: ++ n.accept_visitor(self) ++ ++ def _check_name_exists(self, collection, node): ++ existing = collection.get(node.funcname) ++ collection[node.funcname] = node ++ if existing is not None and \ ++ existing is not node and \ ++ (node.is_block or existing.is_block): ++ raise exceptions.CompileException( ++ "%%def or %%block named '%s' already " ++ "exists in this template." % ++ node.funcname, **node.exception_kwargs) ++ ++ def visitDefTag(self, node): ++ if node.is_root() and not node.is_anonymous: ++ self._check_name_exists(self.topleveldefs, node) ++ elif node is not self.node: ++ self._check_name_exists(self.closuredefs, node) ++ ++ for ident in node.undeclared_identifiers(): ++ if ident != 'context' and \ ++ ident not in self.declared.union(self.locally_declared): ++ self.undeclared.add(ident) ++ ++ # visit defs only one level deep ++ if node is self.node: ++ for ident in node.declared_identifiers(): ++ self.argument_declared.add(ident) ++ ++ for n in node.nodes: ++ n.accept_visitor(self) ++ ++ def visitBlockTag(self, node): ++ if node is not self.node and not node.is_anonymous: ++ ++ if isinstance(self.node, parsetree.DefTag): ++ raise exceptions.CompileException( ++ "Named block '%s' not allowed inside of def '%s'" ++ % (node.name, self.node.name), **node.exception_kwargs) ++ elif isinstance(self.node, ++ (parsetree.CallTag, parsetree.CallNamespaceTag)): ++ raise exceptions.CompileException( ++ "Named block '%s' not allowed inside of <%%call> tag" ++ % (node.name, ), **node.exception_kwargs) ++ ++ for ident in node.undeclared_identifiers(): ++ if ident != 'context' and \ ++ ident not in self.declared.union(self.locally_declared): ++ self.undeclared.add(ident) ++ ++ if not node.is_anonymous: ++ self._check_name_exists(self.topleveldefs, node) ++ self.undeclared.add(node.funcname) ++ elif node is not self.node: ++ self._check_name_exists(self.closuredefs, node) ++ for ident in node.declared_identifiers(): ++ self.argument_declared.add(ident) ++ for n in node.nodes: ++ n.accept_visitor(self) ++ ++ def visitTextTag(self, node): ++ for ident in node.undeclared_identifiers(): ++ if ident != 'context' and \ ++ ident not in self.declared.union(self.locally_declared): ++ self.undeclared.add(ident) ++ ++ def visitIncludeTag(self, node): ++ self.check_declared(node) ++ ++ def visitPageTag(self, node): ++ for ident in node.declared_identifiers(): ++ self.argument_declared.add(ident) ++ self.check_declared(node) ++ ++ def visitCallNamespaceTag(self, node): ++ self.visitCallTag(node) ++ ++ def visitCallTag(self, node): ++ if node is self.node: ++ for ident in node.undeclared_identifiers(): ++ if ident != 'context' and \ ++ ident not in self.declared.union( ++ self.locally_declared): ++ self.undeclared.add(ident) ++ for ident in node.declared_identifiers(): ++ self.argument_declared.add(ident) ++ for n in node.nodes: ++ n.accept_visitor(self) ++ else: ++ for ident in node.undeclared_identifiers(): ++ if ident != 'context' and \ ++ ident not in self.declared.union( ++ self.locally_declared): ++ self.undeclared.add(ident) ++ ++ ++_FOR_LOOP = re.compile( ++ r'^for\s+((?:\(?)\s*[A-Za-z_][A-Za-z_0-9]*' ++ r'(?:\s*,\s*(?:[A-Za-z_][A-Za-z0-9_]*),??)*\s*(?:\)?))\s+in\s+(.*):' ++) ++ ++def mangle_mako_loop(node, printer): ++ """converts a for loop into a context manager wrapped around a for loop ++ when access to the `loop` variable has been detected in the for loop body ++ """ ++ loop_variable = LoopVariable() ++ node.accept_visitor(loop_variable) ++ if loop_variable.detected: ++ node.nodes[-1].has_loop_context = True ++ match = _FOR_LOOP.match(node.text) ++ if match: ++ printer.writelines( ++ 'loop = __M_loop._enter(%s)' % match.group(2), ++ 'try:' ++ #'with __M_loop(%s) as loop:' % match.group(2) ++ ) ++ text = 'for %s in loop:' % match.group(1) ++ else: ++ raise SyntaxError("Couldn't apply loop context: %s" % node.text) ++ else: ++ text = node.text ++ return text ++ ++ ++class LoopVariable(object): ++ """A node visitor which looks for the name 'loop' within undeclared ++ identifiers.""" ++ ++ def __init__(self): ++ self.detected = False ++ ++ def _loop_reference_detected(self, node): ++ if 'loop' in node.undeclared_identifiers(): ++ self.detected = True ++ else: ++ for n in node.get_children(): ++ n.accept_visitor(self) ++ ++ def visitControlLine(self, node): ++ self._loop_reference_detected(node) ++ ++ def visitCode(self, node): ++ self._loop_reference_detected(node) ++ ++ def visitExpression(self, node): ++ self._loop_reference_detected(node) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py +new file mode 100644 +index 0000000..fe277bb +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py +@@ -0,0 +1,174 @@ ++import sys ++import time ++ ++py3k = sys.version_info >= (3, 0) ++py33 = sys.version_info >= (3, 3) ++py2k = sys.version_info < (3,) ++py26 = sys.version_info >= (2, 6) ++jython = sys.platform.startswith('java') ++win32 = sys.platform.startswith('win') ++pypy = hasattr(sys, 'pypy_version_info') ++ ++if py3k: ++ from io import StringIO ++ import builtins as compat_builtins ++ from urllib.parse import quote_plus, unquote_plus ++ from html.entities import codepoint2name, name2codepoint ++ string_types = str, ++ binary_type = bytes ++ text_type = str ++ ++ from io import BytesIO as byte_buffer ++ ++ def u(s): ++ return s ++ ++ def b(s): ++ return s.encode("latin-1") ++ ++ def octal(lit): ++ return eval("0o" + lit) ++ ++else: ++ import __builtin__ as compat_builtins ++ try: ++ from cStringIO import StringIO ++ except: ++ from StringIO import StringIO ++ ++ byte_buffer = StringIO ++ ++ from urllib import quote_plus, unquote_plus ++ from htmlentitydefs import codepoint2name, name2codepoint ++ string_types = basestring, ++ binary_type = str ++ text_type = unicode ++ ++ def u(s): ++ return unicode(s, "utf-8") ++ ++ def b(s): ++ return s ++ ++ def octal(lit): ++ return eval("0" + lit) ++ ++ ++if py33: ++ from importlib import machinery ++ def load_module(module_id, path): ++ return machinery.SourceFileLoader(module_id, path).load_module() ++else: ++ import imp ++ def load_module(module_id, path): ++ fp = open(path, 'rb') ++ try: ++ return imp.load_source(module_id, path, fp) ++ finally: ++ fp.close() ++ ++ ++if py3k: ++ def reraise(tp, value, tb=None, cause=None): ++ if cause is not None: ++ value.__cause__ = cause ++ if value.__traceback__ is not tb: ++ raise value.with_traceback(tb) ++ raise value ++else: ++ exec("def reraise(tp, value, tb=None, cause=None):\n" ++ " raise tp, value, tb\n") ++ ++ ++def exception_as(): ++ return sys.exc_info()[1] ++ ++try: ++ import threading ++ if py3k: ++ import _thread as thread ++ else: ++ import thread ++except ImportError: ++ import dummy_threading as threading ++ if py3k: ++ import _dummy_thread as thread ++ else: ++ import dummy_thread as thread ++ ++if win32 or jython: ++ time_func = time.clock ++else: ++ time_func = time.time ++ ++try: ++ from functools import partial ++except: ++ def partial(func, *args, **keywords): ++ def newfunc(*fargs, **fkeywords): ++ newkeywords = keywords.copy() ++ newkeywords.update(fkeywords) ++ return func(*(args + fargs), **newkeywords) ++ return newfunc ++ ++ ++all = all ++import json ++ ++def exception_name(exc): ++ return exc.__class__.__name__ ++ ++try: ++ from inspect import CO_VARKEYWORDS, CO_VARARGS ++ def inspect_func_args(fn): ++ if py3k: ++ co = fn.__code__ ++ else: ++ co = fn.func_code ++ ++ nargs = co.co_argcount ++ names = co.co_varnames ++ args = list(names[:nargs]) ++ ++ varargs = None ++ if co.co_flags & CO_VARARGS: ++ varargs = co.co_varnames[nargs] ++ nargs = nargs + 1 ++ varkw = None ++ if co.co_flags & CO_VARKEYWORDS: ++ varkw = co.co_varnames[nargs] ++ ++ if py3k: ++ return args, varargs, varkw, fn.__defaults__ ++ else: ++ return args, varargs, varkw, fn.func_defaults ++except ImportError: ++ import inspect ++ def inspect_func_args(fn): ++ return inspect.getargspec(fn) ++ ++if py3k: ++ def callable(fn): ++ return hasattr(fn, '__call__') ++else: ++ callable = callable ++ ++ ++################################################ ++# cross-compatible metaclass implementation ++# Copyright (c) 2010-2012 Benjamin Peterson ++def with_metaclass(meta, base=object): ++ """Create a base class with a metaclass.""" ++ return meta("%sBase" % meta.__name__, (base,), {}) ++################################################ ++ ++ ++def arg_stringname(func_arg): ++ """Gets the string name of a kwarg or vararg ++ In Python3.4 a function's args are ++ of _ast.arg type not _ast.name ++ """ ++ if hasattr(func_arg, 'arg'): ++ return func_arg.arg ++ else: ++ return str(func_arg) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py +new file mode 100644 +index 0000000..c531f21 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py +@@ -0,0 +1,373 @@ ++# mako/exceptions.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""exception classes""" ++ ++import traceback ++import sys ++from mako import util, compat ++ ++class MakoException(Exception): ++ pass ++ ++class RuntimeException(MakoException): ++ pass ++ ++def _format_filepos(lineno, pos, filename): ++ if filename is None: ++ return " at line: %d char: %d" % (lineno, pos) ++ else: ++ return " in file '%s' at line: %d char: %d" % (filename, lineno, pos) ++ ++ ++class CompileException(MakoException): ++ def __init__(self, message, source, lineno, pos, filename): ++ MakoException.__init__(self, ++ message + _format_filepos(lineno, pos, filename)) ++ self.lineno = lineno ++ self.pos = pos ++ self.filename = filename ++ self.source = source ++ ++class SyntaxException(MakoException): ++ def __init__(self, message, source, lineno, pos, filename): ++ MakoException.__init__(self, ++ message + _format_filepos(lineno, pos, filename)) ++ self.lineno = lineno ++ self.pos = pos ++ self.filename = filename ++ self.source = source ++ ++class UnsupportedError(MakoException): ++ """raised when a retired feature is used.""" ++ ++class NameConflictError(MakoException): ++ """raised when a reserved word is used inappropriately""" ++ ++class TemplateLookupException(MakoException): ++ pass ++ ++class TopLevelLookupException(TemplateLookupException): ++ pass ++ ++class RichTraceback(object): ++ """Pull the current exception from the ``sys`` traceback and extracts ++ Mako-specific template information. ++ ++ See the usage examples in :ref:`handling_exceptions`. ++ ++ """ ++ def __init__(self, error=None, traceback=None): ++ self.source, self.lineno = "", 0 ++ ++ if error is None or traceback is None: ++ t, value, tback = sys.exc_info() ++ ++ if error is None: ++ error = value or t ++ ++ if traceback is None: ++ traceback = tback ++ ++ self.error = error ++ self.records = self._init(traceback) ++ ++ if isinstance(self.error, (CompileException, SyntaxException)): ++ self.source = self.error.source ++ self.lineno = self.error.lineno ++ self._has_source = True ++ ++ self._init_message() ++ ++ @property ++ def errorname(self): ++ return compat.exception_name(self.error) ++ ++ def _init_message(self): ++ """Find a unicode representation of self.error""" ++ try: ++ self.message = compat.text_type(self.error) ++ except UnicodeError: ++ try: ++ self.message = str(self.error) ++ except UnicodeEncodeError: ++ # Fallback to args as neither unicode nor ++ # str(Exception(u'\xe6')) work in Python < 2.6 ++ self.message = self.error.args[0] ++ if not isinstance(self.message, compat.text_type): ++ self.message = compat.text_type(self.message, 'ascii', 'replace') ++ ++ def _get_reformatted_records(self, records): ++ for rec in records: ++ if rec[6] is not None: ++ yield (rec[4], rec[5], rec[2], rec[6]) ++ else: ++ yield tuple(rec[0:4]) ++ ++ @property ++ def traceback(self): ++ """Return a list of 4-tuple traceback records (i.e. normal python ++ format) with template-corresponding lines remapped to the originating ++ template. ++ ++ """ ++ return list(self._get_reformatted_records(self.records)) ++ ++ @property ++ def reverse_records(self): ++ return reversed(self.records) ++ ++ @property ++ def reverse_traceback(self): ++ """Return the same data as traceback, except in reverse order. ++ """ ++ ++ return list(self._get_reformatted_records(self.reverse_records)) ++ ++ def _init(self, trcback): ++ """format a traceback from sys.exc_info() into 7-item tuples, ++ containing the regular four traceback tuple items, plus the original ++ template filename, the line number adjusted relative to the template ++ source, and code line from that line number of the template.""" ++ ++ import mako.template ++ mods = {} ++ rawrecords = traceback.extract_tb(trcback) ++ new_trcback = [] ++ for filename, lineno, function, line in rawrecords: ++ if not line: ++ line = '' ++ try: ++ (line_map, template_lines) = mods[filename] ++ except KeyError: ++ try: ++ info = mako.template._get_module_info(filename) ++ module_source = info.code ++ template_source = info.source ++ template_filename = info.template_filename or filename ++ except KeyError: ++ # A normal .py file (not a Template) ++ if not compat.py3k: ++ try: ++ fp = open(filename, 'rb') ++ encoding = util.parse_encoding(fp) ++ fp.close() ++ except IOError: ++ encoding = None ++ if encoding: ++ line = line.decode(encoding) ++ else: ++ line = line.decode('ascii', 'replace') ++ new_trcback.append((filename, lineno, function, line, ++ None, None, None, None)) ++ continue ++ ++ template_ln = 1 ++ ++ source_map = mako.template.ModuleInfo.\ ++ get_module_source_metadata( ++ module_source, full_line_map=True) ++ line_map = source_map['full_line_map'] ++ ++ template_lines = [line for line in ++ template_source.split("\n")] ++ mods[filename] = (line_map, template_lines) ++ ++ template_ln = line_map[lineno - 1] ++ ++ if template_ln <= len(template_lines): ++ template_line = template_lines[template_ln - 1] ++ else: ++ template_line = None ++ new_trcback.append((filename, lineno, function, ++ line, template_filename, template_ln, ++ template_line, template_source)) ++ if not self.source: ++ for l in range(len(new_trcback) - 1, 0, -1): ++ if new_trcback[l][5]: ++ self.source = new_trcback[l][7] ++ self.lineno = new_trcback[l][5] ++ break ++ else: ++ if new_trcback: ++ try: ++ # A normal .py file (not a Template) ++ fp = open(new_trcback[-1][0], 'rb') ++ encoding = util.parse_encoding(fp) ++ fp.seek(0) ++ self.source = fp.read() ++ fp.close() ++ if encoding: ++ self.source = self.source.decode(encoding) ++ except IOError: ++ self.source = '' ++ self.lineno = new_trcback[-1][1] ++ return new_trcback ++ ++ ++def text_error_template(lookup=None): ++ """Provides a template that renders a stack trace in a similar format to ++ the Python interpreter, substituting source template filenames, line ++ numbers and code for that of the originating source template, as ++ applicable. ++ ++ """ ++ import mako.template ++ return mako.template.Template(r""" ++<%page args="error=None, traceback=None"/> ++<%! ++ from mako.exceptions import RichTraceback ++%>\ ++<% ++ tback = RichTraceback(error=error, traceback=traceback) ++%>\ ++Traceback (most recent call last): ++% for (filename, lineno, function, line) in tback.traceback: ++ File "${filename}", line ${lineno}, in ${function or '?'} ++ ${line | trim} ++% endfor ++${tback.errorname}: ${tback.message} ++""") ++ ++ ++def _install_pygments(): ++ global syntax_highlight, pygments_html_formatter ++ from mako.ext.pygmentplugin import syntax_highlight,\ ++ pygments_html_formatter ++ ++def _install_fallback(): ++ global syntax_highlight, pygments_html_formatter ++ from mako.filters import html_escape ++ pygments_html_formatter = None ++ def syntax_highlight(filename='', language=None): ++ return html_escape ++ ++def _install_highlighting(): ++ try: ++ _install_pygments() ++ except ImportError: ++ _install_fallback() ++_install_highlighting() ++ ++def html_error_template(): ++ """Provides a template that renders a stack trace in an HTML format, ++ providing an excerpt of code as well as substituting source template ++ filenames, line numbers and code for that of the originating source ++ template, as applicable. ++ ++ The template's default ``encoding_errors`` value is ++ ``'htmlentityreplace'``. The template has two options. With the ++ ``full`` option disabled, only a section of an HTML document is ++ returned. With the ``css`` option disabled, the default stylesheet ++ won't be included. ++ ++ """ ++ import mako.template ++ return mako.template.Template(r""" ++<%! ++ from mako.exceptions import RichTraceback, syntax_highlight,\ ++ pygments_html_formatter ++%> ++<%page args="full=True, css=True, error=None, traceback=None"/> ++% if full: ++ ++ ++ Mako Runtime Error ++% endif ++% if css: ++ ++% endif ++% if full: ++ ++ ++% endif ++ ++

Error !

++<% ++ tback = RichTraceback(error=error, traceback=traceback) ++ src = tback.source ++ line = tback.lineno ++ if src: ++ lines = src.split('\n') ++ else: ++ lines = None ++%> ++

${tback.errorname}: ${tback.message|h}

++ ++% if lines: ++
++
++% for index in range(max(0, line-4),min(len(lines), line+5)): ++ <% ++ if pygments_html_formatter: ++ pygments_html_formatter.linenostart = index + 1 ++ %> ++ % if index + 1 == line: ++ <% ++ if pygments_html_formatter: ++ old_cssclass = pygments_html_formatter.cssclass ++ pygments_html_formatter.cssclass = 'error ' + old_cssclass ++ %> ++ ${lines[index] | syntax_highlight(language='mako')} ++ <% ++ if pygments_html_formatter: ++ pygments_html_formatter.cssclass = old_cssclass ++ %> ++ % else: ++ ${lines[index] | syntax_highlight(language='mako')} ++ % endif ++% endfor ++
++
++% endif ++ ++
++% for (filename, lineno, function, line) in tback.reverse_traceback: ++
${filename}, line ${lineno}:
++
++ <% ++ if pygments_html_formatter: ++ pygments_html_formatter.linenostart = lineno ++ %> ++
${line | syntax_highlight(filename)}
++
++% endfor ++
++ ++% if full: ++ ++ ++% endif ++""", output_encoding=sys.getdefaultencoding(), ++ encoding_errors='htmlentityreplace') +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py +new file mode 100644 +index 0000000..d79ce23 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py +@@ -0,0 +1,201 @@ ++# mako/filters.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++ ++import re ++import codecs ++ ++from mako.compat import quote_plus, unquote_plus, codepoint2name, \ ++ name2codepoint ++ ++from mako import compat ++ ++xml_escapes = { ++ '&': '&', ++ '>': '>', ++ '<': '<', ++ '"': '"', # also " in html-only ++ "'": ''' # also ' in html-only ++} ++ ++# XXX: " is valid in HTML and XML ++# ' is not valid HTML, but is valid XML ++ ++def legacy_html_escape(s): ++ """legacy HTML escape for non-unicode mode.""" ++ s = s.replace("&", "&") ++ s = s.replace(">", ">") ++ s = s.replace("<", "<") ++ s = s.replace('"', """) ++ s = s.replace("'", "'") ++ return s ++ ++ ++try: ++ import markupsafe ++ html_escape = markupsafe.escape ++except ImportError: ++ html_escape = legacy_html_escape ++ ++def xml_escape(string): ++ return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string) ++ ++def url_escape(string): ++ # convert into a list of octets ++ string = string.encode("utf8") ++ return quote_plus(string) ++ ++def legacy_url_escape(string): ++ # convert into a list of octets ++ return quote_plus(string) ++ ++def url_unescape(string): ++ text = unquote_plus(string) ++ if not is_ascii_str(text): ++ text = text.decode("utf8") ++ return text ++ ++def trim(string): ++ return string.strip() ++ ++ ++class Decode(object): ++ def __getattr__(self, key): ++ def decode(x): ++ if isinstance(x, compat.text_type): ++ return x ++ elif not isinstance(x, compat.binary_type): ++ return decode(str(x)) ++ else: ++ return compat.text_type(x, encoding=key) ++ return decode ++decode = Decode() ++ ++ ++_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z') ++ ++def is_ascii_str(text): ++ return isinstance(text, str) and _ASCII_re.match(text) ++ ++################################################################ ++ ++class XMLEntityEscaper(object): ++ def __init__(self, codepoint2name, name2codepoint): ++ self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n)) ++ for c, n in codepoint2name.items()]) ++ self.name2codepoint = name2codepoint ++ ++ def escape_entities(self, text): ++ """Replace characters with their character entity references. ++ ++ Only characters corresponding to a named entity are replaced. ++ """ ++ return compat.text_type(text).translate(self.codepoint2entity) ++ ++ def __escape(self, m): ++ codepoint = ord(m.group()) ++ try: ++ return self.codepoint2entity[codepoint] ++ except (KeyError, IndexError): ++ return '&#x%X;' % codepoint ++ ++ ++ __escapable = re.compile(r'["&<>]|[^\x00-\x7f]') ++ ++ def escape(self, text): ++ """Replace characters with their character references. ++ ++ Replace characters by their named entity references. ++ Non-ASCII characters, if they do not have a named entity reference, ++ are replaced by numerical character references. ++ ++ The return value is guaranteed to be ASCII. ++ """ ++ return self.__escapable.sub(self.__escape, compat.text_type(text) ++ ).encode('ascii') ++ ++ # XXX: This regexp will not match all valid XML entity names__. ++ # (It punts on details involving involving CombiningChars and Extenders.) ++ # ++ # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef ++ __characterrefs = re.compile(r'''& (?: ++ \#(\d+) ++ | \#x([\da-f]+) ++ | ( (?!\d) [:\w] [-.:\w]+ ) ++ ) ;''', ++ re.X | re.UNICODE) ++ ++ def __unescape(self, m): ++ dval, hval, name = m.groups() ++ if dval: ++ codepoint = int(dval) ++ elif hval: ++ codepoint = int(hval, 16) ++ else: ++ codepoint = self.name2codepoint.get(name, 0xfffd) ++ # U+FFFD = "REPLACEMENT CHARACTER" ++ if codepoint < 128: ++ return chr(codepoint) ++ return chr(codepoint) ++ ++ def unescape(self, text): ++ """Unescape character references. ++ ++ All character references (both entity references and numerical ++ character references) are unescaped. ++ """ ++ return self.__characterrefs.sub(self.__unescape, text) ++ ++ ++_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint) ++ ++html_entities_escape = _html_entities_escaper.escape_entities ++html_entities_unescape = _html_entities_escaper.unescape ++ ++ ++def htmlentityreplace_errors(ex): ++ """An encoding error handler. ++ ++ This python `codecs`_ error handler replaces unencodable ++ characters with HTML entities, or, if no HTML entity exists for ++ the character, XML character references. ++ ++ >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace') ++ 'The cost was €12.' ++ """ ++ if isinstance(ex, UnicodeEncodeError): ++ # Handle encoding errors ++ bad_text = ex.object[ex.start:ex.end] ++ text = _html_entities_escaper.escape(bad_text) ++ return (compat.text_type(text), ex.end) ++ raise ex ++ ++codecs.register_error('htmlentityreplace', htmlentityreplace_errors) ++ ++ ++# TODO: options to make this dynamic per-compilation will be added in a later ++# release ++DEFAULT_ESCAPES = { ++ 'x': 'filters.xml_escape', ++ 'h': 'filters.html_escape', ++ 'u': 'filters.url_escape', ++ 'trim': 'filters.trim', ++ 'entity': 'filters.html_entities_escape', ++ 'unicode': 'unicode', ++ 'decode': 'decode', ++ 'str': 'str', ++ 'n': 'n' ++} ++ ++if compat.py3k: ++ DEFAULT_ESCAPES.update({ ++ 'unicode': 'str' ++ }) ++ ++NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy() ++NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape' ++NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape' ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py +new file mode 100644 +index 0000000..1dda398 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py +@@ -0,0 +1,441 @@ ++# mako/lexer.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""provides the Lexer class for parsing template strings into parse trees.""" ++ ++import re ++import codecs ++from mako import parsetree, exceptions, compat ++from mako.pygen import adjust_whitespace ++ ++_regexp_cache = {} ++ ++class Lexer(object): ++ def __init__(self, text, filename=None, ++ disable_unicode=False, ++ input_encoding=None, preprocessor=None): ++ self.text = text ++ self.filename = filename ++ self.template = parsetree.TemplateNode(self.filename) ++ self.matched_lineno = 1 ++ self.matched_charpos = 0 ++ self.lineno = 1 ++ self.match_position = 0 ++ self.tag = [] ++ self.control_line = [] ++ self.ternary_stack = [] ++ self.disable_unicode = disable_unicode ++ self.encoding = input_encoding ++ ++ if compat.py3k and disable_unicode: ++ raise exceptions.UnsupportedError( ++ "Mako for Python 3 does not " ++ "support disabling Unicode") ++ ++ if preprocessor is None: ++ self.preprocessor = [] ++ elif not hasattr(preprocessor, '__iter__'): ++ self.preprocessor = [preprocessor] ++ else: ++ self.preprocessor = preprocessor ++ ++ @property ++ def exception_kwargs(self): ++ return {'source': self.text, ++ 'lineno': self.matched_lineno, ++ 'pos': self.matched_charpos, ++ 'filename': self.filename} ++ ++ def match(self, regexp, flags=None): ++ """compile the given regexp, cache the reg, and call match_reg().""" ++ ++ try: ++ reg = _regexp_cache[(regexp, flags)] ++ except KeyError: ++ if flags: ++ reg = re.compile(regexp, flags) ++ else: ++ reg = re.compile(regexp) ++ _regexp_cache[(regexp, flags)] = reg ++ ++ return self.match_reg(reg) ++ ++ def match_reg(self, reg): ++ """match the given regular expression object to the current text ++ position. ++ ++ if a match occurs, update the current text and line position. ++ ++ """ ++ ++ mp = self.match_position ++ ++ match = reg.match(self.text, self.match_position) ++ if match: ++ (start, end) = match.span() ++ if end == start: ++ self.match_position = end + 1 ++ else: ++ self.match_position = end ++ self.matched_lineno = self.lineno ++ lines = re.findall(r"\n", self.text[mp:self.match_position]) ++ cp = mp - 1 ++ while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'): ++ cp -= 1 ++ self.matched_charpos = mp - cp ++ self.lineno += len(lines) ++ #print "MATCHED:", match.group(0), "LINE START:", ++ # self.matched_lineno, "LINE END:", self.lineno ++ #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \ ++ # (match and "TRUE" or "FALSE") ++ return match ++ ++ def parse_until_text(self, *text): ++ startpos = self.match_position ++ text_re = r'|'.join(text) ++ brace_level = 0 ++ while True: ++ match = self.match(r'#.*\n') ++ if match: ++ continue ++ match = self.match(r'(\"\"\"|\'\'\'|\"|\')((? 0: ++ brace_level -= 1 ++ continue ++ return \ ++ self.text[startpos: ++ self.match_position - len(match.group(1))],\ ++ match.group(1) ++ match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S) ++ if match: ++ brace_level += match.group(1).count('{') ++ brace_level -= match.group(1).count('}') ++ continue ++ raise exceptions.SyntaxException( ++ "Expected: %s" % ++ ','.join(text), ++ **self.exception_kwargs) ++ ++ def append_node(self, nodecls, *args, **kwargs): ++ kwargs.setdefault('source', self.text) ++ kwargs.setdefault('lineno', self.matched_lineno) ++ kwargs.setdefault('pos', self.matched_charpos) ++ kwargs['filename'] = self.filename ++ node = nodecls(*args, **kwargs) ++ if len(self.tag): ++ self.tag[-1].nodes.append(node) ++ else: ++ self.template.nodes.append(node) ++ # build a set of child nodes for the control line ++ # (used for loop variable detection) ++ # also build a set of child nodes on ternary control lines ++ # (used for determining if a pass needs to be auto-inserted ++ if self.control_line: ++ control_frame = self.control_line[-1] ++ control_frame.nodes.append(node) ++ if not (isinstance(node, parsetree.ControlLine) and ++ control_frame.is_ternary(node.keyword)): ++ if self.ternary_stack and self.ternary_stack[-1]: ++ self.ternary_stack[-1][-1].nodes.append(node) ++ if isinstance(node, parsetree.Tag): ++ if len(self.tag): ++ node.parent = self.tag[-1] ++ self.tag.append(node) ++ elif isinstance(node, parsetree.ControlLine): ++ if node.isend: ++ self.control_line.pop() ++ self.ternary_stack.pop() ++ elif node.is_primary: ++ self.control_line.append(node) ++ self.ternary_stack.append([]) ++ elif self.control_line and \ ++ self.control_line[-1].is_ternary(node.keyword): ++ self.ternary_stack[-1].append(node) ++ elif self.control_line and \ ++ not self.control_line[-1].is_ternary(node.keyword): ++ raise exceptions.SyntaxException( ++ "Keyword '%s' not a legal ternary for keyword '%s'" % ++ (node.keyword, self.control_line[-1].keyword), ++ **self.exception_kwargs) ++ ++ _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n') ++ ++ def decode_raw_stream(self, text, decode_raw, known_encoding, filename): ++ """given string/unicode or bytes/string, determine encoding ++ from magic encoding comment, return body as unicode ++ or raw if decode_raw=False ++ ++ """ ++ if isinstance(text, compat.text_type): ++ m = self._coding_re.match(text) ++ encoding = m and m.group(1) or known_encoding or 'ascii' ++ return encoding, text ++ ++ if text.startswith(codecs.BOM_UTF8): ++ text = text[len(codecs.BOM_UTF8):] ++ parsed_encoding = 'utf-8' ++ m = self._coding_re.match(text.decode('utf-8', 'ignore')) ++ if m is not None and m.group(1) != 'utf-8': ++ raise exceptions.CompileException( ++ "Found utf-8 BOM in file, with conflicting " ++ "magic encoding comment of '%s'" % m.group(1), ++ text.decode('utf-8', 'ignore'), ++ 0, 0, filename) ++ else: ++ m = self._coding_re.match(text.decode('utf-8', 'ignore')) ++ if m: ++ parsed_encoding = m.group(1) ++ else: ++ parsed_encoding = known_encoding or 'ascii' ++ ++ if decode_raw: ++ try: ++ text = text.decode(parsed_encoding) ++ except UnicodeDecodeError: ++ raise exceptions.CompileException( ++ "Unicode decode operation of encoding '%s' failed" % ++ parsed_encoding, ++ text.decode('utf-8', 'ignore'), ++ 0, 0, filename) ++ ++ return parsed_encoding, text ++ ++ def parse(self): ++ self.encoding, self.text = self.decode_raw_stream(self.text, ++ not self.disable_unicode, ++ self.encoding, ++ self.filename,) ++ ++ for preproc in self.preprocessor: ++ self.text = preproc(self.text) ++ ++ # push the match marker past the ++ # encoding comment. ++ self.match_reg(self._coding_re) ++ ++ self.textlength = len(self.text) ++ ++ while (True): ++ if self.match_position > self.textlength: ++ break ++ ++ if self.match_end(): ++ break ++ if self.match_expression(): ++ continue ++ if self.match_control_line(): ++ continue ++ if self.match_comment(): ++ continue ++ if self.match_tag_start(): ++ continue ++ if self.match_tag_end(): ++ continue ++ if self.match_python_block(): ++ continue ++ if self.match_text(): ++ continue ++ ++ if self.match_position > self.textlength: ++ break ++ raise exceptions.CompileException("assertion failed") ++ ++ if len(self.tag): ++ raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % ++ self.tag[-1].keyword, ++ **self.exception_kwargs) ++ if len(self.control_line): ++ raise exceptions.SyntaxException( ++ "Unterminated control keyword: '%s'" % ++ self.control_line[-1].keyword, ++ self.text, ++ self.control_line[-1].lineno, ++ self.control_line[-1].pos, self.filename) ++ return self.template ++ ++ def match_tag_start(self): ++ match = self.match(r''' ++ \<% # opening tag ++ ++ ([\w\.\:]+) # keyword ++ ++ ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*) # attrname, = \ ++ # sign, string expression ++ ++ \s* # more whitespace ++ ++ (/)?> # closing ++ ++ ''', ++ ++ re.I | re.S | re.X) ++ ++ if match: ++ keyword, attr, isend = match.groups() ++ self.keyword = keyword ++ attributes = {} ++ if attr: ++ for att in re.findall( ++ r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr): ++ key, val1, val2 = att ++ text = val1 or val2 ++ text = text.replace('\r\n', '\n') ++ attributes[key] = text ++ self.append_node(parsetree.Tag, keyword, attributes) ++ if isend: ++ self.tag.pop() ++ else: ++ if keyword == 'text': ++ match = self.match(r'(.*?)(?=\)', re.S) ++ if not match: ++ raise exceptions.SyntaxException( ++ "Unclosed tag: <%%%s>" % ++ self.tag[-1].keyword, ++ **self.exception_kwargs) ++ self.append_node(parsetree.Text, match.group(1)) ++ return self.match_tag_end() ++ return True ++ else: ++ return False ++ ++ def match_tag_end(self): ++ match = self.match(r'\') ++ if match: ++ if not len(self.tag): ++ raise exceptions.SyntaxException( ++ "Closing tag without opening tag: " % ++ match.group(1), ++ **self.exception_kwargs) ++ elif self.tag[-1].keyword != match.group(1): ++ raise exceptions.SyntaxException( ++ "Closing tag does not match tag: <%%%s>" % ++ (match.group(1), self.tag[-1].keyword), ++ **self.exception_kwargs) ++ self.tag.pop() ++ return True ++ else: ++ return False ++ ++ def match_end(self): ++ match = self.match(r'\Z', re.S) ++ if match: ++ string = match.group() ++ if string: ++ return string ++ else: ++ return True ++ else: ++ return False ++ ++ def match_text(self): ++ match = self.match(r""" ++ (.*?) # anything, followed by: ++ ( ++ (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based ++ # comment preceded by a ++ # consumed newline and whitespace ++ | ++ (?=\${) # an expression ++ | ++ (?=') ++ # the trailing newline helps ++ # compiler.parse() not complain about indentation ++ text = adjust_whitespace(text) + "\n" ++ self.append_node( ++ parsetree.Code, ++ text, ++ match.group(1) == '!', lineno=line, pos=pos) ++ return True ++ else: ++ return False ++ ++ def match_expression(self): ++ match = self.match(r"\${") ++ if match: ++ line, pos = self.matched_lineno, self.matched_charpos ++ text, end = self.parse_until_text(r'\|', r'}') ++ if end == '|': ++ escapes, end = self.parse_until_text(r'}') ++ else: ++ escapes = "" ++ text = text.replace('\r\n', '\n') ++ self.append_node( ++ parsetree.Expression, ++ text, escapes.strip(), ++ lineno=line, pos=pos) ++ return True ++ else: ++ return False ++ ++ def match_control_line(self): ++ match = self.match( ++ r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)" ++ r"(?:\r?\n|\Z)", re.M) ++ if match: ++ operator = match.group(1) ++ text = match.group(2) ++ if operator == '%': ++ m2 = re.match(r'(end)?(\w+)\s*(.*)', text) ++ if not m2: ++ raise exceptions.SyntaxException( ++ "Invalid control line: '%s'" % ++ text, ++ **self.exception_kwargs) ++ isend, keyword = m2.group(1, 2) ++ isend = (isend is not None) ++ ++ if isend: ++ if not len(self.control_line): ++ raise exceptions.SyntaxException( ++ "No starting keyword '%s' for '%s'" % ++ (keyword, text), ++ **self.exception_kwargs) ++ elif self.control_line[-1].keyword != keyword: ++ raise exceptions.SyntaxException( ++ "Keyword '%s' doesn't match keyword '%s'" % ++ (text, self.control_line[-1].keyword), ++ **self.exception_kwargs) ++ self.append_node(parsetree.ControlLine, keyword, isend, text) ++ else: ++ self.append_node(parsetree.Comment, text) ++ return True ++ else: ++ return False ++ ++ def match_comment(self): ++ """matches the multiline version of a comment""" ++ match = self.match(r"<%doc>(.*?)", re.S) ++ if match: ++ self.append_node(parsetree.Comment, match.group(1)) ++ return True ++ else: ++ return False ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py +new file mode 100644 +index 0000000..2af5411 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py +@@ -0,0 +1,359 @@ ++# mako/lookup.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++import os, stat, posixpath, re ++from mako import exceptions, util ++from mako.template import Template ++ ++try: ++ import threading ++except: ++ import dummy_threading as threading ++ ++class TemplateCollection(object): ++ """Represent a collection of :class:`.Template` objects, ++ identifiable via URI. ++ ++ A :class:`.TemplateCollection` is linked to the usage of ++ all template tags that address other templates, such ++ as ``<%include>``, ``<%namespace>``, and ``<%inherit>``. ++ The ``file`` attribute of each of those tags refers ++ to a string URI that is passed to that :class:`.Template` ++ object's :class:`.TemplateCollection` for resolution. ++ ++ :class:`.TemplateCollection` is an abstract class, ++ with the usual default implementation being :class:`.TemplateLookup`. ++ ++ """ ++ ++ def has_template(self, uri): ++ """Return ``True`` if this :class:`.TemplateLookup` is ++ capable of returning a :class:`.Template` object for the ++ given ``uri``. ++ ++ :param uri: String URI of the template to be resolved. ++ ++ """ ++ try: ++ self.get_template(uri) ++ return True ++ except exceptions.TemplateLookupException: ++ return False ++ ++ def get_template(self, uri, relativeto=None): ++ """Return a :class:`.Template` object corresponding to the given ++ ``uri``. ++ ++ The default implementation raises ++ :class:`.NotImplementedError`. Implementations should ++ raise :class:`.TemplateLookupException` if the given ``uri`` ++ cannot be resolved. ++ ++ :param uri: String URI of the template to be resolved. ++ :param relativeto: if present, the given ``uri`` is assumed to ++ be relative to this URI. ++ ++ """ ++ raise NotImplementedError() ++ ++ def filename_to_uri(self, uri, filename): ++ """Convert the given ``filename`` to a URI relative to ++ this :class:`.TemplateCollection`.""" ++ ++ return uri ++ ++ def adjust_uri(self, uri, filename): ++ """Adjust the given ``uri`` based on the calling ``filename``. ++ ++ When this method is called from the runtime, the ++ ``filename`` parameter is taken directly to the ``filename`` ++ attribute of the calling template. Therefore a custom ++ :class:`.TemplateCollection` subclass can place any string ++ identifier desired in the ``filename`` parameter of the ++ :class:`.Template` objects it constructs and have them come back ++ here. ++ ++ """ ++ return uri ++ ++class TemplateLookup(TemplateCollection): ++ """Represent a collection of templates that locates template source files ++ from the local filesystem. ++ ++ The primary argument is the ``directories`` argument, the list of ++ directories to search: ++ ++ .. sourcecode:: python ++ ++ lookup = TemplateLookup(["/path/to/templates"]) ++ some_template = lookup.get_template("/index.html") ++ ++ The :class:`.TemplateLookup` can also be given :class:`.Template` objects ++ programatically using :meth:`.put_string` or :meth:`.put_template`: ++ ++ .. sourcecode:: python ++ ++ lookup = TemplateLookup() ++ lookup.put_string("base.html", ''' ++ ${self.next()} ++ ''') ++ lookup.put_string("hello.html", ''' ++ <%include file='base.html'/> ++ ++ Hello, world ! ++ ''') ++ ++ ++ :param directories: A list of directory names which will be ++ searched for a particular template URI. The URI is appended ++ to each directory and the filesystem checked. ++ ++ :param collection_size: Approximate size of the collection used ++ to store templates. If left at its default of ``-1``, the size ++ is unbounded, and a plain Python dictionary is used to ++ relate URI strings to :class:`.Template` instances. ++ Otherwise, a least-recently-used cache object is used which ++ will maintain the size of the collection approximately to ++ the number given. ++ ++ :param filesystem_checks: When at its default value of ``True``, ++ each call to :meth:`.TemplateLookup.get_template()` will ++ compare the filesystem last modified time to the time in ++ which an existing :class:`.Template` object was created. ++ This allows the :class:`.TemplateLookup` to regenerate a ++ new :class:`.Template` whenever the original source has ++ been updated. Set this to ``False`` for a very minor ++ performance increase. ++ ++ :param modulename_callable: A callable which, when present, ++ is passed the path of the source file as well as the ++ requested URI, and then returns the full path of the ++ generated Python module file. This is used to inject ++ alternate schemes for Python module location. If left at ++ its default of ``None``, the built in system of generation ++ based on ``module_directory`` plus ``uri`` is used. ++ ++ All other keyword parameters available for ++ :class:`.Template` are mirrored here. When new ++ :class:`.Template` objects are created, the keywords ++ established with this :class:`.TemplateLookup` are passed on ++ to each new :class:`.Template`. ++ ++ """ ++ ++ def __init__(self, ++ directories=None, ++ module_directory=None, ++ filesystem_checks=True, ++ collection_size=-1, ++ format_exceptions=False, ++ error_handler=None, ++ disable_unicode=False, ++ bytestring_passthrough=False, ++ output_encoding=None, ++ encoding_errors='strict', ++ ++ cache_args=None, ++ cache_impl='beaker', ++ cache_enabled=True, ++ cache_type=None, ++ cache_dir=None, ++ cache_url=None, ++ ++ modulename_callable=None, ++ module_writer=None, ++ default_filters=None, ++ buffer_filters=(), ++ strict_undefined=False, ++ imports=None, ++ future_imports=None, ++ enable_loop=True, ++ input_encoding=None, ++ preprocessor=None, ++ lexer_cls=None): ++ ++ self.directories = [posixpath.normpath(d) for d in ++ util.to_list(directories, ()) ++ ] ++ self.module_directory = module_directory ++ self.modulename_callable = modulename_callable ++ self.filesystem_checks = filesystem_checks ++ self.collection_size = collection_size ++ ++ if cache_args is None: ++ cache_args = {} ++ # transfer deprecated cache_* args ++ if cache_dir: ++ cache_args.setdefault('dir', cache_dir) ++ if cache_url: ++ cache_args.setdefault('url', cache_url) ++ if cache_type: ++ cache_args.setdefault('type', cache_type) ++ ++ self.template_args = { ++ 'format_exceptions':format_exceptions, ++ 'error_handler':error_handler, ++ 'disable_unicode':disable_unicode, ++ 'bytestring_passthrough':bytestring_passthrough, ++ 'output_encoding':output_encoding, ++ 'cache_impl':cache_impl, ++ 'encoding_errors':encoding_errors, ++ 'input_encoding':input_encoding, ++ 'module_directory':module_directory, ++ 'module_writer':module_writer, ++ 'cache_args':cache_args, ++ 'cache_enabled':cache_enabled, ++ 'default_filters':default_filters, ++ 'buffer_filters':buffer_filters, ++ 'strict_undefined':strict_undefined, ++ 'imports':imports, ++ 'future_imports':future_imports, ++ 'enable_loop':enable_loop, ++ 'preprocessor':preprocessor, ++ 'lexer_cls':lexer_cls ++ } ++ ++ if collection_size == -1: ++ self._collection = {} ++ self._uri_cache = {} ++ else: ++ self._collection = util.LRUCache(collection_size) ++ self._uri_cache = util.LRUCache(collection_size) ++ self._mutex = threading.Lock() ++ ++ def get_template(self, uri): ++ """Return a :class:`.Template` object corresponding to the given ++ ``uri``. ++ ++ .. note:: The ``relativeto`` argument is not supported here at the moment. ++ ++ """ ++ ++ try: ++ if self.filesystem_checks: ++ return self._check(uri, self._collection[uri]) ++ else: ++ return self._collection[uri] ++ except KeyError: ++ u = re.sub(r'^\/+', '', uri) ++ for dir in self.directories: ++ srcfile = posixpath.normpath(posixpath.join(dir, u)) ++ if os.path.isfile(srcfile): ++ return self._load(srcfile, uri) ++ else: ++ raise exceptions.TopLevelLookupException( ++ "Cant locate template for uri %r" % uri) ++ ++ def adjust_uri(self, uri, relativeto): ++ """Adjust the given ``uri`` based on the given relative URI.""" ++ ++ key = (uri, relativeto) ++ if key in self._uri_cache: ++ return self._uri_cache[key] ++ ++ if uri[0] != '/': ++ if relativeto is not None: ++ v = self._uri_cache[key] = posixpath.join( ++ posixpath.dirname(relativeto), uri) ++ else: ++ v = self._uri_cache[key] = '/' + uri ++ else: ++ v = self._uri_cache[key] = uri ++ return v ++ ++ ++ def filename_to_uri(self, filename): ++ """Convert the given ``filename`` to a URI relative to ++ this :class:`.TemplateCollection`.""" ++ ++ try: ++ return self._uri_cache[filename] ++ except KeyError: ++ value = self._relativeize(filename) ++ self._uri_cache[filename] = value ++ return value ++ ++ def _relativeize(self, filename): ++ """Return the portion of a filename that is 'relative' ++ to the directories in this lookup. ++ ++ """ ++ ++ filename = posixpath.normpath(filename) ++ for dir in self.directories: ++ if filename[0:len(dir)] == dir: ++ return filename[len(dir):] ++ else: ++ return None ++ ++ def _load(self, filename, uri): ++ self._mutex.acquire() ++ try: ++ try: ++ # try returning from collection one ++ # more time in case concurrent thread already loaded ++ return self._collection[uri] ++ except KeyError: ++ pass ++ try: ++ if self.modulename_callable is not None: ++ module_filename = self.modulename_callable(filename, uri) ++ else: ++ module_filename = None ++ self._collection[uri] = template = Template( ++ uri=uri, ++ filename=posixpath.normpath(filename), ++ lookup=self, ++ module_filename=module_filename, ++ **self.template_args) ++ return template ++ except: ++ # if compilation fails etc, ensure ++ # template is removed from collection, ++ # re-raise ++ self._collection.pop(uri, None) ++ raise ++ finally: ++ self._mutex.release() ++ ++ def _check(self, uri, template): ++ if template.filename is None: ++ return template ++ ++ try: ++ template_stat = os.stat(template.filename) ++ if template.module._modified_time < \ ++ template_stat[stat.ST_MTIME]: ++ self._collection.pop(uri, None) ++ return self._load(template.filename, uri) ++ else: ++ return template ++ except OSError: ++ self._collection.pop(uri, None) ++ raise exceptions.TemplateLookupException( ++ "Cant locate template for uri %r" % uri) ++ ++ ++ def put_string(self, uri, text): ++ """Place a new :class:`.Template` object into this ++ :class:`.TemplateLookup`, based on the given string of ++ ``text``. ++ ++ """ ++ self._collection[uri] = Template( ++ text, ++ lookup=self, ++ uri=uri, ++ **self.template_args) ++ ++ def put_template(self, uri, template): ++ """Place a new :class:`.Template` object into this ++ :class:`.TemplateLookup`, based on the given ++ :class:`.Template` object. ++ ++ """ ++ self._collection[uri] = template ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py +new file mode 100644 +index 0000000..49ec4e0 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py +@@ -0,0 +1,594 @@ ++# mako/parsetree.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""defines the parse tree components for Mako templates.""" ++ ++from mako import exceptions, ast, util, filters, compat ++import re ++ ++class Node(object): ++ """base class for a Node in the parse tree.""" ++ ++ def __init__(self, source, lineno, pos, filename): ++ self.source = source ++ self.lineno = lineno ++ self.pos = pos ++ self.filename = filename ++ ++ @property ++ def exception_kwargs(self): ++ return {'source': self.source, 'lineno': self.lineno, ++ 'pos': self.pos, 'filename': self.filename} ++ ++ def get_children(self): ++ return [] ++ ++ def accept_visitor(self, visitor): ++ def traverse(node): ++ for n in node.get_children(): ++ n.accept_visitor(visitor) ++ ++ method = getattr(visitor, "visit" + self.__class__.__name__, traverse) ++ method(self) ++ ++class TemplateNode(Node): ++ """a 'container' node that stores the overall collection of nodes.""" ++ ++ def __init__(self, filename): ++ super(TemplateNode, self).__init__('', 0, 0, filename) ++ self.nodes = [] ++ self.page_attributes = {} ++ ++ def get_children(self): ++ return self.nodes ++ ++ def __repr__(self): ++ return "TemplateNode(%s, %r)" % ( ++ util.sorted_dict_repr(self.page_attributes), ++ self.nodes) ++ ++class ControlLine(Node): ++ """defines a control line, a line-oriented python line or end tag. ++ ++ e.g.:: ++ ++ % if foo: ++ (markup) ++ % endif ++ ++ """ ++ ++ has_loop_context = False ++ ++ def __init__(self, keyword, isend, text, **kwargs): ++ super(ControlLine, self).__init__(**kwargs) ++ self.text = text ++ self.keyword = keyword ++ self.isend = isend ++ self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with'] ++ self.nodes = [] ++ if self.isend: ++ self._declared_identifiers = [] ++ self._undeclared_identifiers = [] ++ else: ++ code = ast.PythonFragment(text, **self.exception_kwargs) ++ self._declared_identifiers = code.declared_identifiers ++ self._undeclared_identifiers = code.undeclared_identifiers ++ ++ def get_children(self): ++ return self.nodes ++ ++ def declared_identifiers(self): ++ return self._declared_identifiers ++ ++ def undeclared_identifiers(self): ++ return self._undeclared_identifiers ++ ++ def is_ternary(self, keyword): ++ """return true if the given keyword is a ternary keyword ++ for this ControlLine""" ++ ++ return keyword in { ++ 'if':set(['else', 'elif']), ++ 'try':set(['except', 'finally']), ++ 'for':set(['else']) ++ }.get(self.keyword, []) ++ ++ def __repr__(self): ++ return "ControlLine(%r, %r, %r, %r)" % ( ++ self.keyword, ++ self.text, ++ self.isend, ++ (self.lineno, self.pos) ++ ) ++ ++class Text(Node): ++ """defines plain text in the template.""" ++ ++ def __init__(self, content, **kwargs): ++ super(Text, self).__init__(**kwargs) ++ self.content = content ++ ++ def __repr__(self): ++ return "Text(%r, %r)" % (self.content, (self.lineno, self.pos)) ++ ++class Code(Node): ++ """defines a Python code block, either inline or module level. ++ ++ e.g.:: ++ ++ inline: ++ <% ++ x = 12 ++ %> ++ ++ module level: ++ <%! ++ import logger ++ %> ++ ++ """ ++ ++ def __init__(self, text, ismodule, **kwargs): ++ super(Code, self).__init__(**kwargs) ++ self.text = text ++ self.ismodule = ismodule ++ self.code = ast.PythonCode(text, **self.exception_kwargs) ++ ++ def declared_identifiers(self): ++ return self.code.declared_identifiers ++ ++ def undeclared_identifiers(self): ++ return self.code.undeclared_identifiers ++ ++ def __repr__(self): ++ return "Code(%r, %r, %r)" % ( ++ self.text, ++ self.ismodule, ++ (self.lineno, self.pos) ++ ) ++ ++class Comment(Node): ++ """defines a comment line. ++ ++ # this is a comment ++ ++ """ ++ ++ def __init__(self, text, **kwargs): ++ super(Comment, self).__init__(**kwargs) ++ self.text = text ++ ++ def __repr__(self): ++ return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos)) ++ ++class Expression(Node): ++ """defines an inline expression. ++ ++ ${x+y} ++ ++ """ ++ ++ def __init__(self, text, escapes, **kwargs): ++ super(Expression, self).__init__(**kwargs) ++ self.text = text ++ self.escapes = escapes ++ self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs) ++ self.code = ast.PythonCode(text, **self.exception_kwargs) ++ ++ def declared_identifiers(self): ++ return [] ++ ++ def undeclared_identifiers(self): ++ # TODO: make the "filter" shortcut list configurable at parse/gen time ++ return self.code.undeclared_identifiers.union( ++ self.escapes_code.undeclared_identifiers.difference( ++ set(filters.DEFAULT_ESCAPES.keys()) ++ ) ++ ).difference(self.code.declared_identifiers) ++ ++ def __repr__(self): ++ return "Expression(%r, %r, %r)" % ( ++ self.text, ++ self.escapes_code.args, ++ (self.lineno, self.pos) ++ ) ++ ++class _TagMeta(type): ++ """metaclass to allow Tag to produce a subclass according to ++ its keyword""" ++ ++ _classmap = {} ++ ++ def __init__(cls, clsname, bases, dict): ++ if getattr(cls, '__keyword__', None) is not None: ++ cls._classmap[cls.__keyword__] = cls ++ super(_TagMeta, cls).__init__(clsname, bases, dict) ++ ++ def __call__(cls, keyword, attributes, **kwargs): ++ if ":" in keyword: ++ ns, defname = keyword.split(':') ++ return type.__call__(CallNamespaceTag, ns, defname, ++ attributes, **kwargs) ++ ++ try: ++ cls = _TagMeta._classmap[keyword] ++ except KeyError: ++ raise exceptions.CompileException( ++ "No such tag: '%s'" % keyword, ++ source=kwargs['source'], ++ lineno=kwargs['lineno'], ++ pos=kwargs['pos'], ++ filename=kwargs['filename'] ++ ) ++ return type.__call__(cls, keyword, attributes, **kwargs) ++ ++class Tag(compat.with_metaclass(_TagMeta, Node)): ++ """abstract base class for tags. ++ ++ <%sometag/> ++ ++ <%someothertag> ++ stuff ++ ++ ++ """ ++ __keyword__ = None ++ ++ def __init__(self, keyword, attributes, expressions, ++ nonexpressions, required, **kwargs): ++ """construct a new Tag instance. ++ ++ this constructor not called directly, and is only called ++ by subclasses. ++ ++ :param keyword: the tag keyword ++ ++ :param attributes: raw dictionary of attribute key/value pairs ++ ++ :param expressions: a set of identifiers that are legal attributes, ++ which can also contain embedded expressions ++ ++ :param nonexpressions: a set of identifiers that are legal ++ attributes, which cannot contain embedded expressions ++ ++ :param \**kwargs: ++ other arguments passed to the Node superclass (lineno, pos) ++ ++ """ ++ super(Tag, self).__init__(**kwargs) ++ self.keyword = keyword ++ self.attributes = attributes ++ self._parse_attributes(expressions, nonexpressions) ++ missing = [r for r in required if r not in self.parsed_attributes] ++ if len(missing): ++ raise exceptions.CompileException( ++ "Missing attribute(s): %s" % ++ ",".join([repr(m) for m in missing]), ++ **self.exception_kwargs) ++ self.parent = None ++ self.nodes = [] ++ ++ def is_root(self): ++ return self.parent is None ++ ++ def get_children(self): ++ return self.nodes ++ ++ def _parse_attributes(self, expressions, nonexpressions): ++ undeclared_identifiers = set() ++ self.parsed_attributes = {} ++ for key in self.attributes: ++ if key in expressions: ++ expr = [] ++ for x in re.compile(r'(\${.+?})', ++ re.S).split(self.attributes[key]): ++ m = re.compile(r'^\${(.+?)}$', re.S).match(x) ++ if m: ++ code = ast.PythonCode(m.group(1).rstrip(), ++ **self.exception_kwargs) ++ # we aren't discarding "declared_identifiers" here, ++ # which we do so that list comprehension-declared ++ # variables aren't counted. As yet can't find a ++ # condition that requires it here. ++ undeclared_identifiers = \ ++ undeclared_identifiers.union( ++ code.undeclared_identifiers) ++ expr.append('(%s)' % m.group(1)) ++ else: ++ if x: ++ expr.append(repr(x)) ++ self.parsed_attributes[key] = " + ".join(expr) or repr('') ++ elif key in nonexpressions: ++ if re.search(r'\${.+?}', self.attributes[key]): ++ raise exceptions.CompileException( ++ "Attibute '%s' in tag '%s' does not allow embedded " ++ "expressions" % (key, self.keyword), ++ **self.exception_kwargs) ++ self.parsed_attributes[key] = repr(self.attributes[key]) ++ else: ++ raise exceptions.CompileException( ++ "Invalid attribute for tag '%s': '%s'" % ++ (self.keyword, key), ++ **self.exception_kwargs) ++ self.expression_undeclared_identifiers = undeclared_identifiers ++ ++ def declared_identifiers(self): ++ return [] ++ ++ def undeclared_identifiers(self): ++ return self.expression_undeclared_identifiers ++ ++ def __repr__(self): ++ return "%s(%r, %s, %r, %r)" % (self.__class__.__name__, ++ self.keyword, ++ util.sorted_dict_repr(self.attributes), ++ (self.lineno, self.pos), ++ self.nodes ++ ) ++ ++class IncludeTag(Tag): ++ __keyword__ = 'include' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ super(IncludeTag, self).__init__( ++ keyword, ++ attributes, ++ ('file', 'import', 'args'), ++ (), ('file',), **kwargs) ++ self.page_args = ast.PythonCode( ++ "__DUMMY(%s)" % attributes.get('args', ''), ++ **self.exception_kwargs) ++ ++ def declared_identifiers(self): ++ return [] ++ ++ def undeclared_identifiers(self): ++ identifiers = self.page_args.undeclared_identifiers.\ ++ difference(set(["__DUMMY"])).\ ++ difference(self.page_args.declared_identifiers) ++ return identifiers.union(super(IncludeTag, self). ++ undeclared_identifiers()) ++ ++class NamespaceTag(Tag): ++ __keyword__ = 'namespace' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ super(NamespaceTag, self).__init__( ++ keyword, attributes, ++ ('file',), ++ ('name','inheritable', ++ 'import','module'), ++ (), **kwargs) ++ ++ self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self)))) ++ if not 'name' in attributes and not 'import' in attributes: ++ raise exceptions.CompileException( ++ "'name' and/or 'import' attributes are required " ++ "for <%namespace>", ++ **self.exception_kwargs) ++ if 'file' in attributes and 'module' in attributes: ++ raise exceptions.CompileException( ++ "<%namespace> may only have one of 'file' or 'module'", ++ **self.exception_kwargs ++ ) ++ ++ def declared_identifiers(self): ++ return [] ++ ++class TextTag(Tag): ++ __keyword__ = 'text' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ super(TextTag, self).__init__( ++ keyword, ++ attributes, (), ++ ('filter'), (), **kwargs) ++ self.filter_args = ast.ArgumentList( ++ attributes.get('filter', ''), ++ **self.exception_kwargs) ++ ++ def undeclared_identifiers(self): ++ return self.filter_args.\ ++ undeclared_identifiers.\ ++ difference(filters.DEFAULT_ESCAPES.keys()).union( ++ self.expression_undeclared_identifiers ++ ) ++ ++class DefTag(Tag): ++ __keyword__ = 'def' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ expressions = ['buffered', 'cached'] + [ ++ c for c in attributes if c.startswith('cache_')] ++ ++ ++ super(DefTag, self).__init__( ++ keyword, ++ attributes, ++ expressions, ++ ('name', 'filter', 'decorator'), ++ ('name',), ++ **kwargs) ++ name = attributes['name'] ++ if re.match(r'^[\w_]+$', name): ++ raise exceptions.CompileException( ++ "Missing parenthesis in %def", ++ **self.exception_kwargs) ++ self.function_decl = ast.FunctionDecl("def " + name + ":pass", ++ **self.exception_kwargs) ++ self.name = self.function_decl.funcname ++ self.decorator = attributes.get('decorator', '') ++ self.filter_args = ast.ArgumentList( ++ attributes.get('filter', ''), ++ **self.exception_kwargs) ++ ++ is_anonymous = False ++ is_block = False ++ ++ @property ++ def funcname(self): ++ return self.function_decl.funcname ++ ++ def get_argument_expressions(self, **kw): ++ return self.function_decl.get_argument_expressions(**kw) ++ ++ def declared_identifiers(self): ++ return self.function_decl.allargnames ++ ++ def undeclared_identifiers(self): ++ res = [] ++ for c in self.function_decl.defaults: ++ res += list(ast.PythonCode(c, **self.exception_kwargs). ++ undeclared_identifiers) ++ return set(res).union( ++ self.filter_args.\ ++ undeclared_identifiers.\ ++ difference(filters.DEFAULT_ESCAPES.keys()) ++ ).union( ++ self.expression_undeclared_identifiers ++ ).difference( ++ self.function_decl.allargnames ++ ) ++ ++class BlockTag(Tag): ++ __keyword__ = 'block' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ expressions = ['buffered', 'cached', 'args'] + [ ++ c for c in attributes if c.startswith('cache_')] ++ ++ super(BlockTag, self).__init__( ++ keyword, ++ attributes, ++ expressions, ++ ('name','filter', 'decorator'), ++ (), ++ **kwargs) ++ name = attributes.get('name') ++ if name and not re.match(r'^[\w_]+$',name): ++ raise exceptions.CompileException( ++ "%block may not specify an argument signature", ++ **self.exception_kwargs) ++ if not name and attributes.get('args', None): ++ raise exceptions.CompileException( ++ "Only named %blocks may specify args", ++ **self.exception_kwargs ++ ) ++ self.body_decl = ast.FunctionArgs(attributes.get('args', ''), ++ **self.exception_kwargs) ++ ++ self.name = name ++ self.decorator = attributes.get('decorator', '') ++ self.filter_args = ast.ArgumentList( ++ attributes.get('filter', ''), ++ **self.exception_kwargs) ++ ++ ++ is_block = True ++ ++ @property ++ def is_anonymous(self): ++ return self.name is None ++ ++ @property ++ def funcname(self): ++ return self.name or "__M_anon_%d" % (self.lineno, ) ++ ++ def get_argument_expressions(self, **kw): ++ return self.body_decl.get_argument_expressions(**kw) ++ ++ def declared_identifiers(self): ++ return self.body_decl.allargnames ++ ++ def undeclared_identifiers(self): ++ return (self.filter_args.\ ++ undeclared_identifiers.\ ++ difference(filters.DEFAULT_ESCAPES.keys()) ++ ).union(self.expression_undeclared_identifiers) ++ ++ ++ ++class CallTag(Tag): ++ __keyword__ = 'call' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ super(CallTag, self).__init__(keyword, attributes, ++ ('args'), ('expr',), ('expr',), **kwargs) ++ self.expression = attributes['expr'] ++ self.code = ast.PythonCode(self.expression, **self.exception_kwargs) ++ self.body_decl = ast.FunctionArgs(attributes.get('args', ''), ++ **self.exception_kwargs) ++ ++ def declared_identifiers(self): ++ return self.code.declared_identifiers.union(self.body_decl.allargnames) ++ ++ def undeclared_identifiers(self): ++ return self.code.undeclared_identifiers.\ ++ difference(self.code.declared_identifiers) ++ ++class CallNamespaceTag(Tag): ++ ++ def __init__(self, namespace, defname, attributes, **kwargs): ++ super(CallNamespaceTag, self).__init__( ++ namespace + ":" + defname, ++ attributes, ++ tuple(attributes.keys()) + ('args', ), ++ (), ++ (), ++ **kwargs) ++ ++ self.expression = "%s.%s(%s)" % ( ++ namespace, ++ defname, ++ ",".join(["%s=%s" % (k, v) for k, v in ++ self.parsed_attributes.items() ++ if k != 'args']) ++ ) ++ self.code = ast.PythonCode(self.expression, **self.exception_kwargs) ++ self.body_decl = ast.FunctionArgs( ++ attributes.get('args', ''), ++ **self.exception_kwargs) ++ ++ def declared_identifiers(self): ++ return self.code.declared_identifiers.union(self.body_decl.allargnames) ++ ++ def undeclared_identifiers(self): ++ return self.code.undeclared_identifiers.\ ++ difference(self.code.declared_identifiers) ++ ++class InheritTag(Tag): ++ __keyword__ = 'inherit' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ super(InheritTag, self).__init__( ++ keyword, attributes, ++ ('file',), (), ('file',), **kwargs) ++ ++class PageTag(Tag): ++ __keyword__ = 'page' ++ ++ def __init__(self, keyword, attributes, **kwargs): ++ expressions = ['cached', 'args', 'expression_filter', 'enable_loop'] + [ ++ c for c in attributes if c.startswith('cache_')] ++ ++ super(PageTag, self).__init__( ++ keyword, ++ attributes, ++ expressions, ++ (), ++ (), ++ **kwargs) ++ self.body_decl = ast.FunctionArgs(attributes.get('args', ''), ++ **self.exception_kwargs) ++ self.filter_args = ast.ArgumentList( ++ attributes.get('expression_filter', ''), ++ **self.exception_kwargs) ++ ++ def declared_identifiers(self): ++ return self.body_decl.allargnames ++ ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py +new file mode 100644 +index 0000000..5ba5125 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py +@@ -0,0 +1,299 @@ ++# mako/pygen.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""utilities for generating and formatting literal Python code.""" ++ ++import re ++from mako import exceptions ++ ++class PythonPrinter(object): ++ def __init__(self, stream): ++ # indentation counter ++ self.indent = 0 ++ ++ # a stack storing information about why we incremented ++ # the indentation counter, to help us determine if we ++ # should decrement it ++ self.indent_detail = [] ++ ++ # the string of whitespace multiplied by the indent ++ # counter to produce a line ++ self.indentstring = " " ++ ++ # the stream we are writing to ++ self.stream = stream ++ ++ # current line number ++ self.lineno = 1 ++ ++ # a list of lines that represents a buffered "block" of code, ++ # which can be later printed relative to an indent level ++ self.line_buffer = [] ++ ++ self.in_indent_lines = False ++ ++ self._reset_multi_line_flags() ++ ++ # mapping of generated python lines to template ++ # source lines ++ self.source_map = {} ++ ++ def _update_lineno(self, num): ++ self.lineno += num ++ ++ def start_source(self, lineno): ++ if self.lineno not in self.source_map: ++ self.source_map[self.lineno] = lineno ++ ++ def write_blanks(self, num): ++ self.stream.write("\n" * num) ++ self._update_lineno(num) ++ ++ def write_indented_block(self, block): ++ """print a line or lines of python which already contain indentation. ++ ++ The indentation of the total block of lines will be adjusted to that of ++ the current indent level.""" ++ self.in_indent_lines = False ++ for l in re.split(r'\r?\n', block): ++ self.line_buffer.append(l) ++ self._update_lineno(1) ++ ++ def writelines(self, *lines): ++ """print a series of lines of python.""" ++ for line in lines: ++ self.writeline(line) ++ ++ def writeline(self, line): ++ """print a line of python, indenting it according to the current ++ indent level. ++ ++ this also adjusts the indentation counter according to the ++ content of the line. ++ ++ """ ++ ++ if not self.in_indent_lines: ++ self._flush_adjusted_lines() ++ self.in_indent_lines = True ++ ++ if (line is None or ++ re.match(r"^\s*#",line) or ++ re.match(r"^\s*$", line) ++ ): ++ hastext = False ++ else: ++ hastext = True ++ ++ is_comment = line and len(line) and line[0] == '#' ++ ++ # see if this line should decrease the indentation level ++ if (not is_comment and ++ (not hastext or self._is_unindentor(line)) ++ ): ++ ++ if self.indent > 0: ++ self.indent -= 1 ++ # if the indent_detail stack is empty, the user ++ # probably put extra closures - the resulting ++ # module wont compile. ++ if len(self.indent_detail) == 0: ++ raise exceptions.SyntaxException( ++ "Too many whitespace closures") ++ self.indent_detail.pop() ++ ++ if line is None: ++ return ++ ++ # write the line ++ self.stream.write(self._indent_line(line) + "\n") ++ self._update_lineno(len(line.split("\n"))) ++ ++ # see if this line should increase the indentation level. ++ # note that a line can both decrase (before printing) and ++ # then increase (after printing) the indentation level. ++ ++ if re.search(r":[ \t]*(?:#.*)?$", line): ++ # increment indentation count, and also ++ # keep track of what the keyword was that indented us, ++ # if it is a python compound statement keyword ++ # where we might have to look for an "unindent" keyword ++ match = re.match(r"^\s*(if|try|elif|while|for|with)", line) ++ if match: ++ # its a "compound" keyword, so we will check for "unindentors" ++ indentor = match.group(1) ++ self.indent += 1 ++ self.indent_detail.append(indentor) ++ else: ++ indentor = None ++ # its not a "compound" keyword. but lets also ++ # test for valid Python keywords that might be indenting us, ++ # else assume its a non-indenting line ++ m2 = re.match(r"^\s*(def|class|else|elif|except|finally)", ++ line) ++ if m2: ++ self.indent += 1 ++ self.indent_detail.append(indentor) ++ ++ def close(self): ++ """close this printer, flushing any remaining lines.""" ++ self._flush_adjusted_lines() ++ ++ def _is_unindentor(self, line): ++ """return true if the given line is an 'unindentor', ++ relative to the last 'indent' event received. ++ ++ """ ++ ++ # no indentation detail has been pushed on; return False ++ if len(self.indent_detail) == 0: ++ return False ++ ++ indentor = self.indent_detail[-1] ++ ++ # the last indent keyword we grabbed is not a ++ # compound statement keyword; return False ++ if indentor is None: ++ return False ++ ++ # if the current line doesnt have one of the "unindentor" keywords, ++ # return False ++ match = re.match(r"^\s*(else|elif|except|finally).*\:", line) ++ if not match: ++ return False ++ ++ # whitespace matches up, we have a compound indentor, ++ # and this line has an unindentor, this ++ # is probably good enough ++ return True ++ ++ # should we decide that its not good enough, heres ++ # more stuff to check. ++ #keyword = match.group(1) ++ ++ # match the original indent keyword ++ #for crit in [ ++ # (r'if|elif', r'else|elif'), ++ # (r'try', r'except|finally|else'), ++ # (r'while|for', r'else'), ++ #]: ++ # if re.match(crit[0], indentor) and re.match(crit[1], keyword): ++ # return True ++ ++ #return False ++ ++ def _indent_line(self, line, stripspace=''): ++ """indent the given line according to the current indent level. ++ ++ stripspace is a string of space that will be truncated from the ++ start of the line before indenting.""" ++ ++ return re.sub(r"^%s" % stripspace, self.indentstring ++ * self.indent, line) ++ ++ def _reset_multi_line_flags(self): ++ """reset the flags which would indicate we are in a backslashed ++ or triple-quoted section.""" ++ ++ self.backslashed, self.triplequoted = False, False ++ ++ def _in_multi_line(self, line): ++ """return true if the given line is part of a multi-line block, ++ via backslash or triple-quote.""" ++ ++ # we are only looking for explicitly joined lines here, not ++ # implicit ones (i.e. brackets, braces etc.). this is just to ++ # guard against the possibility of modifying the space inside of ++ # a literal multiline string with unfortunately placed ++ # whitespace ++ ++ current_state = (self.backslashed or self.triplequoted) ++ ++ if re.search(r"\\$", line): ++ self.backslashed = True ++ else: ++ self.backslashed = False ++ ++ triples = len(re.findall(r"\"\"\"|\'\'\'", line)) ++ if triples == 1 or triples % 2 != 0: ++ self.triplequoted = not self.triplequoted ++ ++ return current_state ++ ++ def _flush_adjusted_lines(self): ++ stripspace = None ++ self._reset_multi_line_flags() ++ ++ for entry in self.line_buffer: ++ if self._in_multi_line(entry): ++ self.stream.write(entry + "\n") ++ else: ++ entry = entry.expandtabs() ++ if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry): ++ stripspace = re.match(r"^([ \t]*)", entry).group(1) ++ self.stream.write(self._indent_line(entry, stripspace) + "\n") ++ ++ self.line_buffer = [] ++ self._reset_multi_line_flags() ++ ++ ++def adjust_whitespace(text): ++ """remove the left-whitespace margin of a block of Python code.""" ++ ++ state = [False, False] ++ (backslashed, triplequoted) = (0, 1) ++ ++ def in_multi_line(line): ++ start_state = (state[backslashed] or state[triplequoted]) ++ ++ if re.search(r"\\$", line): ++ state[backslashed] = True ++ else: ++ state[backslashed] = False ++ ++ def match(reg, t): ++ m = re.match(reg, t) ++ if m: ++ return m, t[len(m.group(0)):] ++ else: ++ return None, t ++ ++ while line: ++ if state[triplequoted]: ++ m, line = match(r"%s" % state[triplequoted], line) ++ if m: ++ state[triplequoted] = False ++ else: ++ m, line = match(r".*?(?=%s|$)" % state[triplequoted], line) ++ else: ++ m, line = match(r'#', line) ++ if m: ++ return start_state ++ ++ m, line = match(r"\"\"\"|\'\'\'", line) ++ if m: ++ state[triplequoted] = m.group(0) ++ continue ++ ++ m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line) ++ ++ return start_state ++ ++ def _indent_line(line, stripspace=''): ++ return re.sub(r"^%s" % stripspace, '', line) ++ ++ lines = [] ++ stripspace = None ++ ++ for line in re.split(r'\r?\n', text): ++ if in_multi_line(line): ++ lines.append(line) ++ else: ++ line = line.expandtabs() ++ if stripspace is None and re.search(r"^[ \t]*[^# \t]", line): ++ stripspace = re.match(r"^([ \t]*)", line).group(1) ++ lines.append(_indent_line(line, stripspace)) ++ return "\n".join(lines) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py +new file mode 100644 +index 0000000..bfa46a9 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py +@@ -0,0 +1,232 @@ ++# mako/pyparser.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""Handles parsing of Python code. ++ ++Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler ++module is used. ++""" ++ ++from mako import exceptions, util, compat ++from mako.compat import arg_stringname ++import operator ++ ++if compat.py3k: ++ # words that cannot be assigned to (notably ++ # smaller than the total keys in __builtins__) ++ reserved = set(['True', 'False', 'None', 'print']) ++ ++ # the "id" attribute on a function node ++ arg_id = operator.attrgetter('arg') ++else: ++ # words that cannot be assigned to (notably ++ # smaller than the total keys in __builtins__) ++ reserved = set(['True', 'False', 'None']) ++ ++ # the "id" attribute on a function node ++ arg_id = operator.attrgetter('id') ++ ++import _ast ++util.restore__ast(_ast) ++from mako import _ast_util ++ ++ ++def parse(code, mode='exec', **exception_kwargs): ++ """Parse an expression into AST""" ++ ++ try: ++ return _ast_util.parse(code, '', mode) ++ except Exception: ++ raise exceptions.SyntaxException( ++ "(%s) %s (%r)" % ( ++ compat.exception_as().__class__.__name__, ++ compat.exception_as(), ++ code[0:50] ++ ), **exception_kwargs) ++ ++ ++class FindIdentifiers(_ast_util.NodeVisitor): ++ ++ def __init__(self, listener, **exception_kwargs): ++ self.in_function = False ++ self.in_assign_targets = False ++ self.local_ident_stack = set() ++ self.listener = listener ++ self.exception_kwargs = exception_kwargs ++ ++ def _add_declared(self, name): ++ if not self.in_function: ++ self.listener.declared_identifiers.add(name) ++ else: ++ self.local_ident_stack.add(name) ++ ++ def visit_ClassDef(self, node): ++ self._add_declared(node.name) ++ ++ def visit_Assign(self, node): ++ ++ # flip around the visiting of Assign so the expression gets ++ # evaluated first, in the case of a clause like "x=x+5" (x ++ # is undeclared) ++ ++ self.visit(node.value) ++ in_a = self.in_assign_targets ++ self.in_assign_targets = True ++ for n in node.targets: ++ self.visit(n) ++ self.in_assign_targets = in_a ++ ++ if compat.py3k: ++ ++ # ExceptHandler is in Python 2, but this block only works in ++ # Python 3 (and is required there) ++ ++ def visit_ExceptHandler(self, node): ++ if node.name is not None: ++ self._add_declared(node.name) ++ if node.type is not None: ++ self.visit(node.type) ++ for statement in node.body: ++ self.visit(statement) ++ ++ def visit_Lambda(self, node, *args): ++ self._visit_function(node, True) ++ ++ def visit_FunctionDef(self, node): ++ self._add_declared(node.name) ++ self._visit_function(node, False) ++ ++ def _expand_tuples(self, args): ++ for arg in args: ++ if isinstance(arg, _ast.Tuple): ++ for n in arg.elts: ++ yield n ++ else: ++ yield arg ++ ++ def _visit_function(self, node, islambda): ++ ++ # push function state onto stack. dont log any more ++ # identifiers as "declared" until outside of the function, ++ # but keep logging identifiers as "undeclared". track ++ # argument names in each function header so they arent ++ # counted as "undeclared" ++ ++ inf = self.in_function ++ self.in_function = True ++ ++ local_ident_stack = self.local_ident_stack ++ self.local_ident_stack = local_ident_stack.union([ ++ arg_id(arg) for arg in self._expand_tuples(node.args.args) ++ ]) ++ if islambda: ++ self.visit(node.body) ++ else: ++ for n in node.body: ++ self.visit(n) ++ self.in_function = inf ++ self.local_ident_stack = local_ident_stack ++ ++ def visit_For(self, node): ++ ++ # flip around visit ++ ++ self.visit(node.iter) ++ self.visit(node.target) ++ for statement in node.body: ++ self.visit(statement) ++ for statement in node.orelse: ++ self.visit(statement) ++ ++ def visit_Name(self, node): ++ if isinstance(node.ctx, _ast.Store): ++ # this is eqiuvalent to visit_AssName in ++ # compiler ++ self._add_declared(node.id) ++ elif node.id not in reserved and node.id \ ++ not in self.listener.declared_identifiers and node.id \ ++ not in self.local_ident_stack: ++ self.listener.undeclared_identifiers.add(node.id) ++ ++ def visit_Import(self, node): ++ for name in node.names: ++ if name.asname is not None: ++ self._add_declared(name.asname) ++ else: ++ self._add_declared(name.name.split('.')[0]) ++ ++ def visit_ImportFrom(self, node): ++ for name in node.names: ++ if name.asname is not None: ++ self._add_declared(name.asname) ++ else: ++ if name.name == '*': ++ raise exceptions.CompileException( ++ "'import *' is not supported, since all identifier " ++ "names must be explicitly declared. Please use the " ++ "form 'from import , , " ++ "...' instead.", **self.exception_kwargs) ++ self._add_declared(name.name) ++ ++ ++class FindTuple(_ast_util.NodeVisitor): ++ ++ def __init__(self, listener, code_factory, **exception_kwargs): ++ self.listener = listener ++ self.exception_kwargs = exception_kwargs ++ self.code_factory = code_factory ++ ++ def visit_Tuple(self, node): ++ for n in node.elts: ++ p = self.code_factory(n, **self.exception_kwargs) ++ self.listener.codeargs.append(p) ++ self.listener.args.append(ExpressionGenerator(n).value()) ++ self.listener.declared_identifiers = \ ++ self.listener.declared_identifiers.union( ++ p.declared_identifiers) ++ self.listener.undeclared_identifiers = \ ++ self.listener.undeclared_identifiers.union( ++ p.undeclared_identifiers) ++ ++ ++class ParseFunc(_ast_util.NodeVisitor): ++ ++ def __init__(self, listener, **exception_kwargs): ++ self.listener = listener ++ self.exception_kwargs = exception_kwargs ++ ++ def visit_FunctionDef(self, node): ++ self.listener.funcname = node.name ++ ++ argnames = [arg_id(arg) for arg in node.args.args] ++ if node.args.vararg: ++ argnames.append(arg_stringname(node.args.vararg)) ++ ++ if compat.py2k: ++ # kw-only args don't exist in Python 2 ++ kwargnames = [] ++ else: ++ kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs] ++ if node.args.kwarg: ++ kwargnames.append(arg_stringname(node.args.kwarg)) ++ self.listener.argnames = argnames ++ self.listener.defaults = node.args.defaults # ast ++ self.listener.kwargnames = kwargnames ++ if compat.py2k: ++ self.listener.kwdefaults = [] ++ else: ++ self.listener.kwdefaults = node.args.kw_defaults ++ self.listener.varargs = node.args.vararg ++ self.listener.kwargs = node.args.kwarg ++ ++class ExpressionGenerator(object): ++ ++ def __init__(self, astnode): ++ self.generator = _ast_util.SourceGenerator(' ' * 4) ++ self.generator.visit(astnode) ++ ++ def value(self): ++ return ''.join(self.generator.result) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py +new file mode 100644 +index 0000000..6b6a35a +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py +@@ -0,0 +1,878 @@ ++# mako/runtime.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""provides runtime services for templates, including Context, ++Namespace, and various helper functions.""" ++ ++from mako import exceptions, util, compat ++from mako.compat import compat_builtins ++import sys ++ ++ ++class Context(object): ++ """Provides runtime namespace, output buffer, and various ++ callstacks for templates. ++ ++ See :ref:`runtime_toplevel` for detail on the usage of ++ :class:`.Context`. ++ ++ """ ++ ++ def __init__(self, buffer, **data): ++ self._buffer_stack = [buffer] ++ ++ self._data = data ++ ++ self._kwargs = data.copy() ++ self._with_template = None ++ self._outputting_as_unicode = None ++ self.namespaces = {} ++ ++ # "capture" function which proxies to the ++ # generic "capture" function ++ self._data['capture'] = compat.partial(capture, self) ++ ++ # "caller" stack used by def calls with content ++ self.caller_stack = self._data['caller'] = CallerStack() ++ ++ def _set_with_template(self, t): ++ self._with_template = t ++ illegal_names = t.reserved_names.intersection(self._data) ++ if illegal_names: ++ raise exceptions.NameConflictError( ++ "Reserved words passed to render(): %s" % ++ ", ".join(illegal_names)) ++ ++ @property ++ def lookup(self): ++ """Return the :class:`.TemplateLookup` associated ++ with this :class:`.Context`. ++ ++ """ ++ return self._with_template.lookup ++ ++ @property ++ def kwargs(self): ++ """Return the dictionary of top level keyword arguments associated ++ with this :class:`.Context`. ++ ++ This dictionary only includes the top-level arguments passed to ++ :meth:`.Template.render`. It does not include names produced within ++ the template execution such as local variable names or special names ++ such as ``self``, ``next``, etc. ++ ++ The purpose of this dictionary is primarily for the case that ++ a :class:`.Template` accepts arguments via its ``<%page>`` tag, ++ which are normally expected to be passed via :meth:`.Template.render`, ++ except the template is being called in an inheritance context, ++ using the ``body()`` method. :attr:`.Context.kwargs` can then be ++ used to propagate these arguments to the inheriting template:: ++ ++ ${next.body(**context.kwargs)} ++ ++ """ ++ return self._kwargs.copy() ++ ++ def push_caller(self, caller): ++ """Push a ``caller`` callable onto the callstack for ++ this :class:`.Context`.""" ++ ++ ++ self.caller_stack.append(caller) ++ ++ def pop_caller(self): ++ """Pop a ``caller`` callable onto the callstack for this ++ :class:`.Context`.""" ++ ++ del self.caller_stack[-1] ++ ++ def keys(self): ++ """Return a list of all names established in this :class:`.Context`.""" ++ ++ return list(self._data.keys()) ++ ++ def __getitem__(self, key): ++ if key in self._data: ++ return self._data[key] ++ else: ++ return compat_builtins.__dict__[key] ++ ++ def _push_writer(self): ++ """push a capturing buffer onto this Context and return ++ the new writer function.""" ++ ++ buf = util.FastEncodingBuffer() ++ self._buffer_stack.append(buf) ++ return buf.write ++ ++ def _pop_buffer_and_writer(self): ++ """pop the most recent capturing buffer from this Context ++ and return the current writer after the pop. ++ ++ """ ++ ++ buf = self._buffer_stack.pop() ++ return buf, self._buffer_stack[-1].write ++ ++ def _push_buffer(self): ++ """push a capturing buffer onto this Context.""" ++ ++ self._push_writer() ++ ++ def _pop_buffer(self): ++ """pop the most recent capturing buffer from this Context.""" ++ ++ return self._buffer_stack.pop() ++ ++ def get(self, key, default=None): ++ """Return a value from this :class:`.Context`.""" ++ ++ return self._data.get(key, compat_builtins.__dict__.get(key, default)) ++ ++ def write(self, string): ++ """Write a string to this :class:`.Context` object's ++ underlying output buffer.""" ++ ++ self._buffer_stack[-1].write(string) ++ ++ def writer(self): ++ """Return the current writer function.""" ++ ++ return self._buffer_stack[-1].write ++ ++ def _copy(self): ++ c = Context.__new__(Context) ++ c._buffer_stack = self._buffer_stack ++ c._data = self._data.copy() ++ c._kwargs = self._kwargs ++ c._with_template = self._with_template ++ c._outputting_as_unicode = self._outputting_as_unicode ++ c.namespaces = self.namespaces ++ c.caller_stack = self.caller_stack ++ return c ++ ++ def _locals(self, d): ++ """Create a new :class:`.Context` with a copy of this ++ :class:`.Context`'s current state, ++ updated with the given dictionary. ++ ++ The :attr:`.Context.kwargs` collection remains ++ unaffected. ++ ++ ++ """ ++ ++ if not d: ++ return self ++ c = self._copy() ++ c._data.update(d) ++ return c ++ ++ def _clean_inheritance_tokens(self): ++ """create a new copy of this :class:`.Context`. with ++ tokens related to inheritance state removed.""" ++ ++ c = self._copy() ++ x = c._data ++ x.pop('self', None) ++ x.pop('parent', None) ++ x.pop('next', None) ++ return c ++ ++class CallerStack(list): ++ def __init__(self): ++ self.nextcaller = None ++ ++ def __nonzero__(self): ++ return self.__bool__() ++ ++ def __bool__(self): ++ return len(self) and self._get_caller() and True or False ++ ++ def _get_caller(self): ++ # this method can be removed once ++ # codegen MAGIC_NUMBER moves past 7 ++ return self[-1] ++ ++ def __getattr__(self, key): ++ return getattr(self._get_caller(), key) ++ ++ def _push_frame(self): ++ frame = self.nextcaller or None ++ self.append(frame) ++ self.nextcaller = None ++ return frame ++ ++ def _pop_frame(self): ++ self.nextcaller = self.pop() ++ ++ ++class Undefined(object): ++ """Represents an undefined value in a template. ++ ++ All template modules have a constant value ++ ``UNDEFINED`` present which is an instance of this ++ object. ++ ++ """ ++ def __str__(self): ++ raise NameError("Undefined") ++ ++ def __nonzero__(self): ++ return self.__bool__() ++ ++ def __bool__(self): ++ return False ++ ++UNDEFINED = Undefined() ++ ++class LoopStack(object): ++ """a stack for LoopContexts that implements the context manager protocol ++ to automatically pop off the top of the stack on context exit ++ """ ++ ++ def __init__(self): ++ self.stack = [] ++ ++ def _enter(self, iterable): ++ self._push(iterable) ++ return self._top ++ ++ def _exit(self): ++ self._pop() ++ return self._top ++ ++ @property ++ def _top(self): ++ if self.stack: ++ return self.stack[-1] ++ else: ++ return self ++ ++ def _pop(self): ++ return self.stack.pop() ++ ++ def _push(self, iterable): ++ new = LoopContext(iterable) ++ if self.stack: ++ new.parent = self.stack[-1] ++ return self.stack.append(new) ++ ++ def __getattr__(self, key): ++ raise exceptions.RuntimeException("No loop context is established") ++ ++ def __iter__(self): ++ return iter(self._top) ++ ++ ++class LoopContext(object): ++ """A magic loop variable. ++ Automatically accessible in any ``% for`` block. ++ ++ See the section :ref:`loop_context` for usage ++ notes. ++ ++ :attr:`parent` -> :class:`.LoopContext` or ``None`` ++ The parent loop, if one exists. ++ :attr:`index` -> `int` ++ The 0-based iteration count. ++ :attr:`reverse_index` -> `int` ++ The number of iterations remaining. ++ :attr:`first` -> `bool` ++ ``True`` on the first iteration, ``False`` otherwise. ++ :attr:`last` -> `bool` ++ ``True`` on the last iteration, ``False`` otherwise. ++ :attr:`even` -> `bool` ++ ``True`` when ``index`` is even. ++ :attr:`odd` -> `bool` ++ ``True`` when ``index`` is odd. ++ """ ++ ++ def __init__(self, iterable): ++ self._iterable = iterable ++ self.index = 0 ++ self.parent = None ++ ++ def __iter__(self): ++ for i in self._iterable: ++ yield i ++ self.index += 1 ++ ++ @util.memoized_instancemethod ++ def __len__(self): ++ return len(self._iterable) ++ ++ @property ++ def reverse_index(self): ++ return len(self) - self.index - 1 ++ ++ @property ++ def first(self): ++ return self.index == 0 ++ ++ @property ++ def last(self): ++ return self.index == len(self) - 1 ++ ++ @property ++ def even(self): ++ return not self.odd ++ ++ @property ++ def odd(self): ++ return bool(self.index % 2) ++ ++ def cycle(self, *values): ++ """Cycle through values as the loop progresses. ++ """ ++ if not values: ++ raise ValueError("You must provide values to cycle through") ++ return values[self.index % len(values)] ++ ++ ++class _NSAttr(object): ++ def __init__(self, parent): ++ self.__parent = parent ++ def __getattr__(self, key): ++ ns = self.__parent ++ while ns: ++ if hasattr(ns.module, key): ++ return getattr(ns.module, key) ++ else: ++ ns = ns.inherits ++ raise AttributeError(key) ++ ++class Namespace(object): ++ """Provides access to collections of rendering methods, which ++ can be local, from other templates, or from imported modules. ++ ++ To access a particular rendering method referenced by a ++ :class:`.Namespace`, use plain attribute access: ++ ++ .. sourcecode:: mako ++ ++ ${some_namespace.foo(x, y, z)} ++ ++ :class:`.Namespace` also contains several built-in attributes ++ described here. ++ ++ """ ++ ++ def __init__(self, name, context, ++ callables=None, inherits=None, ++ populate_self=True, calling_uri=None): ++ self.name = name ++ self.context = context ++ self.inherits = inherits ++ if callables is not None: ++ self.callables = dict([(c.__name__, c) for c in callables]) ++ ++ callables = () ++ ++ module = None ++ """The Python module referenced by this :class:`.Namespace`. ++ ++ If the namespace references a :class:`.Template`, then ++ this module is the equivalent of ``template.module``, ++ i.e. the generated module for the template. ++ ++ """ ++ ++ template = None ++ """The :class:`.Template` object referenced by this ++ :class:`.Namespace`, if any. ++ ++ """ ++ ++ context = None ++ """The :class:`.Context` object for this :class:`.Namespace`. ++ ++ Namespaces are often created with copies of contexts that ++ contain slightly different data, particularly in inheritance ++ scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one ++ can traverse an entire chain of templates that inherit from ++ one-another. ++ ++ """ ++ ++ filename = None ++ """The path of the filesystem file used for this ++ :class:`.Namespace`'s module or template. ++ ++ If this is a pure module-based ++ :class:`.Namespace`, this evaluates to ``module.__file__``. If a ++ template-based namespace, it evaluates to the original ++ template file location. ++ ++ """ ++ ++ uri = None ++ """The URI for this :class:`.Namespace`'s template. ++ ++ I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`. ++ ++ This is the equivalent of :attr:`.Template.uri`. ++ ++ """ ++ ++ _templateuri = None ++ ++ @util.memoized_property ++ def attr(self): ++ """Access module level attributes by name. ++ ++ This accessor allows templates to supply "scalar" ++ attributes which are particularly handy in inheritance ++ relationships. ++ ++ .. seealso:: ++ ++ :ref:`inheritance_attr` ++ ++ :ref:`namespace_attr_for_includes` ++ ++ """ ++ return _NSAttr(self) ++ ++ def get_namespace(self, uri): ++ """Return a :class:`.Namespace` corresponding to the given ``uri``. ++ ++ If the given ``uri`` is a relative URI (i.e. it does not ++ contain a leading slash ``/``), the ``uri`` is adjusted to ++ be relative to the ``uri`` of the namespace itself. This ++ method is therefore mostly useful off of the built-in ++ ``local`` namespace, described in :ref:`namespace_local`. ++ ++ In ++ most cases, a template wouldn't need this function, and ++ should instead use the ``<%namespace>`` tag to load ++ namespaces. However, since all ``<%namespace>`` tags are ++ evaluated before the body of a template ever runs, ++ this method can be used to locate namespaces using ++ expressions that were generated within the body code of ++ the template, or to conditionally use a particular ++ namespace. ++ ++ """ ++ key = (self, uri) ++ if key in self.context.namespaces: ++ return self.context.namespaces[key] ++ else: ++ ns = TemplateNamespace(uri, self.context._copy(), ++ templateuri=uri, ++ calling_uri=self._templateuri) ++ self.context.namespaces[key] = ns ++ return ns ++ ++ def get_template(self, uri): ++ """Return a :class:`.Template` from the given ``uri``. ++ ++ The ``uri`` resolution is relative to the ``uri`` of this ++ :class:`.Namespace` object's :class:`.Template`. ++ ++ """ ++ return _lookup_template(self.context, uri, self._templateuri) ++ ++ def get_cached(self, key, **kwargs): ++ """Return a value from the :class:`.Cache` referenced by this ++ :class:`.Namespace` object's :class:`.Template`. ++ ++ The advantage to this method versus direct access to the ++ :class:`.Cache` is that the configuration parameters ++ declared in ``<%page>`` take effect here, thereby calling ++ up the same configured backend as that configured ++ by ``<%page>``. ++ ++ """ ++ ++ return self.cache.get(key, **kwargs) ++ ++ @property ++ def cache(self): ++ """Return the :class:`.Cache` object referenced ++ by this :class:`.Namespace` object's ++ :class:`.Template`. ++ ++ """ ++ return self.template.cache ++ ++ def include_file(self, uri, **kwargs): ++ """Include a file at the given ``uri``.""" ++ ++ _include_file(self.context, uri, self._templateuri, **kwargs) ++ ++ def _populate(self, d, l): ++ for ident in l: ++ if ident == '*': ++ for (k, v) in self._get_star(): ++ d[k] = v ++ else: ++ d[ident] = getattr(self, ident) ++ ++ def _get_star(self): ++ if self.callables: ++ for key in self.callables: ++ yield (key, self.callables[key]) ++ ++ def __getattr__(self, key): ++ if key in self.callables: ++ val = self.callables[key] ++ elif self.inherits: ++ val = getattr(self.inherits, key) ++ else: ++ raise AttributeError( ++ "Namespace '%s' has no member '%s'" % ++ (self.name, key)) ++ setattr(self, key, val) ++ return val ++ ++class TemplateNamespace(Namespace): ++ """A :class:`.Namespace` specific to a :class:`.Template` instance.""" ++ ++ def __init__(self, name, context, template=None, templateuri=None, ++ callables=None, inherits=None, ++ populate_self=True, calling_uri=None): ++ self.name = name ++ self.context = context ++ self.inherits = inherits ++ if callables is not None: ++ self.callables = dict([(c.__name__, c) for c in callables]) ++ ++ if templateuri is not None: ++ self.template = _lookup_template(context, templateuri, ++ calling_uri) ++ self._templateuri = self.template.module._template_uri ++ elif template is not None: ++ self.template = template ++ self._templateuri = template.module._template_uri ++ else: ++ raise TypeError("'template' argument is required.") ++ ++ if populate_self: ++ lclcallable, lclcontext = \ ++ _populate_self_namespace(context, self.template, ++ self_ns=self) ++ ++ @property ++ def module(self): ++ """The Python module referenced by this :class:`.Namespace`. ++ ++ If the namespace references a :class:`.Template`, then ++ this module is the equivalent of ``template.module``, ++ i.e. the generated module for the template. ++ ++ """ ++ return self.template.module ++ ++ @property ++ def filename(self): ++ """The path of the filesystem file used for this ++ :class:`.Namespace`'s module or template. ++ """ ++ return self.template.filename ++ ++ @property ++ def uri(self): ++ """The URI for this :class:`.Namespace`'s template. ++ ++ I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`. ++ ++ This is the equivalent of :attr:`.Template.uri`. ++ ++ """ ++ return self.template.uri ++ ++ def _get_star(self): ++ if self.callables: ++ for key in self.callables: ++ yield (key, self.callables[key]) ++ def get(key): ++ callable_ = self.template._get_def_callable(key) ++ return compat.partial(callable_, self.context) ++ for k in self.template.module._exports: ++ yield (k, get(k)) ++ ++ def __getattr__(self, key): ++ if key in self.callables: ++ val = self.callables[key] ++ elif self.template.has_def(key): ++ callable_ = self.template._get_def_callable(key) ++ val = compat.partial(callable_, self.context) ++ elif self.inherits: ++ val = getattr(self.inherits, key) ++ ++ else: ++ raise AttributeError( ++ "Namespace '%s' has no member '%s'" % ++ (self.name, key)) ++ setattr(self, key, val) ++ return val ++ ++class ModuleNamespace(Namespace): ++ """A :class:`.Namespace` specific to a Python module instance.""" ++ ++ def __init__(self, name, context, module, ++ callables=None, inherits=None, ++ populate_self=True, calling_uri=None): ++ self.name = name ++ self.context = context ++ self.inherits = inherits ++ if callables is not None: ++ self.callables = dict([(c.__name__, c) for c in callables]) ++ ++ mod = __import__(module) ++ for token in module.split('.')[1:]: ++ mod = getattr(mod, token) ++ self.module = mod ++ ++ @property ++ def filename(self): ++ """The path of the filesystem file used for this ++ :class:`.Namespace`'s module or template. ++ """ ++ return self.module.__file__ ++ ++ def _get_star(self): ++ if self.callables: ++ for key in self.callables: ++ yield (key, self.callables[key]) ++ for key in dir(self.module): ++ if key[0] != '_': ++ callable_ = getattr(self.module, key) ++ if compat.callable(callable_): ++ yield key, compat.partial(callable_, self.context) ++ ++ ++ def __getattr__(self, key): ++ if key in self.callables: ++ val = self.callables[key] ++ elif hasattr(self.module, key): ++ callable_ = getattr(self.module, key) ++ val = compat.partial(callable_, self.context) ++ elif self.inherits: ++ val = getattr(self.inherits, key) ++ else: ++ raise AttributeError( ++ "Namespace '%s' has no member '%s'" % ++ (self.name, key)) ++ setattr(self, key, val) ++ return val ++ ++def supports_caller(func): ++ """Apply a caller_stack compatibility decorator to a plain ++ Python function. ++ ++ See the example in :ref:`namespaces_python_modules`. ++ ++ """ ++ ++ def wrap_stackframe(context, *args, **kwargs): ++ context.caller_stack._push_frame() ++ try: ++ return func(context, *args, **kwargs) ++ finally: ++ context.caller_stack._pop_frame() ++ return wrap_stackframe ++ ++def capture(context, callable_, *args, **kwargs): ++ """Execute the given template def, capturing the output into ++ a buffer. ++ ++ See the example in :ref:`namespaces_python_modules`. ++ ++ """ ++ ++ if not compat.callable(callable_): ++ raise exceptions.RuntimeException( ++ "capture() function expects a callable as " ++ "its argument (i.e. capture(func, *args, **kwargs))" ++ ) ++ context._push_buffer() ++ try: ++ callable_(*args, **kwargs) ++ finally: ++ buf = context._pop_buffer() ++ return buf.getvalue() ++ ++def _decorate_toplevel(fn): ++ def decorate_render(render_fn): ++ def go(context, *args, **kw): ++ def y(*args, **kw): ++ return render_fn(context, *args, **kw) ++ try: ++ y.__name__ = render_fn.__name__[7:] ++ except TypeError: ++ # < Python 2.4 ++ pass ++ return fn(y)(context, *args, **kw) ++ return go ++ return decorate_render ++ ++def _decorate_inline(context, fn): ++ def decorate_render(render_fn): ++ dec = fn(render_fn) ++ def go(*args, **kw): ++ return dec(context, *args, **kw) ++ return go ++ return decorate_render ++ ++def _include_file(context, uri, calling_uri, **kwargs): ++ """locate the template from the given uri and include it in ++ the current output.""" ++ ++ template = _lookup_template(context, uri, calling_uri) ++ (callable_, ctx) = _populate_self_namespace( ++ context._clean_inheritance_tokens(), ++ template) ++ callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs)) ++ ++def _inherit_from(context, uri, calling_uri): ++ """called by the _inherit method in template modules to set ++ up the inheritance chain at the start of a template's ++ execution.""" ++ ++ if uri is None: ++ return None ++ template = _lookup_template(context, uri, calling_uri) ++ self_ns = context['self'] ++ ih = self_ns ++ while ih.inherits is not None: ++ ih = ih.inherits ++ lclcontext = context._locals({'next': ih}) ++ ih.inherits = TemplateNamespace("self:%s" % template.uri, ++ lclcontext, ++ template=template, ++ populate_self=False) ++ context._data['parent'] = lclcontext._data['local'] = ih.inherits ++ callable_ = getattr(template.module, '_mako_inherit', None) ++ if callable_ is not None: ++ ret = callable_(template, lclcontext) ++ if ret: ++ return ret ++ ++ gen_ns = getattr(template.module, '_mako_generate_namespaces', None) ++ if gen_ns is not None: ++ gen_ns(context) ++ return (template.callable_, lclcontext) ++ ++def _lookup_template(context, uri, relativeto): ++ lookup = context._with_template.lookup ++ if lookup is None: ++ raise exceptions.TemplateLookupException( ++ "Template '%s' has no TemplateLookup associated" % ++ context._with_template.uri) ++ uri = lookup.adjust_uri(uri, relativeto) ++ try: ++ return lookup.get_template(uri) ++ except exceptions.TopLevelLookupException: ++ raise exceptions.TemplateLookupException(str(compat.exception_as())) ++ ++def _populate_self_namespace(context, template, self_ns=None): ++ if self_ns is None: ++ self_ns = TemplateNamespace('self:%s' % template.uri, ++ context, template=template, ++ populate_self=False) ++ context._data['self'] = context._data['local'] = self_ns ++ if hasattr(template.module, '_mako_inherit'): ++ ret = template.module._mako_inherit(template, context) ++ if ret: ++ return ret ++ return (template.callable_, context) ++ ++def _render(template, callable_, args, data, as_unicode=False): ++ """create a Context and return the string ++ output of the given template and template callable.""" ++ ++ if as_unicode: ++ buf = util.FastEncodingBuffer(as_unicode=True) ++ elif template.bytestring_passthrough: ++ buf = compat.StringIO() ++ else: ++ buf = util.FastEncodingBuffer( ++ as_unicode=as_unicode, ++ encoding=template.output_encoding, ++ errors=template.encoding_errors) ++ context = Context(buf, **data) ++ context._outputting_as_unicode = as_unicode ++ context._set_with_template(template) ++ ++ _render_context(template, callable_, context, *args, ++ **_kwargs_for_callable(callable_, data)) ++ return context._pop_buffer().getvalue() ++ ++def _kwargs_for_callable(callable_, data): ++ argspec = compat.inspect_func_args(callable_) ++ # for normal pages, **pageargs is usually present ++ if argspec[2]: ++ return data ++ ++ # for rendering defs from the top level, figure out the args ++ namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None] ++ kwargs = {} ++ for arg in namedargs: ++ if arg != 'context' and arg in data and arg not in kwargs: ++ kwargs[arg] = data[arg] ++ return kwargs ++ ++def _kwargs_for_include(callable_, data, **kwargs): ++ argspec = compat.inspect_func_args(callable_) ++ namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None] ++ for arg in namedargs: ++ if arg != 'context' and arg in data and arg not in kwargs: ++ kwargs[arg] = data[arg] ++ return kwargs ++ ++def _render_context(tmpl, callable_, context, *args, **kwargs): ++ import mako.template as template ++ # create polymorphic 'self' namespace for this ++ # template with possibly updated context ++ if not isinstance(tmpl, template.DefTemplate): ++ # if main render method, call from the base of the inheritance stack ++ (inherit, lclcontext) = _populate_self_namespace(context, tmpl) ++ _exec_template(inherit, lclcontext, args=args, kwargs=kwargs) ++ else: ++ # otherwise, call the actual rendering method specified ++ (inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent) ++ _exec_template(callable_, context, args=args, kwargs=kwargs) ++ ++def _exec_template(callable_, context, args=None, kwargs=None): ++ """execute a rendering callable given the callable, a ++ Context, and optional explicit arguments ++ ++ the contextual Template will be located if it exists, and ++ the error handling options specified on that Template will ++ be interpreted here. ++ """ ++ template = context._with_template ++ if template is not None and \ ++ (template.format_exceptions or template.error_handler): ++ try: ++ callable_(context, *args, **kwargs) ++ except Exception: ++ _render_error(template, context, compat.exception_as()) ++ except: ++ e = sys.exc_info()[0] ++ _render_error(template, context, e) ++ else: ++ callable_(context, *args, **kwargs) ++ ++def _render_error(template, context, error): ++ if template.error_handler: ++ result = template.error_handler(context, error) ++ if not result: ++ compat.reraise(*sys.exc_info()) ++ else: ++ error_template = exceptions.html_error_template() ++ if context._outputting_as_unicode: ++ context._buffer_stack[:] = [ ++ util.FastEncodingBuffer(as_unicode=True)] ++ else: ++ context._buffer_stack[:] = [util.FastEncodingBuffer( ++ error_template.output_encoding, ++ error_template.encoding_errors)] ++ ++ context._set_with_template(error_template) ++ error_template.render_context(context, error=error) +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py +new file mode 100644 +index 0000000..fb61062 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py +@@ -0,0 +1,705 @@ ++# mako/template.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++"""Provides the Template class, a facade for parsing, generating and executing ++template strings, as well as template runtime operations.""" ++ ++from mako.lexer import Lexer ++from mako import runtime, util, exceptions, codegen, cache, compat ++import os ++import re ++import shutil ++import stat ++import sys ++import tempfile ++import types ++import weakref ++ ++ ++class Template(object): ++ """Represents a compiled template. ++ ++ :class:`.Template` includes a reference to the original ++ template source (via the :attr:`.source` attribute) ++ as well as the source code of the ++ generated Python module (i.e. the :attr:`.code` attribute), ++ as well as a reference to an actual Python module. ++ ++ :class:`.Template` is constructed using either a literal string ++ representing the template text, or a filename representing a filesystem ++ path to a source file. ++ ++ :param text: textual template source. This argument is mutually ++ exclusive versus the ``filename`` parameter. ++ ++ :param filename: filename of the source template. This argument is ++ mutually exclusive versus the ``text`` parameter. ++ ++ :param buffer_filters: string list of filters to be applied ++ to the output of ``%def``\ s which are buffered, cached, or otherwise ++ filtered, after all filters ++ defined with the ``%def`` itself have been applied. Allows the ++ creation of default expression filters that let the output ++ of return-valued ``%def``\ s "opt out" of that filtering via ++ passing special attributes or objects. ++ ++ :param bytestring_passthrough: When ``True``, and ``output_encoding`` is ++ set to ``None``, and :meth:`.Template.render` is used to render, ++ the `StringIO` or `cStringIO` buffer will be used instead of the ++ default "fast" buffer. This allows raw bytestrings in the ++ output stream, such as in expressions, to pass straight ++ through to the buffer. This flag is forced ++ to ``True`` if ``disable_unicode`` is also configured. ++ ++ .. versionadded:: 0.4 ++ Added to provide the same behavior as that of the previous series. ++ ++ :param cache_args: Dictionary of cache configuration arguments that ++ will be passed to the :class:`.CacheImpl`. See :ref:`caching_toplevel`. ++ ++ :param cache_dir: ++ ++ .. deprecated:: 0.6 ++ Use the ``'dir'`` argument in the ``cache_args`` dictionary. ++ See :ref:`caching_toplevel`. ++ ++ :param cache_enabled: Boolean flag which enables caching of this ++ template. See :ref:`caching_toplevel`. ++ ++ :param cache_impl: String name of a :class:`.CacheImpl` caching ++ implementation to use. Defaults to ``'beaker'``. ++ ++ :param cache_type: ++ ++ .. deprecated:: 0.6 ++ Use the ``'type'`` argument in the ``cache_args`` dictionary. ++ See :ref:`caching_toplevel`. ++ ++ :param cache_url: ++ ++ .. deprecated:: 0.6 ++ Use the ``'url'`` argument in the ``cache_args`` dictionary. ++ See :ref:`caching_toplevel`. ++ ++ :param default_filters: List of string filter names that will ++ be applied to all expressions. See :ref:`filtering_default_filters`. ++ ++ :param disable_unicode: Disables all awareness of Python Unicode ++ objects. See :ref:`unicode_disabled`. ++ ++ :param enable_loop: When ``True``, enable the ``loop`` context variable. ++ This can be set to ``False`` to support templates that may ++ be making usage of the name "``loop``". Individual templates can ++ re-enable the "loop" context by placing the directive ++ ``enable_loop="True"`` inside the ``<%page>`` tag -- see ++ :ref:`migrating_loop`. ++ ++ :param encoding_errors: Error parameter passed to ``encode()`` when ++ string encoding is performed. See :ref:`usage_unicode`. ++ ++ :param error_handler: Python callable which is called whenever ++ compile or runtime exceptions occur. The callable is passed ++ the current context as well as the exception. If the ++ callable returns ``True``, the exception is considered to ++ be handled, else it is re-raised after the function ++ completes. Is used to provide custom error-rendering ++ functions. ++ ++ :param format_exceptions: if ``True``, exceptions which occur during ++ the render phase of this template will be caught and ++ formatted into an HTML error page, which then becomes the ++ rendered result of the :meth:`.render` call. Otherwise, ++ runtime exceptions are propagated outwards. ++ ++ :param imports: String list of Python statements, typically individual ++ "import" lines, which will be placed into the module level ++ preamble of all generated Python modules. See the example ++ in :ref:`filtering_default_filters`. ++ ++ :param future_imports: String list of names to import from `__future__`. ++ These will be concatenated into a comma-separated string and inserted ++ into the beginning of the template, e.g. ``futures_imports=['FOO', ++ 'BAR']`` results in ``from __future__ import FOO, BAR``. If you're ++ interested in using features like the new division operator, you must ++ use future_imports to convey that to the renderer, as otherwise the ++ import will not appear as the first executed statement in the generated ++ code and will therefore not have the desired effect. ++ ++ :param input_encoding: Encoding of the template's source code. Can ++ be used in lieu of the coding comment. See ++ :ref:`usage_unicode` as well as :ref:`unicode_toplevel` for ++ details on source encoding. ++ ++ :param lookup: a :class:`.TemplateLookup` instance that will be used ++ for all file lookups via the ``<%namespace>``, ++ ``<%include>``, and ``<%inherit>`` tags. See ++ :ref:`usage_templatelookup`. ++ ++ :param module_directory: Filesystem location where generated ++ Python module files will be placed. ++ ++ :param module_filename: Overrides the filename of the generated ++ Python module file. For advanced usage only. ++ ++ :param module_writer: A callable which overrides how the Python ++ module is written entirely. The callable is passed the ++ encoded source content of the module and the destination ++ path to be written to. The default behavior of module writing ++ uses a tempfile in conjunction with a file move in order ++ to make the operation atomic. So a user-defined module ++ writing function that mimics the default behavior would be: ++ ++ .. sourcecode:: python ++ ++ import tempfile ++ import os ++ import shutil ++ ++ def module_writer(source, outputpath): ++ (dest, name) = \\ ++ tempfile.mkstemp( ++ dir=os.path.dirname(outputpath) ++ ) ++ ++ os.write(dest, source) ++ os.close(dest) ++ shutil.move(name, outputpath) ++ ++ from mako.template import Template ++ mytemplate = Template( ++ filename="index.html", ++ module_directory="/path/to/modules", ++ module_writer=module_writer ++ ) ++ ++ The function is provided for unusual configurations where ++ certain platform-specific permissions or other special ++ steps are needed. ++ ++ :param output_encoding: The encoding to use when :meth:`.render` ++ is called. ++ See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`. ++ ++ :param preprocessor: Python callable which will be passed ++ the full template source before it is parsed. The return ++ result of the callable will be used as the template source ++ code. ++ ++ :param lexer_cls: A :class:`.Lexer` class used to parse ++ the template. The :class:`.Lexer` class is used by ++ default. ++ ++ .. versionadded:: 0.7.4 ++ ++ :param strict_undefined: Replaces the automatic usage of ++ ``UNDEFINED`` for any undeclared variables not located in ++ the :class:`.Context` with an immediate raise of ++ ``NameError``. The advantage is immediate reporting of ++ missing variables which include the name. ++ ++ .. versionadded:: 0.3.6 ++ ++ :param uri: string URI or other identifier for this template. ++ If not provided, the ``uri`` is generated from the filesystem ++ path, or from the in-memory identity of a non-file-based ++ template. The primary usage of the ``uri`` is to provide a key ++ within :class:`.TemplateLookup`, as well as to generate the ++ file path of the generated Python module file, if ++ ``module_directory`` is specified. ++ ++ """ ++ ++ lexer_cls = Lexer ++ ++ def __init__(self, ++ text=None, ++ filename=None, ++ uri=None, ++ format_exceptions=False, ++ error_handler=None, ++ lookup=None, ++ output_encoding=None, ++ encoding_errors='strict', ++ module_directory=None, ++ cache_args=None, ++ cache_impl='beaker', ++ cache_enabled=True, ++ cache_type=None, ++ cache_dir=None, ++ cache_url=None, ++ module_filename=None, ++ input_encoding=None, ++ disable_unicode=False, ++ module_writer=None, ++ bytestring_passthrough=False, ++ default_filters=None, ++ buffer_filters=(), ++ strict_undefined=False, ++ imports=None, ++ future_imports=None, ++ enable_loop=True, ++ preprocessor=None, ++ lexer_cls=None): ++ if uri: ++ self.module_id = re.sub(r'\W', "_", uri) ++ self.uri = uri ++ elif filename: ++ self.module_id = re.sub(r'\W', "_", filename) ++ drive, path = os.path.splitdrive(filename) ++ path = os.path.normpath(path).replace(os.path.sep, "/") ++ self.uri = path ++ else: ++ self.module_id = "memory:" + hex(id(self)) ++ self.uri = self.module_id ++ ++ u_norm = self.uri ++ if u_norm.startswith("/"): ++ u_norm = u_norm[1:] ++ u_norm = os.path.normpath(u_norm) ++ if u_norm.startswith(".."): ++ raise exceptions.TemplateLookupException( ++ "Template uri \"%s\" is invalid - " ++ "it cannot be relative outside " ++ "of the root path." % self.uri) ++ ++ self.input_encoding = input_encoding ++ self.output_encoding = output_encoding ++ self.encoding_errors = encoding_errors ++ self.disable_unicode = disable_unicode ++ self.bytestring_passthrough = bytestring_passthrough or disable_unicode ++ self.enable_loop = enable_loop ++ self.strict_undefined = strict_undefined ++ self.module_writer = module_writer ++ ++ if compat.py3k and disable_unicode: ++ raise exceptions.UnsupportedError( ++ "Mako for Python 3 does not " ++ "support disabling Unicode") ++ elif output_encoding and disable_unicode: ++ raise exceptions.UnsupportedError( ++ "output_encoding must be set to " ++ "None when disable_unicode is used.") ++ if default_filters is None: ++ if compat.py3k or self.disable_unicode: ++ self.default_filters = ['str'] ++ else: ++ self.default_filters = ['unicode'] ++ else: ++ self.default_filters = default_filters ++ self.buffer_filters = buffer_filters ++ ++ self.imports = imports ++ self.future_imports = future_imports ++ self.preprocessor = preprocessor ++ ++ if lexer_cls is not None: ++ self.lexer_cls = lexer_cls ++ ++ # if plain text, compile code in memory only ++ if text is not None: ++ (code, module) = _compile_text(self, text, filename) ++ self._code = code ++ self._source = text ++ ModuleInfo(module, None, self, filename, code, text) ++ elif filename is not None: ++ # if template filename and a module directory, load ++ # a filesystem-based module file, generating if needed ++ if module_filename is not None: ++ path = module_filename ++ elif module_directory is not None: ++ path = os.path.abspath( ++ os.path.join( ++ os.path.normpath(module_directory), ++ u_norm + ".py" ++ ) ++ ) ++ else: ++ path = None ++ module = self._compile_from_file(path, filename) ++ else: ++ raise exceptions.RuntimeException( ++ "Template requires text or filename") ++ ++ self.module = module ++ self.filename = filename ++ self.callable_ = self.module.render_body ++ self.format_exceptions = format_exceptions ++ self.error_handler = error_handler ++ self.lookup = lookup ++ ++ self.module_directory = module_directory ++ ++ self._setup_cache_args( ++ cache_impl, cache_enabled, cache_args, ++ cache_type, cache_dir, cache_url ++ ) ++ ++ ++ @util.memoized_property ++ def reserved_names(self): ++ if self.enable_loop: ++ return codegen.RESERVED_NAMES ++ else: ++ return codegen.RESERVED_NAMES.difference(['loop']) ++ ++ def _setup_cache_args(self, ++ cache_impl, cache_enabled, cache_args, ++ cache_type, cache_dir, cache_url): ++ self.cache_impl = cache_impl ++ self.cache_enabled = cache_enabled ++ if cache_args: ++ self.cache_args = cache_args ++ else: ++ self.cache_args = {} ++ ++ # transfer deprecated cache_* args ++ if cache_type: ++ self.cache_args['type'] = cache_type ++ if cache_dir: ++ self.cache_args['dir'] = cache_dir ++ if cache_url: ++ self.cache_args['url'] = cache_url ++ ++ def _compile_from_file(self, path, filename): ++ if path is not None: ++ util.verify_directory(os.path.dirname(path)) ++ filemtime = os.stat(filename)[stat.ST_MTIME] ++ if not os.path.exists(path) or \ ++ os.stat(path)[stat.ST_MTIME] < filemtime: ++ data = util.read_file(filename) ++ _compile_module_file( ++ self, ++ data, ++ filename, ++ path, ++ self.module_writer) ++ module = compat.load_module(self.module_id, path) ++ del sys.modules[self.module_id] ++ if module._magic_number != codegen.MAGIC_NUMBER: ++ data = util.read_file(filename) ++ _compile_module_file( ++ self, ++ data, ++ filename, ++ path, ++ self.module_writer) ++ module = compat.load_module(self.module_id, path) ++ del sys.modules[self.module_id] ++ ModuleInfo(module, path, self, filename, None, None) ++ else: ++ # template filename and no module directory, compile code ++ # in memory ++ data = util.read_file(filename) ++ code, module = _compile_text( ++ self, ++ data, ++ filename) ++ self._source = None ++ self._code = code ++ ModuleInfo(module, None, self, filename, code, None) ++ return module ++ ++ @property ++ def source(self): ++ """Return the template source code for this :class:`.Template`.""" ++ ++ return _get_module_info_from_callable(self.callable_).source ++ ++ @property ++ def code(self): ++ """Return the module source code for this :class:`.Template`.""" ++ ++ return _get_module_info_from_callable(self.callable_).code ++ ++ @util.memoized_property ++ def cache(self): ++ return cache.Cache(self) ++ ++ @property ++ def cache_dir(self): ++ return self.cache_args['dir'] ++ @property ++ def cache_url(self): ++ return self.cache_args['url'] ++ @property ++ def cache_type(self): ++ return self.cache_args['type'] ++ ++ def render(self, *args, **data): ++ """Render the output of this template as a string. ++ ++ If the template specifies an output encoding, the string ++ will be encoded accordingly, else the output is raw (raw ++ output uses `cStringIO` and can't handle multibyte ++ characters). A :class:`.Context` object is created corresponding ++ to the given data. Arguments that are explicitly declared ++ by this template's internal rendering method are also ++ pulled from the given ``*args``, ``**data`` members. ++ ++ """ ++ return runtime._render(self, self.callable_, args, data) ++ ++ def render_unicode(self, *args, **data): ++ """Render the output of this template as a unicode object.""" ++ ++ return runtime._render(self, ++ self.callable_, ++ args, ++ data, ++ as_unicode=True) ++ ++ def render_context(self, context, *args, **kwargs): ++ """Render this :class:`.Template` with the given context. ++ ++ The data is written to the context's buffer. ++ ++ """ ++ if getattr(context, '_with_template', None) is None: ++ context._set_with_template(self) ++ runtime._render_context(self, ++ self.callable_, ++ context, ++ *args, ++ **kwargs) ++ ++ def has_def(self, name): ++ return hasattr(self.module, "render_%s" % name) ++ ++ def get_def(self, name): ++ """Return a def of this template as a :class:`.DefTemplate`.""" ++ ++ return DefTemplate(self, getattr(self.module, "render_%s" % name)) ++ ++ def _get_def_callable(self, name): ++ return getattr(self.module, "render_%s" % name) ++ ++ @property ++ def last_modified(self): ++ return self.module._modified_time ++ ++class ModuleTemplate(Template): ++ """A Template which is constructed given an existing Python module. ++ ++ e.g.:: ++ ++ t = Template("this is a template") ++ f = file("mymodule.py", "w") ++ f.write(t.code) ++ f.close() ++ ++ import mymodule ++ ++ t = ModuleTemplate(mymodule) ++ print t.render() ++ ++ """ ++ ++ def __init__(self, module, ++ module_filename=None, ++ template=None, ++ template_filename=None, ++ module_source=None, ++ template_source=None, ++ output_encoding=None, ++ encoding_errors='strict', ++ disable_unicode=False, ++ bytestring_passthrough=False, ++ format_exceptions=False, ++ error_handler=None, ++ lookup=None, ++ cache_args=None, ++ cache_impl='beaker', ++ cache_enabled=True, ++ cache_type=None, ++ cache_dir=None, ++ cache_url=None, ++ ): ++ self.module_id = re.sub(r'\W', "_", module._template_uri) ++ self.uri = module._template_uri ++ self.input_encoding = module._source_encoding ++ self.output_encoding = output_encoding ++ self.encoding_errors = encoding_errors ++ self.disable_unicode = disable_unicode ++ self.bytestring_passthrough = bytestring_passthrough or disable_unicode ++ self.enable_loop = module._enable_loop ++ ++ if compat.py3k and disable_unicode: ++ raise exceptions.UnsupportedError( ++ "Mako for Python 3 does not " ++ "support disabling Unicode") ++ elif output_encoding and disable_unicode: ++ raise exceptions.UnsupportedError( ++ "output_encoding must be set to " ++ "None when disable_unicode is used.") ++ ++ self.module = module ++ self.filename = template_filename ++ ModuleInfo(module, ++ module_filename, ++ self, ++ template_filename, ++ module_source, ++ template_source) ++ ++ self.callable_ = self.module.render_body ++ self.format_exceptions = format_exceptions ++ self.error_handler = error_handler ++ self.lookup = lookup ++ self._setup_cache_args( ++ cache_impl, cache_enabled, cache_args, ++ cache_type, cache_dir, cache_url ++ ) ++ ++class DefTemplate(Template): ++ """A :class:`.Template` which represents a callable def in a parent ++ template.""" ++ ++ def __init__(self, parent, callable_): ++ self.parent = parent ++ self.callable_ = callable_ ++ self.output_encoding = parent.output_encoding ++ self.module = parent.module ++ self.encoding_errors = parent.encoding_errors ++ self.format_exceptions = parent.format_exceptions ++ self.error_handler = parent.error_handler ++ self.enable_loop = parent.enable_loop ++ self.lookup = parent.lookup ++ self.bytestring_passthrough = parent.bytestring_passthrough ++ ++ def get_def(self, name): ++ return self.parent.get_def(name) ++ ++class ModuleInfo(object): ++ """Stores information about a module currently loaded into ++ memory, provides reverse lookups of template source, module ++ source code based on a module's identifier. ++ ++ """ ++ _modules = weakref.WeakValueDictionary() ++ ++ def __init__(self, ++ module, ++ module_filename, ++ template, ++ template_filename, ++ module_source, ++ template_source): ++ self.module = module ++ self.module_filename = module_filename ++ self.template_filename = template_filename ++ self.module_source = module_source ++ self.template_source = template_source ++ self._modules[module.__name__] = template._mmarker = self ++ if module_filename: ++ self._modules[module_filename] = self ++ ++ @classmethod ++ def get_module_source_metadata(cls, module_source, full_line_map=False): ++ source_map = re.search( ++ r"__M_BEGIN_METADATA(.+?)__M_END_METADATA", ++ module_source, re.S).group(1) ++ source_map = compat.json.loads(source_map) ++ source_map['line_map'] = dict((int(k), int(v)) ++ for k, v in source_map['line_map'].items()) ++ if full_line_map: ++ f_line_map = source_map['full_line_map'] = [] ++ line_map = source_map['line_map'] ++ ++ curr_templ_line = 1 ++ for mod_line in range(1, max(line_map)): ++ if mod_line in line_map: ++ curr_templ_line = line_map[mod_line] ++ f_line_map.append(curr_templ_line) ++ return source_map ++ ++ @property ++ def code(self): ++ if self.module_source is not None: ++ return self.module_source ++ else: ++ return util.read_python_file(self.module_filename) ++ ++ @property ++ def source(self): ++ if self.template_source is not None: ++ if self.module._source_encoding and \ ++ not isinstance(self.template_source, compat.text_type): ++ return self.template_source.decode( ++ self.module._source_encoding) ++ else: ++ return self.template_source ++ else: ++ data = util.read_file(self.template_filename) ++ if self.module._source_encoding: ++ return data.decode(self.module._source_encoding) ++ else: ++ return data ++ ++def _compile(template, text, filename, generate_magic_comment): ++ lexer = template.lexer_cls(text, ++ filename, ++ disable_unicode=template.disable_unicode, ++ input_encoding=template.input_encoding, ++ preprocessor=template.preprocessor) ++ node = lexer.parse() ++ source = codegen.compile(node, ++ template.uri, ++ filename, ++ default_filters=template.default_filters, ++ buffer_filters=template.buffer_filters, ++ imports=template.imports, ++ future_imports=template.future_imports, ++ source_encoding=lexer.encoding, ++ generate_magic_comment=generate_magic_comment, ++ disable_unicode=template.disable_unicode, ++ strict_undefined=template.strict_undefined, ++ enable_loop=template.enable_loop, ++ reserved_names=template.reserved_names) ++ return source, lexer ++ ++def _compile_text(template, text, filename): ++ identifier = template.module_id ++ source, lexer = _compile(template, text, filename, ++ generate_magic_comment=template.disable_unicode) ++ ++ cid = identifier ++ if not compat.py3k and isinstance(cid, compat.text_type): ++ cid = cid.encode() ++ module = types.ModuleType(cid) ++ code = compile(source, cid, 'exec') ++ ++ # this exec() works for 2.4->3.3. ++ exec(code, module.__dict__, module.__dict__) ++ return (source, module) ++ ++def _compile_module_file(template, text, filename, outputpath, module_writer): ++ source, lexer = _compile(template, text, filename, ++ generate_magic_comment=True) ++ ++ if isinstance(source, compat.text_type): ++ source = source.encode(lexer.encoding or 'ascii') ++ ++ if module_writer: ++ module_writer(source, outputpath) ++ else: ++ # make tempfiles in the same location as the ultimate ++ # location. this ensures they're on the same filesystem, ++ # avoiding synchronization issues. ++ (dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath)) ++ ++ os.write(dest, source) ++ os.close(dest) ++ shutil.move(name, outputpath) ++ ++def _get_module_info_from_callable(callable_): ++ if compat.py3k: ++ return _get_module_info(callable_.__globals__['__name__']) ++ else: ++ return _get_module_info(callable_.func_globals['__name__']) ++ ++def _get_module_info(filename): ++ return ModuleInfo._modules[filename] ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py +new file mode 100644 +index 0000000..cba2ab7 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py +@@ -0,0 +1,360 @@ ++# mako/util.py ++# Copyright (C) 2006-2015 the Mako authors and contributors ++# ++# This module is part of Mako and is released under ++# the MIT License: http://www.opensource.org/licenses/mit-license.php ++ ++import re ++import collections ++import codecs ++import os ++from mako import compat ++import operator ++ ++def update_wrapper(decorated, fn): ++ decorated.__wrapped__ = fn ++ decorated.__name__ = fn.__name__ ++ return decorated ++ ++ ++class PluginLoader(object): ++ def __init__(self, group): ++ self.group = group ++ self.impls = {} ++ ++ def load(self, name): ++ if name in self.impls: ++ return self.impls[name]() ++ else: ++ import pkg_resources ++ for impl in pkg_resources.iter_entry_points( ++ self.group, ++ name): ++ self.impls[name] = impl.load ++ return impl.load() ++ else: ++ from mako import exceptions ++ raise exceptions.RuntimeException( ++ "Can't load plugin %s %s" % ++ (self.group, name)) ++ ++ def register(self, name, modulepath, objname): ++ def load(): ++ mod = __import__(modulepath) ++ for token in modulepath.split(".")[1:]: ++ mod = getattr(mod, token) ++ return getattr(mod, objname) ++ self.impls[name] = load ++ ++def verify_directory(dir): ++ """create and/or verify a filesystem directory.""" ++ ++ tries = 0 ++ ++ while not os.path.exists(dir): ++ try: ++ tries += 1 ++ os.makedirs(dir, compat.octal("0775")) ++ except: ++ if tries > 5: ++ raise ++ ++def to_list(x, default=None): ++ if x is None: ++ return default ++ if not isinstance(x, (list, tuple)): ++ return [x] ++ else: ++ return x ++ ++ ++class memoized_property(object): ++ """A read-only @property that is only evaluated once.""" ++ def __init__(self, fget, doc=None): ++ self.fget = fget ++ self.__doc__ = doc or fget.__doc__ ++ self.__name__ = fget.__name__ ++ ++ def __get__(self, obj, cls): ++ if obj is None: ++ return self ++ obj.__dict__[self.__name__] = result = self.fget(obj) ++ return result ++ ++class memoized_instancemethod(object): ++ """Decorate a method memoize its return value. ++ ++ Best applied to no-arg methods: memoization is not sensitive to ++ argument values, and will always return the same value even when ++ called with different arguments. ++ ++ """ ++ def __init__(self, fget, doc=None): ++ self.fget = fget ++ self.__doc__ = doc or fget.__doc__ ++ self.__name__ = fget.__name__ ++ ++ def __get__(self, obj, cls): ++ if obj is None: ++ return self ++ def oneshot(*args, **kw): ++ result = self.fget(obj, *args, **kw) ++ memo = lambda *a, **kw: result ++ memo.__name__ = self.__name__ ++ memo.__doc__ = self.__doc__ ++ obj.__dict__[self.__name__] = memo ++ return result ++ oneshot.__name__ = self.__name__ ++ oneshot.__doc__ = self.__doc__ ++ return oneshot ++ ++class SetLikeDict(dict): ++ """a dictionary that has some setlike methods on it""" ++ def union(self, other): ++ """produce a 'union' of this dict and another (at the key level). ++ ++ values in the second dict take precedence over that of the first""" ++ x = SetLikeDict(**self) ++ x.update(other) ++ return x ++ ++class FastEncodingBuffer(object): ++ """a very rudimentary buffer that is faster than StringIO, ++ but doesn't crash on unicode data like cStringIO.""" ++ ++ def __init__(self, encoding=None, errors='strict', as_unicode=False): ++ self.data = collections.deque() ++ self.encoding = encoding ++ if as_unicode: ++ self.delim = compat.u('') ++ else: ++ self.delim = '' ++ self.as_unicode = as_unicode ++ self.errors = errors ++ self.write = self.data.append ++ ++ def truncate(self): ++ self.data = collections.deque() ++ self.write = self.data.append ++ ++ def getvalue(self): ++ if self.encoding: ++ return self.delim.join(self.data).encode(self.encoding, ++ self.errors) ++ else: ++ return self.delim.join(self.data) ++ ++class LRUCache(dict): ++ """A dictionary-like object that stores a limited number of items, ++ discarding lesser used items periodically. ++ ++ this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based ++ paradigm so that synchronization is not really needed. the size management ++ is inexact. ++ """ ++ ++ class _Item(object): ++ def __init__(self, key, value): ++ self.key = key ++ self.value = value ++ self.timestamp = compat.time_func() ++ def __repr__(self): ++ return repr(self.value) ++ ++ def __init__(self, capacity, threshold=.5): ++ self.capacity = capacity ++ self.threshold = threshold ++ ++ def __getitem__(self, key): ++ item = dict.__getitem__(self, key) ++ item.timestamp = compat.time_func() ++ return item.value ++ ++ def values(self): ++ return [i.value for i in dict.values(self)] ++ ++ def setdefault(self, key, value): ++ if key in self: ++ return self[key] ++ else: ++ self[key] = value ++ return value ++ ++ def __setitem__(self, key, value): ++ item = dict.get(self, key) ++ if item is None: ++ item = self._Item(key, value) ++ dict.__setitem__(self, key, item) ++ else: ++ item.value = value ++ self._manage_size() ++ ++ def _manage_size(self): ++ while len(self) > self.capacity + self.capacity * self.threshold: ++ bytime = sorted(dict.values(self), ++ key=operator.attrgetter('timestamp'), reverse=True) ++ for item in bytime[self.capacity:]: ++ try: ++ del self[item.key] ++ except KeyError: ++ # if we couldn't find a key, most likely some other thread ++ # broke in on us. loop around and try again ++ break ++ ++# Regexp to match python magic encoding line ++_PYTHON_MAGIC_COMMENT_re = re.compile( ++ r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', ++ re.VERBOSE) ++ ++def parse_encoding(fp): ++ """Deduce the encoding of a Python source file (binary mode) from magic ++ comment. ++ ++ It does this in the same way as the `Python interpreter`__ ++ ++ .. __: http://docs.python.org/ref/encodings.html ++ ++ The ``fp`` argument should be a seekable file object in binary mode. ++ """ ++ pos = fp.tell() ++ fp.seek(0) ++ try: ++ line1 = fp.readline() ++ has_bom = line1.startswith(codecs.BOM_UTF8) ++ if has_bom: ++ line1 = line1[len(codecs.BOM_UTF8):] ++ ++ m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore')) ++ if not m: ++ try: ++ import parser ++ parser.suite(line1.decode('ascii', 'ignore')) ++ except (ImportError, SyntaxError): ++ # Either it's a real syntax error, in which case the source ++ # is not valid python source, or line2 is a continuation of ++ # line1, in which case we don't want to scan line2 for a magic ++ # comment. ++ pass ++ else: ++ line2 = fp.readline() ++ m = _PYTHON_MAGIC_COMMENT_re.match( ++ line2.decode('ascii', 'ignore')) ++ ++ if has_bom: ++ if m: ++ raise SyntaxError("python refuses to compile code with both a UTF8" \ ++ " byte-order-mark and a magic encoding comment") ++ return 'utf_8' ++ elif m: ++ return m.group(1) ++ else: ++ return None ++ finally: ++ fp.seek(pos) ++ ++def sorted_dict_repr(d): ++ """repr() a dictionary with the keys in order. ++ ++ Used by the lexer unit test to compare parse trees based on strings. ++ ++ """ ++ keys = list(d.keys()) ++ keys.sort() ++ return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}" ++ ++def restore__ast(_ast): ++ """Attempt to restore the required classes to the _ast module if it ++ appears to be missing them ++ """ ++ if hasattr(_ast, 'AST'): ++ return ++ _ast.PyCF_ONLY_AST = 2 << 9 ++ m = compile("""\ ++def foo(): pass ++class Bar(object): pass ++if False: pass ++baz = 'mako' ++1 + 2 - 3 * 4 / 5 ++6 // 7 % 8 << 9 >> 10 ++11 & 12 ^ 13 | 14 ++15 and 16 or 17 ++-baz + (not +18) - ~17 ++baz and 'foo' or 'bar' ++(mako is baz == baz) is not baz != mako ++mako > baz < mako >= baz <= mako ++mako in baz not in mako""", '', 'exec', _ast.PyCF_ONLY_AST) ++ _ast.Module = type(m) ++ ++ for cls in _ast.Module.__mro__: ++ if cls.__name__ == 'mod': ++ _ast.mod = cls ++ elif cls.__name__ == 'AST': ++ _ast.AST = cls ++ ++ _ast.FunctionDef = type(m.body[0]) ++ _ast.ClassDef = type(m.body[1]) ++ _ast.If = type(m.body[2]) ++ ++ _ast.Name = type(m.body[3].targets[0]) ++ _ast.Store = type(m.body[3].targets[0].ctx) ++ _ast.Str = type(m.body[3].value) ++ ++ _ast.Sub = type(m.body[4].value.op) ++ _ast.Add = type(m.body[4].value.left.op) ++ _ast.Div = type(m.body[4].value.right.op) ++ _ast.Mult = type(m.body[4].value.right.left.op) ++ ++ _ast.RShift = type(m.body[5].value.op) ++ _ast.LShift = type(m.body[5].value.left.op) ++ _ast.Mod = type(m.body[5].value.left.left.op) ++ _ast.FloorDiv = type(m.body[5].value.left.left.left.op) ++ ++ _ast.BitOr = type(m.body[6].value.op) ++ _ast.BitXor = type(m.body[6].value.left.op) ++ _ast.BitAnd = type(m.body[6].value.left.left.op) ++ ++ _ast.Or = type(m.body[7].value.op) ++ _ast.And = type(m.body[7].value.values[0].op) ++ ++ _ast.Invert = type(m.body[8].value.right.op) ++ _ast.Not = type(m.body[8].value.left.right.op) ++ _ast.UAdd = type(m.body[8].value.left.right.operand.op) ++ _ast.USub = type(m.body[8].value.left.left.op) ++ ++ _ast.Or = type(m.body[9].value.op) ++ _ast.And = type(m.body[9].value.values[0].op) ++ ++ _ast.IsNot = type(m.body[10].value.ops[0]) ++ _ast.NotEq = type(m.body[10].value.ops[1]) ++ _ast.Is = type(m.body[10].value.left.ops[0]) ++ _ast.Eq = type(m.body[10].value.left.ops[1]) ++ ++ _ast.Gt = type(m.body[11].value.ops[0]) ++ _ast.Lt = type(m.body[11].value.ops[1]) ++ _ast.GtE = type(m.body[11].value.ops[2]) ++ _ast.LtE = type(m.body[11].value.ops[3]) ++ ++ _ast.In = type(m.body[12].value.ops[0]) ++ _ast.NotIn = type(m.body[12].value.ops[1]) ++ ++ ++ ++def read_file(path, mode='rb'): ++ fp = open(path, mode) ++ try: ++ data = fp.read() ++ return data ++ finally: ++ fp.close() ++ ++def read_python_file(path): ++ fp = open(path, "rb") ++ try: ++ encoding = parse_encoding(fp) ++ data = fp.read() ++ if encoding: ++ data = data.decode(encoding) ++ return data ++ finally: ++ fp.close() ++ +diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template +new file mode 100644 +index 0000000..5fbba17 +--- /dev/null ++++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template +@@ -0,0 +1,106 @@ ++/****************************************************************************** ++* ++* Copyright 2015 ++* Intel Corporation ++* ++* Licensed under the Apache License, Version 2.0 (the "License"); ++* you may not use this file except in compliance with the License. ++* You may obtain a copy of the License at ++* ++* http ://www.apache.org/licenses/LICENSE-2.0 ++* ++* Unless required by applicable law or agreed to in writing, software ++* distributed under the License is distributed on an "AS IS" BASIS, ++* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++* See the License for the specific language governing permissions and ++* limitations under the License. ++* ++% if gen_header: ++* @file ${filename}.h ++% else: ++* @file ${filename}.cpp ++% endif ++* ++* @brief Dynamic Knobs for Core. ++* ++* ======================= AUTO GENERATED: DO NOT EDIT !!! ==================== ++* ++******************************************************************************/ ++%if gen_header: ++#pragma once ++ ++template ++struct Knob ++{ ++ const T& Value() const { return m_Value; } ++ const T& Value(const T& newValue) { m_Value = newValue; return Value(); } ++ ++private: ++ T m_Value; ++}; ++ ++#define DEFINE_KNOB(_name, _type, _default) \\ ++ ++ struct Knob_##_name : Knob<_type> \\ ++ ++ { Knob_##_name() { Value(_default); } \\ ++ ++ const char* Name() const { return "KNOB_" #_name; } \\ ++ ++ } _name; ++ ++#define GET_KNOB(_name) g_GlobalKnobs._name.Value() ++#define SET_KNOB(_name, _newValue) g_GlobalKnobs._name.Value(_newValue) ++ ++struct GlobalKnobs ++{ ++ % for knob in knobs: ++ //----------------------------------------------------------- ++ // KNOB_${knob[0]} ++ // ++ % for line in knob[1]['desc']: ++ // ${line} ++ % endfor ++ DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']}); ++ ++ % endfor ++ GlobalKnobs(); ++}; ++extern GlobalKnobs g_GlobalKnobs; ++ ++<% ++ max_len = 0 ++ for knob in knobs: ++ if len(knob[0]) > max_len: max_len = len(knob[0]) ++ max_len += len('KNOB_ ') ++ if max_len % 4: max_len += 4 - (max_len % 4) ++ ++ def space_knob(knob): ++ knob_len = len('KNOB_' + knob) ++ return ' '*(max_len - knob_len) ++%> ++% for knob in knobs: ++#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]}) ++% endfor ++ ++% else: ++% for inc in includes: ++#include <${inc}> ++% endfor ++ ++//======================================================== ++// Static Data Members ++//======================================================== ++GlobalKnobs g_GlobalKnobs; ++ ++//======================================================== ++// Knob Initialization ++//======================================================== ++GlobalKnobs::GlobalKnobs() ++{ ++ % for knob in knobs: ++ InitKnob(${knob[0]}); ++ % endfor ++} ++ ++% endif +-- +2.6.2 + diff --git a/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch b/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch new file mode 100644 index 0000000..239130f --- /dev/null +++ b/0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch @@ -0,0 +1,42 @@ +From fe9e5f557953d3c4b9c3cac6be0ff29d97c3f2c7 Mon Sep 17 00:00:00 2001 +From: Igor Gnatenko +Date: Thu, 22 Oct 2015 17:08:04 +0200 +Subject: [PATCH 3/3] gallium/swr: add flags parameter to + pipe_screen::context_create + +Signed-off-by: Igor Gnatenko +--- + src/gallium/drivers/swr/swr_context.cpp | 3 ++- + src/gallium/drivers/swr/swr_context.h | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp +index 6269cd0..2dd3443 100644 +--- a/src/gallium/drivers/swr/swr_context.cpp ++++ b/src/gallium/drivers/swr/swr_context.cpp +@@ -336,7 +336,8 @@ swr_render_condition(struct pipe_context *pipe, + + + struct pipe_context * +-swr_create_context(struct pipe_screen *screen, void *priv) ++swr_create_context(struct pipe_screen *screen, void *priv, ++ unsigned flags) + { + struct swr_context *ctx = CALLOC_STRUCT(swr_context); + ctx->blendJIT = +diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h +index 9d93a6d..5271eac 100644 +--- a/src/gallium/drivers/swr/swr_context.h ++++ b/src/gallium/drivers/swr/swr_context.h +@@ -160,7 +160,7 @@ swr_context(struct pipe_context *pipe) + return (struct swr_context *)pipe; + } + +-struct pipe_context *swr_create_context(struct pipe_screen *, void *priv); ++struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags); + + void swr_state_init(struct pipe_context *pipe); + +-- +2.6.2 + diff --git a/mesa.spec b/mesa.spec index 65137e0..674ff8e 100644 --- a/mesa.spec +++ b/mesa.spec @@ -17,6 +17,7 @@ %define min_wayland_version 1.0 %if 0%{?with_llvm} %define with_radeonsi 1 +%define with_swr 1 %endif %ifarch s390 s390x ppc @@ -55,7 +56,7 @@ Summary: Mesa graphics libraries Name: mesa Version: 11.1.0 -Release: 0.devel.8.%{git}%{?dist} +Release: 0.devel.9.%{git}%{?dist} License: MIT Group: System Environment/Libraries URL: http://www.mesa3d.org @@ -74,6 +75,10 @@ Patch15: mesa-9.2-hardware-float.patch Patch20: mesa-10.2-evergreen-big-endian.patch Patch30: mesa-10.3-bigendian-assert.patch +Patch101: 0001-Initial-public-Mesa-SWR.patch +Patch102: 0002-swr-484541-Initial-public-SWR.patch +Patch103: 0003-gallium-swr-add-flags-parameter-to-pipe_screen-conte.patch + # To have sha info in glxinfo BuildRequires: git-core @@ -348,6 +353,10 @@ grep -q ^/ src/gallium/auxiliary/vl/vl_decoder.c && exit 1 %patch20 -p1 -b .egbe %patch30 -p1 -b .beassert +%patch101 -p1 +%patch102 -p1 +%patch103 -p1 + %if 0%{with_private_llvm} sed -i 's/llvm-config/mesa-private-llvm-config-%{__isa_bits}/g' configure.ac sed -i 's/`$LLVM_CONFIG --version`/&-mesa/' configure.ac @@ -395,7 +404,8 @@ export CXXFLAGS="$RPM_OPT_FLAGS %{?with_opencl:-frtti -fexceptions} %{!?with_ope %if %{with_hardware} %{?with_xa:--enable-xa} \ %{?with_nine:--enable-nine} \ - --with-gallium-drivers=%{?with_vmware:svga,}%{?with_radeonsi:radeonsi,}%{?with_llvm:swrast,r600,}%{?with_freedreno:freedreno,}%{?with_vc4:vc4,}%{?with_ilo:ilo,}r300,nouveau \ + --with-gallium-drivers=%{?with_vmware:svga,}%{?with_radeonsi:radeonsi,}%{?with_llvm:swrast,r600,}%{?with_freedreno:freedreno,}%{?with_vc4:vc4,}%{?with_ilo:ilo,}%{?with_swr:swr,}r300,nouveau \ + %{?with_swr:--enable-swr-native} \ %else --with-gallium-drivers=%{?with_llvm:swrast} \ %endif @@ -675,6 +685,9 @@ rm -rf $RPM_BUILD_ROOT %endif %changelog +* Wed Oct 21 2015 Igor Gnatenko - 11.1.0-0.devel.9.4a168ad +- Enable experimental SWR rasterizer + * Wed Oct 14 2015 Igor Gnatenko - 11.1.0-0.devel.8.4a168ad - 4a168ad