Backport some fixes needed by mesa and rust

2018-08-06 21:53:23 +00:00 · 2018-08-06 21:53:23 +00:00 · 93d2074b7b
parent 8c9aac9934
commit 93d2074b7b
4 changed files with 1334 additions and 1 deletions
--- a/0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch
+++ b/0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch
@ -0,0 +1,47 @@
+From 2eb830fed5b813c5624e770c244eec61dacb04d7 Mon Sep 17 00:00:00 2001
+From: Tom Stellard <tstellar@redhat.com>
+Date: Mon, 9 Jul 2018 10:35:30 -0700
+Subject: [PATCH] Don't run BV DAG Combine before legalization if it assumes
+ legal types
+
+---
+ lib/Target/PowerPC/PPCISelLowering.cpp | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
+index 26e9f13..f622b05 100644
+--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
+@@ -11790,10 +11790,15 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
+   auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
+     if (!Op)
+       return false;
+-    if (Op.getOpcode() != ISD::SIGN_EXTEND)
+    if (Op.getOpcode() != ISD::SIGN_EXTEND &&
+        Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
+       return false;
+ 
+    // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
+    // of the right width.
+     SDValue Extract = Op.getOperand(0);
+    if (Extract.getOpcode() == ISD::ANY_EXTEND)
+      Extract = Extract.getOperand(0);
+     if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+       return false;
+ 
+@@ -11881,8 +11886,10 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
+     return Reduced;
+ 
+   // If we're building a vector out of extended elements from another vector
+-  // we have P9 vector integer extend instructions.
+-  if (Subtarget.hasP9Altivec()) {
+  // we have P9 vector integer extend instructions. The code assumes legal
+  // input types (i.e. it can't handle things like v4i16) so do not run before
+  // legalization.
+  if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
+     Reduced = combineBVOfVecSExt(N, DAG);
+     if (Reduced)
+       return Reduced;
+-- 
+1.8.3.1
+
--- a/0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch
+++ b/0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch
@ -0,0 +1,919 @@
+From 88ad713b81c2f51dd8405b251f9825b0bca6e57d Mon Sep 17 00:00:00 2001
+From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
+Date: Thu, 2 Aug 2018 00:03:22 +0000
+Subject: [PATCH] [PowerPC] Do not round values prior to converting to integer
+
+Adding the FP_ROUND nodes when combining FP_TO_[SU]INT of elements
+feeding a BUILD_VECTOR into an FP_TO_[SU]INT of the built vector
+loses precision. This patch removes the code that adds these nodes
+to true f64 operands. It also adds patterns required to ensure
+the code is still vectorized rather than converting individual
+elements and inserting into a vector.
+
+Fixes https://bugs.llvm.org/show_bug.cgi?id=38342
+
+Differential Revision: https://reviews.llvm.org/D50121
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338658 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/Target/PowerPC/PPCISelLowering.cpp     |  22 +-
+ lib/Target/PowerPC/PPCInstrVSX.td          |  86 +++++++
+ test/CodeGen/PowerPC/build-vector-tests.ll | 357 +++++++++++++----------------
+ 3 files changed, 258 insertions(+), 207 deletions(-)
+
+diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
+index f622b05..527ec5a 100644
+--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
+@@ -11560,6 +11560,14 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
+       ShiftCst);
+ }
+ 
+// Is this an extending load from an f32 to an f64?
+static bool isFPExtLoad(SDValue Op) {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
+    return LD->getExtensionType() == ISD::EXTLOAD &&
+      Op.getValueType() == MVT::f64;
+  return false;
+}
+
+ /// \brief Reduces the number of fp-to-int conversion when building a vector.
+ ///
+ /// If this vector is built out of floating to integer conversions,
+@@ -11594,11 +11602,18 @@ combineElementTruncationToVectorTruncation(SDNode *N,
+     SmallVector<SDValue, 4> Ops;
+     EVT TargetVT = N->getValueType(0);
+     for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
+-      if (N->getOperand(i).getOpcode() != PPCISD::MFVSR)
+      SDValue NextOp = N->getOperand(i);
+      if (NextOp.getOpcode() != PPCISD::MFVSR)
+         return SDValue();
+-      unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode();
+      unsigned NextConversion = NextOp.getOperand(0).getOpcode();
+       if (NextConversion != FirstConversion)
+         return SDValue();
+      // If we are converting to 32-bit integers, we need to add an FP_ROUND.
+      // This is not valid if the input was originally double precision. It is
+      // also not profitable to do unless this is an extending load in which
+      // case doing this combine will allow us to combine consecutive loads.
+      if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
+        return SDValue();
+       if (N->getOperand(i) != FirstInput)
+         IsSplat = false;
+     }
+@@ -11612,8 +11627,9 @@ combineElementTruncationToVectorTruncation(SDNode *N,
+     // Now that we know we have the right type of node, get its operands
+     for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
+       SDValue In = N->getOperand(i).getOperand(0);
+-      // For 32-bit values, we need to add an FP_ROUND node.
+       if (Is32Bit) {
+        // For 32-bit values, we need to add an FP_ROUND node (if we made it
+        // here, we know that all inputs are extending loads so this is safe).
+         if (In.isUndef())
+           Ops.push_back(DAG.getUNDEF(SrcVT));
+         else {
+diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
+index 6f71978..1f48473 100644
+--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
+@@ -3100,6 +3100,17 @@ def DblToFlt {
+   dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
+ }
+ 
+def ExtDbl {
+  dag A0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 0))))));
+  dag A1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 1))))));
+  dag B0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 0))))));
+  dag B1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 1))))));
+  dag A0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 0))))));
+  dag A1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 1))))));
+  dag B0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 0))))));
+  dag B1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 1))))));
+}
+
+ def ByteToWord {
+   dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
+   dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
+@@ -3177,9 +3188,15 @@ def FltToULong {
+ }
+ def DblToInt {
+   dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A))));
+  dag B = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$B))));
+  dag C = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$C))));
+  dag D = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$D))));
+ }
+ def DblToUInt {
+   dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A))));
+  dag B = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$B))));
+  dag C = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$C))));
+  dag D = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$D))));
+ }
+ def DblToLong {
+   dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A))));
+@@ -3218,6 +3235,47 @@ def MrgFP {
+   dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3));
+ }
+ 
+// Word-element merge dags - conversions from f64 to i32 merged into vectors.
+def MrgWords {
+  // For big endian, we merge low and hi doublewords (A, B).
+  dag A0B0 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 0));
+  dag A1B1 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 3));
+  dag CVA1B1S = (v4i32 (XVCVDPSXWS A1B1));
+  dag CVA0B0S = (v4i32 (XVCVDPSXWS A0B0));
+  dag CVA1B1U = (v4i32 (XVCVDPUXWS A1B1));
+  dag CVA0B0U = (v4i32 (XVCVDPUXWS A0B0));
+
+  // For little endian, we merge low and hi doublewords (B, A).
+  dag B1A1 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 0));
+  dag B0A0 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 3));
+  dag CVB1A1S = (v4i32 (XVCVDPSXWS B1A1));
+  dag CVB0A0S = (v4i32 (XVCVDPSXWS B0A0));
+  dag CVB1A1U = (v4i32 (XVCVDPUXWS B1A1));
+  dag CVB0A0U = (v4i32 (XVCVDPUXWS B0A0));
+
+  // For big endian, we merge hi doublewords of (A, C) and (B, D), convert
+  // then merge.
+  dag AC = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$A, VSRC),
+                            (COPY_TO_REGCLASS f64:$C, VSRC), 0));
+  dag BD = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$B, VSRC),
+                            (COPY_TO_REGCLASS f64:$D, VSRC), 0));
+  dag CVACS = (v4i32 (XVCVDPSXWS AC));
+  dag CVBDS = (v4i32 (XVCVDPSXWS BD));
+  dag CVACU = (v4i32 (XVCVDPUXWS AC));
+  dag CVBDU = (v4i32 (XVCVDPUXWS BD));
+
+  // For little endian, we merge hi doublewords of (D, B) and (C, A), convert
+  // then merge.
+  dag DB = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$D, VSRC),
+                            (COPY_TO_REGCLASS f64:$B, VSRC), 0));
+  dag CA = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$C, VSRC),
+                            (COPY_TO_REGCLASS f64:$A, VSRC), 0));
+  dag CVDBS = (v4i32 (XVCVDPSXWS DB));
+  dag CVCAS = (v4i32 (XVCVDPSXWS CA));
+  dag CVDBU = (v4i32 (XVCVDPUXWS DB));
+  dag CVCAU = (v4i32 (XVCVDPUXWS CA));
+}
+
+ // Patterns for BUILD_VECTOR nodes.
+ def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
+ let AddedComplexity = 400 in {
+@@ -3286,6 +3344,20 @@ let AddedComplexity = 400 in {
+     def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+                                    DblToFlt.B0, DblToFlt.B1)),
+               (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;
+
+    // Convert 4 doubles to a vector of ints.
+    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
+                                   DblToInt.C, DblToInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>;
+    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
+                                   DblToUInt.C, DblToUInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
+                                   ExtDbl.B0S, ExtDbl.B1S)),
+              (v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
+                                   ExtDbl.B0U, ExtDbl.B1U)),
+              (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>;
+   }
+ 
+   let Predicates = [IsLittleEndian, HasVSX] in {
+@@ -3300,6 +3372,20 @@ let AddedComplexity = 400 in {
+     def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+                                    DblToFlt.B0, DblToFlt.B1)),
+               (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;
+
+    // Convert 4 doubles to a vector of ints.
+    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
+                                   DblToInt.C, DblToInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>;
+    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
+                                   DblToUInt.C, DblToUInt.D)),
+              (v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
+                                   ExtDbl.B0S, ExtDbl.B1S)),
+              (v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>;
+    def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
+                                   ExtDbl.B0U, ExtDbl.B1U)),
+              (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>;
+   }
+ 
+   let Predicates = [HasDirectMove] in {
+diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
+index 16b562b..3785b2a 100644
+--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
+@@ -119,8 +119,8 @@
+ ;vector int spltCnstConvftoi() {                                              //
+ ;  return (vector int) 4.74f;                                                 //
+ ;}                                                                            //
+-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                         //
+-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvdpsxws                         //
+;// P8: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                                   //
+;// P9: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                                   //
+ ;vector int fromRegsConvftoi(float a, float b, float c, float d) {            //
+ ;  return (vector int) { a, b, c, d };                                        //
+ ;}                                                                            //
+@@ -139,15 +139,15 @@
+ ;vector int fromDiffMemConsDConvftoi(float *ptr) {                            //
+ ;  return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] };                    //
+ ;}                                                                            //
+-;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws             //
+-;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws              //
+;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                       //
+;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                        //
+ ;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+ ;//       sldi 2, load, xvcvspuxws                                            //
+ ;vector int fromDiffMemVarAConvftoi(float *arr, int elem) {                   //
+ ;  return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] };  //
+ ;}                                                                            //
+-;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws             //
+-;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws              //
+;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                       //
+;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                        //
+ ;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+ ;//       sldi 2, 2 x load, vperm, xvcvspuxws                                 //
+ ;vector int fromDiffMemVarDConvftoi(float *arr, int elem) {                   //
+@@ -168,8 +168,8 @@
+ ;vector int spltCnstConvdtoi() {                                              //
+ ;  return (vector int) 4.74;                                                  //
+ ;}                                                                            //
+-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                         //
+-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                         //
+;// P8: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                                   //
+;// P9: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                                   //
+ ;vector int fromRegsConvdtoi(double a, double b, double c, double d) {        //
+ ;  return (vector int) { a, b, c, d };                                        //
+ ;}                                                                            //
+@@ -178,25 +178,23 @@
+ ;vector int fromDiffConstsConvdtoi() {                                        //
+ ;  return (vector int) { 24.46, 234., 988.19, 422.39 };                       //
+ ;}                                                                            //
+-;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew,      //
+-;//     xvcvspsxws                                                            //
+-;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew,        //
+-;//     xvcvspsxws                                                            //
+;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspsxws, vmrgew     //
+;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspsxws, vmrgew       //
+ ;vector int fromDiffMemConsAConvdtoi(double *ptr) {                           //
+ ;  return (vector int) { ptr[0], ptr[1], ptr[2], ptr[3] };                    //
+ ;}                                                                            //
+-;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws              //
+-;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws                //
+;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                        //
+;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                          //
+ ;vector int fromDiffMemConsDConvdtoi(double *ptr) {                           //
+ ;  return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] };                    //
+ ;}                                                                            //
+-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws       //
+-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws         //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                 //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                   //
+ ;vector int fromDiffMemVarAConvdtoi(double *arr, int elem) {                  //
+ ;  return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] };  //
+ ;}                                                                            //
+-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws       //
+-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws         //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                 //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew                   //
+ ;vector int fromDiffMemVarDConvdtoi(double *arr, int elem) {                  //
+ ;  return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] };  //
+ ;}                                                                            //
+@@ -296,8 +294,8 @@
+ ;vector unsigned int spltCnstConvftoui() {                                    //
+ ;  return (vector unsigned int) 4.74f;                                        //
+ ;}                                                                            //
+-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+;// P8: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                                   //
+;// P9: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                                   //
+ ;vector unsigned int fromRegsConvftoui(float a, float b, float c, float d) {  //
+ ;  return (vector unsigned int) { a, b, c, d };                               //
+ ;}                                                                            //
+@@ -316,16 +314,16 @@
+ ;vector unsigned int fromDiffMemConsDConvftoui(float *ptr) {                  //
+ ;  return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] };           //
+ ;}                                                                            //
+-;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws      //
+-;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                //
+;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                   //
+ ;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+ ;//       sldi 2, load, xvcvspuxws                                            //
+ ;vector unsigned int fromDiffMemVarAConvftoui(float *arr, int elem) {         //
+ ;  return (vector unsigned int) { arr[elem], arr[elem+1],                     //
+ ;                                 arr[elem+2], arr[elem+3] };                 //
+ ;}                                                                            //
+-;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws      //
+-;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                //
+;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                   //
+ ;// Note: if the consecutive loads learns to handle pre-inc, this can be:     //
+ ;//       sldi 2, 2 x load, vperm, xvcvspuxws                                 //
+ ;vector unsigned int fromDiffMemVarDConvftoui(float *arr, int elem) {         //
+@@ -347,8 +345,8 @@
+ ;vector unsigned int spltCnstConvdtoui() {                                    //
+ ;  return (vector unsigned int) 4.74;                                         //
+ ;}                                                                            //
+-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                         //
+;// P8: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                                   //
+;// P9: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                                   //
+ ;vector unsigned int fromRegsConvdtoui(double a, double b,                    //
+ ;                                      double c, double d) {                  //
+ ;  return (vector unsigned int) { a, b, c, d };                               //
+@@ -358,25 +356,24 @@
+ ;vector unsigned int fromDiffConstsConvdtoui() {                              //
+ ;  return (vector unsigned int) { 24.46, 234., 988.19, 422.39 };              //
+ ;}                                                                            //
+-;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew,      //
+-;//     xvcvspuxws                                                            //
+-;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws          //
+;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspuxws, vmrgew     //
+;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvspuxws, vmrgew                    //
+ ;vector unsigned int fromDiffMemConsAConvdtoui(double *ptr) {                 //
+ ;  return (vector unsigned int) { ptr[0], ptr[1], ptr[2], ptr[3] };           //
+ ;}                                                                            //
+-;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws              //
+-;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws                //
+;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                        //
+;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                          //
+ ;vector unsigned int fromDiffMemConsDConvdtoui(double *ptr) {                 //
+ ;  return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] };           //
+ ;}                                                                            //
+-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws       //
+-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                 //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                   //
+ ;vector unsigned int fromDiffMemVarAConvdtoui(double *arr, int elem) {        //
+ ;  return (vector unsigned int) { arr[elem], arr[elem+1],                     //
+ ;                                 arr[elem+2], arr[elem+3] };                 //
+ ;}                                                                            //
+-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws       //
+-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws         //
+;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                 //
+;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew                   //
+ ;vector unsigned int fromDiffMemVarDConvdtoui(double *arr, int elem) {        //
+ ;  return (vector unsigned int) { arr[elem], arr[elem-1],                     //
+ ;                                 arr[elem-2], arr[elem-3] };                 //
+@@ -1253,28 +1250,24 @@ entry:
+ ; P8LE-LABEL: fromRegsConvftoi
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P9BE: xvcvspsxws v2, v2
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P9LE: xvcvspsxws v2, v2
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P8BE: xvcvspsxws v2, v2
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P8LE: xvcvspsxws v2, v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+@@ -1529,28 +1522,24 @@ entry:
+ ; P8LE-LABEL: fromRegsConvdtoi
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P9BE: xvcvspsxws v2, v2
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P9LE: xvcvspsxws v2, v2
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P8BE: xvcvspsxws v2, v2
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P8LE: xvcvspsxws v2, v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+@@ -1592,36 +1581,32 @@ entry:
+ ; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
+ ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+ ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+-; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+-; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9BE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]]
+; P9BE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]]
+ ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
+-; P9BE: xvcvspsxws v2, v2
+ ; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+ ; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
+ ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
+ ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
+-; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+-; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9LE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]]
+; P9LE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]]
+ ; P9LE: vmrgew v2, [[REG6]], [[REG5]]
+-; P9LE: xvcvspsxws v2, v2
+ ; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+ ; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+ ; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+ ; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+-; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+-; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P8BE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]]
+; P8BE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]]
+ ; P8BE: vmrgew v2, [[REG6]], [[REG5]]
+-; P8BE: xvcvspsxws v2, v2
+ ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+ ; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+ ; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]]
+ ; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]]
+ ; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]]
+ ; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]]
+-; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]]
+-; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]]
+; P8LE-DAG: xvcvdpsxws [[REG7:[vs0-9]+]], [[REG5]]
+; P8LE-DAG: xvcvdpsxws [[REG8:[vs0-9]+]], [[REG6]]
+ ; P8LE: vmrgew v2, [[REG8]], [[REG7]]
+-; P8LE: xvcvspsxws v2, v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readonly
+@@ -1653,40 +1638,36 @@ entry:
+ ; P9BE: lfd
+ ; P9BE: xxmrghd
+ ; P9BE: xxmrghd
+-; P9BE: xvcvdpsp
+-; P9BE: xvcvdpsp
+-; P9BE: vmrgew
+-; P9BE: xvcvspsxws v2
+; P9BE: xvcvdpsxws
+; P9BE: xvcvdpsxws
+; P9BE: vmrgew v2
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: xxmrghd
+ ; P9LE: xxmrghd
+-; P9LE: xvcvdpsp
+-; P9LE: xvcvdpsp
+-; P9LE: vmrgew
+-; P9LE: xvcvspsxws v2
+; P9LE: xvcvdpsxws
+; P9LE: xvcvdpsxws
+; P9LE: vmrgew v2
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: xxmrghd
+ ; P8BE: xxmrghd
+-; P8BE: xvcvdpsp
+-; P8BE: xvcvdpsp
+-; P8BE: vmrgew
+-; P8BE: xvcvspsxws v2
+; P8BE: xvcvdpsxws
+; P8BE: xvcvdpsxws
+; P8BE: vmrgew v2
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: xxmrghd
+ ; P8LE: xxmrghd
+-; P8LE: xvcvdpsp
+-; P8LE: xvcvdpsp
+-; P8LE: vmrgew
+-; P8LE: xvcvspsxws v2
+; P8LE: xvcvdpsxws
+; P8LE: xvcvdpsxws
+; P8LE: vmrgew v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readonly
+@@ -1726,40 +1707,36 @@ entry:
+ ; P9BE: lfd
+ ; P9BE: xxmrghd
+ ; P9BE: xxmrghd
+-; P9BE: xvcvdpsp
+-; P9BE: xvcvdpsp
+-; P9BE: vmrgew
+-; P9BE: xvcvspsxws v2
+; P9BE: xvcvdpsxws
+; P9BE: xvcvdpsxws
+; P9BE: vmrgew v2
+ ; P9LE: lfdux
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: xxmrghd
+ ; P9LE: xxmrghd
+-; P9LE: xvcvdpsp
+-; P9LE: xvcvdpsp
+-; P9LE: vmrgew
+-; P9LE: xvcvspsxws v2
+; P9LE: xvcvdpsxws
+; P9LE: xvcvdpsxws
+; P9LE: vmrgew v2
+ ; P8BE: lfdux
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: xxmrghd
+ ; P8BE: xxmrghd
+-; P8BE: xvcvdpsp
+-; P8BE: xvcvdpsp
+-; P8BE: vmrgew
+-; P8BE: xvcvspsxws v2
+; P8BE: xvcvdpsxws
+; P8BE: xvcvdpsxws
+; P8BE: vmrgew v2
+ ; P8LE: lfdux
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: xxmrghd
+ ; P8LE: xxmrghd
+-; P8LE: xvcvdpsp
+-; P8LE: xvcvdpsp
+-; P8LE: vmrgew
+-; P8LE: xvcvspsxws v2
+; P8LE: xvcvdpsxws
+; P8LE: xvcvdpsxws
+; P8LE: vmrgew v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readonly
+@@ -1799,40 +1776,36 @@ entry:
+ ; P9BE: lfd
+ ; P9BE: xxmrghd
+ ; P9BE: xxmrghd
+-; P9BE: xvcvdpsp
+-; P9BE: xvcvdpsp
+-; P9BE: vmrgew
+-; P9BE: xvcvspsxws v2
+; P9BE: xvcvdpsxws
+; P9BE: xvcvdpsxws
+; P9BE: vmrgew v2
+ ; P9LE: lfdux
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: xxmrghd
+ ; P9LE: xxmrghd
+-; P9LE: xvcvdpsp
+-; P9LE: xvcvdpsp
+-; P9LE: vmrgew
+-; P9LE: xvcvspsxws v2
+; P9LE: xvcvdpsxws
+; P9LE: xvcvdpsxws
+; P9LE: vmrgew v2
+ ; P8BE: lfdux
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: xxmrghd
+ ; P8BE: xxmrghd
+-; P8BE: xvcvdpsp
+-; P8BE: xvcvdpsp
+-; P8BE: vmrgew
+-; P8BE: xvcvspsxws v2
+; P8BE: xvcvdpsxws
+; P8BE: xvcvdpsxws
+; P8BE: vmrgew v2
+ ; P8LE: lfdux
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: xxmrghd
+ ; P8LE: xxmrghd
+-; P8LE: xvcvdpsp
+-; P8LE: xvcvdpsp
+-; P8LE: vmrgew
+-; P8LE: xvcvspsxws v2
+; P8LE: xvcvdpsxws
+; P8LE: xvcvdpsxws
+; P8LE: vmrgew v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+@@ -2413,28 +2386,24 @@ entry:
+ ; P8LE-LABEL: fromRegsConvftoui
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P9BE: xvcvspuxws v2, v2
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P9LE: xvcvspuxws v2, v2
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P8BE: xvcvspuxws v2, v2
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P8LE: xvcvspuxws v2, v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+@@ -2689,28 +2658,24 @@ entry:
+ ; P8LE-LABEL: fromRegsConvdtoui
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P9BE: xvcvspuxws v2, v2
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P9LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P9LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P9LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P9LE: xvcvspuxws v2, v2
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3
+ ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4
+-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8BE: vmrgew v2, [[REG3]], [[REG4]]
+-; P8BE: xvcvspuxws v2, v2
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1
+ ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2
+-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+; P8LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]]
+; P8LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]]
+ ; P8LE: vmrgew v2, [[REG4]], [[REG3]]
+-; P8LE: xvcvspuxws v2, v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+@@ -2752,36 +2717,32 @@ entry:
+ ; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
+ ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+ ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+-; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+-; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9BE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]]
+; P9BE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]]
+ ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
+-; P9BE: xvcvspuxws v2, v2
+ ; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3)
+ ; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3)
+-; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
+ ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
+-; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+-; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
+; P9LE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]]
+; P9LE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]]
+ ; P9LE: vmrgew v2, [[REG6]], [[REG5]]
+-; P9LE: xvcvspuxws v2, v2
+ ; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+ ; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+ ; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
+ ; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
+-; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
+-; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
+; P8BE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]]
+; P8BE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]]
+ ; P8BE: vmrgew v2, [[REG6]], [[REG5]]
+-; P8BE: xvcvspuxws v2, v2
+ ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
+ ; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4
+ ; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]]
+ ; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]]
+ ; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]]
+ ; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]]
+-; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]]
+-; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]]
+; P8LE-DAG: xvcvdpuxws [[REG7:[vs0-9]+]], [[REG5]]
+; P8LE-DAG: xvcvdpuxws [[REG8:[vs0-9]+]], [[REG6]]
+ ; P8LE: vmrgew v2, [[REG8]], [[REG7]]
+-; P8LE: xvcvspuxws v2, v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readonly
+@@ -2813,40 +2774,36 @@ entry:
+ ; P9BE: lfd
+ ; P9BE: xxmrghd
+ ; P9BE: xxmrghd
+-; P9BE: xvcvdpsp
+-; P9BE: xvcvdpsp
+-; P9BE: vmrgew
+-; P9BE: xvcvspuxws v2
+; P9BE: xvcvdpuxws
+; P9BE: xvcvdpuxws
+; P9BE: vmrgew v2
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: xxmrghd
+ ; P9LE: xxmrghd
+-; P9LE: xvcvdpsp
+-; P9LE: xvcvdpsp
+-; P9LE: vmrgew
+-; P9LE: xvcvspuxws v2
+; P9LE: xvcvdpuxws
+; P9LE: xvcvdpuxws
+; P9LE: vmrgew v2
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: xxmrghd
+ ; P8BE: xxmrghd
+-; P8BE: xvcvdpsp
+-; P8BE: xvcvdpsp
+-; P8BE: vmrgew
+-; P8BE: xvcvspuxws v2
+; P8BE: xvcvdpuxws
+; P8BE: xvcvdpuxws
+; P8BE: vmrgew v2
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: xxmrghd
+ ; P8LE: xxmrghd
+-; P8LE: xvcvdpsp
+-; P8LE: xvcvdpsp
+-; P8LE: vmrgew
+-; P8LE: xvcvspuxws v2
+; P8LE: xvcvdpuxws
+; P8LE: xvcvdpuxws
+; P8LE: vmrgew v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readonly
+@@ -2886,40 +2843,36 @@ entry:
+ ; P9BE: lfd
+ ; P9BE: xxmrghd
+ ; P9BE: xxmrghd
+-; P9BE: xvcvdpsp
+-; P9BE: xvcvdpsp
+-; P9BE: vmrgew
+-; P9BE: xvcvspuxws v2
+; P9BE: xvcvdpuxws
+; P9BE: xvcvdpuxws
+; P9BE: vmrgew v2
+ ; P9LE: lfdux
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: xxmrghd
+ ; P9LE: xxmrghd
+-; P9LE: xvcvdpsp
+-; P9LE: xvcvdpsp
+-; P9LE: vmrgew
+-; P9LE: xvcvspuxws v2
+; P9LE: xvcvdpuxws
+; P9LE: xvcvdpuxws
+; P9LE: vmrgew v2
+ ; P8BE: lfdux
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: xxmrghd
+ ; P8BE: xxmrghd
+-; P8BE: xvcvdpsp
+-; P8BE: xvcvdpsp
+-; P8BE: vmrgew
+-; P8BE: xvcvspuxws v2
+; P8BE: xvcvdpuxws
+; P8BE: xvcvdpuxws
+; P8BE: vmrgew v2
+ ; P8LE: lfdux
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: xxmrghd
+ ; P8LE: xxmrghd
+-; P8LE: xvcvdpsp
+-; P8LE: xvcvdpsp
+-; P8LE: vmrgew
+-; P8LE: xvcvspuxws v2
+; P8LE: xvcvdpuxws
+; P8LE: xvcvdpuxws
+; P8LE: vmrgew v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readonly
+@@ -2959,40 +2912,36 @@ entry:
+ ; P9BE: lfd
+ ; P9BE: xxmrghd
+ ; P9BE: xxmrghd
+-; P9BE: xvcvdpsp
+-; P9BE: xvcvdpsp
+-; P9BE: vmrgew
+-; P9BE: xvcvspuxws v2
+; P9BE: xvcvdpuxws
+; P9BE: xvcvdpuxws
+; P9BE: vmrgew v2
+ ; P9LE: lfdux
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: lfd
+ ; P9LE: xxmrghd
+ ; P9LE: xxmrghd
+-; P9LE: xvcvdpsp
+-; P9LE: xvcvdpsp
+-; P9LE: vmrgew
+-; P9LE: xvcvspuxws v2
+; P9LE: xvcvdpuxws
+; P9LE: xvcvdpuxws
+; P9LE: vmrgew v2
+ ; P8BE: lfdux
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: lxsdx
+ ; P8BE: xxmrghd
+ ; P8BE: xxmrghd
+-; P8BE: xvcvdpsp
+-; P8BE: xvcvdpsp
+-; P8BE: vmrgew
+-; P8BE: xvcvspuxws v2
+; P8BE: xvcvdpuxws
+; P8BE: xvcvdpuxws
+; P8BE: vmrgew v2
+ ; P8LE: lfdux
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: lxsdx
+ ; P8LE: xxmrghd
+ ; P8LE: xxmrghd
+-; P8LE: xvcvdpsp
+-; P8LE: xvcvdpsp
+-; P8LE: vmrgew
+-; P8LE: xvcvspuxws v2
+; P8LE: xvcvdpuxws
+; P8LE: xvcvdpuxws
+; P8LE: vmrgew v2
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+-- 
+1.8.3.1
+
--- a/0001-SystemZ-TableGen-Fix-shift-count-handling.patch
+++ b/0001-SystemZ-TableGen-Fix-shift-count-handling.patch
@ -0,0 +1,360 @@
+From 2ac90db51fc323d183aabe744e57f4feca6d3008 Mon Sep 17 00:00:00 2001
+From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
+Date: Wed, 1 Aug 2018 11:57:58 +0000
+Subject: [PATCH] [SystemZ, TableGen] Fix shift count handling
+
+*Backport of this patch from trunk without the TableGen fix and modified
+to work with LLVM 6.0 TableGen. *
+
+The DAG combiner logic to simplify AND masks in shift counts is invalid.
+While it is true that the SystemZ shift instructions ignore all but the
+low 6 bits of the shift count, it is still invalid to simplify the AND
+masks while the DAG still uses the standard shift operators (which are
+*not* defined to match the SystemZ instruction behavior).
+
+Instead, this patch performs equivalent operations during instruction
+selection. For completely removing the AND, this now happens via
+additional DAG match patterns implemented by a multi-alternative
+PatFrags. For simplifying a 32-bit AND to a 16-bit AND, the existing DAG
+patterns were already mostly OK, they just needed an output XForm to
+actually truncate the immediate value.
+
+Unfortunately, the latter change also exposed a bug in TableGen: it
+seems XForms are currently only handled correctly for direct operands of
+the outermost operation node. This patch also fixes that bug by simply
+recurring through the whole pattern. This should be NFC for all other
+targets.
+
+Differential Revision: https://reviews.llvm.org/D50096
+
+git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338521 91177308-0d34-0410-b5e6-96231b3b80d8
+---
+ lib/Target/SystemZ/SystemZISelLowering.cpp | 78 ------------------------------
+ lib/Target/SystemZ/SystemZISelLowering.h   |  1 -
+ lib/Target/SystemZ/SystemZInstrInfo.td     | 49 +++++++++++++------
+ lib/Target/SystemZ/SystemZOperands.td      |  1 +
+ lib/Target/SystemZ/SystemZOperators.td     |  6 +++
+ test/CodeGen/SystemZ/shift-12.ll           | 12 +++++
+ utils/TableGen/CodeGenDAGPatterns.cpp      | 39 ++++++++-------
+ 7 files changed, 71 insertions(+), 115 deletions(-)
+
+diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
+index adf3683..505b143 100644
+--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
+@@ -522,10 +522,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
+   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+   setTargetDAGCombine(ISD::FP_ROUND);
+   setTargetDAGCombine(ISD::BSWAP);
+-  setTargetDAGCombine(ISD::SHL);
+-  setTargetDAGCombine(ISD::SRA);
+-  setTargetDAGCombine(ISD::SRL);
+-  setTargetDAGCombine(ISD::ROTL);
+ 
+   // Handle intrinsics.
+   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+@@ -5405,76 +5401,6 @@ SDValue SystemZTargetLowering::combineBSWAP(
+   return SDValue();
+ }
+ 
+-SDValue SystemZTargetLowering::combineSHIFTROT(
+-    SDNode *N, DAGCombinerInfo &DCI) const {
+-
+-  SelectionDAG &DAG = DCI.DAG;
+-
+-  // Shift/rotate instructions only use the last 6 bits of the second operand
+-  // register. If the second operand is the result of an AND with an immediate
+-  // value that has its last 6 bits set, we can safely remove the AND operation.
+-  //
+-  // If the AND operation doesn't have the last 6 bits set, we can't remove it
+-  // entirely, but we can still truncate it to a 16-bit value. This prevents
+-  // us from ending up with a NILL with a signed operand, which will cause the
+-  // instruction printer to abort.
+-  SDValue N1 = N->getOperand(1);
+-  if (N1.getOpcode() == ISD::AND) {
+-    SDValue AndMaskOp = N1->getOperand(1);
+-    auto *AndMask = dyn_cast<ConstantSDNode>(AndMaskOp);
+-
+-    // The AND mask is constant
+-    if (AndMask) {
+-      auto AmtVal = AndMask->getZExtValue();
+-      
+-      // Bottom 6 bits are set
+-      if ((AmtVal & 0x3f) == 0x3f) {
+-        SDValue AndOp = N1->getOperand(0);
+-
+-        // This is the only use, so remove the node
+-        if (N1.hasOneUse()) {
+-          // Combine the AND away
+-          DCI.CombineTo(N1.getNode(), AndOp);
+-
+-          // Return N so it isn't rechecked
+-          return SDValue(N, 0);
+-
+-        // The node will be reused, so create a new node for this one use
+-        } else {
+-          SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+-                                        N->getValueType(0), N->getOperand(0),
+-                                        AndOp);
+-          DCI.AddToWorklist(Replace.getNode());
+-
+-          return Replace;
+-        }
+-
+-      // We can't remove the AND, but we can use NILL here (normally we would
+-      // use NILF). Only keep the last 16 bits of the mask. The actual
+-      // transformation will be handled by .td definitions.
+-      } else if (AmtVal >> 16 != 0) {
+-        SDValue AndOp = N1->getOperand(0);
+-
+-        auto NewMask = DAG.getConstant(AndMask->getZExtValue() & 0x0000ffff,
+-                                       SDLoc(AndMaskOp),
+-                                       AndMaskOp.getValueType());
+-
+-        auto NewAnd = DAG.getNode(N1.getOpcode(), SDLoc(N1), N1.getValueType(),
+-                                  AndOp, NewMask);
+-
+-        SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N),
+-                                      N->getValueType(0), N->getOperand(0),
+-                                      NewAnd);
+-        DCI.AddToWorklist(Replace.getNode());
+-
+-        return Replace;
+-      }
+-    }
+-  }
+-
+-  return SDValue();
+-}
+-
+ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+   switch(N->getOpcode()) {
+@@ -5487,10 +5413,6 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
+   case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
+   case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
+   case ISD::BSWAP:              return combineBSWAP(N, DCI);
+-  case ISD::SHL:
+-  case ISD::SRA:
+-  case ISD::SRL:
+-  case ISD::ROTL:               return combineSHIFTROT(N, DCI);
+   }
+ 
+   return SDValue();
+diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
+index 2cdc88d..1918d45 100644
+--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
+@@ -570,7 +570,6 @@ private:
+   SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
+   SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
+   SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
+-  SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const;
+ 
+   // If the last instruction before MBBI in MBB was some form of COMPARE,
+   // try to replace it with a COMPARE AND BRANCH just before MBBI.
+diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
+index abb8045..fb40cb4 100644
+--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
+@@ -1318,9 +1318,20 @@ def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+ // Shifts
+ //===----------------------------------------------------------------------===//
+ 
+// Complexity is 8 so we match it before the NILL paterns below.
+let AddedComplexity = 8 in {
+
+class ShiftAndPat <SDNode node, Instruction inst, ValueType vt> : Pat <
+  (node vt:$val, (and i32:$count, imm32bottom6set)),
+  (inst vt:$val, i32:$count, 0)
+>;
+}
+
+ // Logical shift left.
+ defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+def : ShiftAndPat <shl, SLL, i32>;
+ def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+def : ShiftAndPat <shl, SLLG, i64>;
+ def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
+ 
+ // Arithmetic shift left.
+@@ -1332,7 +1343,9 @@ let Defs = [CC] in {
+ 
+ // Logical shift right.
+ defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+def : ShiftAndPat <srl, SRL, i32>;
+ def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+def : ShiftAndPat <srl, SRLG, i64>;
+ def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
+ 
+ // Arithmetic shift right.
+@@ -1341,10 +1354,14 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+   def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+   def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>;
+ }
+def : ShiftAndPat <sra, SRA, i32>;
+def : ShiftAndPat <sra, SRAG, i64>;
+ 
+ // Rotate left.
+ def RLL  : BinaryRSY<"rll",  0xEB1D, rotl, GR32>;
+def : ShiftAndPat <rotl, RLL, i32>;
+ def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>;
+def : ShiftAndPat <rotl, RLLG, i64>;
+ 
+ // Rotate second operand left and inserted selected bits into first operand.
+ // These can act like 32-bit operands provided that the constant start and
+@@ -2154,29 +2171,29 @@ def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y),
+ // Complexity is added so that we match this before we match NILF on the AND
+ // operation alone.
+ let AddedComplexity = 4 in {
+-  def : Pat<(shl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+-            (SLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(shl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (SLL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(sra GR32:$val, (and GR32:$shift, uimm32:$imm)),
+-            (SRA GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(sra GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (SRA GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(srl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+-            (SRL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(srl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (SRL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(shl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+-            (SLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(shl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (SLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(sra GR64:$val, (and GR32:$shift, uimm32:$imm)),
+-            (SRAG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(sra GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (SRAG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(srl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+-            (SRLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(srl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (SRLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(rotl GR32:$val, (and GR32:$shift, uimm32:$imm)),
+-            (RLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(rotl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (RLL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ 
+-  def : Pat<(rotl GR64:$val, (and GR32:$shift, uimm32:$imm)),
+-            (RLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>;
+  def : Pat<(rotl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)),
+            (RLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>;
+ }
+ 
+ // Peepholes for turning scalar operations into block operations.
+diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
+index 7136121..61a1124 100644
+--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
+@@ -341,6 +341,7 @@ def imm32zx16 : Immediate<i32, [{
+ }], UIMM16, "U16Imm">;
+ 
+ def imm32sx16trunc : Immediate<i32, [{}], SIMM16, "S16Imm">;
+def imm32zx16trunc : Immediate<i32, [{}], UIMM16, "U16Imm">;
+ 
+ // Full 32-bit immediates.  we need both signed and unsigned versions
+ // because the assembler is picky.  E.g. AFI requires signed operands
+diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
+index d067f33..269c3d0 100644
+--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
+@@ -611,6 +611,12 @@ class storei<SDPatternOperator operator, SDPatternOperator store = store>
+   : PatFrag<(ops node:$addr),
+             (store (operator), node:$addr)>;
+ 
+// Create a shift operator that optionally ignores an AND of the
+// shift count with an immediate if the bottom 6 bits are all set.
+def imm32bottom6set : PatLeaf<(i32 imm), [{
+  return (N->getZExtValue() & 0x3f) == 0x3f;
+}]>;
+
+ // Vector representation of all-zeros and all-ones.
+ def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>;
+ def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>;
+diff --git a/test/CodeGen/SystemZ/shift-12.ll b/test/CodeGen/SystemZ/shift-12.ll
+index 4ebc42b..53d3d53 100644
+--- a/test/CodeGen/SystemZ/shift-12.ll
+++ b/test/CodeGen/SystemZ/shift-12.ll
+@@ -104,3 +104,15 @@ define i32 @f10(i32 %a, i32 %sh) {
+   %reuse = add i32 %and, %shift
+   ret i32 %reuse
+ }
+
+; Test that AND is not removed for i128 (which calls __ashlti3)
+define i128 @f11(i128 %a, i32 %sh) {
+; CHECK-LABEL: f11:
+; CHECK: risbg %r4, %r4, 57, 191, 0
+; CHECK: brasl %r14, __ashlti3@PLT
+  %and = and i32 %sh, 127
+  %ext = zext i32 %and to i128
+  %shift = shl i128 %a, %ext
+  ret i128 %shift
+}
+
+diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
+index 493066e..74af62b 100644
+--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
+@@ -3919,6 +3919,24 @@ static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) {
+   return false;
+ }
+ 
+// Promote xform function to be an explicit node wherever set.
+static TreePatternNode* PromoteXForms(TreePatternNode* N) {
+  if (Record *Xform = N->getTransformFn()) {
+      N->setTransformFn(nullptr);
+      std::vector<TreePatternNode*> Children;
+      Children.push_back(PromoteXForms(N));
+      return new TreePatternNode(Xform, std::move(Children),
+                                               N->getNumTypes());
+  }
+
+  if (!N->isLeaf())
+    for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) {
+      TreePatternNode* Child = N->getChild(i);
+      N->setChild(i, std::move(PromoteXForms(Child)));
+    }
+  return N;
+}
+
+ void CodeGenDAGPatterns::ParsePatterns() {
+   std::vector<Record*> Patterns = Records.getAllDerivedDefinitions("Pattern");
+ 
+@@ -4009,26 +4027,7 @@ void CodeGenDAGPatterns::ParsePatterns() {
+                                   InstImpResults);
+ 
+     // Promote the xform function to be an explicit node if set.
+-    TreePatternNode *DstPattern = Result.getOnlyTree();
+-    std::vector<TreePatternNode*> ResultNodeOperands;
+-    for (unsigned ii = 0, ee = DstPattern->getNumChildren(); ii != ee; ++ii) {
+-      TreePatternNode *OpNode = DstPattern->getChild(ii);
+-      if (Record *Xform = OpNode->getTransformFn()) {
+-        OpNode->setTransformFn(nullptr);
+-        std::vector<TreePatternNode*> Children;
+-        Children.push_back(OpNode);
+-        OpNode = new TreePatternNode(Xform, Children, OpNode->getNumTypes());
+-      }
+-      ResultNodeOperands.push_back(OpNode);
+-    }
+-    DstPattern = Result.getOnlyTree();
+-    if (!DstPattern->isLeaf())
+-      DstPattern = new TreePatternNode(DstPattern->getOperator(),
+-                                       ResultNodeOperands,
+-                                       DstPattern->getNumTypes());
+-
+-    for (unsigned i = 0, e = Result.getOnlyTree()->getNumTypes(); i != e; ++i)
+-      DstPattern->setType(i, Result.getOnlyTree()->getExtType(i));
+    TreePatternNode* DstPattern = PromoteXForms(Result.getOnlyTree());
+ 
+     TreePattern Temp(Result.getRecord(), DstPattern, false, *this);
+     Temp.InferAllTypes();
+-- 
+1.8.3.1
+
--- a/llvm.spec
+++ b/llvm.spec
@ -30,7 +30,7 @@

 Name:		%{pkg_name}
 Version:	%{maj_ver}.%{min_ver}.%{patch_ver}
-Release:	5%{?dist}
+Release:	6%{?dist}
 Summary:	The Low Level Virtual Machine

 License:	NCSA
@ -43,6 +43,10 @@ Patch3:		0001-CMake-Split-static-library-exports-into-their-own-ex.patch
 Patch7:		0001-Filter-out-cxxflags-not-supported-by-clang.patch
 Patch9:		0001-Export-LLVM_DYLIB_COMPONENTS-in-LLVMConfig.cmake.patch

+Patch10:	0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch
+Patch11:	0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch
+Patch12:	0001-SystemZ-TableGen-Fix-shift-count-handling.patch
+
 BuildRequires:  gcc
 BuildRequires:  gcc-c++
 BuildRequires:	cmake
@ -310,6 +314,9 @@ fi
 %endif

 %changelog
+* Mon Aug 06 2018 Tom Stellard <tstellar@redhat.com> - 6.0.1-6
+- Backport some fixes needed by mesa and rust
+
 * Thu Jul 26 2018 Tom Stellard <tstellar@redhat.com> - 6.0.1-5
 - Move libLLVM-6.0.so to llvm6.0-libs.