diff --git a/0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch b/0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch new file mode 100644 index 0000000..31d2b44 --- /dev/null +++ b/0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch @@ -0,0 +1,47 @@ +From 2eb830fed5b813c5624e770c244eec61dacb04d7 Mon Sep 17 00:00:00 2001 +From: Tom Stellard +Date: Mon, 9 Jul 2018 10:35:30 -0700 +Subject: [PATCH] Don't run BV DAG Combine before legalization if it assumes + legal types + +--- + lib/Target/PowerPC/PPCISelLowering.cpp | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp +index 26e9f13..f622b05 100644 +--- a/lib/Target/PowerPC/PPCISelLowering.cpp ++++ b/lib/Target/PowerPC/PPCISelLowering.cpp +@@ -11790,10 +11790,15 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { + auto isSExtOfVecExtract = [&](SDValue Op) -> bool { + if (!Op) + return false; +- if (Op.getOpcode() != ISD::SIGN_EXTEND) ++ if (Op.getOpcode() != ISD::SIGN_EXTEND && ++ Op.getOpcode() != ISD::SIGN_EXTEND_INREG) + return false; + ++ // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value ++ // of the right width. + SDValue Extract = Op.getOperand(0); ++ if (Extract.getOpcode() == ISD::ANY_EXTEND) ++ Extract = Extract.getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + +@@ -11881,8 +11886,10 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, + return Reduced; + + // If we're building a vector out of extended elements from another vector +- // we have P9 vector integer extend instructions. +- if (Subtarget.hasP9Altivec()) { ++ // we have P9 vector integer extend instructions. The code assumes legal ++ // input types (i.e. it can't handle things like v4i16) so do not run before ++ // legalization. ++ if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) { + Reduced = combineBVOfVecSExt(N, DAG); + if (Reduced) + return Reduced; +-- +1.8.3.1 + diff --git a/0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch b/0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch new file mode 100644 index 0000000..4c97cc6 --- /dev/null +++ b/0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch @@ -0,0 +1,919 @@ +From 88ad713b81c2f51dd8405b251f9825b0bca6e57d Mon Sep 17 00:00:00 2001 +From: Nemanja Ivanovic +Date: Thu, 2 Aug 2018 00:03:22 +0000 +Subject: [PATCH] [PowerPC] Do not round values prior to converting to integer + +Adding the FP_ROUND nodes when combining FP_TO_[SU]INT of elements +feeding a BUILD_VECTOR into an FP_TO_[SU]INT of the built vector +loses precision. This patch removes the code that adds these nodes +to true f64 operands. It also adds patterns required to ensure +the code is still vectorized rather than converting individual +elements and inserting into a vector. + +Fixes https://bugs.llvm.org/show_bug.cgi?id=38342 + +Differential Revision: https://reviews.llvm.org/D50121 + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338658 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/Target/PowerPC/PPCISelLowering.cpp | 22 +- + lib/Target/PowerPC/PPCInstrVSX.td | 86 +++++++ + test/CodeGen/PowerPC/build-vector-tests.ll | 357 +++++++++++++---------------- + 3 files changed, 258 insertions(+), 207 deletions(-) + +diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp +index f622b05..527ec5a 100644 +--- a/lib/Target/PowerPC/PPCISelLowering.cpp ++++ b/lib/Target/PowerPC/PPCISelLowering.cpp +@@ -11560,6 +11560,14 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, + ShiftCst); + } + ++// Is this an extending load from an f32 to an f64? ++static bool isFPExtLoad(SDValue Op) { ++ if (LoadSDNode *LD = dyn_cast(Op.getNode())) ++ return LD->getExtensionType() == ISD::EXTLOAD && ++ Op.getValueType() == MVT::f64; ++ return false; ++} ++ + /// \brief Reduces the number of fp-to-int conversion when building a vector. + /// + /// If this vector is built out of floating to integer conversions, +@@ -11594,11 +11602,18 @@ combineElementTruncationToVectorTruncation(SDNode *N, + SmallVector Ops; + EVT TargetVT = N->getValueType(0); + for (int i = 0, e = N->getNumOperands(); i < e; ++i) { +- if (N->getOperand(i).getOpcode() != PPCISD::MFVSR) ++ SDValue NextOp = N->getOperand(i); ++ if (NextOp.getOpcode() != PPCISD::MFVSR) + return SDValue(); +- unsigned NextConversion = N->getOperand(i).getOperand(0).getOpcode(); ++ unsigned NextConversion = NextOp.getOperand(0).getOpcode(); + if (NextConversion != FirstConversion) + return SDValue(); ++ // If we are converting to 32-bit integers, we need to add an FP_ROUND. ++ // This is not valid if the input was originally double precision. It is ++ // also not profitable to do unless this is an extending load in which ++ // case doing this combine will allow us to combine consecutive loads. ++ if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0))) ++ return SDValue(); + if (N->getOperand(i) != FirstInput) + IsSplat = false; + } +@@ -11612,8 +11627,9 @@ combineElementTruncationToVectorTruncation(SDNode *N, + // Now that we know we have the right type of node, get its operands + for (int i = 0, e = N->getNumOperands(); i < e; ++i) { + SDValue In = N->getOperand(i).getOperand(0); +- // For 32-bit values, we need to add an FP_ROUND node. + if (Is32Bit) { ++ // For 32-bit values, we need to add an FP_ROUND node (if we made it ++ // here, we know that all inputs are extending loads so this is safe). + if (In.isUndef()) + Ops.push_back(DAG.getUNDEF(SrcVT)); + else { +diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td +index 6f71978..1f48473 100644 +--- a/lib/Target/PowerPC/PPCInstrVSX.td ++++ b/lib/Target/PowerPC/PPCInstrVSX.td +@@ -3100,6 +3100,17 @@ def DblToFlt { + dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1)))); + } + ++def ExtDbl { ++ dag A0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 0)))))); ++ dag A1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$A, 1)))))); ++ dag B0S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 0)))))); ++ dag B1S = (i32 (PPCmfvsr (f64 (PPCfctiwz (f64 (extractelt v2f64:$B, 1)))))); ++ dag A0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 0)))))); ++ dag A1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$A, 1)))))); ++ dag B0U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 0)))))); ++ dag B1U = (i32 (PPCmfvsr (f64 (PPCfctiwuz (f64 (extractelt v2f64:$B, 1)))))); ++} ++ + def ByteToWord { + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); +@@ -3177,9 +3188,15 @@ def FltToULong { + } + def DblToInt { + dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A)))); ++ dag B = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$B)))); ++ dag C = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$C)))); ++ dag D = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$D)))); + } + def DblToUInt { + dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A)))); ++ dag B = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$B)))); ++ dag C = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$C)))); ++ dag D = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$D)))); + } + def DblToLong { + dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A)))); +@@ -3218,6 +3235,47 @@ def MrgFP { + dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3)); + } + ++// Word-element merge dags - conversions from f64 to i32 merged into vectors. ++def MrgWords { ++ // For big endian, we merge low and hi doublewords (A, B). ++ dag A0B0 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 0)); ++ dag A1B1 = (v2f64 (XXPERMDI v2f64:$A, v2f64:$B, 3)); ++ dag CVA1B1S = (v4i32 (XVCVDPSXWS A1B1)); ++ dag CVA0B0S = (v4i32 (XVCVDPSXWS A0B0)); ++ dag CVA1B1U = (v4i32 (XVCVDPUXWS A1B1)); ++ dag CVA0B0U = (v4i32 (XVCVDPUXWS A0B0)); ++ ++ // For little endian, we merge low and hi doublewords (B, A). ++ dag B1A1 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 0)); ++ dag B0A0 = (v2f64 (XXPERMDI v2f64:$B, v2f64:$A, 3)); ++ dag CVB1A1S = (v4i32 (XVCVDPSXWS B1A1)); ++ dag CVB0A0S = (v4i32 (XVCVDPSXWS B0A0)); ++ dag CVB1A1U = (v4i32 (XVCVDPUXWS B1A1)); ++ dag CVB0A0U = (v4i32 (XVCVDPUXWS B0A0)); ++ ++ // For big endian, we merge hi doublewords of (A, C) and (B, D), convert ++ // then merge. ++ dag AC = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$A, VSRC), ++ (COPY_TO_REGCLASS f64:$C, VSRC), 0)); ++ dag BD = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$B, VSRC), ++ (COPY_TO_REGCLASS f64:$D, VSRC), 0)); ++ dag CVACS = (v4i32 (XVCVDPSXWS AC)); ++ dag CVBDS = (v4i32 (XVCVDPSXWS BD)); ++ dag CVACU = (v4i32 (XVCVDPUXWS AC)); ++ dag CVBDU = (v4i32 (XVCVDPUXWS BD)); ++ ++ // For little endian, we merge hi doublewords of (D, B) and (C, A), convert ++ // then merge. ++ dag DB = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$D, VSRC), ++ (COPY_TO_REGCLASS f64:$B, VSRC), 0)); ++ dag CA = (v2f64 (XXPERMDI (COPY_TO_REGCLASS f64:$C, VSRC), ++ (COPY_TO_REGCLASS f64:$A, VSRC), 0)); ++ dag CVDBS = (v4i32 (XVCVDPSXWS DB)); ++ dag CVCAS = (v4i32 (XVCVDPSXWS CA)); ++ dag CVDBU = (v4i32 (XVCVDPUXWS DB)); ++ dag CVCAU = (v4i32 (XVCVDPUXWS CA)); ++} ++ + // Patterns for BUILD_VECTOR nodes. + def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">; + let AddedComplexity = 400 in { +@@ -3286,6 +3344,20 @@ let AddedComplexity = 400 in { + def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, + DblToFlt.B0, DblToFlt.B1)), + (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>; ++ ++ // Convert 4 doubles to a vector of ints. ++ def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B, ++ DblToInt.C, DblToInt.D)), ++ (v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>; ++ def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B, ++ DblToUInt.C, DblToUInt.D)), ++ (v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>; ++ def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S, ++ ExtDbl.B0S, ExtDbl.B1S)), ++ (v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>; ++ def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U, ++ ExtDbl.B0U, ExtDbl.B1U)), ++ (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>; + } + + let Predicates = [IsLittleEndian, HasVSX] in { +@@ -3300,6 +3372,20 @@ let AddedComplexity = 400 in { + def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, + DblToFlt.B0, DblToFlt.B1)), + (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>; ++ ++ // Convert 4 doubles to a vector of ints. ++ def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B, ++ DblToInt.C, DblToInt.D)), ++ (v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>; ++ def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B, ++ DblToUInt.C, DblToUInt.D)), ++ (v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>; ++ def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S, ++ ExtDbl.B0S, ExtDbl.B1S)), ++ (v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>; ++ def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U, ++ ExtDbl.B0U, ExtDbl.B1U)), ++ (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>; + } + + let Predicates = [HasDirectMove] in { +diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll +index 16b562b..3785b2a 100644 +--- a/test/CodeGen/PowerPC/build-vector-tests.ll ++++ b/test/CodeGen/PowerPC/build-vector-tests.ll +@@ -119,8 +119,8 @@ + ;vector int spltCnstConvftoi() { // + ; return (vector int) 4.74f; // + ;} // +-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvdpsxws // ++;// P8: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;vector int fromRegsConvftoi(float a, float b, float c, float d) { // + ; return (vector int) { a, b, c, d }; // + ;} // +@@ -139,15 +139,15 @@ + ;vector int fromDiffMemConsDConvftoi(float *ptr) { // + ; return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // + ;} // +-;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // ++;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // + ;// sldi 2, load, xvcvspuxws // + ;vector int fromDiffMemVarAConvftoi(float *arr, int elem) { // + ; return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] }; // + ;} // +-;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // ++;// P8: 4 x lxsspx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: 4 x lxssp, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // + ;// sldi 2, 2 x load, vperm, xvcvspuxws // + ;vector int fromDiffMemVarDConvftoi(float *arr, int elem) { // +@@ -168,8 +168,8 @@ + ;vector int spltCnstConvdtoi() { // + ; return (vector int) 4.74; // + ;} // +-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // ++;// P8: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;vector int fromRegsConvdtoi(double a, double b, double c, double d) { // + ; return (vector int) { a, b, c, d }; // + ;} // +@@ -178,25 +178,23 @@ + ;vector int fromDiffConstsConvdtoi() { // + ; return (vector int) { 24.46, 234., 988.19, 422.39 }; // + ;} // +-;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, // +-;// xvcvspsxws // +-;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, // +-;// xvcvspsxws // ++;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: 2 x lxvx, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspsxws, vmrgew // + ;vector int fromDiffMemConsAConvdtoi(double *ptr) { // + ; return (vector int) { ptr[0], ptr[1], ptr[2], ptr[3] }; // + ;} // +-;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // ++;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;vector int fromDiffMemConsDConvdtoi(double *ptr) { // + ; return (vector int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // + ;} // +-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // ++;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;vector int fromDiffMemVarAConvdtoi(double *arr, int elem) { // + ; return (vector int) { arr[elem], arr[elem+1], arr[elem+2], arr[elem+3] }; // + ;} // +-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // +-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspsxws // ++;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // ++;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspsxws, vmrgew // + ;vector int fromDiffMemVarDConvdtoi(double *arr, int elem) { // + ; return (vector int) { arr[elem], arr[elem-1], arr[elem-2], arr[elem-3] }; // + ;} // +@@ -296,8 +294,8 @@ + ;vector unsigned int spltCnstConvftoui() { // + ; return (vector unsigned int) 4.74f; // + ;} // +-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;vector unsigned int fromRegsConvftoui(float a, float b, float c, float d) { // + ; return (vector unsigned int) { a, b, c, d }; // + ;} // +@@ -316,16 +314,16 @@ + ;vector unsigned int fromDiffMemConsDConvftoui(float *ptr) { // + ; return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // + ;} // +-;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // + ;// sldi 2, load, xvcvspuxws // + ;vector unsigned int fromDiffMemVarAConvftoui(float *arr, int elem) { // + ; return (vector unsigned int) { arr[elem], arr[elem+1], // + ; arr[elem+2], arr[elem+3] }; // + ;} // +-;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: lfsux, 3 x lxsspx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: lfsux, 3 x lfs, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;// Note: if the consecutive loads learns to handle pre-inc, this can be: // + ;// sldi 2, 2 x load, vperm, xvcvspuxws // + ;vector unsigned int fromDiffMemVarDConvftoui(float *arr, int elem) { // +@@ -347,8 +345,8 @@ + ;vector unsigned int spltCnstConvdtoui() { // + ; return (vector unsigned int) 4.74; // + ;} // +-;// P8: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;vector unsigned int fromRegsConvdtoui(double a, double b, // + ; double c, double d) { // + ; return (vector unsigned int) { a, b, c, d }; // +@@ -358,25 +356,24 @@ + ;vector unsigned int fromDiffConstsConvdtoui() { // + ; return (vector unsigned int) { 24.46, 234., 988.19, 422.39 }; // + ;} // +-;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, // +-;// xvcvspuxws // +-;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: 2 x lxvd2x, 2 x xxswapd, xxmrgld, xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: 2 x lxvx, xxmrgld, xxmrghd, 2 x xvcvspuxws, vmrgew // + ;vector unsigned int fromDiffMemConsAConvdtoui(double *ptr) { // + ; return (vector unsigned int) { ptr[0], ptr[1], ptr[2], ptr[3] }; // + ;} // +-;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: 4 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: 4 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;vector unsigned int fromDiffMemConsDConvdtoui(double *ptr) { // + ; return (vector unsigned int) { ptr[3], ptr[2], ptr[1], ptr[0] }; // + ;} // +-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;vector unsigned int fromDiffMemVarAConvdtoui(double *arr, int elem) { // + ; return (vector unsigned int) { arr[elem], arr[elem+1], // + ; arr[elem+2], arr[elem+3] }; // + ;} // +-;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // +-;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvdpsp, vmrgew, xvcvspuxws // ++;// P8: lfdux, 3 x lxsdx, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // ++;// P9: lfdux, 3 x lfd, 2 x xxmrghd, 2 x xvcvspuxws, vmrgew // + ;vector unsigned int fromDiffMemVarDConvdtoui(double *arr, int elem) { // + ; return (vector unsigned int) { arr[elem], arr[elem-1], // + ; arr[elem-2], arr[elem-3] }; // +@@ -1253,28 +1250,24 @@ entry: + ; P8LE-LABEL: fromRegsConvftoi + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9BE: vmrgew v2, [[REG3]], [[REG4]] +-; P9BE: xvcvspsxws v2, v2 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9LE: vmrgew v2, [[REG4]], [[REG3]] +-; P9LE: xvcvspsxws v2, v2 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8BE: vmrgew v2, [[REG3]], [[REG4]] +-; P8BE: xvcvspsxws v2, v2 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8LE: vmrgew v2, [[REG4]], [[REG3]] +-; P8LE: xvcvspsxws v2, v2 + } + + ; Function Attrs: norecurse nounwind readnone +@@ -1529,28 +1522,24 @@ entry: + ; P8LE-LABEL: fromRegsConvdtoi + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9BE: vmrgew v2, [[REG3]], [[REG4]] +-; P9BE: xvcvspsxws v2, v2 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9LE: vmrgew v2, [[REG4]], [[REG3]] +-; P9LE: xvcvspsxws v2, v2 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8BE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8BE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8BE: vmrgew v2, [[REG3]], [[REG4]] +-; P8BE: xvcvspsxws v2, v2 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8LE-DAG: xvcvdpsxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8LE-DAG: xvcvdpsxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8LE: vmrgew v2, [[REG4]], [[REG3]] +-; P8LE: xvcvspsxws v2, v2 + } + + ; Function Attrs: norecurse nounwind readnone +@@ -1592,36 +1581,32 @@ entry: + ; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) + ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] + ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] +-; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] +-; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ++; P9BE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]] ++; P9BE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]] + ; P9BE: vmrgew v2, [[REG6]], [[REG5]] +-; P9BE: xvcvspsxws v2, v2 + ; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) + ; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) + ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] + ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] +-; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] +-; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ++; P9LE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]] ++; P9LE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]] + ; P9LE: vmrgew v2, [[REG6]], [[REG5]] +-; P9LE: xvcvspsxws v2, v2 + ; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 + ; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 + ; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] + ; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] +-; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] +-; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ++; P8BE-DAG: xvcvdpsxws [[REG5:[vs0-9]+]], [[REG3]] ++; P8BE-DAG: xvcvdpsxws [[REG6:[vs0-9]+]], [[REG4]] + ; P8BE: vmrgew v2, [[REG6]], [[REG5]] +-; P8BE: xvcvspsxws v2, v2 + ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 + ; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 + ; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]] + ; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]] + ; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]] + ; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]] +-; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]] +-; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]] ++; P8LE-DAG: xvcvdpsxws [[REG7:[vs0-9]+]], [[REG5]] ++; P8LE-DAG: xvcvdpsxws [[REG8:[vs0-9]+]], [[REG6]] + ; P8LE: vmrgew v2, [[REG8]], [[REG7]] +-; P8LE: xvcvspsxws v2, v2 + } + + ; Function Attrs: norecurse nounwind readonly +@@ -1653,40 +1638,36 @@ entry: + ; P9BE: lfd + ; P9BE: xxmrghd + ; P9BE: xxmrghd +-; P9BE: xvcvdpsp +-; P9BE: xvcvdpsp +-; P9BE: vmrgew +-; P9BE: xvcvspsxws v2 ++; P9BE: xvcvdpsxws ++; P9BE: xvcvdpsxws ++; P9BE: vmrgew v2 + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: xxmrghd + ; P9LE: xxmrghd +-; P9LE: xvcvdpsp +-; P9LE: xvcvdpsp +-; P9LE: vmrgew +-; P9LE: xvcvspsxws v2 ++; P9LE: xvcvdpsxws ++; P9LE: xvcvdpsxws ++; P9LE: vmrgew v2 + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: xxmrghd + ; P8BE: xxmrghd +-; P8BE: xvcvdpsp +-; P8BE: xvcvdpsp +-; P8BE: vmrgew +-; P8BE: xvcvspsxws v2 ++; P8BE: xvcvdpsxws ++; P8BE: xvcvdpsxws ++; P8BE: vmrgew v2 + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: xxmrghd + ; P8LE: xxmrghd +-; P8LE: xvcvdpsp +-; P8LE: xvcvdpsp +-; P8LE: vmrgew +-; P8LE: xvcvspsxws v2 ++; P8LE: xvcvdpsxws ++; P8LE: xvcvdpsxws ++; P8LE: vmrgew v2 + } + + ; Function Attrs: norecurse nounwind readonly +@@ -1726,40 +1707,36 @@ entry: + ; P9BE: lfd + ; P9BE: xxmrghd + ; P9BE: xxmrghd +-; P9BE: xvcvdpsp +-; P9BE: xvcvdpsp +-; P9BE: vmrgew +-; P9BE: xvcvspsxws v2 ++; P9BE: xvcvdpsxws ++; P9BE: xvcvdpsxws ++; P9BE: vmrgew v2 + ; P9LE: lfdux + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: xxmrghd + ; P9LE: xxmrghd +-; P9LE: xvcvdpsp +-; P9LE: xvcvdpsp +-; P9LE: vmrgew +-; P9LE: xvcvspsxws v2 ++; P9LE: xvcvdpsxws ++; P9LE: xvcvdpsxws ++; P9LE: vmrgew v2 + ; P8BE: lfdux + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: xxmrghd + ; P8BE: xxmrghd +-; P8BE: xvcvdpsp +-; P8BE: xvcvdpsp +-; P8BE: vmrgew +-; P8BE: xvcvspsxws v2 ++; P8BE: xvcvdpsxws ++; P8BE: xvcvdpsxws ++; P8BE: vmrgew v2 + ; P8LE: lfdux + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: xxmrghd + ; P8LE: xxmrghd +-; P8LE: xvcvdpsp +-; P8LE: xvcvdpsp +-; P8LE: vmrgew +-; P8LE: xvcvspsxws v2 ++; P8LE: xvcvdpsxws ++; P8LE: xvcvdpsxws ++; P8LE: vmrgew v2 + } + + ; Function Attrs: norecurse nounwind readonly +@@ -1799,40 +1776,36 @@ entry: + ; P9BE: lfd + ; P9BE: xxmrghd + ; P9BE: xxmrghd +-; P9BE: xvcvdpsp +-; P9BE: xvcvdpsp +-; P9BE: vmrgew +-; P9BE: xvcvspsxws v2 ++; P9BE: xvcvdpsxws ++; P9BE: xvcvdpsxws ++; P9BE: vmrgew v2 + ; P9LE: lfdux + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: xxmrghd + ; P9LE: xxmrghd +-; P9LE: xvcvdpsp +-; P9LE: xvcvdpsp +-; P9LE: vmrgew +-; P9LE: xvcvspsxws v2 ++; P9LE: xvcvdpsxws ++; P9LE: xvcvdpsxws ++; P9LE: vmrgew v2 + ; P8BE: lfdux + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: xxmrghd + ; P8BE: xxmrghd +-; P8BE: xvcvdpsp +-; P8BE: xvcvdpsp +-; P8BE: vmrgew +-; P8BE: xvcvspsxws v2 ++; P8BE: xvcvdpsxws ++; P8BE: xvcvdpsxws ++; P8BE: vmrgew v2 + ; P8LE: lfdux + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: xxmrghd + ; P8LE: xxmrghd +-; P8LE: xvcvdpsp +-; P8LE: xvcvdpsp +-; P8LE: vmrgew +-; P8LE: xvcvspsxws v2 ++; P8LE: xvcvdpsxws ++; P8LE: xvcvdpsxws ++; P8LE: vmrgew v2 + } + + ; Function Attrs: norecurse nounwind readnone +@@ -2413,28 +2386,24 @@ entry: + ; P8LE-LABEL: fromRegsConvftoui + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9BE: vmrgew v2, [[REG3]], [[REG4]] +-; P9BE: xvcvspuxws v2, v2 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9LE: vmrgew v2, [[REG4]], [[REG3]] +-; P9LE: xvcvspuxws v2, v2 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8BE: vmrgew v2, [[REG3]], [[REG4]] +-; P8BE: xvcvspuxws v2, v2 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8LE: vmrgew v2, [[REG4]], [[REG3]] +-; P8LE: xvcvspuxws v2, v2 + } + + ; Function Attrs: norecurse nounwind readnone +@@ -2689,28 +2658,24 @@ entry: + ; P8LE-LABEL: fromRegsConvdtoui + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P9BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P9BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9BE: vmrgew v2, [[REG3]], [[REG4]] +-; P9BE: xvcvspuxws v2, v2 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P9LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P9LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P9LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P9LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P9LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P9LE: vmrgew v2, [[REG4]], [[REG3]] +-; P9LE: xvcvspuxws v2, v2 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs1, vs3 + ; P8BE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs2, vs4 +-; P8BE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8BE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8BE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8BE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8BE: vmrgew v2, [[REG3]], [[REG4]] +-; P8BE: xvcvspuxws v2, v2 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG1:[0-9]+]], vs3, vs1 + ; P8LE-DAG: xxmrghd {{[vs]+}}[[REG2:[0-9]+]], vs4, vs2 +-; P8LE-DAG: xvcvdpsp [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] +-; P8LE-DAG: xvcvdpsp [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] ++; P8LE-DAG: xvcvdpuxws [[REG3:v[0-9]+]], {{[vs]+}}[[REG1]] ++; P8LE-DAG: xvcvdpuxws [[REG4:v[0-9]+]], {{[vs]+}}[[REG2]] + ; P8LE: vmrgew v2, [[REG4]], [[REG3]] +-; P8LE: xvcvspuxws v2, v2 + } + + ; Function Attrs: norecurse nounwind readnone +@@ -2752,36 +2717,32 @@ entry: + ; P9BE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) + ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] + ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] +-; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] +-; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ++; P9BE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]] ++; P9BE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]] + ; P9BE: vmrgew v2, [[REG6]], [[REG5]] +-; P9BE: xvcvspuxws v2, v2 + ; P9LE-DAG: lxv [[REG1:[vs0-9]+]], 0(r3) + ; P9LE-DAG: lxv [[REG2:[vs0-9]+]], 16(r3) +-; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] + ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]] +-; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] +-; P9LE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ++; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]] ++; P9LE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]] ++; P9LE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]] + ; P9LE: vmrgew v2, [[REG6]], [[REG5]] +-; P9LE: xvcvspuxws v2, v2 + ; P8BE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 + ; P8BE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 + ; P8BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]] + ; P8BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]] +-; P8BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]] +-; P8BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]] ++; P8BE-DAG: xvcvdpuxws [[REG5:[vs0-9]+]], [[REG3]] ++; P8BE-DAG: xvcvdpuxws [[REG6:[vs0-9]+]], [[REG4]] + ; P8BE: vmrgew v2, [[REG6]], [[REG5]] +-; P8BE: xvcvspuxws v2, v2 + ; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3 + ; P8LE: lxvd2x [[REG2:[vs0-9]+]], r3, r4 + ; P8LE-DAG: xxswapd [[REG3:[vs0-9]+]], [[REG1]] + ; P8LE-DAG: xxswapd [[REG4:[vs0-9]+]], [[REG2]] + ; P8LE-DAG: xxmrgld [[REG5:[vs0-9]+]], [[REG4]], [[REG3]] + ; P8LE-DAG: xxmrghd [[REG6:[vs0-9]+]], [[REG4]], [[REG3]] +-; P8LE-DAG: xvcvdpsp [[REG7:[vs0-9]+]], [[REG5]] +-; P8LE-DAG: xvcvdpsp [[REG8:[vs0-9]+]], [[REG6]] ++; P8LE-DAG: xvcvdpuxws [[REG7:[vs0-9]+]], [[REG5]] ++; P8LE-DAG: xvcvdpuxws [[REG8:[vs0-9]+]], [[REG6]] + ; P8LE: vmrgew v2, [[REG8]], [[REG7]] +-; P8LE: xvcvspuxws v2, v2 + } + + ; Function Attrs: norecurse nounwind readonly +@@ -2813,40 +2774,36 @@ entry: + ; P9BE: lfd + ; P9BE: xxmrghd + ; P9BE: xxmrghd +-; P9BE: xvcvdpsp +-; P9BE: xvcvdpsp +-; P9BE: vmrgew +-; P9BE: xvcvspuxws v2 ++; P9BE: xvcvdpuxws ++; P9BE: xvcvdpuxws ++; P9BE: vmrgew v2 + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: xxmrghd + ; P9LE: xxmrghd +-; P9LE: xvcvdpsp +-; P9LE: xvcvdpsp +-; P9LE: vmrgew +-; P9LE: xvcvspuxws v2 ++; P9LE: xvcvdpuxws ++; P9LE: xvcvdpuxws ++; P9LE: vmrgew v2 + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: xxmrghd + ; P8BE: xxmrghd +-; P8BE: xvcvdpsp +-; P8BE: xvcvdpsp +-; P8BE: vmrgew +-; P8BE: xvcvspuxws v2 ++; P8BE: xvcvdpuxws ++; P8BE: xvcvdpuxws ++; P8BE: vmrgew v2 + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: xxmrghd + ; P8LE: xxmrghd +-; P8LE: xvcvdpsp +-; P8LE: xvcvdpsp +-; P8LE: vmrgew +-; P8LE: xvcvspuxws v2 ++; P8LE: xvcvdpuxws ++; P8LE: xvcvdpuxws ++; P8LE: vmrgew v2 + } + + ; Function Attrs: norecurse nounwind readonly +@@ -2886,40 +2843,36 @@ entry: + ; P9BE: lfd + ; P9BE: xxmrghd + ; P9BE: xxmrghd +-; P9BE: xvcvdpsp +-; P9BE: xvcvdpsp +-; P9BE: vmrgew +-; P9BE: xvcvspuxws v2 ++; P9BE: xvcvdpuxws ++; P9BE: xvcvdpuxws ++; P9BE: vmrgew v2 + ; P9LE: lfdux + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: xxmrghd + ; P9LE: xxmrghd +-; P9LE: xvcvdpsp +-; P9LE: xvcvdpsp +-; P9LE: vmrgew +-; P9LE: xvcvspuxws v2 ++; P9LE: xvcvdpuxws ++; P9LE: xvcvdpuxws ++; P9LE: vmrgew v2 + ; P8BE: lfdux + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: xxmrghd + ; P8BE: xxmrghd +-; P8BE: xvcvdpsp +-; P8BE: xvcvdpsp +-; P8BE: vmrgew +-; P8BE: xvcvspuxws v2 ++; P8BE: xvcvdpuxws ++; P8BE: xvcvdpuxws ++; P8BE: vmrgew v2 + ; P8LE: lfdux + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: xxmrghd + ; P8LE: xxmrghd +-; P8LE: xvcvdpsp +-; P8LE: xvcvdpsp +-; P8LE: vmrgew +-; P8LE: xvcvspuxws v2 ++; P8LE: xvcvdpuxws ++; P8LE: xvcvdpuxws ++; P8LE: vmrgew v2 + } + + ; Function Attrs: norecurse nounwind readonly +@@ -2959,40 +2912,36 @@ entry: + ; P9BE: lfd + ; P9BE: xxmrghd + ; P9BE: xxmrghd +-; P9BE: xvcvdpsp +-; P9BE: xvcvdpsp +-; P9BE: vmrgew +-; P9BE: xvcvspuxws v2 ++; P9BE: xvcvdpuxws ++; P9BE: xvcvdpuxws ++; P9BE: vmrgew v2 + ; P9LE: lfdux + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: lfd + ; P9LE: xxmrghd + ; P9LE: xxmrghd +-; P9LE: xvcvdpsp +-; P9LE: xvcvdpsp +-; P9LE: vmrgew +-; P9LE: xvcvspuxws v2 ++; P9LE: xvcvdpuxws ++; P9LE: xvcvdpuxws ++; P9LE: vmrgew v2 + ; P8BE: lfdux + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: lxsdx + ; P8BE: xxmrghd + ; P8BE: xxmrghd +-; P8BE: xvcvdpsp +-; P8BE: xvcvdpsp +-; P8BE: vmrgew +-; P8BE: xvcvspuxws v2 ++; P8BE: xvcvdpuxws ++; P8BE: xvcvdpuxws ++; P8BE: vmrgew v2 + ; P8LE: lfdux + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: lxsdx + ; P8LE: xxmrghd + ; P8LE: xxmrghd +-; P8LE: xvcvdpsp +-; P8LE: xvcvdpsp +-; P8LE: vmrgew +-; P8LE: xvcvspuxws v2 ++; P8LE: xvcvdpuxws ++; P8LE: xvcvdpuxws ++; P8LE: vmrgew v2 + } + + ; Function Attrs: norecurse nounwind readnone +-- +1.8.3.1 + diff --git a/0001-SystemZ-TableGen-Fix-shift-count-handling.patch b/0001-SystemZ-TableGen-Fix-shift-count-handling.patch new file mode 100644 index 0000000..5777e3d --- /dev/null +++ b/0001-SystemZ-TableGen-Fix-shift-count-handling.patch @@ -0,0 +1,360 @@ +From 2ac90db51fc323d183aabe744e57f4feca6d3008 Mon Sep 17 00:00:00 2001 +From: Ulrich Weigand +Date: Wed, 1 Aug 2018 11:57:58 +0000 +Subject: [PATCH] [SystemZ, TableGen] Fix shift count handling + +*Backport of this patch from trunk without the TableGen fix and modified +to work with LLVM 6.0 TableGen. * + +The DAG combiner logic to simplify AND masks in shift counts is invalid. +While it is true that the SystemZ shift instructions ignore all but the +low 6 bits of the shift count, it is still invalid to simplify the AND +masks while the DAG still uses the standard shift operators (which are +*not* defined to match the SystemZ instruction behavior). + +Instead, this patch performs equivalent operations during instruction +selection. For completely removing the AND, this now happens via +additional DAG match patterns implemented by a multi-alternative +PatFrags. For simplifying a 32-bit AND to a 16-bit AND, the existing DAG +patterns were already mostly OK, they just needed an output XForm to +actually truncate the immediate value. + +Unfortunately, the latter change also exposed a bug in TableGen: it +seems XForms are currently only handled correctly for direct operands of +the outermost operation node. This patch also fixes that bug by simply +recurring through the whole pattern. This should be NFC for all other +targets. + +Differential Revision: https://reviews.llvm.org/D50096 + +git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338521 91177308-0d34-0410-b5e6-96231b3b80d8 +--- + lib/Target/SystemZ/SystemZISelLowering.cpp | 78 ------------------------------ + lib/Target/SystemZ/SystemZISelLowering.h | 1 - + lib/Target/SystemZ/SystemZInstrInfo.td | 49 +++++++++++++------ + lib/Target/SystemZ/SystemZOperands.td | 1 + + lib/Target/SystemZ/SystemZOperators.td | 6 +++ + test/CodeGen/SystemZ/shift-12.ll | 12 +++++ + utils/TableGen/CodeGenDAGPatterns.cpp | 39 ++++++++------- + 7 files changed, 71 insertions(+), 115 deletions(-) + +diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp +index adf3683..505b143 100644 +--- a/lib/Target/SystemZ/SystemZISelLowering.cpp ++++ b/lib/Target/SystemZ/SystemZISelLowering.cpp +@@ -522,10 +522,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::BSWAP); +- setTargetDAGCombine(ISD::SHL); +- setTargetDAGCombine(ISD::SRA); +- setTargetDAGCombine(ISD::SRL); +- setTargetDAGCombine(ISD::ROTL); + + // Handle intrinsics. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); +@@ -5405,76 +5401,6 @@ SDValue SystemZTargetLowering::combineBSWAP( + return SDValue(); + } + +-SDValue SystemZTargetLowering::combineSHIFTROT( +- SDNode *N, DAGCombinerInfo &DCI) const { +- +- SelectionDAG &DAG = DCI.DAG; +- +- // Shift/rotate instructions only use the last 6 bits of the second operand +- // register. If the second operand is the result of an AND with an immediate +- // value that has its last 6 bits set, we can safely remove the AND operation. +- // +- // If the AND operation doesn't have the last 6 bits set, we can't remove it +- // entirely, but we can still truncate it to a 16-bit value. This prevents +- // us from ending up with a NILL with a signed operand, which will cause the +- // instruction printer to abort. +- SDValue N1 = N->getOperand(1); +- if (N1.getOpcode() == ISD::AND) { +- SDValue AndMaskOp = N1->getOperand(1); +- auto *AndMask = dyn_cast(AndMaskOp); +- +- // The AND mask is constant +- if (AndMask) { +- auto AmtVal = AndMask->getZExtValue(); +- +- // Bottom 6 bits are set +- if ((AmtVal & 0x3f) == 0x3f) { +- SDValue AndOp = N1->getOperand(0); +- +- // This is the only use, so remove the node +- if (N1.hasOneUse()) { +- // Combine the AND away +- DCI.CombineTo(N1.getNode(), AndOp); +- +- // Return N so it isn't rechecked +- return SDValue(N, 0); +- +- // The node will be reused, so create a new node for this one use +- } else { +- SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N), +- N->getValueType(0), N->getOperand(0), +- AndOp); +- DCI.AddToWorklist(Replace.getNode()); +- +- return Replace; +- } +- +- // We can't remove the AND, but we can use NILL here (normally we would +- // use NILF). Only keep the last 16 bits of the mask. The actual +- // transformation will be handled by .td definitions. +- } else if (AmtVal >> 16 != 0) { +- SDValue AndOp = N1->getOperand(0); +- +- auto NewMask = DAG.getConstant(AndMask->getZExtValue() & 0x0000ffff, +- SDLoc(AndMaskOp), +- AndMaskOp.getValueType()); +- +- auto NewAnd = DAG.getNode(N1.getOpcode(), SDLoc(N1), N1.getValueType(), +- AndOp, NewMask); +- +- SDValue Replace = DAG.getNode(N->getOpcode(), SDLoc(N), +- N->getValueType(0), N->getOperand(0), +- NewAnd); +- DCI.AddToWorklist(Replace.getNode()); +- +- return Replace; +- } +- } +- } +- +- return SDValue(); +-} +- + SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + switch(N->getOpcode()) { +@@ -5487,10 +5413,6 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, + case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI); + case ISD::FP_ROUND: return combineFP_ROUND(N, DCI); + case ISD::BSWAP: return combineBSWAP(N, DCI); +- case ISD::SHL: +- case ISD::SRA: +- case ISD::SRL: +- case ISD::ROTL: return combineSHIFTROT(N, DCI); + } + + return SDValue(); +diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h +index 2cdc88d..1918d45 100644 +--- a/lib/Target/SystemZ/SystemZISelLowering.h ++++ b/lib/Target/SystemZ/SystemZISelLowering.h +@@ -570,7 +570,6 @@ private: + SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const; +- SDValue combineSHIFTROT(SDNode *N, DAGCombinerInfo &DCI) const; + + // If the last instruction before MBBI in MBB was some form of COMPARE, + // try to replace it with a COMPARE AND BRANCH just before MBBI. +diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td +index abb8045..fb40cb4 100644 +--- a/lib/Target/SystemZ/SystemZInstrInfo.td ++++ b/lib/Target/SystemZ/SystemZInstrInfo.td +@@ -1318,9 +1318,20 @@ def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))), + // Shifts + //===----------------------------------------------------------------------===// + ++// Complexity is 8 so we match it before the NILL paterns below. ++let AddedComplexity = 8 in { ++ ++class ShiftAndPat : Pat < ++ (node vt:$val, (and i32:$count, imm32bottom6set)), ++ (inst vt:$val, i32:$count, 0) ++>; ++} ++ + // Logical shift left. + defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; ++def : ShiftAndPat ; + def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; ++def : ShiftAndPat ; + def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>; + + // Arithmetic shift left. +@@ -1332,7 +1343,9 @@ let Defs = [CC] in { + + // Logical shift right. + defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; ++def : ShiftAndPat ; + def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; ++def : ShiftAndPat ; + def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>; + + // Arithmetic shift right. +@@ -1341,10 +1354,14 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { + def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>; + def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>; + } ++def : ShiftAndPat ; ++def : ShiftAndPat ; + + // Rotate left. + def RLL : BinaryRSY<"rll", 0xEB1D, rotl, GR32>; ++def : ShiftAndPat ; + def RLLG : BinaryRSY<"rllg", 0xEB1C, rotl, GR64>; ++def : ShiftAndPat ; + + // Rotate second operand left and inserted selected bits into first operand. + // These can act like 32-bit operands provided that the constant start and +@@ -2154,29 +2171,29 @@ def : Pat<(and (xor GR64:$x, (i64 -1)), GR64:$y), + // Complexity is added so that we match this before we match NILF on the AND + // operation alone. + let AddedComplexity = 4 in { +- def : Pat<(shl GR32:$val, (and GR32:$shift, uimm32:$imm)), +- (SLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(shl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (SLL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(sra GR32:$val, (and GR32:$shift, uimm32:$imm)), +- (SRA GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(sra GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (SRA GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(srl GR32:$val, (and GR32:$shift, uimm32:$imm)), +- (SRL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(srl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (SRL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(shl GR64:$val, (and GR32:$shift, uimm32:$imm)), +- (SLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(shl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (SLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(sra GR64:$val, (and GR32:$shift, uimm32:$imm)), +- (SRAG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(sra GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (SRAG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(srl GR64:$val, (and GR32:$shift, uimm32:$imm)), +- (SRLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(srl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (SRLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(rotl GR32:$val, (and GR32:$shift, uimm32:$imm)), +- (RLL GR32:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(rotl GR32:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (RLL GR32:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + +- def : Pat<(rotl GR64:$val, (and GR32:$shift, uimm32:$imm)), +- (RLLG GR64:$val, (NILL GR32:$shift, uimm32:$imm), 0)>; ++ def : Pat<(rotl GR64:$val, (and GR32:$shift, imm32zx16trunc:$imm)), ++ (RLLG GR64:$val, (NILL GR32:$shift, imm32zx16trunc:$imm), 0)>; + } + + // Peepholes for turning scalar operations into block operations. +diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td +index 7136121..61a1124 100644 +--- a/lib/Target/SystemZ/SystemZOperands.td ++++ b/lib/Target/SystemZ/SystemZOperands.td +@@ -341,6 +341,7 @@ def imm32zx16 : Immediate; + + def imm32sx16trunc : Immediate; ++def imm32zx16trunc : Immediate; + + // Full 32-bit immediates. we need both signed and unsigned versions + // because the assembler is picky. E.g. AFI requires signed operands +diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td +index d067f33..269c3d0 100644 +--- a/lib/Target/SystemZ/SystemZOperators.td ++++ b/lib/Target/SystemZ/SystemZOperators.td +@@ -611,6 +611,12 @@ class storei + : PatFrag<(ops node:$addr), + (store (operator), node:$addr)>; + ++// Create a shift operator that optionally ignores an AND of the ++// shift count with an immediate if the bottom 6 bits are all set. ++def imm32bottom6set : PatLeaf<(i32 imm), [{ ++ return (N->getZExtValue() & 0x3f) == 0x3f; ++}]>; ++ + // Vector representation of all-zeros and all-ones. + def z_vzero : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 0))))>; + def z_vones : PatFrag<(ops), (bitconvert (v16i8 (z_byte_mask (i32 65535))))>; +diff --git a/test/CodeGen/SystemZ/shift-12.ll b/test/CodeGen/SystemZ/shift-12.ll +index 4ebc42b..53d3d53 100644 +--- a/test/CodeGen/SystemZ/shift-12.ll ++++ b/test/CodeGen/SystemZ/shift-12.ll +@@ -104,3 +104,15 @@ define i32 @f10(i32 %a, i32 %sh) { + %reuse = add i32 %and, %shift + ret i32 %reuse + } ++ ++; Test that AND is not removed for i128 (which calls __ashlti3) ++define i128 @f11(i128 %a, i32 %sh) { ++; CHECK-LABEL: f11: ++; CHECK: risbg %r4, %r4, 57, 191, 0 ++; CHECK: brasl %r14, __ashlti3@PLT ++ %and = and i32 %sh, 127 ++ %ext = zext i32 %and to i128 ++ %shift = shl i128 %a, %ext ++ ret i128 %shift ++} ++ +diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp +index 493066e..74af62b 100644 +--- a/utils/TableGen/CodeGenDAGPatterns.cpp ++++ b/utils/TableGen/CodeGenDAGPatterns.cpp +@@ -3919,6 +3919,24 @@ static bool ForceArbitraryInstResultType(TreePatternNode *N, TreePattern &TP) { + return false; + } + ++// Promote xform function to be an explicit node wherever set. ++static TreePatternNode* PromoteXForms(TreePatternNode* N) { ++ if (Record *Xform = N->getTransformFn()) { ++ N->setTransformFn(nullptr); ++ std::vector Children; ++ Children.push_back(PromoteXForms(N)); ++ return new TreePatternNode(Xform, std::move(Children), ++ N->getNumTypes()); ++ } ++ ++ if (!N->isLeaf()) ++ for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) { ++ TreePatternNode* Child = N->getChild(i); ++ N->setChild(i, std::move(PromoteXForms(Child))); ++ } ++ return N; ++} ++ + void CodeGenDAGPatterns::ParsePatterns() { + std::vector Patterns = Records.getAllDerivedDefinitions("Pattern"); + +@@ -4009,26 +4027,7 @@ void CodeGenDAGPatterns::ParsePatterns() { + InstImpResults); + + // Promote the xform function to be an explicit node if set. +- TreePatternNode *DstPattern = Result.getOnlyTree(); +- std::vector ResultNodeOperands; +- for (unsigned ii = 0, ee = DstPattern->getNumChildren(); ii != ee; ++ii) { +- TreePatternNode *OpNode = DstPattern->getChild(ii); +- if (Record *Xform = OpNode->getTransformFn()) { +- OpNode->setTransformFn(nullptr); +- std::vector Children; +- Children.push_back(OpNode); +- OpNode = new TreePatternNode(Xform, Children, OpNode->getNumTypes()); +- } +- ResultNodeOperands.push_back(OpNode); +- } +- DstPattern = Result.getOnlyTree(); +- if (!DstPattern->isLeaf()) +- DstPattern = new TreePatternNode(DstPattern->getOperator(), +- ResultNodeOperands, +- DstPattern->getNumTypes()); +- +- for (unsigned i = 0, e = Result.getOnlyTree()->getNumTypes(); i != e; ++i) +- DstPattern->setType(i, Result.getOnlyTree()->getExtType(i)); ++ TreePatternNode* DstPattern = PromoteXForms(Result.getOnlyTree()); + + TreePattern Temp(Result.getRecord(), DstPattern, false, *this); + Temp.InferAllTypes(); +-- +1.8.3.1 + diff --git a/llvm.spec b/llvm.spec index ce858f2..f100956 100644 --- a/llvm.spec +++ b/llvm.spec @@ -30,7 +30,7 @@ Name: %{pkg_name} Version: %{maj_ver}.%{min_ver}.%{patch_ver} -Release: 5%{?dist} +Release: 6%{?dist} Summary: The Low Level Virtual Machine License: NCSA @@ -43,6 +43,10 @@ Patch3: 0001-CMake-Split-static-library-exports-into-their-own-ex.patch Patch7: 0001-Filter-out-cxxflags-not-supported-by-clang.patch Patch9: 0001-Export-LLVM_DYLIB_COMPONENTS-in-LLVMConfig.cmake.patch +Patch10: 0001-Don-t-run-BV-DAG-Combine-before-legalization-if-it-a.patch +Patch11: 0001-PowerPC-Do-not-round-values-prior-to-converting-to-i.patch +Patch12: 0001-SystemZ-TableGen-Fix-shift-count-handling.patch + BuildRequires: gcc BuildRequires: gcc-c++ BuildRequires: cmake @@ -310,6 +314,9 @@ fi %endif %changelog +* Mon Aug 06 2018 Tom Stellard - 6.0.1-6 +- Backport some fixes needed by mesa and rust + * Thu Jul 26 2018 Tom Stellard - 6.0.1-5 - Move libLLVM-6.0.so to llvm6.0-libs.