Revert "Backport two RISCV [target] fixes from trunk"

This reverts commit 108ae1dd96f64e2f046264d02e13c39ab650b53b.

Signed-off-by: David Abdurachmanov <davidlt@rivosinc.com>
This commit is contained in:
David Abdurachmanov 2025-02-02 06:37:01 +02:00
parent 108ae1dd96
commit fd1ce7e338
Signed by: davidlt
GPG Key ID: 7A5F42FAF91FACC3
3 changed files with 1 additions and 457 deletions

View File

@ -143,7 +143,7 @@
Summary: Various compilers (C, C++, Objective-C, ...)
Name: gcc
Version: %{gcc_version}
Release: %{gcc_release}.4.0.riscv64%{?dist}
Release: %{gcc_release}.4%{?dist}
# License notes for some of the less obvious ones:
# gcc/doc/cppinternals.texi: Linux-man-pages-copyleft-2-para
# isl: MIT, BSD-2-Clause
@ -302,10 +302,6 @@ Patch12: gcc15-pr118206.patch
Patch13: gcc15-d-deps.patch
Patch14: gcc15-pr117231.patch
# RISCV
Patch20: gcc15-pr116256.patch
Patch21: gcc15-pr118103.patch
Patch50: isl-rh2155127.patch
Patch100: gcc15-fortran-fdec-duplicates.patch
@ -923,10 +919,6 @@ so that there cannot be any synchronization problems.
%patch -P13 -p0 -b .d-deps~
%patch -P14 -p0 -b .pr117231~
# RISCV
%patch -P20 -p0 -b .pr116256~
%patch -P21 -p0 -b .pr118103~
%patch -P50 -p0 -b .rh2155127~
touch -r isl-0.24/m4/ax_prog_cxx_for_build.m4 isl-0.24/m4/ax_prog_cc_for_build.m4
@ -3685,10 +3677,6 @@ end
%endif
%changelog
* Sun Jan 26 2025 David Abdurachmanov <davidlt@rivosinc.com> 15.0.1-0.4.0.riscv64
- Pull riscv fixes from trunk
- PRs target/116256, target/118103
* Sat Jan 25 2025 Jakub Jelinek <jakub@redhat.com> 15.0.1-0.4
- update from trunk
- PRs c/118639, c++/105440, c++/107522, c++/107741, c++/115769, c++/116417,

View File

@ -1,227 +0,0 @@
From e5990a6ce611f522b8f48c2b469983da19d39777 Mon Sep 17 00:00:00 2001
From: Jeff Law <jlaw@ventanamicro.com>
Date: Sat, 25 Jan 2025 09:42:19 -0700
Subject: [PATCH] [RISC-V][PR target/116256] Improve handling of single bit
constants
So under the umbrella of pr116256 (P3 regression) I've been exploring removal
of the mvconst_internal pattern. Not surprisingly, that's going to cause all
kinds of undesirable fallout. While I can kind of see a path forward for that
work, it's going to require some combine work that I don't think we want to
tackle in the context of gcc-15.
Essentially without mvconst_internal we'll have fully exposed constant
synthesis prior to combine. Remember that combine has limits on what
combinations it will perform based on how many instructions are in the source
sequence. If we need 2+ instructions to synthesize the constant, those eat
into our budget.
In a world without mvconst_internal we'd need to either improve combine to
handle 5 insns cases (which do show up in the testsuite) or we need to
significantly improve how combine handles REG_EQUAL notes. 5 insn combinations
sound like insanity to me. So I'd tend to lean towards the latter, though
that's going to need some refactoring and diving into note redistribution
(ugh!).
In the mean time we can start limiting mvconst_internal. For the remaining
case in pr116256 we have this code in combine:
> (insn 8 5 10 2 (set (reg:V2048HF 138 [ _5 ])
> (vec_duplicate:V2048HF (reg:HF 142 [ x ]))) "j.c":152:11 3712 {*vec_duplicatev2048hf}
> (expr_list:REG_DEAD (reg:HF 142 [ x ])
> (nil)))
> (insn 10 8 11 2 (set (reg:DI 139)
> (const_int 2048 [0x800])) "j.c":152:11 275 {*mvconst_internal}
> (nil)) (insn 11 10 0 2 (set (mem:V2048HF (reg/f:DI 141 [ in ]) [1 MEM <vector(2048) _Float16> [(_Float16 *)in_7(D)]+0 S4096 A128])
> (if_then_else:V2048HF (unspec:V2048BI [
> (const_vector:V2048BI [
> (const_int 1 [0x1]) repeated x2048
> ])
> (reg:DI 139)
> (const_int 2 [0x2]) repeated x3
> (reg:SI 66 vl)
> (reg:SI 67 vtype)
> ] UNSPEC_VPREDICATE)
> (reg:V2048HF 138 [ _5 ])
> (unspec:V2048HF [
> (reg:DI 0 zero)
> ] UNSPEC_VUNDEF))) "j.c":152:11 3843 {*pred_movv2048hf}
> (expr_list:REG_DEAD (reg/f:DI 141 [ in ])
> (expr_list:REG_DEAD (reg:DI 0 zero)
> (expr_list:REG_DEAD (reg:SI 66 vl)
> (expr_list:REG_DEAD (reg:SI 67 vtype)
> (expr_list:REG_DEAD (reg:V2048HF 138 [ _5 ])
> (expr_list:REG_DEAD (reg:DI 139)
> (nil))))))))
Note a couple things. First insn 8 will be split shortly after combine and
will need the constant 2048. But that's obviously exposed late. Second (of
course) is the mvconst_internal pattern at insn 10. After split1 we'll have:
> (insn 16 5 17 2 (set (reg:DI 144) (const_int 4096 [0x1000])) "j.c":152:11 -1
> (nil))
> (insn 17 16 18 2 (set (reg:DI 143)
> (plus:DI (reg:DI 144)
> (const_int -2048 [0xfffffffffffff800]))) "j.c":152:11 -1
> (expr_list:REG_EQUAL (const_int 2048 [0x800])
> (nil)))
> (insn 18 17 19 2 (set (reg:V2048HF 138 [ _5 ])
> (if_then_else:V2048HF (unspec:V2048BI [ (const_vector:V2048BI [
> (const_int 1 [0x1]) repeated x2048
> ])
> (reg:DI 143)
> (const_int 2 [0x2]) repeated x3
> (reg:SI 66 vl)
> (reg:SI 67 vtype)
> ] UNSPEC_VPREDICATE)
> (vec_duplicate:V2048HF (reg:HF 142 [ x ]))
> (unspec:V2048HF [ (reg:DI 0 zero)
> ] UNSPEC_VUNDEF))) "j.c":152:11 -1
> (nil))
> (insn 19 18 20 2 (set (reg:DI 145)
> (const_int 4096 [0x1000])) "j.c":152:11 -1
> (nil))
> (insn 20 19 11 2 (set (reg:DI 139)
> (plus:DI (reg:DI 145)
> (const_int -2048 [0xfffffffffffff800]))) "j.c":152:11 -1
> (expr_list:REG_EQUAL (const_int 2048 [0x800])
> (nil)))
> (insn 11 20 0 2 (set (mem:V2048HF (reg/f:DI 141 [ in ]) [1 MEM <vector(2048) _Float16> [(_Float16 *)in_7(D)]+0 S4096 A128])
> (if_then_else:V2048HF (unspec:V2048BI [
> (const_vector:V2048BI [
> (const_int 1 [0x1]) repeated x2048
> ])
> (reg:DI 139) (const_int 2 [0x2]) repeated x3
> (reg:SI 66 vl)
> (reg:SI 67 vtype)
> ] UNSPEC_VPREDICATE)
> (reg:V2048HF 138 [ _5 ])
> (unspec:V2048HF [ (reg:DI 0 zero)
> ] UNSPEC_VUNDEF))) "j.c":152:11 3843 {*pred_movv2048hf}
> (expr_list:REG_DEAD (reg/f:DI 141 [ in ])
> (expr_list:REG_DEAD (reg:DI 0 zero) (expr_list:REG_DEAD (reg:SI 66 vl)
> (expr_list:REG_DEAD (reg:SI 67 vtype)
> (expr_list:REG_DEAD (reg:V2048HF 138 [ _5 ])
> (expr_list:REG_DEAD (reg:DI 139)
> (nil))))))))
Note the synthesis of 2048 appears twice. I seriously considered adding a
local cprop pass at this point. That could be done with a bit of work. It
didn't look too bad -- the biggest problem is cprop isn't designed to run once
we've left cfglayout. But we could probably finesse that by not allowing it to
change jumps if we've left cfglayout or converting it to do the more complex
jump fixups.
You might ask why the post-reload optimizers don't help since this at least
looks like a case where they could. After LRA the RTL looks like:
> (insn 26 5 25 2 (set (reg:DI 15 a5 [144])
> (const_int 4096 [0x1000])) "/home/jlaw/test/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-1.c":152:11 277 {*movdi_64bit} (expr_list:REG_EQUIV (const_int 4096 [0x1000])
> (nil)))
> (insn 25 26 19 2 (set (reg:DI 15 a5 [143])
> (plus:DI (reg:DI 15 a5 [144])
> (const_int -2048 [0xfffffffffffff800]))) "/home/jlaw/test/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-1.c":152:11 5 {adddi3}
> (expr_list:REG_EQUIV (const_int 2048 [0x800])
> (nil)))
> (insn 19 25 20 2 (set (reg:V2048QI 100 v4 [orig:138 _11 ] [138])
> (if_then_else:V2048QI (unspec:V2048BI [
> (const_vector:V2048BI [
> (const_int 1 [0x1]) repeated x2048
> ])
> (reg:DI 15 a5 [143])
> (const_int 2 [0x2]) repeated x3
> (reg:SI 66 vl)
> (reg:SI 67 vtype)
> ] UNSPEC_VPREDICATE)
> (vec_duplicate:V2048QI (reg:QI 12 a2 [145]))
> (unspec:V2048QI [ (reg:DI 0 zero)
> ] UNSPEC_VUNDEF))) "/home/jlaw/test/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-1.c":152:11 4172 {*pred_broadcastv2048qi}
> (nil)) (insn 20 19 21 2 (set (reg:DI 15 a5 [146])
> (const_int 4096 [0x1000])) "/home/jlaw/test/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-1.c":152:11 277 {*movdi_64bit} (expr_list:REG_EQUIV (const_int 4096 [0x1000])
> (nil)))
> (insn 21 20 11 2 (set (reg:DI 15 a5 [139])
> (plus:DI (reg:DI 15 a5 [146])
> (const_int -2048 [0xfffffffffffff800]))) "/home/jlaw/test/gcc/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-1.c":152:11 5 {adddi3}
> (expr_list:REG_EQUIV (const_int 2048 [0x800])
> (nil)))
Note the re-use of a5 for the constant synthesis steps. That's going to spoil
any chance of reload_cse saving us. That re-use also gets in the way of vsetvl
elimination and we ultimately get this code:
> foo10:
> li a5,4096
> addi a5,a5,-2048
> vsetvli zero,a5,e16,m8,ta,ma
> vfmv.v.f v8,fa0
> li a5,4096
> addi a5,a5,-2048
> vsetvli zero,a5,e16,m8,ta,ma
> vse16.v v8,0(a0)
> ret
The regression is we have the obviously redundant vsetvl. The additional copy
of the synthesis is undesirable as well.
If we filter out single bit constants from mvconst_internal we trivially fix
that regression. The only fallout is a class of saturation tests which want to
test against 0x80000000. Under the hood this is a minor codegen issue
interacting badly with combine's deliberate rejection of simplification of
extensions of constants. Rather than constructing the SImode constant, then
zero extending the result we can just generate the constant we actually want
directly in DImode.
The net is we fix the regression, don't introduce any obvious new regressions
and slightly reduce our dependence on mvconst_internal. All good in my book.
Obviously I'll wait for pre-commit CI to render a verdict.
PR target/116256
gcc/
* config/riscv/riscv.md (mvconst_internal): Reject single bit
constants.
* config/riscv/riscv.cc (riscv_gen_zero_extend_rtx): Improve
handling constants.
---
gcc/config/riscv/riscv.cc | 12 +++++++++---
gcc/config/riscv/riscv.md | 3 ++-
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5a3a05041773..4652454b8fec 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -12684,10 +12684,16 @@ riscv_gen_zero_extend_rtx (rtx x, machine_mode mode)
emit_move_insn (xmode_reg, x);
else
{
- rtx reg_x = gen_reg_rtx (mode);
+ /* Combine deliberately does not simplify extensions of constants
+ (long story). So try to generate the zero extended constant
+ efficiently.
- emit_move_insn (reg_x, x);
- riscv_emit_unary (ZERO_EXTEND, xmode_reg, reg_x);
+ First extract the constant and mask off all the bits not in MODE. */
+ HOST_WIDE_INT val = INTVAL (x);
+ val &= GET_MODE_MASK (mode);
+
+ /* X may need synthesis, so do not blindly copy it. */
+ xmode_reg = force_reg (Xmode, gen_int_mode (val, Xmode));
}
return xmode_reg;
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index e4123c912dcb..09053df1eb9b 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2470,7 +2470,8 @@
(match_operand:GPR 1 "splittable_const_int_operand" "i"))]
"!ira_in_progress
&& !(p2m1_shift_operand (operands[1], <MODE>mode)
- || high_mask_shift_operand (operands[1], <MODE>mode))"
+ || high_mask_shift_operand (operands[1], <MODE>mode)
+ || exact_log2 (INTVAL (operands[1])) >= 0)"
"#"
"&& 1"
[(const_int 0)]
--
2.43.5

View File

@ -1,217 +0,0 @@
From 55d288d4ff5360c572f2a017ba9385840ac5134e Mon Sep 17 00:00:00 2001
From: Pan Li <pan2.li@intel.com>
Date: Sat, 25 Jan 2025 15:45:10 +0800
Subject: [PATCH] RISC-V: Make FRM as global register [PR118103]
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit
After we enabled the labe-combine pass after the mode-switching pass, it
will try to combine below insn patterns into op. Aka:
(insn 40 5 41 2 (set (reg:SI 11 a1 [151])
(reg:SI 69 frm)) "pr118103-simple.c":67:15 2712 {frrmsi}
(nil))
(insn 41 40 7 2 (set (reg:SI 69 frm)
(const_int 2 [0x2])) "pr118103-simple.c":69:8 2710 {fsrmsi_restore}
(nil))
(insn 42 10 11 2 (set (reg:SI 69 frm)
(reg:SI 11 a1 [151])) "pr118103-simple.c":70:8 2710 {fsrmsi_restore}
(nil))
trying to combine definition of r11 in:
40: a1:SI=frm:SI
into:
42: frm:SI=a1:SI
instruction becomes a no-op:
(set (reg:SI 69 frm)
(reg:SI 69 frm))
original cost = 4 + 4 (weighted: 8.000000), replacement cost =
2147483647; keeping replacement
rescanning insn with uid = 42.
updating insn 42 in-place
verify found no changes in insn with uid = 42.
deleting insn 40
For example we have code as blow:
9 │ int test_exampe () {
10 │ test ();
11 │
12 │ size_t vl = 4;
13 │ vfloat16m1_t va = __riscv_vle16_v_f16m1(a, vl);
14 │ va = __riscv_vfnmadd_vv_f16m1_rm(va, va, va, __RISCV_FRM_RDN, vl);
15 │ va = __riscv_vfmsac_vv_f16m1(va, va, va, vl);
16 │
17 │ __riscv_vse16_v_f16m1(b, va, vl);
18 │
19 │ return 0;
20 │ }
it will be compiled to:
53 │ main:
54 │ addi sp,sp,-16
55 │ sd ra,8(sp)
56 │ call initialize
57 │ lui a6,%hi(b)
58 │ lui a2,%hi(a)
59 │ addi a3,a6,%lo(b)
60 │ addi a2,a2,%lo(a)
61 │ li a4,4
62 │ .L8:
63 │ fsrmi 2
64 │ vsetvli a5,a4,e16,m1,ta,ma
65 │ vle16.v v1,0(a2)
66 │ slli a1,a5,1
67 │ subw a4,a4,a5
68 │ add a2,a2,a1
69 │ vfnmadd.vv v1,v1,v1
>> The fsrm a0 insn is deleted by late-combine <<
70 │ vfmsub.vv v1,v1,v1
71 │ vse16.v v1,0(a3)
72 │ add a3,a3,a1
73 │ bgt a4,zero,.L8
74 │ lh a4,%lo(b)(a6)
75 │ li a5,-20480
76 │ addi a5,a5,-1382
77 │ bne a4,a5,.L14
78 │ ld ra,8(sp)
79 │ li a0,0
80 │ addi sp,sp,16
81 │ jr ra
This patch would like to add the FRM register to the global_regs as it
is a cooperatively-managed global register. And then the fsrm insn will
not be eliminated by late-combine. The related spec17 cam4 failure may
also caused by this issue too.
The below test suites are passed for this patch.
* The rv64gcv fully regression test.
PR target/118103
gcc/ChangeLog:
* config/riscv/riscv.cc (riscv_conditional_register_usage): Add
the FRM as the global_regs.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/base/pr118103-1.c: New test.
* gcc.target/riscv/rvv/base/pr118103-run-1.c: New test.
Signed-off-by: Pan Li <pan2.li@intel.com>
---
gcc/config/riscv/riscv.cc | 4 +-
.../gcc.target/riscv/rvv/base/pr118103-1.c | 27 ++++++++++
.../riscv/rvv/base/pr118103-run-1.c | 50 +++++++++++++++++++
3 files changed, 80 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-run-1.c
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 4652454b8fec..dd50fe4eddfb 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -10885,7 +10885,9 @@ riscv_conditional_register_usage (void)
call_used_regs[r] = 1;
}
- if (!TARGET_HARD_FLOAT)
+ if (TARGET_HARD_FLOAT)
+ global_regs[FRM_REGNUM] = 1;
+ else
{
for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
fixed_regs[regno] = call_used_regs[regno] = 1;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-1.c
new file mode 100644
index 000000000000..1afa5d3afb50
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvfh -mabi=lp64d" } */
+
+#include "riscv_vector.h"
+
+#define N 4
+typedef _Float16 float16_t;
+float16_t a[N]; float16_t b[N];
+
+extern void test ();
+
+int test_exampe () {
+ test ();
+
+ size_t vl = N;
+ vfloat16m1_t va = __riscv_vle16_v_f16m1(a, vl);
+ va = __riscv_vfnmadd_vv_f16m1_rm(va, va, va, __RISCV_FRM_RDN, vl);
+ va = __riscv_vfmsac_vv_f16m1(va, va, va, vl);
+
+ __riscv_vse16_v_f16m1(b, va, vl);
+
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times {frrm\s+[axs][0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {fsrmi\s+[01234]} 1 } } */
+/* { dg-final { scan-assembler-times {fsrm\s+[axs][0-9]+} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-run-1.c
new file mode 100644
index 000000000000..62375c63ee86
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr118103-run-1.c
@@ -0,0 +1,50 @@
+/* { dg-do run { target { riscv_zvfh } } } */
+/* { dg-options "-O3 -fno-strict-aliasing" } */
+
+#include "riscv_vector.h"
+#define N 4
+typedef _Float16 float16_t;
+float16_t a[N]; float16_t b[N];
+
+void initialize () {
+ uint16_t tmp_0[N] = {43883, 3213, 238, 275, };
+
+ for (int i = 0; i < N; ++i)
+ {
+ union { float16_t f16; uint16_t u16; } converter;
+ converter.u16 = tmp_0[i];
+ a[i] = converter.f16;
+ }
+
+ for (int i = 0; i < N; ++i)
+ b[i] = 0;
+}
+
+void compute ()
+{
+ int avl = N;
+ float16_t* ptr_a = a; float16_t* ptr_b = b;
+
+ for (size_t vl; avl > 0; avl -= vl)
+ {
+ vl = __riscv_vsetvl_e16m1(avl);
+ vfloat16m1_t va = __riscv_vle16_v_f16m1(ptr_a, vl);
+ va = __riscv_vfnmadd_vv_f16m1_rm(va, va, va, __RISCV_FRM_RDN, vl);
+ va = __riscv_vfmsac_vv_f16m1(va, va, va, vl);
+ __riscv_vse16_v_f16m1(ptr_b, va, vl);
+ ptr_a += vl; ptr_b += vl;
+ }
+}
+
+int main ()
+{
+ initialize();
+ compute();
+
+ short *tmp = (short *)b;
+
+ if (*tmp != -21862)
+ __builtin_abort ();
+
+ return 0;
+}
--
2.43.5