2009-03-27 Jakub Jelinek PR rtl-optimization/39543 * fwprop.c (forward_propagate_asm): New function. (forward_propagate_and_simplify): Propagate also into __asm, if it doesn't increase the number of referenced registers. * gcc.target/i386/pr39543-1.c: New test. * gcc.target/i386/pr39543-2.c: New test. * gcc.target/i386/pr39543-3.c: New test. --- gcc/fwprop.c.jj 2009-03-27 07:55:33.000000000 +0100 +++ gcc/fwprop.c 2009-03-27 10:00:48.000000000 +0100 @@ -1,5 +1,5 @@ /* RTL-based forward propagation pass for GNU compiler. - Copyright (C) 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. Contributed by Paolo Bonzini and Steven Bosscher. This file is part of GCC. @@ -852,6 +852,73 @@ forward_propagate_subreg (df_ref use, rt return false; } +/* Try to replace USE with SRC (defined in DEF_INSN) in __asm. */ + +static bool +forward_propagate_asm (df_ref use, rtx def_insn, rtx def_set, rtx reg) +{ + rtx use_insn = DF_REF_INSN (use), src, use_pat, asm_operands, new_rtx, *loc; + int speed_p, i; + df_ref *use_vec; + + gcc_assert ((DF_REF_FLAGS (use) & DF_REF_IN_NOTE) == 0); + + src = SET_SRC (def_set); + use_pat = PATTERN (use_insn); + + /* In __asm don't replace if src might need more registers than + reg, as that could increase register pressure on the __asm. */ + use_vec = DF_INSN_USES (def_insn); + if (use_vec[0] && use_vec[1]) + return false; + + speed_p = optimize_bb_for_speed_p (BLOCK_FOR_INSN (use_insn)); + asm_operands = NULL_RTX; + switch (GET_CODE (use_pat)) + { + case ASM_OPERANDS: + asm_operands = use_pat; + break; + case SET: + loc = &SET_DEST (use_pat); + new_rtx = propagate_rtx (*loc, GET_MODE (*loc), reg, src, speed_p); + if (new_rtx) + validate_unshare_change (use_insn, loc, new_rtx, true); + asm_operands = SET_SRC (use_pat); + break; + case PARALLEL: + for (i = 0; i < XVECLEN (use_pat, 0); i++) + if (GET_CODE (XVECEXP (use_pat, 0, i)) == SET) + { + loc = &SET_DEST (XVECEXP (use_pat, 0, i)); + new_rtx = propagate_rtx (*loc, GET_MODE (*loc), reg, src, speed_p); + if (new_rtx) + validate_unshare_change (use_insn, loc, new_rtx, true); + asm_operands = SET_SRC (XVECEXP (use_pat, 0, i)); + } + else if (GET_CODE (XVECEXP (use_pat, 0, i)) == ASM_OPERANDS) + asm_operands = XVECEXP (use_pat, 0, i); + break; + default: + gcc_unreachable (); + } + + gcc_assert (asm_operands && GET_CODE (asm_operands) == ASM_OPERANDS); + for (i = 0; i < ASM_OPERANDS_INPUT_LENGTH (asm_operands); i++) + { + loc = &ASM_OPERANDS_INPUT (asm_operands, i); + new_rtx = propagate_rtx (*loc, GET_MODE (*loc), reg, src, speed_p); + if (new_rtx) + validate_unshare_change (use_insn, loc, new_rtx, true); + } + + if (num_changes_pending () == 0 || !apply_change_group ()) + return false; + + num_changes++; + return true; +} + /* Try to replace USE with SRC (defined in DEF_INSN) and simplify the result. */ @@ -863,12 +930,16 @@ forward_propagate_and_simplify (df_ref u rtx src, reg, new_rtx, *loc; bool set_reg_equal; enum machine_mode mode; + int asm_use = -1; + + if (INSN_CODE (use_insn) < 0) + asm_use = asm_noperands (PATTERN (use_insn)); - if (!use_set) + if (!use_set && asm_use < 0) return false; /* Do not propagate into PC, CC0, etc. */ - if (GET_MODE (SET_DEST (use_set)) == VOIDmode) + if (use_set && GET_MODE (SET_DEST (use_set)) == VOIDmode) return false; /* If def and use are subreg, check if they match. */ @@ -900,7 +971,7 @@ forward_propagate_and_simplify (df_ref u if (MEM_P (src) && MEM_READONLY_P (src)) { rtx x = avoid_constant_pool_reference (src); - if (x != src) + if (x != src && use_set) { rtx note = find_reg_note (use_insn, REG_EQUAL, NULL_RTX); rtx old_rtx = note ? XEXP (note, 0) : SET_SRC (use_set); @@ -911,6 +982,9 @@ forward_propagate_and_simplify (df_ref u return false; } + if (asm_use >= 0) + return forward_propagate_asm (use, def_insn, def_set, reg); + /* Else try simplifying. */ if (DF_REF_TYPE (use) == DF_REF_REG_MEM_STORE) --- gcc/testsuite/gcc.target/i386/pr39543-1.c.jj 2009-03-25 16:40:18.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/pr39543-1.c 2009-03-25 16:40:50.000000000 +0100 @@ -0,0 +1,52 @@ +/* PR rtl-optimization/39543 */ +/* { dg-do compile } */ +/* { dg-options "-O3 -fomit-frame-pointer" } */ + +float __attribute__ ((aligned (16))) s0[128]; +const float s1 = 0.707; +float s2[8] __attribute__ ((aligned (16))); +float s3[8] __attribute__ ((aligned (16))); +float s4[16] __attribute__ ((aligned (16))); +float s5[16] __attribute__ ((aligned (16))); + +void +foo (int k, float *x, float *y, const float *d, const float *z) +{ + float *a, *b, *c, *e; + + a = x + 2 * k; + b = a + 2 * k; + c = b + 2 * k; + e = y + 2 * k; + __asm__ volatile ("" + : "=m" (x[0]), "=m" (b[0]), "=m" (a[0]), "=m" (c[0]) + : "m" (y[0]), "m" (y[k * 2]), "m" (x[0]), "m" (a[0]) + : "memory"); + for (;;) + { + __asm__ volatile ("" + : + : "m" (y[2]), "m" (d[2]), "m" (e[2]), "m" (z[2]) + : "memory"); + if (!--k) + break; + } + __asm__ volatile ("" + : "=m" (x[2]), "=m" (x[10]), "=m" (x[6]), "=m" (x[14]) + : "m" (y[2]), "m" (y[6]), "m" (x[2]), "m" (x[6]), + "m" (y[18]), "m" (s1) + : "memory"); +} + +void +bar (float *a) +{ + foo (4, a, a + 16, s2, s3); + foo (8, a, a + 32, s4, s5); +} + +void +baz (void) +{ + bar (s0); +} --- gcc/testsuite/gcc.target/i386/pr39543-2.c.jj 2009-03-25 16:40:18.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/pr39543-2.c 2009-03-25 16:40:38.000000000 +0100 @@ -0,0 +1,51 @@ +/* PR rtl-optimization/39543 */ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +float __attribute__ ((aligned (16))) s0[128]; +const float s1 = 0.707; +float s2[8] __attribute__ ((aligned (16))); +float s3[8] __attribute__ ((aligned (16))); +float s4[16] __attribute__ ((aligned (16))); +float s5[16] __attribute__ ((aligned (16))); + +void +foo (int k, float *x, float *y, const float *d, const float *z) +{ + float *a, *b, *c, *e; + + a = x + 2 * k; + b = a + 2 * k; + c = b + 2 * k; + e = y + 2 * k; + __asm__ volatile ("" + : "=m" (x[0]), "=m" (b[0]), "=m" (a[0]), "=m" (c[0]) + : "m" (y[0]), "m" (y[k * 2]), "m" (x[0]), "m" (a[0]) + : "memory"); + for (;;) + { + __asm__ volatile ("" + : + : "m" (y[2]), "m" (d[2]), "m" (e[2]), "m" (z[2]) + : "memory"); + if (!--k) + break; + } + __asm__ volatile ("" + : "=m" (x[2]), "=m" (x[10]), "=m" (x[6]), "=m" (x[14]) + : "m" (y[2]), "m" (y[6]), "m" (x[2]), "m" (x[6]), "m" (s1) + : "memory"); +} + +void +bar (float *a) +{ + foo (4, a, a + 16, s2, s3); + foo (8, a, a + 32, s4, s5); +} + +void +baz (void) +{ + bar (s0); +} --- gcc/testsuite/gcc.target/i386/pr39543-3.c.jj 2009-03-25 16:41:29.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/pr39543-3.c 2009-03-25 16:41:19.000000000 +0100 @@ -0,0 +1,42 @@ +/* PR rtl-optimization/39543 */ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +int s[128]; + +void +f1 (void) +{ + int i; + asm volatile ("# %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 %10 %11 %12 %13 %14 %15 %16 %17" + : "=r" (i) + : "m" (s[0]), "m" (s[2]), "m" (s[4]), "m" (s[6]), "m" (s[8]), + "m" (s[10]), "m" (s[12]), "m" (s[14]), "m" (s[16]), "m" (s[18]), + "m" (s[20]), "m" (s[22]), "m" (s[24]), "m" (s[26]), "m" (s[28]), + "m" (s[30]), "m" (s[32])); + asm volatile ("# %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 %10 %11 %12 %13 %14 %15 %16 %17" + : "=r" (i) + : "m" (s[0]), "m" (s[2]), "m" (s[4]), "m" (s[6]), "m" (s[8]), + "m" (s[10]), "m" (s[12]), "m" (s[14]), "m" (s[16]), "m" (s[18]), + "m" (s[20]), "m" (s[22]), "m" (s[24]), "m" (s[26]), "m" (s[28]), + "m" (s[30]), "m" (s[32])); +} + +void +f2 (int *q) +{ + int i; + int *p = q + 32; + asm volatile ("# %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 %10 %11 %12 %13 %14 %15 %16 %17" + : "=r" (i) + : "m" (p[0]), "m" (p[2]), "m" (p[4]), "m" (p[6]), "m" (p[8]), + "m" (p[10]), "m" (p[12]), "m" (p[14]), "m" (p[16]), "m" (p[18]), + "m" (p[20]), "m" (p[22]), "m" (p[24]), "m" (p[26]), "m" (p[28]), + "m" (p[30]), "m" (p[32])); + asm volatile ("# %0 %1 %2 %3 %4 %5 %6 %7 %8 %9 %10 %11 %12 %13 %14 %15 %16 %17" + : "=r" (i) + : "m" (p[0]), "m" (p[2]), "m" (p[4]), "m" (p[6]), "m" (p[8]), + "m" (p[10]), "m" (p[12]), "m" (p[14]), "m" (p[16]), "m" (p[18]), + "m" (p[20]), "m" (p[22]), "m" (p[24]), "m" (p[26]), "m" (p[28]), + "m" (p[30]), "m" (p[32])); +}