gcc/gcc44-atom.patch

2833 lines
95 KiB
Diff
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

2009-02-05 Joey Ye <joey.ye@intel.com>
Xuepeng Guo <xuepeng.guo@intel.com>
H.J. Lu <hongjiu.lu@intel.com>
Atom pipeline model, tuning and insn selection.
* rtlanal.c (reg_mentioned_by_mem_p_1): New function.
(reg_mentioned_by_mem_p): New function.
(reg_dep_by_addr_p): New function.
* rtl.h (reg_mentioned_by_mem_p): Declare new function.
(reg_dep_by_addr_p): Likewise.
* config.gcc (atom): Add atom config options and target.
* config/i386/i386.h (TARGET_ATOM): New target macro.
(X86_TUNE_OPT_AGU): New tuning flag.
(TARGET_OPT_AGU): New target option.
(TARGET_CPU_DEFAULT_atom): New CPU default.
(PROCESSOR_ATOM): New processor.
* config/i386/i386-c.c (ix86_target_macros_internal): New case
PROCESSOR_ATOM.
(ix86_target_macros_internal): Likewise.
* config/i386/i386-protos.h (ix86_lea_for_add_ok): Declare new
function.
(ix86_dep_by_shift_count): Likewise.
(ix86_agi_dependent): Likewise.
* config/i386/i386.c (atom_cost): New cost.
(m_ATOM): New macro flag.
(initial_ix86_tune_fe): Set m_ATOM.
(x86_accumulate_outgoing_args): Likewise.
(x86_arch_always_fancy_math_387): Likewise.
(processor_target): Add Atom cost.
(cpu_names): Add Atom cpu name.
(override_options): Set Atom ISA.
(LEA_SEARCH_THRESHOLD): New macro.
(distance_non_agu_define): New function.
(distance_agu_use): Likewise.
(ix86_lea_for_add_ok): Likewise.
(ix86_dep_by_shift_count): Likewise.
(ix86_agi_dependent): Make it global.
(ix86_issue_rate): New case PROCESSOR_ATOM.
(ix86_adjust_cost): Likewise.
* config/i386/i386.md (cpu): Add new value "atom".
(atom.md): Include atom.md.
(use_carry, movu): New attr.
(adddi3_carry_rex64): Set attr "use_carry".
(addqi3_carry): Likewise.
(addhi3_carry): Likewise.
(addsi3_carry): Likewise.
(*addsi3_carry_zext): Likewise.
(subdi3_carry_rex64): Likewise.
(subqi3_carry): Likewise.
(subhi3_carry): Likewise.
(subsi3_carry): Likewise.
(x86_movdicc_0_m1_rex64): Likewise.
(*x86_movdicc_0_m1_se): Likewise.
(x86_movsicc_0_m1): Likewise.
(*x86_movsicc_0_m1_se): Likewise.
(*adddi_1_rex64): Emit add insn as much as possible.
(*addsi_1): Likewise.
(return_internal): Set atom_unit.
(return_internal_long): Likewise.
(return_pop_internal): Likewise.
(*rcpsf2_sse): Set atom_sse_attr attr.
(*qrt<mode>2_sse): Likewise.
(*prefetch_sse): Likewise.
* config/i386/sse.md (cpu): Set attr "atom_sse_attr".
(*prefetch_sse_rex): Likewise.
(sse_rcpv4sf2): Likewise.
(sse_vmrcpv4sf2): Likewise.
(sse_sqrtv4sf2): Likewise.
(<sse>_vmsqrt<mode>2): Likewise.
(sse_ldmxcsr): Likewise.
(sse_stmxcsr): Likewise.
(*sse_sfence): Likewise.
(sse2_clflush): Likewise.
(*sse2_mfence): Likewise.
(*sse2_lfence): Likewise.
(avx_movup<avxmodesuffixf2c><avxmodesuffix>): Set attr "movu".
(<sse>_movup<ssemodesuffixf2c>): Likewise.
(avx_movdqu<avxmodesuffix>): Likewise.
(avx_lddqu<avxmodesuffix>): Likewise.
(sse2_movntv2di): Change attr "type" to "ssemov".
(sse2_movntsi): Likewise.
(rsqrtv8sf2): Change attr "type" to "sseadd".
(sse3_addsubv2df3): Set attr "atom_unit".
(sse3_h<plusminus_insn>v4sf3): Likewise.
(*sse2_pmaddwd): Likewise.
(*vec_extractv2di_1_rex64): Likewise.
(*vec_extractv2di_1_avx): Likewise.
(sse2_psadbw): Likewise.
(ssse3_phaddwv8hi3): Likewise.
(ssse3_phaddwv4hi3): Likewise.
(ssse3_phadddv4si3): Likewise.
(ssse3_phadddv2si3): Likewise.
(ssse3_phaddswv8hi3): Likewise.
(ssse3_phaddswv4hi3): Likewise.
(ssse3_phsubwv8hi3): Likewise.
(ssse3_phsubwv4hi3): Likewise.
(ssse3_phsubdv4si3): Likewise.
(ssse3_phsubdv2si3): Likewise.
(ssse3_phsubswv8hi3): Likewise.
(ssse3_phsubswv4hi3): Likewise.
(ssse3_pmaddubsw128): Likewise.
(sse3_pmaddubsw: Likewise.
(ssse3_palignrti): Likewise.
(ssse3_palignrdi): Likewise.
* config/i386/atom.md: New.
2009-02-05 H.J. Lu <hongjiu.lu@intel.com>
* config/i386/i386.c (ix86_agi_dependent): Remove the third
argument. Swap the first 2 arguments.
(ix86_adjust_cost): Updated.
2009-01-30 Vladimir Makarov <vmakarov@redhat.com>
* genautomata.c: Add a new year to the copyright. Add a new
reference.
(struct insn_reserv_decl): Add comments for member bypass_list.
(find_bypass): Remove.
(insert_bypass): New.
(process_decls): Use insert_bypass.
(output_internal_insn_latency_func): Output all bypasses with the
same input insn in one switch case.
* rtl.def (define_bypass): Describe bypass choice.
* doc/md.texi (define_bypass): Ditto.
--- gcc/doc/md.texi (.../trunk) (revision 144460)
+++ gcc/doc/md.texi (.../branches/ix86/atom) (revision 144601)
@@ -7506,6 +7506,11 @@ be ignored for this case. The additiona
recognize complicated bypasses, e.g.@: when the consumer is only an address
of insn @samp{store} (not a stored value).
+If there are more one bypass with the same output and input insns, the
+chosen bypass is the first bypass with a guard in description whose
+guard function returns nonzero. If there is no such bypass, then
+bypass without the guard function is chosen.
+
@findex exclusion_set
@findex presence_set
@findex final_presence_set
--- gcc/rtlanal.c (.../trunk) (revision 144460)
+++ gcc/rtlanal.c (.../branches/ix86/atom) (revision 144601)
@@ -728,6 +728,129 @@ reg_mentioned_p (const_rtx reg, const_rt
}
return 0;
}
+
+static int
+reg_mentioned_by_mem_p_1 (const_rtx reg, const_rtx in,
+ bool *mem_p)
+{
+ const char *fmt;
+ int i;
+ enum rtx_code code;
+
+ if (in == 0)
+ return 0;
+
+ if (reg == in)
+ return 1;
+
+ if (GET_CODE (in) == LABEL_REF)
+ return reg == XEXP (in, 0);
+
+ code = GET_CODE (in);
+
+ switch (code)
+ {
+ /* Compare registers by number. */
+ case REG:
+ return REG_P (reg) && REGNO (in) == REGNO (reg);
+
+ /* These codes have no constituent expressions
+ and are unique. */
+ case SCRATCH:
+ case CC0:
+ case PC:
+ return 0;
+
+ case CONST_INT:
+ case CONST_VECTOR:
+ case CONST_DOUBLE:
+ case CONST_FIXED:
+ /* These are kept unique for a given value. */
+ return 0;
+
+ default:
+ break;
+ }
+
+ if (GET_CODE (reg) == code && rtx_equal_p (reg, in))
+ return 1;
+
+ fmt = GET_RTX_FORMAT (code);
+
+ for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--)
+ {
+ if (fmt[i] == 'E')
+ {
+ int j;
+ for (j = XVECLEN (in, i) - 1; j >= 0; j--)
+ if (reg_mentioned_by_mem_p_1 (reg, XVECEXP (in, i, j), mem_p))
+ {
+ if (code == MEM)
+ *mem_p = true;
+
+ return 1;
+ }
+ }
+ else if (fmt[i] == 'e'
+ && reg_mentioned_by_mem_p_1 (reg, XEXP (in, i), mem_p))
+ {
+ if (code == MEM)
+ *mem_p = true;
+
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* Similar to the function reg_mentioned_p, return true only when
+ register REG appears in a MEM container of RTX IN. */
+
+bool
+reg_mentioned_by_mem_p (const_rtx reg, const_rtx in)
+{
+ bool mem = false;
+
+ reg_mentioned_by_mem_p_1 (reg, in, &mem);
+ return mem;
+}
+
+/* Return true if dest regsiter in set_insn is used in use_insn as
+ address calculation.
+ For example, returns true if
+ set_insn: reg_a = reg_b
+ use_insn: reg_c = (reg_a) # reg_a used in addr calculation
+ False if
+ set_insn: reg_a = reg_b
+ use_insn: (reg_c) = reg_a # reg_a is used, by not as addr. */
+
+bool
+reg_dep_by_addr_p (const_rtx set_insn, const_rtx use_insn)
+{
+ rtx pattern = PATTERN (set_insn);
+ rtx set_dest = NULL;
+
+ switch (GET_CODE (pattern))
+ {
+ case SET:
+ set_dest = SET_DEST (pattern);
+ break;
+ case PARALLEL:
+ {
+ rtx pattern2 = XVECEXP (PATTERN (set_insn), 0,0);
+ if (GET_CODE (pattern2) == SET)
+ set_dest = SET_DEST (pattern2);
+ break;
+ }
+ default:
+ set_dest = NULL;
+ }
+
+ /* True if destination of set is reg and used as address. */
+ return set_dest && REG_P (set_dest)
+ && reg_mentioned_by_mem_p (set_dest, use_insn);
+}
+
/* Return 1 if in between BEG and END, exclusive of BEG and END, there is
no CODE_LABEL insn. */
--- gcc/genautomata.c (.../trunk) (revision 144460)
+++ gcc/genautomata.c (.../branches/ix86/atom) (revision 144601)
@@ -22,21 +22,25 @@ along with GCC; see the file COPYING3.
/* References:
- 1. Detecting pipeline structural hazards quickly. T. Proebsting,
+ 1. The finite state automaton based pipeline hazard recognizer and
+ instruction scheduler in GCC. V. Makarov. Proceedings of GCC
+ summit, 2003.
+
+ 2. Detecting pipeline structural hazards quickly. T. Proebsting,
C. Fraser. Proceedings of ACM SIGPLAN-SIGACT Symposium on
Principles of Programming Languages, pages 280--286, 1994.
This article is a good start point to understand usage of finite
state automata for pipeline hazard recognizers. But I'd
- recommend the 2nd article for more deep understanding.
+ recommend the 1st and 3rd article for more deep understanding.
- 2. Efficient Instruction Scheduling Using Finite State Automata:
+ 3. Efficient Instruction Scheduling Using Finite State Automata:
V. Bala and N. Rubin, Proceedings of MICRO-28. This is the best
article about usage of finite state automata for pipeline hazard
recognizers.
- The current implementation is different from the 2nd article in the
- following:
+ The current implementation is described in the 1st article and it
+ is different from the 3rd article in the following:
1. New operator `|' (alternative) is permitted in functional unit
reservation which can be treated deterministically and
@@ -463,7 +467,10 @@ struct insn_reserv_decl
insn. */
int insn_num;
/* The following field value is list of bypasses in which given insn
- is output insn. */
+ is output insn. Bypasses with the same input insn stay one after
+ another in the list in the same order as their occurrences in the
+ description but the bypass without a guard stays always the last
+ in a row of bypasses with the same input insn. */
struct bypass_decl *bypass_list;
/* The following fields are defined by automaton generator. */
@@ -2367,18 +2374,67 @@ add_presence_absence (unit_set_el_t dest
}
-/* The function searches for bypass with given IN_INSN_RESERV in given
- BYPASS_LIST. */
-static struct bypass_decl *
-find_bypass (struct bypass_decl *bypass_list,
- struct insn_reserv_decl *in_insn_reserv)
-{
- struct bypass_decl *bypass;
-
- for (bypass = bypass_list; bypass != NULL; bypass = bypass->next)
- if (bypass->in_insn_reserv == in_insn_reserv)
- break;
- return bypass;
+/* The function inserts BYPASS in the list of bypasses of the
+ corresponding output insn. The order of bypasses in the list is
+ decribed in a comment for member `bypass_list' (see above). If
+ there is already the same bypass in the list the function reports
+ this and does nothing. */
+static void
+insert_bypass (struct bypass_decl *bypass)
+{
+ struct bypass_decl *curr, *last;
+ struct insn_reserv_decl *out_insn_reserv = bypass->out_insn_reserv;
+ struct insn_reserv_decl *in_insn_reserv = bypass->in_insn_reserv;
+
+ for (curr = out_insn_reserv->bypass_list, last = NULL;
+ curr != NULL;
+ last = curr, curr = curr->next)
+ if (curr->in_insn_reserv == in_insn_reserv)
+ {
+ if ((bypass->bypass_guard_name != NULL
+ && curr->bypass_guard_name != NULL
+ && ! strcmp (bypass->bypass_guard_name, curr->bypass_guard_name))
+ || bypass->bypass_guard_name == curr->bypass_guard_name)
+ {
+ if (bypass->bypass_guard_name == NULL)
+ {
+ if (!w_flag)
+ error ("the same bypass `%s - %s' is already defined",
+ bypass->out_insn_name, bypass->in_insn_name);
+ else
+ warning (0, "the same bypass `%s - %s' is already defined",
+ bypass->out_insn_name, bypass->in_insn_name);
+ }
+ else if (!w_flag)
+ error ("the same bypass `%s - %s' (guard %s) is already defined",
+ bypass->out_insn_name, bypass->in_insn_name,
+ bypass->bypass_guard_name);
+ else
+ warning
+ (0, "the same bypass `%s - %s' (guard %s) is already defined",
+ bypass->out_insn_name, bypass->in_insn_name,
+ bypass->bypass_guard_name);
+ return;
+ }
+ if (curr->bypass_guard_name == NULL)
+ break;
+ if (curr->next == NULL || curr->next->in_insn_reserv != in_insn_reserv)
+ {
+ last = curr;
+ break;
+ }
+
+ }
+ if (last == NULL)
+ {
+ bypass->next = out_insn_reserv->bypass_list;
+ out_insn_reserv->bypass_list = bypass;
+ }
+ else
+ {
+ bypass->next = last->next;
+ last->next = bypass;
+ }
}
/* The function processes pipeline description declarations, checks
@@ -2391,7 +2447,6 @@ process_decls (void)
decl_t decl_in_table;
decl_t out_insn_reserv;
decl_t in_insn_reserv;
- struct bypass_decl *bypass;
int automaton_presence;
int i;
@@ -2514,36 +2569,7 @@ process_decls (void)
= DECL_INSN_RESERV (out_insn_reserv);
DECL_BYPASS (decl)->in_insn_reserv
= DECL_INSN_RESERV (in_insn_reserv);
- bypass
- = find_bypass (DECL_INSN_RESERV (out_insn_reserv)->bypass_list,
- DECL_BYPASS (decl)->in_insn_reserv);
- if (bypass != NULL)
- {
- if (DECL_BYPASS (decl)->latency == bypass->latency)
- {
- if (!w_flag)
- error
- ("the same bypass `%s - %s' is already defined",
- DECL_BYPASS (decl)->out_insn_name,
- DECL_BYPASS (decl)->in_insn_name);
- else
- warning
- (0, "the same bypass `%s - %s' is already defined",
- DECL_BYPASS (decl)->out_insn_name,
- DECL_BYPASS (decl)->in_insn_name);
- }
- else
- error ("bypass `%s - %s' is already defined",
- DECL_BYPASS (decl)->out_insn_name,
- DECL_BYPASS (decl)->in_insn_name);
- }
- else
- {
- DECL_BYPASS (decl)->next
- = DECL_INSN_RESERV (out_insn_reserv)->bypass_list;
- DECL_INSN_RESERV (out_insn_reserv)->bypass_list
- = DECL_BYPASS (decl);
- }
+ insert_bypass (DECL_BYPASS (decl));
}
}
}
@@ -8159,19 +8185,32 @@ output_internal_insn_latency_func (void)
(advance_cycle_insn_decl)->insn_num));
fprintf (output_file, " case %d:\n",
bypass->in_insn_reserv->insn_num);
- if (bypass->bypass_guard_name == NULL)
- fprintf (output_file, " return %d;\n",
- bypass->latency);
- else
+ for (;;)
{
- fprintf (output_file,
- " if (%s (%s, %s))\n",
- bypass->bypass_guard_name, INSN_PARAMETER_NAME,
- INSN2_PARAMETER_NAME);
- fprintf (output_file,
- " return %d;\n break;\n",
- bypass->latency);
+ if (bypass->bypass_guard_name == NULL)
+ {
+ gcc_assert (bypass->next == NULL
+ || (bypass->in_insn_reserv
+ != bypass->next->in_insn_reserv));
+ fprintf (output_file, " return %d;\n",
+ bypass->latency);
+ }
+ else
+ {
+ fprintf (output_file,
+ " if (%s (%s, %s))\n",
+ bypass->bypass_guard_name, INSN_PARAMETER_NAME,
+ INSN2_PARAMETER_NAME);
+ fprintf (output_file, " return %d;\n",
+ bypass->latency);
+ }
+ if (bypass->next == NULL
+ || bypass->in_insn_reserv != bypass->next->in_insn_reserv)
+ break;
+ bypass = bypass->next;
}
+ if (bypass->bypass_guard_name != NULL)
+ fprintf (output_file, " break;\n");
}
fputs (" }\n break;\n", output_file);
}
--- gcc/rtl.def (.../trunk) (revision 144460)
+++ gcc/rtl.def (.../branches/ix86/atom) (revision 144601)
@@ -1088,7 +1088,11 @@ DEF_RTL_EXPR(FINAL_ABSENCE_SET, "final_a
guard for the bypass. The function will get the two insns as
parameters. If the function returns zero the bypass will be
ignored for this case. Additional guard is necessary to recognize
- complicated bypasses, e.g. when consumer is load address. */
+ complicated bypasses, e.g. when consumer is load address. If there
+ are more one bypass with the same output and input insns, the
+ chosen bypass is the first bypass with a guard in description whose
+ guard function returns nonzero. If there is no such bypass, then
+ bypass without the guard function is chosen. */
DEF_RTL_EXPR(DEFINE_BYPASS, "define_bypass", "issS", RTX_EXTRA)
/* (define_automaton string) describes names of automata generated and
--- gcc/rtl.h (.../trunk) (revision 144460)
+++ gcc/rtl.h (.../branches/ix86/atom) (revision 144601)
@@ -1731,6 +1731,8 @@ extern rtx get_related_value (const_rtx)
extern bool offset_within_block_p (const_rtx, HOST_WIDE_INT);
extern void split_const (rtx, rtx *, rtx *);
extern int reg_mentioned_p (const_rtx, const_rtx);
+extern bool reg_mentioned_by_mem_p (const_rtx, const_rtx);
+extern bool reg_dep_by_addr_p (const_rtx, const_rtx);
extern int count_occurrences (const_rtx, const_rtx, int);
extern int reg_referenced_p (const_rtx, const_rtx);
extern int reg_used_between_p (const_rtx, const_rtx, const_rtx);
--- gcc/config.gcc (.../trunk) (revision 144460)
+++ gcc/config.gcc (.../branches/ix86/atom) (revision 144601)
@@ -1087,7 +1087,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfree
tmake_file="${tmake_file} i386/t-linux64"
need_64bit_hwint=yes
case X"${with_cpu}" in
- Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx)
+ Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx)
;;
X)
if test x$with_cpu_64 = x; then
@@ -1096,7 +1096,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfree
;;
*)
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
- echo "generic core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2
+ echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2
exit 1
;;
esac
@@ -1201,7 +1201,7 @@ i[34567]86-*-solaris2*)
# libgcc/configure.ac instead.
need_64bit_hwint=yes
case X"${with_cpu}" in
- Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx)
+ Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx)
;;
X)
if test x$with_cpu_64 = x; then
@@ -1210,7 +1210,7 @@ i[34567]86-*-solaris2*)
;;
*)
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
- echo "generic core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2
+ echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2
exit 1
;;
esac
@@ -2803,7 +2803,7 @@ case "${target}" in
esac
# OK
;;
- "" | amdfam10 | barcelona | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic)
+ "" | amdfam10 | barcelona | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | atom | generic)
# OK
;;
*)
--- gcc/config/i386/i386.h (.../trunk) (revision 144460)
+++ gcc/config/i386/i386.h (.../branches/ix86/atom) (revision 144601)
@@ -231,6 +231,7 @@ extern const struct processor_costs ix86
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
+#define TARGET_ATOM (ix86_tune == PROCESSOR_ATOM)
/* Feature tests against the various tunings. */
enum ix86_tune_indices {
@@ -295,6 +296,7 @@ enum ix86_tune_indices {
X86_TUNE_USE_VECTOR_FP_CONVERTS,
X86_TUNE_USE_VECTOR_CONVERTS,
X86_TUNE_FUSE_CMP_AND_BRANCH,
+ X86_TUNE_OPT_AGU,
X86_TUNE_LAST
};
@@ -382,6 +384,7 @@ extern unsigned char ix86_tune_features[
ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
#define TARGET_FUSE_CMP_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
+#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
@@ -564,6 +567,7 @@ enum target_cpu_default
TARGET_CPU_DEFAULT_prescott,
TARGET_CPU_DEFAULT_nocona,
TARGET_CPU_DEFAULT_core2,
+ TARGET_CPU_DEFAULT_atom,
TARGET_CPU_DEFAULT_geode,
TARGET_CPU_DEFAULT_k6,
@@ -2256,6 +2260,7 @@ enum processor_type
PROCESSOR_GENERIC32,
PROCESSOR_GENERIC64,
PROCESSOR_AMDFAM10,
+ PROCESSOR_ATOM,
PROCESSOR_max
};
--- gcc/config/i386/i386.md (.../trunk) (revision 144460)
+++ gcc/config/i386/i386.md (.../branches/ix86/atom) (revision 144601)
@@ -298,7 +298,7 @@ (define_constants
;; Processor type.
-(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,
+(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,atom,
generic64,amdfam10"
(const (symbol_ref "ix86_schedule")))
@@ -594,6 +594,12 @@ (define_attr "fp_int_src" "false,true"
(define_attr "i387_cw" "trunc,floor,ceil,mask_pm,uninitialized,any"
(const_string "any"))
+;; Define attribute to classify add/sub insns that consumes carry flag (CF)
+(define_attr "use_carry" "0,1" (const_string "0"))
+
+;; Define attribute to indicate unaligned ssemov insns
+(define_attr "movu" "0,1" (const_string "0"))
+
;; Describe a user's asm statement.
(define_asm_attributes
[(set_attr "length" "128")
@@ -709,6 +715,7 @@ (define_mode_iterator P [(SI "Pmode == S
(include "k6.md")
(include "athlon.md")
(include "geode.md")
+(include "atom.md")
;; Operand and operator predicates and constraints
@@ -5776,6 +5783,7 @@ (define_insn "adddi3_carry_rex64"
"TARGET_64BIT && ix86_binary_operator_ok (PLUS, DImode, operands)"
"adc{q}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "DI")])
@@ -5850,6 +5858,7 @@ (define_insn "addqi3_carry"
"ix86_binary_operator_ok (PLUS, QImode, operands)"
"adc{b}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "QI")])
@@ -5862,6 +5871,7 @@ (define_insn "addhi3_carry"
"ix86_binary_operator_ok (PLUS, HImode, operands)"
"adc{w}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "HI")])
@@ -5874,6 +5884,7 @@ (define_insn "addsi3_carry"
"ix86_binary_operator_ok (PLUS, SImode, operands)"
"adc{l}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "SI")])
@@ -5887,6 +5898,7 @@ (define_insn "*addsi3_carry_zext"
"TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)"
"adc{l}\t{%2, %k0|%k0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "SI")])
@@ -6116,9 +6128,9 @@ (define_insn_and_split "*lea_general_3_z
(set_attr "mode" "SI")])
(define_insn "*adddi_1_rex64"
- [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
- (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,r")
- (match_operand:DI 2 "x86_64_general_operand" "rme,re,le")))
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+ (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,r,r")
+ (match_operand:DI 2 "x86_64_general_operand" "rme,re,0,le")))
(clobber (reg:CC FLAGS_REG))]
"TARGET_64BIT && ix86_binary_operator_ok (PLUS, DImode, operands)"
{
@@ -6139,6 +6151,10 @@ (define_insn "*adddi_1_rex64"
}
default:
+ /* Use add as much as possible to replace lea for AGU optimization. */
+ if (which_alternative == 2 && TARGET_OPT_AGU)
+ return "add{q}\t{%1, %0|%0, %1}";
+
gcc_assert (rtx_equal_p (operands[0], operands[1]));
/* Make things pretty and `subl $4,%eax' rather than `addl $-4, %eax'.
@@ -6157,8 +6173,11 @@ (define_insn "*adddi_1_rex64"
}
}
[(set (attr "type")
- (cond [(eq_attr "alternative" "2")
+ (cond [(and (eq_attr "alternative" "2")
+ (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0)))
(const_string "lea")
+ (eq_attr "alternative" "3")
+ (const_string "lea")
; Current assemblers are broken and do not allow @GOTOFF in
; ought but a memory context.
(match_operand:DI 2 "pic_symbolic_operand" "")
@@ -6175,8 +6194,8 @@ (define_split
(plus:DI (match_operand:DI 1 "register_operand" "")
(match_operand:DI 2 "x86_64_nonmemory_operand" "")))
(clobber (reg:CC FLAGS_REG))]
- "TARGET_64BIT && reload_completed
- && true_regnum (operands[0]) != true_regnum (operands[1])"
+ "TARGET_64BIT && reload_completed
+ && ix86_lea_for_add_ok (PLUS, insn, operands)"
[(set (match_dup 0)
(plus:DI (match_dup 1)
(match_dup 2)))]
@@ -6380,9 +6399,9 @@ (define_insn "*adddi_5_rex64"
(define_insn "*addsi_1"
- [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm,r")
- (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,r")
- (match_operand:SI 2 "general_operand" "g,ri,li")))
+ [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm,r,r")
+ (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,r,r")
+ (match_operand:SI 2 "general_operand" "g,ri,0,li")))
(clobber (reg:CC FLAGS_REG))]
"ix86_binary_operator_ok (PLUS, SImode, operands)"
{
@@ -6403,6 +6422,10 @@ (define_insn "*addsi_1"
}
default:
+ /* Use add as much as possible to replace lea for AGU optimization. */
+ if (which_alternative == 2 && TARGET_OPT_AGU)
+ return "add{l}\t{%1, %0|%0, %1}";
+
gcc_assert (rtx_equal_p (operands[0], operands[1]));
/* Make things pretty and `subl $4,%eax' rather than `addl $-4, %eax'.
@@ -6419,7 +6442,10 @@ (define_insn "*addsi_1"
}
}
[(set (attr "type")
- (cond [(eq_attr "alternative" "2")
+ (cond [(and (eq_attr "alternative" "2")
+ (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0)))
+ (const_string "lea")
+ (eq_attr "alternative" "3")
(const_string "lea")
; Current assemblers are broken and do not allow @GOTOFF in
; ought but a memory context.
@@ -6437,8 +6463,7 @@ (define_split
(plus (match_operand 1 "register_operand" "")
(match_operand 2 "nonmemory_operand" "")))
(clobber (reg:CC FLAGS_REG))]
- "reload_completed
- && true_regnum (operands[0]) != true_regnum (operands[1])"
+ "reload_completed && ix86_lea_for_add_ok (PLUS, insn, operands)"
[(const_int 0)]
{
rtx pat;
@@ -7539,6 +7564,7 @@ (define_insn "subdi3_carry_rex64"
"TARGET_64BIT && ix86_binary_operator_ok (MINUS, DImode, operands)"
"sbb{q}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "DI")])
@@ -7587,6 +7613,7 @@ (define_insn "subqi3_carry"
"ix86_binary_operator_ok (MINUS, QImode, operands)"
"sbb{b}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "QI")])
@@ -7599,6 +7626,7 @@ (define_insn "subhi3_carry"
"ix86_binary_operator_ok (MINUS, HImode, operands)"
"sbb{w}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "HI")])
@@ -7611,6 +7639,7 @@ (define_insn "subsi3_carry"
"ix86_binary_operator_ok (MINUS, SImode, operands)"
"sbb{l}\t{%2, %0|%0, %2}"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "mode" "SI")])
@@ -15224,6 +15253,7 @@ (define_insn "return_internal"
"reload_completed"
"ret"
[(set_attr "length" "1")
+ (set_attr "atom_unit" "jeu")
(set_attr "length_immediate" "0")
(set_attr "modrm" "0")])
@@ -15236,6 +15266,7 @@ (define_insn "return_internal_long"
"reload_completed"
"rep\;ret"
[(set_attr "length" "1")
+ (set_attr "atom_unit" "jeu")
(set_attr "length_immediate" "0")
(set_attr "prefix_rep" "1")
(set_attr "modrm" "0")])
@@ -15246,6 +15277,7 @@ (define_insn "return_pop_internal"
"reload_completed"
"ret\t%0"
[(set_attr "length" "3")
+ (set_attr "atom_unit" "jeu")
(set_attr "length_immediate" "2")
(set_attr "modrm" "0")])
@@ -16367,6 +16399,7 @@ (define_insn "*rcpsf2_sse"
"TARGET_SSE_MATH"
"%vrcpss\t{%1, %d0|%d0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "rcp")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "SF")])
@@ -16718,6 +16751,7 @@ (define_insn "*rsqrtsf2_sse"
"TARGET_SSE_MATH"
"%vrsqrtss\t{%1, %d0|%d0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "rcp")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "SF")])
@@ -16738,6 +16772,7 @@ (define_insn "*sqrt<mode>2_sse"
"SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
"%vsqrts<ssemodefsuffix>\t{%1, %d0|%d0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "sqrt")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "<MODE>")
(set_attr "athlon_decode" "*")
@@ -19791,6 +19826,7 @@ (define_insn "x86_movdicc_0_m1_rex64"
; Since we don't have the proper number of operands for an alu insn,
; fill in all the blanks.
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "memory" "none")
(set_attr "imm_disp" "false")
@@ -19806,6 +19842,7 @@ (define_insn "*x86_movdicc_0_m1_se"
""
"sbb{q}\t%0, %0"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "memory" "none")
(set_attr "imm_disp" "false")
@@ -19849,6 +19886,7 @@ (define_insn "x86_movsicc_0_m1"
; Since we don't have the proper number of operands for an alu insn,
; fill in all the blanks.
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "memory" "none")
(set_attr "imm_disp" "false")
@@ -19864,6 +19902,7 @@ (define_insn "*x86_movsicc_0_m1_se"
""
"sbb{l}\t%0, %0"
[(set_attr "type" "alu")
+ (set_attr "use_carry" "1")
(set_attr "pent_pair" "pu")
(set_attr "memory" "none")
(set_attr "imm_disp" "false")
@@ -20196,7 +20235,8 @@ (define_insn "pro_epilogue_adjust_stack_
}
}
[(set (attr "type")
- (cond [(eq_attr "alternative" "0")
+ (cond [(and (eq_attr "alternative" "0")
+ (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0)))
(const_string "alu")
(match_operand:SI 2 "const0_operand" "")
(const_string "imov")
@@ -20239,7 +20279,8 @@ (define_insn "pro_epilogue_adjust_stack_
}
}
[(set (attr "type")
- (cond [(eq_attr "alternative" "0")
+ (cond [(and (eq_attr "alternative" "0")
+ (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0)))
(const_string "alu")
(match_operand:DI 2 "const0_operand" "")
(const_string "imov")
@@ -21731,6 +21772,7 @@ (define_insn "*prefetch_sse"
return patterns[locality];
}
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "prefetch")
(set_attr "memory" "none")])
(define_insn "*prefetch_sse_rex"
@@ -21749,6 +21791,7 @@ (define_insn "*prefetch_sse_rex"
return patterns[locality];
}
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "prefetch")
(set_attr "memory" "none")])
(define_insn "*prefetch_3dnow"
--- gcc/config/i386/atom.md (.../trunk) (revision 0)
+++ gcc/config/i386/atom.md (.../branches/ix86/atom) (revision 144601)
@@ -0,0 +1,796 @@
+;; Atom Scheduling
+;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA. */
+;;
+;; Atom is an in-order core with two integer pipelines.
+
+
+(define_attr "atom_unit" "sishuf,simul,jeu,complex,other"
+ (const_string "other"))
+
+(define_attr "atom_sse_attr" "rcp,movdup,lfence,fence,prefetch,sqrt,mxcsr,other"
+ (const_string "other"))
+
+(define_automaton "atom")
+
+;; Atom has two ports: port 0 and port 1 connecting to all execution units
+(define_cpu_unit "atom-port-0,atom-port-1" "atom")
+
+;; EU: Execution Unit
+;; Atom EUs are connected by port 0 or port 1.
+
+(define_cpu_unit "atom-eu-0, atom-eu-1,
+ atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4"
+ "atom")
+
+;; Some EUs have duplicated copied and can be accessed via either
+;; port 0 or port 1
+;; (define_reservation "atom-port-either" "(atom-port-0 | atom-port-1)")
+
+;;; Some instructions is dual-pipe execution, need both ports
+;;; Complex multi-op macro-instructoins need both ports and all EUs
+(define_reservation "atom-port-dual" "(atom-port-0 + atom-port-1)")
+(define_reservation "atom-all-eu" "(atom-eu-0 + atom-eu-1 +
+ atom-imul-1 + atom-imul-2 + atom-imul-3 +
+ atom-imul-4)")
+
+;;; Most of simple instructions have 1 cycle latency. Some of them
+;;; issue in port 0, some in port 0 and some in either port.
+(define_reservation "atom-simple-0" "(atom-port-0 + atom-eu-0)")
+(define_reservation "atom-simple-1" "(atom-port-1 + atom-eu-1)")
+(define_reservation "atom-simple-either" "(atom-simple-0 | atom-simple-1)")
+
+;;; Some insn issues in port 0 with 3 cycle latency and 1 cycle tput
+(define_reservation "atom-eu-0-3-1" "(atom-port-0 + atom-eu-0, nothing*2)")
+
+;;; fmul insn can have 4 or 5 cycles latency
+(define_reservation "atom-fmul-5c" "(atom-port-0 + atom-eu-0), nothing*4")
+(define_reservation "atom-fmul-4c" "(atom-port-0 + atom-eu-0), nothing*3")
+
+;;; fadd can has 5 cycles latency depends on instruction forms
+(define_reservation "atom-fadd-5c" "(atom-port-1 + atom-eu-1), nothing*5")
+
+;;; imul insn has 5 cycles latency
+(define_reservation "atom-imul-32"
+ "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4,
+ atom-port-0")
+;;; imul instruction excludes other non-FP instructions.
+(exclusion_set "atom-eu-0, atom-eu-1"
+ "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4")
+
+;;; dual-execution instructions can have 1,2,4,5 cycles latency depends on
+;;; instruction forms
+(define_reservation "atom-dual-1c" "(atom-port-dual + atom-eu-0 + atom-eu-1)")
+(define_reservation "atom-dual-2c"
+ "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing)")
+(define_reservation "atom-dual-5c"
+ "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing*4)")
+
+;;; Complex macro-instruction has variants of latency, and uses both ports.
+(define_reservation "atom-complex" "(atom-port-dual + atom-all-eu)")
+
+(define_insn_reservation "atom_other" 9
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "other")
+ (eq_attr "atom_unit" "!jeu")))
+ "atom-complex, atom-all-eu*8")
+
+;; return has type "other" with atom_unit "jeu"
+(define_insn_reservation "atom_other_2" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "other")
+ (eq_attr "atom_unit" "jeu")))
+ "atom-dual-1c")
+
+(define_insn_reservation "atom_multi" 9
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "multi"))
+ "atom-complex, atom-all-eu*8")
+
+;; Normal alu insns without carry
+(define_insn_reservation "atom_alu" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "alu")
+ (and (eq_attr "memory" "none")
+ (eq_attr "use_carry" "0"))))
+ "atom-simple-either")
+
+;; Normal alu insns without carry
+(define_insn_reservation "atom_alu_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "alu")
+ (and (eq_attr "memory" "!none")
+ (eq_attr "use_carry" "0"))))
+ "atom-simple-either")
+
+;; Alu insn consuming CF, such as add/sbb
+(define_insn_reservation "atom_alu_carry" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "alu")
+ (and (eq_attr "memory" "none")
+ (eq_attr "use_carry" "1"))))
+ "atom-simple-either")
+
+;; Alu insn consuming CF, such as add/sbb
+(define_insn_reservation "atom_alu_carry_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "alu")
+ (and (eq_attr "memory" "!none")
+ (eq_attr "use_carry" "1"))))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_alu1" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "alu1")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_alu1_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "alu1")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_negnot" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "negnot")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_negnot_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "negnot")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_imov" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imov")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_imov_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imov")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+;; 16<-16, 32<-32
+(define_insn_reservation "atom_imovx" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imovx")
+ (and (eq_attr "memory" "none")
+ (ior (and (match_operand:HI 0 "register_operand")
+ (match_operand:HI 1 "general_operand"))
+ (and (match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "general_operand"))))))
+ "atom-simple-either")
+
+;; 16<-16, 32<-32, mem
+(define_insn_reservation "atom_imovx_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imovx")
+ (and (eq_attr "memory" "!none")
+ (ior (and (match_operand:HI 0 "register_operand")
+ (match_operand:HI 1 "general_operand"))
+ (and (match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "general_operand"))))))
+ "atom-simple-either")
+
+;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8
+(define_insn_reservation "atom_imovx_2" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imovx")
+ (and (eq_attr "memory" "none")
+ (ior (match_operand:QI 0 "register_operand")
+ (ior (and (match_operand:SI 0 "register_operand")
+ (not (match_operand:SI 1 "general_operand")))
+ (match_operand:DI 0 "register_operand"))))))
+ "atom-simple-0")
+
+;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8, mem
+(define_insn_reservation "atom_imovx_2_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imovx")
+ (and (eq_attr "memory" "!none")
+ (ior (match_operand:QI 0 "register_operand")
+ (ior (and (match_operand:SI 0 "register_operand")
+ (not (match_operand:SI 1 "general_operand")))
+ (match_operand:DI 0 "register_operand"))))))
+ "atom-simple-0")
+
+;; 16<-8
+(define_insn_reservation "atom_imovx_3" 3
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imovx")
+ (and (match_operand:HI 0 "register_operand")
+ (match_operand:QI 1 "general_operand"))))
+ "atom-complex, atom-all-eu*2")
+
+(define_insn_reservation "atom_lea" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "lea")
+ (eq_attr "mode" "!HI")))
+ "atom-simple-either")
+
+;; lea 16bit address is complex insn
+(define_insn_reservation "atom_lea_2" 2
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "lea")
+ (eq_attr "mode" "HI")))
+ "atom-complex, atom-all-eu")
+
+(define_insn_reservation "atom_incdec" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "incdec")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_incdec_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "incdec")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+;; simple shift instruction use SHIFT eu, none memory
+(define_insn_reservation "atom_ishift" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ishift")
+ (and (eq_attr "memory" "none") (eq_attr "prefix_0f" "0"))))
+ "atom-simple-0")
+
+;; simple shift instruction use SHIFT eu, memory
+(define_insn_reservation "atom_ishift_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ishift")
+ (and (eq_attr "memory" "!none") (eq_attr "prefix_0f" "0"))))
+ "atom-simple-0")
+
+;; DF shift (prefixed with 0f) is complex insn with latency of 7 cycles
+(define_insn_reservation "atom_ishift_3" 7
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ishift")
+ (eq_attr "prefix_0f" "1")))
+ "atom-complex, atom-all-eu*6")
+
+(define_insn_reservation "atom_ishift1" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ishift1")
+ (eq_attr "memory" "none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_ishift1_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ishift1")
+ (eq_attr "memory" "!none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_rotate" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "rotate")
+ (eq_attr "memory" "none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_rotate_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "rotate")
+ (eq_attr "memory" "!none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_rotate1" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "rotate1")
+ (eq_attr "memory" "none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_rotate1_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "rotate1")
+ (eq_attr "memory" "!none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_imul" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imul")
+ (and (eq_attr "memory" "none") (eq_attr "mode" "SI"))))
+ "atom-imul-32")
+
+(define_insn_reservation "atom_imul_mem" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imul")
+ (and (eq_attr "memory" "!none") (eq_attr "mode" "SI"))))
+ "atom-imul-32")
+
+;; latency set to 10 as common 64x64 imul
+(define_insn_reservation "atom_imul_3" 10
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "imul")
+ (eq_attr "mode" "!SI")))
+ "atom-complex, atom-all-eu*9")
+
+(define_insn_reservation "atom_idiv" 65
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "idiv"))
+ "atom-complex, atom-all-eu*32, nothing*32")
+
+(define_insn_reservation "atom_icmp" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "icmp")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_icmp_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "icmp")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_test" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "test")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_test_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "test")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_ibr" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ibr")
+ (eq_attr "memory" "!load")))
+ "atom-simple-1")
+
+;; complex if jump target is from address
+(define_insn_reservation "atom_ibr_2" 2
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ibr")
+ (eq_attr "memory" "load")))
+ "atom-complex, atom-all-eu")
+
+(define_insn_reservation "atom_setcc" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "setcc")
+ (eq_attr "memory" "!store")))
+ "atom-simple-either")
+
+;; 2 cycles complex if target is in memory
+(define_insn_reservation "atom_setcc_2" 2
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "setcc")
+ (eq_attr "memory" "store")))
+ "atom-complex, atom-all-eu")
+
+(define_insn_reservation "atom_icmov" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "icmov")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_icmov_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "icmov")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+;; UCODE if segreg, ignored
+(define_insn_reservation "atom_push" 2
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "push"))
+ "atom-dual-2c")
+
+;; pop r64 is 1 cycle. UCODE if segreg, ignored
+(define_insn_reservation "atom_pop" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "pop")
+ (eq_attr "mode" "DI")))
+ "atom-dual-1c")
+
+;; pop non-r64 is 2 cycles. UCODE if segreg, ignored
+(define_insn_reservation "atom_pop_2" 2
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "pop")
+ (eq_attr "mode" "!DI")))
+ "atom-dual-2c")
+
+;; UCODE if segreg, ignored
+(define_insn_reservation "atom_call" 1
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "call"))
+ "atom-dual-1c")
+
+(define_insn_reservation "atom_callv" 1
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "callv"))
+ "atom-dual-1c")
+
+(define_insn_reservation "atom_leave" 3
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "leave"))
+ "atom-complex, atom-all-eu*2")
+
+(define_insn_reservation "atom_str" 3
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "str"))
+ "atom-complex, atom-all-eu*2")
+
+(define_insn_reservation "atom_sselog" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sselog")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_sselog_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sselog")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_sselog1" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sselog1")
+ (eq_attr "memory" "none")))
+ "atom-simple-0")
+
+(define_insn_reservation "atom_sselog1_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sselog1")
+ (eq_attr "memory" "!none")))
+ "atom-simple-0")
+
+;; not pmad, not psad
+(define_insn_reservation "atom_sseiadd" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseiadd")
+ (and (not (match_operand:V2DI 0 "register_operand"))
+ (and (eq_attr "atom_unit" "!simul")
+ (eq_attr "atom_unit" "!complex")))))
+ "atom-simple-either")
+
+;; pmad, psad and 64
+(define_insn_reservation "atom_sseiadd_2" 4
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseiadd")
+ (and (not (match_operand:V2DI 0 "register_operand"))
+ (and (eq_attr "atom_unit" "simul" )
+ (eq_attr "mode" "DI")))))
+ "atom-fmul-4c")
+
+;; pmad, psad and 128
+(define_insn_reservation "atom_sseiadd_3" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseiadd")
+ (and (not (match_operand:V2DI 0 "register_operand"))
+ (and (eq_attr "atom_unit" "simul" )
+ (eq_attr "mode" "TI")))))
+ "atom-fmul-5c")
+
+;; if paddq(64 bit op), phadd/phsub
+(define_insn_reservation "atom_sseiadd_4" 6
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseiadd")
+ (ior (match_operand:V2DI 0 "register_operand")
+ (eq_attr "atom_unit" "complex"))))
+ "atom-complex, atom-all-eu*5")
+
+;; if immediate op.
+(define_insn_reservation "atom_sseishft" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseishft")
+ (and (eq_attr "atom_unit" "!sishuf")
+ (match_operand 2 "immediate_operand"))))
+ "atom-simple-either")
+
+;; if palignr or psrldq
+(define_insn_reservation "atom_sseishft_2" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseishft")
+ (and (eq_attr "atom_unit" "sishuf")
+ (match_operand 2 "immediate_operand"))))
+ "atom-simple-0")
+
+;; if reg/mem op
+(define_insn_reservation "atom_sseishft_3" 2
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseishft")
+ (not (match_operand 2 "immediate_operand"))))
+ "atom-complex, atom-all-eu")
+
+(define_insn_reservation "atom_sseimul" 1
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "sseimul"))
+ "atom-simple-0")
+
+;; rcpss or rsqrtss
+(define_insn_reservation "atom_sse" 4
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sse")
+ (and (eq_attr "atom_sse_attr" "rcp") (eq_attr "mode" "SF"))))
+ "atom-fmul-4c")
+
+;; movshdup, movsldup. Suggest to type sseishft
+(define_insn_reservation "atom_sse_2" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sse")
+ (eq_attr "atom_sse_attr" "movdup")))
+ "atom-simple-0")
+
+;; lfence
+(define_insn_reservation "atom_sse_3" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sse")
+ (eq_attr "atom_sse_attr" "lfence")))
+ "atom-simple-either")
+
+;; sfence,clflush,mfence, prefetch
+(define_insn_reservation "atom_sse_4" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sse")
+ (ior (eq_attr "atom_sse_attr" "fence")
+ (eq_attr "atom_sse_attr" "prefetch"))))
+ "atom-simple-0")
+
+;; rcpps, rsqrtss, sqrt, ldmxcsr
+(define_insn_reservation "atom_sse_5" 7
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sse")
+ (ior (ior (eq_attr "atom_sse_attr" "sqrt")
+ (eq_attr "atom_sse_attr" "mxcsr"))
+ (and (eq_attr "atom_sse_attr" "rcp")
+ (eq_attr "mode" "V4SF")))))
+ "atom-complex, atom-all-eu*6")
+
+;; xmm->xmm
+(define_insn_reservation "atom_ssemov" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemov")
+ (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "xy"))))
+ "atom-simple-either")
+
+;; reg->xmm
+(define_insn_reservation "atom_ssemov_2" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemov")
+ (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "r"))))
+ "atom-simple-0")
+
+;; xmm->reg
+(define_insn_reservation "atom_ssemov_3" 3
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemov")
+ (and (match_operand 0 "register_operand" "r") (match_operand 1 "register_operand" "xy"))))
+ "atom-eu-0-3-1")
+
+;; mov mem
+(define_insn_reservation "atom_ssemov_4" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemov")
+ (and (eq_attr "movu" "0") (eq_attr "memory" "!none"))))
+ "atom-simple-0")
+
+;; movu mem
+(define_insn_reservation "atom_ssemov_5" 2
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemov")
+ (ior (eq_attr "movu" "1") (eq_attr "memory" "!none"))))
+ "atom-complex, atom-all-eu")
+
+;; no memory simple
+(define_insn_reservation "atom_sseadd" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseadd")
+ (and (eq_attr "memory" "none")
+ (and (eq_attr "mode" "!V2DF")
+ (eq_attr "atom_unit" "!complex")))))
+ "atom-fadd-5c")
+
+;; memory simple
+(define_insn_reservation "atom_sseadd_mem" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseadd")
+ (and (eq_attr "memory" "!none")
+ (and (eq_attr "mode" "!V2DF")
+ (eq_attr "atom_unit" "!complex")))))
+ "atom-dual-5c")
+
+;; maxps, minps, *pd, hadd, hsub
+(define_insn_reservation "atom_sseadd_3" 8
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseadd")
+ (ior (eq_attr "mode" "V2DF") (eq_attr "atom_unit" "complex"))))
+ "atom-complex, atom-all-eu*7")
+
+;; Except dppd/dpps
+(define_insn_reservation "atom_ssemul" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemul")
+ (eq_attr "mode" "!SF")))
+ "atom-fmul-5c")
+
+;; Except dppd/dpps, 4 cycle if mulss
+(define_insn_reservation "atom_ssemul_2" 4
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssemul")
+ (eq_attr "mode" "SF")))
+ "atom-fmul-4c")
+
+(define_insn_reservation "atom_ssecmp" 1
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "ssecmp"))
+ "atom-simple-either")
+
+(define_insn_reservation "atom_ssecomi" 10
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "ssecomi"))
+ "atom-complex, atom-all-eu*9")
+
+;; no memory and cvtpi2ps, cvtps2pi, cvttps2pi
+(define_insn_reservation "atom_ssecvt" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssecvt")
+ (ior (and (match_operand:V2SI 0 "register_operand")
+ (match_operand:V4SF 1 "register_operand"))
+ (and (match_operand:V4SF 0 "register_operand")
+ (match_operand:V2SI 1 "register_operand")))))
+ "atom-fadd-5c")
+
+;; memory and cvtpi2ps, cvtps2pi, cvttps2pi
+(define_insn_reservation "atom_ssecvt_2" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssecvt")
+ (ior (and (match_operand:V2SI 0 "register_operand")
+ (match_operand:V4SF 1 "memory_operand"))
+ (and (match_operand:V4SF 0 "register_operand")
+ (match_operand:V2SI 1 "memory_operand")))))
+ "atom-dual-5c")
+
+;; otherwise. 7 cycles average for cvtss2sd
+(define_insn_reservation "atom_ssecvt_3" 7
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "ssecvt")
+ (not (ior (and (match_operand:V2SI 0 "register_operand")
+ (match_operand:V4SF 1 "nonimmediate_operand"))
+ (and (match_operand:V4SF 0 "register_operand")
+ (match_operand:V2SI 1 "nonimmediate_operand"))))))
+ "atom-complex, atom-all-eu*6")
+
+;; memory and cvtsi2sd
+(define_insn_reservation "atom_sseicvt" 5
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseicvt")
+ (and (match_operand:V2DF 0 "register_operand")
+ (match_operand:SI 1 "memory_operand"))))
+ "atom-dual-5c")
+
+;; otherwise. 8 cycles average for cvtsd2si
+(define_insn_reservation "atom_sseicvt_2" 8
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "sseicvt")
+ (not (and (match_operand:V2DF 0 "register_operand")
+ (match_operand:SI 1 "memory_operand")))))
+ "atom-complex, atom-all-eu*7")
+
+(define_insn_reservation "atom_ssediv" 62
+ (and (eq_attr "cpu" "atom")
+ (eq_attr "type" "ssediv"))
+ "atom-complex, atom-all-eu*12, nothing*49")
+
+;; simple for fmov
+(define_insn_reservation "atom_fmov" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "fmov")
+ (eq_attr "memory" "none")))
+ "atom-simple-either")
+
+;; simple for fmov
+(define_insn_reservation "atom_fmov_mem" 1
+ (and (eq_attr "cpu" "atom")
+ (and (eq_attr "type" "fmov")
+ (eq_attr "memory" "!none")))
+ "atom-simple-either")
+
+;; Define bypass here
+
+;; There will be no stall from lea to non-mem EX insns
+(define_bypass 0 "atom_lea"
+ "atom_alu_carry,
+ atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+ atom_incdec, atom_setcc, atom_icmov, atom_pop")
+
+(define_bypass 0 "atom_lea"
+ "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+ atom_imovx_mem, atom_imovx_2_mem,
+ atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+ "!ix86_agi_dependent")
+
+;; There will be 3 cycles stall from EX insns to AGAN insns LEA
+(define_bypass 4 "atom_alu_carry,
+ atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+ atom_incdec,atom_ishift,atom_ishift1,atom_rotate,
+ atom_rotate1, atom_setcc, atom_icmov, atom_pop,
+ atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+ atom_imovx_mem, atom_imovx_2_mem,
+ atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+ "atom_lea")
+
+;; There will be 3 cycles stall from EX insns to insns need addr calculation
+(define_bypass 4 "atom_alu_carry,
+ atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+ atom_incdec,atom_ishift,atom_ishift1,atom_rotate,
+ atom_rotate1, atom_setcc, atom_icmov, atom_pop,
+ atom_imovx_mem, atom_imovx_2_mem,
+ atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+ atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+ "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+ atom_negnot_mem, atom_imov_mem, atom_incdec_mem,
+ atom_imovx_mem, atom_imovx_2_mem,
+ atom_imul_mem, atom_icmp_mem,
+ atom_test_mem, atom_icmov_mem, atom_sselog_mem,
+ atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem,
+ atom_ishift_mem, atom_ishift1_mem,
+ atom_rotate_mem, atom_rotate1_mem"
+ "ix86_agi_dependent")
+
+;; Stall from imul to lea is 8 cycles.
+(define_bypass 9 "atom_imul, atom_imul_mem" "atom_lea")
+
+;; Stall from imul to memory address is 8 cycles.
+(define_bypass 9 "atom_imul, atom_imul_mem"
+ "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+ atom_negnot_mem, atom_imov_mem, atom_incdec_mem,
+ atom_ishift_mem, atom_ishift1_mem, atom_rotate_mem,
+ atom_rotate1_mem, atom_imul_mem, atom_icmp_mem,
+ atom_test_mem, atom_icmov_mem, atom_sselog_mem,
+ atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem"
+ "ix86_agi_dependent")
+
+;; There will be 0 cycle stall from cmp/test to jcc
+
+;; There will be 1 cycle stall from flag producer to cmov and adc/sbb
+(define_bypass 2 "atom_icmp, atom_test, atom_alu, atom_alu_carry,
+ atom_alu1, atom_negnot, atom_incdec, atom_ishift,
+ atom_ishift1, atom_rotate, atom_rotate1"
+ "atom_icmov, atom_alu_carry")
+
+;; lea to shift count stall is 2 cycles
+(define_bypass 3 "atom_lea"
+ "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1,
+ atom_ishift_mem, atom_ishift1_mem,
+ atom_rotate_mem, atom_rotate1_mem"
+ "ix86_dep_by_shift_count")
+
+;; lea to shift source stall is 1 cycle
+(define_bypass 2 "atom_lea"
+ "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1"
+ "!ix86_dep_by_shift_count")
+
+;; non-lea to shift count stall is 1 cycle
+(define_bypass 2 "atom_alu_carry,
+ atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+ atom_incdec,atom_ishift,atom_ishift1,atom_rotate,
+ atom_rotate1, atom_setcc, atom_icmov, atom_pop,
+ atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+ atom_imovx_mem, atom_imovx_2_mem,
+ atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+ "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1,
+ atom_ishift_mem, atom_ishift1_mem,
+ atom_rotate_mem, atom_rotate1_mem"
+ "ix86_dep_by_shift_count")
--- gcc/config/i386/sse.md (.../trunk) (revision 144460)
+++ gcc/config/i386/sse.md (.../branches/ix86/atom) (revision 144601)
@@ -338,6 +338,7 @@ (define_insn "avx_movup<avxmodesuffixf2c
&& !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"vmovup<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
+ (set_attr "movu" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "<MODE>")])
@@ -363,6 +364,7 @@ (define_insn "<sse>_movup<ssemodesuffixf
&& !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"movup<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
+ (set_attr "movu" "1")
(set_attr "mode" "<MODE>")])
(define_insn "avx_movdqu<avxmodesuffix>"
@@ -373,6 +375,7 @@ (define_insn "avx_movdqu<avxmodesuffix>"
"TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"vmovdqu\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
+ (set_attr "movu" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "<avxvecmode>")])
@@ -383,6 +386,7 @@ (define_insn "sse2_movdqu"
"TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
"movdqu\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
+ (set_attr "movu" "1")
(set_attr "prefix_data16" "1")
(set_attr "mode" "TI")])
@@ -424,7 +428,7 @@ (define_insn "sse2_movntv2di"
UNSPEC_MOVNT))]
"TARGET_SSE2"
"movntdq\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "prefix_data16" "1")
(set_attr "mode" "TI")])
@@ -434,7 +438,7 @@ (define_insn "sse2_movntsi"
UNSPEC_MOVNT))]
"TARGET_SSE2"
"movnti\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "mode" "V2DF")])
(define_insn "avx_lddqu<avxmodesuffix>"
@@ -445,6 +449,7 @@ (define_insn "avx_lddqu<avxmodesuffix>"
"TARGET_AVX"
"vlddqu\t{%1, %0|%0, %1}"
[(set_attr "type" "ssecvt")
+ (set_attr "movu" "1")
(set_attr "prefix" "vex")
(set_attr "mode" "<avxvecmode>")])
@@ -454,7 +459,8 @@ (define_insn "sse3_lddqu"
UNSPEC_LDDQU))]
"TARGET_SSE3"
"lddqu\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
+ (set_attr "movu" "1")
(set_attr "prefix_rep" "1")
(set_attr "mode" "TI")])
@@ -761,6 +767,7 @@ (define_insn "sse_rcpv4sf2"
"TARGET_SSE"
"%vrcpps\t{%1, %0|%0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "rcp")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "V4SF")])
@@ -787,6 +794,7 @@ (define_insn "sse_vmrcpv4sf2"
"TARGET_SSE"
"rcpss\t{%1, %0|%0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "rcp")
(set_attr "mode" "SF")])
(define_expand "sqrtv8sf2"
@@ -832,6 +840,7 @@ (define_insn "sse_sqrtv4sf2"
"TARGET_SSE"
"%vsqrtps\t{%1, %0|%0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "sqrt")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "V4SF")])
@@ -876,6 +885,7 @@ (define_insn "<sse>_vmsqrt<mode>2"
"SSE_VEC_FLOAT_MODE_P (<MODE>mode)"
"sqrts<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "sqrt")
(set_attr "mode" "<ssescalarmode>")])
(define_expand "rsqrtv8sf2"
@@ -1039,7 +1049,7 @@ (define_insn "<sse>_vm<code><mode>3"
(const_int 1)))]
"SSE_VEC_FLOAT_MODE_P (<MODE>mode)"
"<maxminfprefix>s<ssemodesuffixf2c>\t{%2, %0|%0, %2}"
- [(set_attr "type" "sse")
+ [(set_attr "type" "sseadd")
(set_attr "mode" "<ssescalarmode>")])
;; These versions of the min/max patterns implement exactly the operations
@@ -1175,6 +1185,7 @@ (define_insn "sse3_addsubv2df3"
"TARGET_SSE3"
"addsubpd\t{%2, %0|%0, %2}"
[(set_attr "type" "sseadd")
+ (set_attr "atom_unit" "complex")
(set_attr "mode" "V2DF")])
(define_insn "avx_h<plusminus_insn>v4df3"
@@ -1298,6 +1309,7 @@ (define_insn "sse3_h<plusminus_insn>v4sf
"TARGET_SSE3"
"h<plusminus_mnemonic>ps\t{%2, %0|%0, %2}"
[(set_attr "type" "sseadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_rep" "1")
(set_attr "mode" "V4SF")])
@@ -5066,6 +5078,7 @@ (define_insn "*sse2_pmaddwd"
"TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
"pmaddwd\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "simul")
(set_attr "prefix_data16" "1")
(set_attr "mode" "TI")])
@@ -7025,6 +7038,7 @@ (define_insn "*vec_extractv2di_1_rex64"
movq\t{%H1, %0|%0, %H1}
mov{q}\t{%H1, %0|%0, %H1}"
[(set_attr "type" "ssemov,sseishft,ssemov,imov")
+ (set_attr "atom_unit" "*,sishuf,*,*")
(set_attr "memory" "*,none,*,*")
(set_attr "mode" "V2SF,TI,TI,DI")])
@@ -7057,6 +7071,7 @@ (define_insn "*vec_extractv2di_1_sse2"
psrldq\t{$8, %0|%0, 8}
movq\t{%H1, %0|%0, %H1}"
[(set_attr "type" "ssemov,sseishft,ssemov")
+ (set_attr "atom_unit" "*,sishuf,*")
(set_attr "memory" "*,none,*")
(set_attr "mode" "V2SF,TI,TI")])
@@ -7614,6 +7629,7 @@ (define_insn "sse2_psadbw"
"TARGET_SSE2"
"psadbw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "simul")
(set_attr "prefix_data16" "1")
(set_attr "mode" "TI")])
@@ -7635,7 +7651,7 @@ (define_insn "<sse>_movmskp<ssemodesuffi
UNSPEC_MOVMSK))]
"SSE_VEC_FLOAT_MODE_P (<MODE>mode)"
"%vmovmskp<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "<MODE>")])
@@ -7645,7 +7661,7 @@ (define_insn "sse2_pmovmskb"
UNSPEC_MOVMSK))]
"TARGET_SSE2"
"%vpmovmskb\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "prefix_data16" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "SI")])
@@ -7668,7 +7684,7 @@ (define_insn "*sse2_maskmovdqu"
"TARGET_SSE2 && !TARGET_64BIT"
;; @@@ check ordering of operands in intel/nonintel syntax
"%vmaskmovdqu\t{%2, %1|%1, %2}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "prefix_data16" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -7682,7 +7698,7 @@ (define_insn "*sse2_maskmovdqu_rex64"
"TARGET_SSE2 && TARGET_64BIT"
;; @@@ check ordering of operands in intel/nonintel syntax
"%vmaskmovdqu\t{%2, %1|%1, %2}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "prefix_data16" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
@@ -7693,6 +7709,7 @@ (define_insn "sse_ldmxcsr"
"TARGET_SSE"
"%vldmxcsr\t%0"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "mxcsr")
(set_attr "prefix" "maybe_vex")
(set_attr "memory" "load")])
@@ -7702,6 +7719,7 @@ (define_insn "sse_stmxcsr"
"TARGET_SSE"
"%vstmxcsr\t%0"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "mxcsr")
(set_attr "prefix" "maybe_vex")
(set_attr "memory" "store")])
@@ -7720,6 +7738,7 @@ (define_insn "*sse_sfence"
"TARGET_SSE || TARGET_3DNOW_A"
"sfence"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "fence")
(set_attr "memory" "unknown")])
(define_insn "sse2_clflush"
@@ -7728,6 +7747,7 @@ (define_insn "sse2_clflush"
"TARGET_SSE2"
"clflush\t%a0"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "fence")
(set_attr "memory" "unknown")])
(define_expand "sse2_mfence"
@@ -7745,6 +7765,7 @@ (define_insn "*sse2_mfence"
"TARGET_64BIT || TARGET_SSE2"
"mfence"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "fence")
(set_attr "memory" "unknown")])
(define_expand "sse2_lfence"
@@ -7762,6 +7783,7 @@ (define_insn "*sse2_lfence"
"TARGET_SSE2"
"lfence"
[(set_attr "type" "sse")
+ (set_attr "atom_sse_attr" "lfence")
(set_attr "memory" "unknown")])
(define_insn "sse3_mwait"
@@ -7885,6 +7907,7 @@ (define_insn "ssse3_phaddwv8hi3"
"TARGET_SSSE3"
"phaddw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -7913,6 +7936,7 @@ (define_insn "ssse3_phaddwv4hi3"
"TARGET_SSSE3"
"phaddw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -7967,6 +7991,7 @@ (define_insn "ssse3_phadddv4si3"
"TARGET_SSSE3"
"phaddd\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -7987,6 +8012,7 @@ (define_insn "ssse3_phadddv2si3"
"TARGET_SSSE3"
"phaddd\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8073,6 +8099,7 @@ (define_insn "ssse3_phaddswv8hi3"
"TARGET_SSSE3"
"phaddsw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -8101,6 +8128,7 @@ (define_insn "ssse3_phaddswv4hi3"
"TARGET_SSSE3"
"phaddsw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8187,6 +8215,7 @@ (define_insn "ssse3_phsubwv8hi3"
"TARGET_SSSE3"
"phsubw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -8215,6 +8244,7 @@ (define_insn "ssse3_phsubwv4hi3"
"TARGET_SSSE3"
"phsubw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8269,6 +8299,7 @@ (define_insn "ssse3_phsubdv4si3"
"TARGET_SSSE3"
"phsubd\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -8289,6 +8320,7 @@ (define_insn "ssse3_phsubdv2si3"
"TARGET_SSSE3"
"phsubd\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8375,6 +8407,7 @@ (define_insn "ssse3_phsubswv8hi3"
"TARGET_SSSE3"
"phsubsw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -8403,6 +8436,7 @@ (define_insn "ssse3_phsubswv4hi3"
"TARGET_SSSE3"
"phsubsw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "complex")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8509,6 +8543,7 @@ (define_insn "ssse3_pmaddubsw128"
"TARGET_SSSE3"
"pmaddubsw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "simul")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -8547,6 +8582,7 @@ (define_insn "ssse3_pmaddubsw"
"TARGET_SSSE3"
"pmaddubsw\t{%2, %0|%0, %2}"
[(set_attr "type" "sseiadd")
+ (set_attr "atom_unit" "simul")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8754,6 +8790,7 @@ (define_insn "ssse3_palignrti"
return "palignr\t{%3, %2, %0|%0, %2, %3}";
}
[(set_attr "type" "sseishft")
+ (set_attr "atom_unit" "sishuf")
(set_attr "prefix_data16" "1")
(set_attr "prefix_extra" "1")
(set_attr "mode" "TI")])
@@ -8770,6 +8807,7 @@ (define_insn "ssse3_palignrdi"
return "palignr\t{%3, %2, %0|%0, %2, %3}";
}
[(set_attr "type" "sseishft")
+ (set_attr "atom_unit" "sishuf")
(set_attr "prefix_extra" "1")
(set_attr "mode" "DI")])
@@ -8956,7 +8994,7 @@ (define_insn "sse4_1_movntdqa"
UNSPEC_MOVNTDQA))]
"TARGET_SSE4_1"
"%vmovntdqa\t{%1, %0|%0, %1}"
- [(set_attr "type" "ssecvt")
+ [(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "TI")])
--- gcc/config/i386/i386-c.c (.../trunk) (revision 144460)
+++ gcc/config/i386/i386-c.c (.../branches/ix86/atom) (revision 144601)
@@ -119,6 +119,10 @@ ix86_target_macros_internal (int isa_fla
def_or_undef (parse_in, "__core2");
def_or_undef (parse_in, "__core2__");
break;
+ case PROCESSOR_ATOM:
+ def_or_undef (parse_in, "__atom");
+ def_or_undef (parse_in, "__atom__");
+ break;
/* use PROCESSOR_max to not set/unset the arch macro. */
case PROCESSOR_max:
break;
@@ -187,6 +191,9 @@ ix86_target_macros_internal (int isa_fla
case PROCESSOR_CORE2:
def_or_undef (parse_in, "__tune_core2__");
break;
+ case PROCESSOR_ATOM:
+ def_or_undef (parse_in, "__tune_atom__");
+ break;
case PROCESSOR_GENERIC32:
case PROCESSOR_GENERIC64:
break;
--- gcc/config/i386/i386-protos.h (.../trunk) (revision 144460)
+++ gcc/config/i386/i386-protos.h (.../branches/ix86/atom) (revision 144601)
@@ -85,6 +85,9 @@ extern void ix86_fixup_binary_operands_n
extern void ix86_expand_binary_operator (enum rtx_code,
enum machine_mode, rtx[]);
extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
+extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]);
+extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
+extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
rtx[]);
extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
--- gcc/config/i386/i386.c (.../trunk) (revision 144460)
+++ gcc/config/i386/i386.c (.../branches/ix86/atom) (revision 144601)
@@ -1036,6 +1036,79 @@ struct processor_costs core2_cost = {
1, /* cond_not_taken_branch_cost. */
};
+static const
+struct processor_costs atom_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (2)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (26), /* HI */
+ COSTS_N_INSNS (42), /* SI */
+ COSTS_N_INSNS (74), /* DI */
+ COSTS_N_INSNS (74)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 17, /* MOVE_RATIO */
+ 2, /* cost for loading QImode using movzbl */
+ {4, 4, 4}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {4, 4, 4}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {12, 12, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {8, 8}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {8, 8}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {8, 8, 8}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {8, 8, 8}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 5, /* MMX or SSE register to integer */
+ 32, /* size of l1 cache. */
+ 256, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ 6, /* number of parallel prefetches */
+ 3, /* Branch cost */
+ COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (8), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (20), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (8), /* cost of FABS instruction. */
+ COSTS_N_INSNS (8), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
+ {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{32, loop}, {64, rep_prefix_4_byte},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {15, unrolled_loop},
+ {2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{24, loop}, {32, unrolled_loop},
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
/* Generic64 should produce code tuned for Nocona and K8. */
static const
struct processor_costs generic64_cost = {
@@ -1194,6 +1267,7 @@ const struct processor_costs *ix86_cost
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
#define m_NOCONA (1<<PROCESSOR_NOCONA)
#define m_CORE2 (1<<PROCESSOR_CORE2)
+#define m_ATOM (1<<PROCESSOR_ATOM)
#define m_GEODE (1<<PROCESSOR_GEODE)
#define m_K6 (1<<PROCESSOR_K6)
@@ -1231,10 +1305,11 @@ static unsigned int initial_ix86_tune_fe
m_486 | m_PENT,
/* X86_TUNE_UNROLL_STRLEN */
- m_486 | m_PENT | m_PPRO | m_AMD_MULTIPLE | m_K6 | m_CORE2 | m_GENERIC,
+ m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
+ | m_CORE2 | m_GENERIC,
/* X86_TUNE_DEEP_BRANCH_PREDICTION */
- m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
+ m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
on simulation result. But after P4 was made, no performance benefit
@@ -1246,12 +1321,12 @@ static unsigned int initial_ix86_tune_fe
~m_386,
/* X86_TUNE_USE_SAHF */
- m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
+ m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
| m_NOCONA | m_CORE2 | m_GENERIC,
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
partial dependencies. */
- m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA
+ m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA
| m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
/* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
@@ -1271,13 +1346,13 @@ static unsigned int initial_ix86_tune_fe
m_386 | m_486 | m_K6_GEODE,
/* X86_TUNE_USE_SIMODE_FIOP */
- ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_CORE2 | m_GENERIC),
+ ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_ATOM | m_CORE2 | m_GENERIC),
/* X86_TUNE_USE_MOV0 */
m_K6,
/* X86_TUNE_USE_CLTD */
- ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
+ ~(m_PENT | m_ATOM | m_K6 | m_CORE2 | m_GENERIC),
/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
m_PENT4,
@@ -1292,8 +1367,8 @@ static unsigned int initial_ix86_tune_fe
~(m_PENT | m_PPRO),
/* X86_TUNE_PROMOTE_QIMODE */
- m_K6_GEODE | m_PENT | m_386 | m_486 | m_AMD_MULTIPLE | m_CORE2
- | m_GENERIC /* | m_PENT4 ? */,
+ m_K6_GEODE | m_PENT | m_ATOM | m_386 | m_486 | m_AMD_MULTIPLE
+ | m_CORE2 | m_GENERIC /* | m_PENT4 ? */,
/* X86_TUNE_FAST_PREFIX */
~(m_PENT | m_486 | m_386),
@@ -1317,26 +1392,28 @@ static unsigned int initial_ix86_tune_fe
m_PPRO,
/* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
- m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_ATOM | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA
+ | m_CORE2 | m_GENERIC,
/* X86_TUNE_ADD_ESP_8 */
- m_AMD_MULTIPLE | m_PPRO | m_K6_GEODE | m_386
+ m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_K6_GEODE | m_386
| m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
/* X86_TUNE_SUB_ESP_4 */
- m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2
+ | m_GENERIC,
/* X86_TUNE_SUB_ESP_8 */
- m_AMD_MULTIPLE | m_PPRO | m_386 | m_486
+ m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_386 | m_486
| m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
for DFmode copies */
- ~(m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
+ ~(m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
| m_GENERIC | m_GEODE),
/* X86_TUNE_PARTIAL_REG_DEPENDENCY */
- m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
conflict here in between PPro/Pentium4 based chips that thread 128bit
@@ -1347,7 +1424,8 @@ static unsigned int initial_ix86_tune_fe
shows that disabling this option on P4 brings over 20% SPECfp regression,
while enabling it on K8 brings roughly 2.4% regression that can be partly
masked by careful scheduling of moves. */
- m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
+ m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
+ | m_AMDFAM10,
/* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
m_AMDFAM10,
@@ -1365,13 +1443,13 @@ static unsigned int initial_ix86_tune_fe
m_PPRO | m_PENT4 | m_NOCONA,
/* X86_TUNE_MEMORY_MISMATCH_STALL */
- m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
/* X86_TUNE_PROLOGUE_USING_MOVE */
- m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
+ m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2 | m_GENERIC,
/* X86_TUNE_EPILOGUE_USING_MOVE */
- m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
+ m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2 | m_GENERIC,
/* X86_TUNE_SHIFT1 */
~m_486,
@@ -1380,29 +1458,32 @@ static unsigned int initial_ix86_tune_fe
m_AMD_MULTIPLE,
/* X86_TUNE_INTER_UNIT_MOVES */
- ~(m_AMD_MULTIPLE | m_GENERIC),
+ ~(m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
/* X86_TUNE_INTER_UNIT_CONVERSIONS */
~(m_AMDFAM10),
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
- m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+ m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2
+ | m_GENERIC,
/* X86_TUNE_SCHEDULE */
- m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
+ m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_ATOM | m_CORE2
+ | m_GENERIC,
/* X86_TUNE_USE_BT */
- m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
+ m_AMD_MULTIPLE | m_ATOM | m_CORE2 | m_GENERIC,
/* X86_TUNE_USE_INCDEC */
- ~(m_PENT4 | m_NOCONA | m_GENERIC),
+ ~(m_PENT4 | m_NOCONA | m_GENERIC | m_ATOM),
/* X86_TUNE_PAD_RETURNS */
m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
/* X86_TUNE_EXT_80387_CONSTANTS */
- m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
+ m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO
+ | m_CORE2 | m_GENERIC,
/* X86_TUNE_SHORTEN_X87_SSE */
~m_K8,
@@ -1447,6 +1528,10 @@ static unsigned int initial_ix86_tune_fe
with a subsequent conditional jump instruction into a single
compare-and-branch uop. */
m_CORE2,
+
+ /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
+ will impact LEA instruction selection. */
+ m_ATOM,
};
/* Feature tests against the various architecture variations. */
@@ -1472,10 +1557,11 @@ static unsigned int initial_ix86_arch_fe
};
static const unsigned int x86_accumulate_outgoing_args
- = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
+ = m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
+ | m_GENERIC;
static const unsigned int x86_arch_always_fancy_math_387
- = m_PENT | m_PPRO | m_AMD_MULTIPLE | m_PENT4
+ = m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4
| m_NOCONA | m_CORE2 | m_GENERIC;
static enum stringop_alg stringop_alg = no_stringop;
@@ -1953,7 +2039,8 @@ static const struct ptt processor_target
{&core2_cost, 16, 10, 16, 10, 16},
{&generic32_cost, 16, 7, 16, 7, 16},
{&generic64_cost, 16, 10, 16, 10, 16},
- {&amdfam10_cost, 32, 24, 32, 7, 32}
+ {&amdfam10_cost, 32, 24, 32, 7, 32},
+ {&atom_cost, 16, 7, 16, 7, 16}
};
static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
@@ -1971,6 +2058,7 @@ static const char *const cpu_names[TARGE
"prescott",
"nocona",
"core2",
+ "atom",
"geode",
"k6",
"k6-2",
@@ -2529,6 +2617,9 @@ override_options (bool main_args_p)
{"core2", PROCESSOR_CORE2, CPU_CORE2,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_CX16},
+ {"atom", PROCESSOR_ATOM, CPU_ATOM,
+ PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+ | PTA_SSSE3 | PTA_CX16},
{"geode", PROCESSOR_GEODE, CPU_GEODE,
PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
{"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
@@ -12903,6 +12994,263 @@ ix86_expand_unary_operator (enum rtx_cod
emit_move_insn (operands[0], dst);
}
+#define LEA_SEARCH_THRESHOLD 12
+
+/* Reach non-agu definition of op1 and op2 in insn's basic block.
+ Search backward until 1. passed LEA_SEARCH_THRESHOLD instructions,
+ or 2. reach BB boundary, or reach agu definition.
+ Returns the distance between the non-agu definition point and insn.
+ If no definition point, returns -1
+ TODO: Currently we have no way to distinguish if definition insn is a LEA.
+ We just assume all definitions are non-lea. */
+static int
+distance_non_agu_define (rtx op1, rtx op2, rtx insn)
+{
+ rtx reg_op1 = REG_P (op1) ? op1 : NULL;
+ rtx reg_op2 = REG_P (op2) ? op2 : NULL;
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ int distance = 0;
+
+ if (insn != BB_HEAD (bb))
+ {
+
+ rtx prev = PREV_INSN (insn);
+ while (prev && distance < LEA_SEARCH_THRESHOLD)
+ {
+ if (INSN_P (prev))
+ {
+ distance++;
+ if ((reg_op1 && reg_set_p (reg_op1, prev))
+ || (reg_op2 && reg_set_p (reg_op2, prev)))
+ return distance ;
+ }
+ if (prev == BB_HEAD (bb))
+ break;
+ prev = PREV_INSN (prev);
+ }
+ }
+
+ if (distance < LEA_SEARCH_THRESHOLD)
+ {
+ edge e;
+ edge_iterator ei;
+ bool simple_loop = false;
+
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ if (e->src == bb)
+ {
+ simple_loop = true;
+ break;
+ }
+
+ if (simple_loop)
+ {
+ rtx prev = BB_END (bb);
+ while (prev
+ && prev != insn
+ && distance < LEA_SEARCH_THRESHOLD)
+ {
+ if (INSN_P (prev))
+ {
+ distance++;
+ if ((reg_op1 && reg_set_p (reg_op1, prev))
+ || (reg_op2 && reg_set_p (reg_op2, prev)))
+ return distance;
+ }
+ prev = PREV_INSN (prev);
+ }
+ }
+ }
+
+ return -1;
+}
+
+/* Return the distance between this insn and the next insn that uses
+ result of this insn as memory address.
+ Return -1 if not found such a use within LEA_SEARCH_THRESHOLD. */
+static int
+distance_agu_use (rtx op0, rtx insn)
+{
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ int distance = 0;
+
+ if (insn != BB_END(bb))
+ {
+ rtx next = NEXT_INSN (insn);
+
+ while (next && distance < LEA_SEARCH_THRESHOLD)
+ {
+ if (INSN_P (next))
+ {
+ distance++;
+ if (reg_mentioned_by_mem_p (op0, next))
+ return distance;
+ if (reg_set_p (op0, next))
+ return -1;
+ }
+ if (next == BB_END (bb))
+ break;
+ next = NEXT_INSN (next);
+ }
+ }
+
+ if (distance < LEA_SEARCH_THRESHOLD)
+ {
+ edge e;
+ edge_iterator ei;
+ bool simple_loop = false;
+
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ if (e->dest == bb)
+ {
+ simple_loop = true;
+ break;
+ }
+
+ if (simple_loop)
+ {
+ rtx next = BB_HEAD (bb);
+ while (next && distance < LEA_SEARCH_THRESHOLD)
+ {
+ if (next == insn)
+ break;
+ if (INSN_P (next))
+ {
+ distance++;
+ if (reg_mentioned_by_mem_p (op0, next))
+ return distance;
+ if (reg_set_p (op0, next))
+ return -1;
+ }
+ next = NEXT_INSN (next);
+ }
+ }
+ }
+
+ return -1;
+}
+
+/* Define this macro to tune LEA priority vs ADD, it take effect when
+ there is a dilemma of choicing LEA or ADD
+ Negative value: ADD is more preferred than LEA
+ Zero: Netrual
+ Positive value: LEA is more preferred than ADD*/
+#define IX86_LEA_PRIORITY 2
+
+/* Return true if it is ok to optimize an ADD operation to LEA
+ operation to avoid flag register consumation. For the processors
+ like ATOM, if the destination register of LEA holds an actual
+ address which will be used soon, LEA is better and otherwise ADD
+ is better. */
+
+bool
+ix86_lea_for_add_ok (enum rtx_code code ATTRIBUTE_UNUSED,
+ rtx insn,
+ rtx operands[])
+{
+ gcc_assert (REG_P (operands[0]));
+ gcc_assert (operands[1] && operands[2]);
+
+ if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+ {
+ if (true_regnum (operands[0]) != true_regnum (operands[1]))
+ return true;
+ else
+ return false;
+ }
+
+ /* If a = b + c, (a!=b && a!=c), must use lea form. */
+ if (true_regnum (operands[0]) != true_regnum (operands[1])
+ && true_regnum (operands[0]) != true_regnum (operands[2]))
+ return true;
+ else
+ {
+ int dist_define, dist_use;
+ dist_define = distance_non_agu_define (operands[1],
+ operands[2], insn);
+ if (dist_define <= 0)
+ return true;
+
+ /* If this insn has both backward non-agu dependence and forward
+ agu dependence, the one with short distance take effect. */
+ dist_use = distance_agu_use (operands[0], insn);
+ if (dist_use <= 0
+ || (dist_define + IX86_LEA_PRIORITY) < dist_use)
+ return false;
+
+ return true;
+ }
+}
+
+/* Return true if destination reg of SET_INSN is shift count of
+ USE_INSN. */
+
+bool
+ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
+{
+ rtx set_pattern = PATTERN (set_insn);
+ rtx set_dest;
+ rtx shift_rtx;
+ rtx use_pattern;
+
+ /* Retrieve destination of set_insn */
+ switch (GET_CODE (set_pattern))
+ {
+ case SET:
+ set_dest = SET_DEST (set_pattern);
+ break;
+ case PARALLEL:
+ set_pattern = XVECEXP (set_pattern, 0, 0);
+ if (GET_CODE (set_pattern ) == SET)
+ {
+ set_dest = SET_DEST (set_pattern);
+ break;
+ }
+ default:
+ set_dest = NULL;
+ break;
+ }
+ if (!set_dest || !REG_P (set_dest))
+ return false;
+
+ /* Retrieve shift count of use_insn */
+ use_pattern = PATTERN (use_insn);
+ switch (GET_CODE (use_pattern))
+ {
+ case SET:
+ shift_rtx = XEXP (use_pattern, 1);
+ break;
+ case PARALLEL:
+ set_pattern = XVECEXP (use_pattern, 0, 0);
+ if (GET_CODE (set_pattern) == SET)
+ {
+ shift_rtx = XEXP (set_pattern, 1);
+ break;
+ }
+ default:
+ shift_rtx = NULL;
+ break;
+ }
+
+ if (shift_rtx
+ && (GET_CODE (shift_rtx) == ASHIFT
+ || GET_CODE (shift_rtx) == LSHIFTRT
+ || GET_CODE (shift_rtx) == ASHIFTRT
+ || GET_CODE (shift_rtx) == ROTATE
+ || GET_CODE (shift_rtx) == ROTATERT))
+ {
+ rtx shift_count = XEXP (shift_rtx, 1);
+ gcc_assert (shift_count);
+
+ /* Return true if shift count is dest of set_insn */
+ if (REG_P (shift_count)
+ && true_regnum (set_dest) == true_regnum (shift_count))
+ return true;
+ }
+
+ return false;
+}
+
/* Return TRUE or FALSE depending on whether the unary operator meets the
appropriate constraints. */
@@ -19022,6 +19370,7 @@ ix86_issue_rate (void)
switch (ix86_tune)
{
case PROCESSOR_PENTIUM:
+ case PROCESSOR_ATOM:
case PROCESSOR_K6:
return 2;
@@ -19088,41 +19437,21 @@ ix86_flags_dependent (rtx insn, rtx dep_
return 1;
}
-/* A subroutine of ix86_adjust_cost -- return true iff INSN has a memory
- address with operands set by DEP_INSN. */
+/* Return true iff USE_INSN has a memory address with operands set by
+ SET_INSN. */
-static int
-ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
+bool
+ix86_agi_dependent (rtx set_insn, rtx use_insn)
{
- rtx addr;
-
- if (insn_type == TYPE_LEA
- && TARGET_PENTIUM)
- {
- addr = PATTERN (insn);
-
- if (GET_CODE (addr) == PARALLEL)
- addr = XVECEXP (addr, 0, 0);
-
- gcc_assert (GET_CODE (addr) == SET);
-
- addr = SET_SRC (addr);
- }
- else
- {
- int i;
- extract_insn_cached (insn);
- for (i = recog_data.n_operands - 1; i >= 0; --i)
- if (MEM_P (recog_data.operand[i]))
- {
- addr = XEXP (recog_data.operand[i], 0);
- goto found;
- }
- return 0;
- found:;
- }
-
- return modified_in_p (addr, dep_insn);
+ int i;
+ extract_insn_cached (use_insn);
+ for (i = recog_data.n_operands - 1; i >= 0; --i)
+ if (MEM_P (recog_data.operand[i]))
+ {
+ rtx addr = XEXP (recog_data.operand[i], 0);
+ return modified_in_p (addr, set_insn) != 0;
+ }
+ return false;
}
static int
@@ -19150,8 +19479,19 @@ ix86_adjust_cost (rtx insn, rtx link, rt
{
case PROCESSOR_PENTIUM:
/* Address Generation Interlock adds a cycle of latency. */
- if (ix86_agi_dependent (insn, dep_insn, insn_type))
- cost += 1;
+ if (insn_type == TYPE_LEA)
+ {
+ rtx addr = PATTERN (insn);
+
+ if (GET_CODE (addr) == PARALLEL)
+ addr = XVECEXP (addr, 0, 0);
+
+ gcc_assert (GET_CODE (addr) == SET);
+
+ addr = SET_SRC (addr);
+ if (modified_in_p (addr, dep_insn))
+ cost += 1;
+ }
/* ??? Compares pair with jump/setcc. */
if (ix86_flags_dependent (insn, dep_insn, insn_type))
@@ -19160,7 +19500,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
/* Floating point stores require value to be ready one cycle earlier. */
if (insn_type == TYPE_FMOV
&& get_attr_memory (insn) == MEMORY_STORE
- && !ix86_agi_dependent (insn, dep_insn, insn_type))
+ && !ix86_agi_dependent (dep_insn, insn))
cost += 1;
break;
@@ -19183,7 +19523,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (insn, dep_insn, insn_type))
+ && !ix86_agi_dependent (dep_insn, insn))
{
/* Claim moves to take one cycle, as core can issue one load
at time and the next load can start cycle later. */
@@ -19212,7 +19552,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (insn, dep_insn, insn_type))
+ && !ix86_agi_dependent (dep_insn, insn))
{
/* Claim moves to take one cycle, as core can issue one load
at time and the next load can start cycle later. */
@@ -19229,6 +19569,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
+ case PROCESSOR_ATOM:
case PROCESSOR_GENERIC32:
case PROCESSOR_GENERIC64:
memory = get_attr_memory (insn);
@@ -19237,7 +19578,7 @@ ix86_adjust_cost (rtx insn, rtx link, rt
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
- && !ix86_agi_dependent (insn, dep_insn, insn_type))
+ && !ix86_agi_dependent (dep_insn, insn))
{
enum attr_unit unit = get_attr_unit (insn);
int loadcost = 3;