diff --git a/.cvsignore b/.cvsignore index 612cc08..f09d384 100644 --- a/.cvsignore +++ b/.cvsignore @@ -1 +1 @@ -gcc-4.1.1-20070202.tar.bz2 +gcc-4.1.1-20070209.tar.bz2 diff --git a/gcc41-amdfam10.patch b/gcc41-amdfam10.patch new file mode 100644 index 0000000..4a7938b --- /dev/null +++ b/gcc41-amdfam10.patch @@ -0,0 +1,3614 @@ +2007-02-08 Harsha Jagasia + + * config/i386/xmmintrin.h: Make inclusion of emmintrin.h + conditional to __SSE2__. + * config/i386/emmintrin.h: Generate #error if __SSE2__ is not + defined. + * config/i386/pmmintrin.h: Generate #error if __SSE3__ is not + defined. + * config/i386/tmmintrin.h: Generate #error if __SSSE3__ is not + defined. + +2007-02-05 Harsha Jagasia + + * config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8, + athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov, + athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul, + athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn, + athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8, + athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load, + athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8, + athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10. + + * config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse, + cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387, + swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse, + fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse, + x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed, + floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse, + floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1, + mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn, + umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn, + umuldi3_highpart_rex64, umulsi3_highpart_insn, + umulsi3_highpart_zext, smuldi3_highpart_rex64, + smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld, + x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse, + sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387, + sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387, + sqrtextenddfxf2_i387): Added amdfam10_decode. + + * config/i386/athlon.md (athlon_idirect_amdfam10, + athlon_ivector_amdfam10, athlon_idirect_load_amdfam10, + athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10, + athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10, + athlon_ivector_store_amdfam10): New define_insn_reservation. + (athlon_idirect_loadmov, athlon_idirect_movstore): Added + amdfam10. + + * config/i386/athlon.md (athlon_call_amdfam10, + athlon_pop_amdfam10, athlon_lea_amdfam10): New + define_insn_reservation. + (athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8, + athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI, + athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10. + + * config/i386/athlon.md (athlon_sseld_amdfam10, + athlon_mmxld_amdfam10, athlon_ssest_amdfam10, + athlon_mmxssest_short_amdfam10): New define_insn_reservation. + + * config/i386/athlon.md (athlon_sseins_amdfam10): New + define_insn_reservation. + * config/i386/i386.md (sseins): Added sseins to define_attr type + and define_attr unit. + * config/i386/sse.md: Set type attribute to sseins for insertq + and insertqi. + + * config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10, + ssecmpvector_load_amdfam10, ssecmpvector_amdfam10, + ssecomi_load_amdfam10, ssecomi_amdfam10, + sseaddvector_load_amdfam10, sseaddvector_amdfam10): New + define_insn_reservation. + (ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10. + + * config/i386/athlon.md (cvtss2sd_load_amdfam10, + cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10, + cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10, + cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10, + cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10, + cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New + define_insn_reservation. + + * config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si, + cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq, + cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq, + cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd, + cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute. + + * config/i386/athlon.md (athlon_ssedivvector_amdfam10, + athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10, + athlon_ssemulvector_load_amdfam10): New define_insn_reservation. + (athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul, + athlon_ssemul_load_k8): Added amdfam10. + + * config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro. + (x86_sse_unaligned_move_optimal): New variable. + + * config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for + m_AMDFAM10. + (ix86_expand_vector_move_misalign): Add code to generate movupd/movups + for unaligned vector SSE double/single precision loads for AMDFAM10. + + * config/i386/i386.h (TARGET_AMDFAM10): New macro. + (TARGET_CPU_CPP_BUILTINS): Add code for amdfam10. + Define TARGET_CPU_DEFAULT_amdfam10. + (TARGET_CPU_DEFAULT_NAMES): Add amdfam10. + (processor_type): Add PROCESSOR_AMDFAM10. + + * config/i386/i386.md: Add amdfam10 as a new cpu attribute to match + processor_type in config/i386/i386.h. + Enable imul peepholes for TARGET_AMDFAM10. + + * config.gcc: Add support for --with-cpu option for amdfam10. + + * config/i386/i386.c (amdfam10_cost): New variable. + (m_AMDFAM10): New macro. + (m_ATHLON_K8_AMDFAM10): New macro. + (x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen, + x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop, + x86_promote_QImode, x86_integer_DFmode_moves, + x86_partial_reg_dependency, x86_memory_mismatch_stall, + x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, + x86_sse_partial_reg_dependency, x86_sse_typeless_stores, + x86_use_ffreep, x86_use_incdec, x86_four_jump_limit, + x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns): + Enable/disable for amdfam10. + (override_options): Add amdfam10_cost to processor_target_table. + Set up PROCESSOR_AMDFAM10 for amdfam10 entry in + processor_alias_table. + (ix86_issue_rate): Add PROCESSOR_AMDFAM10. + (ix86_adjust_cost): Add code for amdfam10. + + * config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm) + instruction set feature flag. Add new (-mpopcnt) flag for popcnt + instruction. Add new SSE4A (-msse4a) instruction set feature flag. + * config/i386/i386.h: Add builtin definition for SSE4A. + * config/i386/i386.md: Add support for ABM instructions + (popcnt and lzcnt). + * config/i386/sse.md: Add support for SSE4A instructions + (movntss, movntsd, extrq, insertq). + * config/i386/i386.c: Add support for ABM and SSE4A builtins. + Add -march=amdfam10 flag. + * config/i386/ammintrin.h: Add support for SSE4A intrinsics. + * doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt + and amdfam10. + * doc/extend.texi: Add documentation for SSE4A builtins. + +2007-02-05 Dwarakanath Rajagopal + + * gcc.dg/i386-cpuid.h: Test whether SSE4A is supported + for running tests. + * gcc.target/i386/sse4a-extract.c: New test. + * gcc.target/i386/sse4a-insert.c: New test. + * gcc.target/i386/sse4a-montsd.c: New test. + * gcc.target/i386/sse4a-montss.c: New test. + +2006-12-15 H.J. Lu + + * gcc.dg/i386-cpuid.h (bit_SSSE3): New. + +2006-11-30 H.J. Lu + + * gcc.dg/i386-cpuid.h (bit_SSE3): New. + (i386_get_cpuid): New function. + (i386_cpuid_ecx): Likewise. + (i386_cpuid_edx): Likewise. + (i386_cpuid): Updated to call i386_cpuid_edx. + +--- gcc/doc/extend.texi.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/doc/extend.texi 2007-02-09 21:26:06.000000000 +0100 +@@ -6931,6 +6931,23 @@ v4si __builtin_ia32_pabsd128 (v4si) + v8hi __builtin_ia32_pabsw128 (v8hi) + @end smallexample + ++The following built-in functions are available when @option{-msse4a} is used. ++ ++@smallexample ++void _mm_stream_sd (double*,__m128d); ++Generates the @code{movntsd} machine instruction. ++void _mm_stream_ss (float*,__m128); ++Generates the @code{movntss} machine instruction. ++__m128i _mm_extract_si64 (__m128i, __m128i); ++Generates the @code{extrq} machine instruction with only SSE register operands. ++__m128i _mm_extracti_si64 (__m128i, int, int); ++Generates the @code{extrq} machine instruction with SSE register and immediate operands. ++__m128i _mm_insert_si64 (__m128i, __m128i); ++Generates the @code{insertq} machine instruction with only SSE register operands. ++__m128i _mm_inserti_si64 (__m128i, __m128i, int, int); ++Generates the @code{insertq} machine instruction with SSE register and immediate operands. ++@end smallexample ++ + The following built-in functions are available when @option{-m3dnow} is used. + All of them generate the machine instruction that is part of the name. + +--- gcc/doc/invoke.texi.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/doc/invoke.texi 2007-02-09 21:56:44.000000000 +0100 +@@ -522,7 +522,7 @@ Objective-C and Objective-C++ Dialects}. + -mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol + -mno-wide-multiply -mrtd -malign-double @gol + -mpreferred-stack-boundary=@var{num} @gol +--mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol ++-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol + -mthreads -mno-align-stringops -minline-all-stringops @gol + -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol + -m96bit-long-double -mregparm=@var{num} -msseregparm @gol +@@ -9062,6 +9062,10 @@ instruction set support. + @item k8, opteron, athlon64, athlon-fx + AMD K8 core based CPUs with x86-64 instruction set support. (This supersets + MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.) ++@item amdfam10 ++AMD Family 10 core based CPUs with x86-64 instruction set support. (This ++supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit ++instruction set extensions.) + @item winchip-c6 + IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction + set support. +@@ -9339,8 +9343,14 @@ preferred alignment to @option{-mpreferr + @itemx -mno-sse3 + @item -mssse3 + @itemx -mno-ssse3 ++@item -msse4a ++@item -mno-sse4a + @item -m3dnow + @itemx -mno-3dnow ++@item -mpopcnt ++@itemx -mno-popcnt ++@item -mabm ++@itemx -mno-abm + @opindex mmmx + @opindex mno-mmx + @opindex msse +--- gcc/testsuite/gcc.target/i386/sse4a-insert.c.jj 2007-02-09 21:26:06.000000000 +0100 ++++ gcc/testsuite/gcc.target/i386/sse4a-insert.c 2007-02-09 21:26:06.000000000 +0100 +@@ -0,0 +1,110 @@ ++/* { dg-do run { target i?86-*-* x86_64-*-* } } */ ++/* { dg-options "-O2 -msse4a" } */ ++#include ++#include ++#include "../../gcc.dg/i386-cpuid.h" ++ ++static void sse4a_test (void); ++ ++typedef union ++{ ++ long long i[2]; ++ __m128i vec; ++} LI; ++ ++int ++main () ++{ ++ unsigned long cpu_facilities; ++ ++ cpu_facilities = i386_extended_cpuid_ecx (); ++ ++ /* Run SSE4a test only if host has SSE4a support. */ ++ if ((cpu_facilities & bit_SSE4a)) ++ sse4a_test (); ++ ++ exit (0); ++} ++ ++static long long ++sse4a_test_insert (long long in1, long long in2) ++{ ++ __m128i v1,v2; ++ long long index_length, pad; ++ LI v_out; ++ index_length = 0x0000000000000810; ++ pad = 0x0; ++ v1 = _mm_set_epi64x (pad, in1); ++ v2 = _mm_set_epi64x (index_length, in2); ++ v_out.vec = _mm_insert_si64 (v1, v2); ++ return (v_out.i[0]); ++} ++ ++static long long ++sse4a_test_inserti (long long in1, long long in2) ++{ ++ __m128i v1,v2; ++ long long pad = 0x0; ++ LI v_out; ++ v1 = _mm_set_epi64x (pad, in1); ++ v2 = _mm_set_epi64x (pad, in2); ++ v_out.vec = _mm_inserti_si64 (v1, v2, (unsigned int) 0x10, (unsigned int) 0x08); ++ return (v_out.i[0]); ++} ++ ++static chk (long long i1, long long i2) ++{ ++ int n_fails =0; ++ if (i1 != i2) ++ n_fails +=1; ++ return n_fails; ++} ++ ++long long vals_in1[5] = ++ { ++ 0x1234567887654321, ++ 0x1456782093002490, ++ 0x2340909123990390, ++ 0x9595959599595999, ++ 0x9099038798000029 ++ }; ++ ++long long vals_in2[5] = ++ { ++ 0x9ABCDEF00FEDCBA9, ++ 0x234567097289672A, ++ 0x45476453097BD342, ++ 0x23569012AE586FF0, ++ 0x432567ABCDEF765D ++ }; ++ ++long long vals_out[5] = ++ { ++ 0x1234567887CBA921, ++ 0x1456782093672A90, ++ 0x2340909123D34290, ++ 0x95959595996FF099, ++ 0x9099038798765D29 ++ }; ++ ++static void ++sse4a_test (void) ++{ ++ int i; ++ int fail = 0; ++ long long out; ++ ++ for (i = 0; i < 5; i += 1) ++ { ++ out = sse4a_test_insert (vals_in1[i], vals_in2[i]); ++ fail += chk(out, vals_out[i]); ++ ++ out = sse4a_test_inserti (vals_in1[i], vals_in2[i]); ++ fail += chk(out, vals_out[i]); ++ } ++ ++ if (fail != 0) ++ abort (); ++ ++ exit (0); ++} +--- gcc/testsuite/gcc.target/i386/sse4a-extract.c.jj 2007-02-09 21:26:06.000000000 +0100 ++++ gcc/testsuite/gcc.target/i386/sse4a-extract.c 2007-02-09 21:26:06.000000000 +0100 +@@ -0,0 +1,100 @@ ++/* { dg-do run { target i?86-*-* x86_64-*-* } } */ ++/* { dg-options "-O2 -msse4a" } */ ++#include ++#include ++#include "../../gcc.dg/i386-cpuid.h" ++ ++static void sse4a_test (void); ++ ++typedef union ++{ ++ long long i[2]; ++ __m128i vec; ++} LI; ++ ++int ++main () ++{ ++ unsigned long cpu_facilities; ++ ++ cpu_facilities = i386_extended_cpuid_ecx (); ++ ++ /* Run SSE4a test only if host has SSE4a support. */ ++ if ((cpu_facilities & bit_SSE4a)) ++ sse4a_test (); ++ ++ exit (0); ++} ++ ++static long long ++sse4a_test_extrq (long long in) ++{ ++ __m128i v1, v2; ++ long long index_length, pad; ++ LI v_out; ++ index_length = 0x0000000000000810; ++ pad = 0x0; ++ v1 = _mm_set_epi64x (pad, in); ++ v2 = _mm_set_epi64x (pad, index_length); ++ v_out.vec = _mm_extract_si64 (v1, v2); ++ return (v_out.i[0]); ++} ++ ++static long long ++sse4a_test_extrqi (long long in) ++{ ++ __m128i v1; ++ long long pad =0x0; ++ LI v_out; ++ v1 = _mm_set_epi64x (pad, in); ++ v_out.vec = _mm_extracti_si64 (v1, (unsigned int) 0x10,(unsigned int) 0x08); ++ return (v_out.i[0]); ++} ++ ++static chk (long long i1, long long i2) ++{ ++ int n_fails =0; ++ if (i1 != i2) ++ n_fails +=1; ++ return n_fails; ++} ++ ++long long vals_in[5] = ++ { ++ 0x1234567887654321, ++ 0x1456782093002490, ++ 0x2340909123990390, ++ 0x9595959599595999, ++ 0x9099038798000029 ++ }; ++ ++long long vals_out[5] = ++ { ++ 0x0000000000006543, ++ 0x0000000000000024, ++ 0x0000000000009903, ++ 0x0000000000005959, ++ 0x0000000000000000 ++ }; ++ ++static void ++sse4a_test (void) ++{ ++ int i; ++ int fail = 0; ++ long long out; ++ ++ for (i = 0; i < 5; i += 1) ++ { ++ out = sse4a_test_extrq (vals_in[i]); ++ fail += chk(out, vals_out[i]); ++ ++ out = sse4a_test_extrqi (vals_in[i]); ++ fail += chk(out, vals_out[i]); ++ } ++ ++ if (fail != 0) ++ abort (); ++ ++ exit (0); ++} +--- gcc/testsuite/gcc.target/i386/sse4a-montss.c.jj 2007-02-09 21:26:06.000000000 +0100 ++++ gcc/testsuite/gcc.target/i386/sse4a-montss.c 2007-02-09 21:26:06.000000000 +0100 +@@ -0,0 +1,64 @@ ++/* { dg-do run { target i?86-*-* x86_64-*-* } } */ ++/* { dg-options "-O2 -msse4a" } */ ++#include ++#include ++#include "../../gcc.dg/i386-cpuid.h" ++ ++static void sse4a_test (void); ++ ++int ++main () ++{ ++ unsigned long cpu_facilities; ++ ++ cpu_facilities = i386_extended_cpuid_ecx (); ++ ++ /* Run SSE4a test only if host has SSE4a support. */ ++ if ((cpu_facilities & bit_SSE4a)) ++ sse4a_test (); ++ ++ exit (0); ++} ++ ++static void ++sse4a_test_movntss (float *out, float *in) ++{ ++ __m128 in_v4sf = _mm_load_ss (in); ++ _mm_stream_ss (out, in_v4sf); ++} ++ ++static int ++chk_ss (float *v1, float *v2) ++{ ++ int n_fails = 0; ++ if (v1[0] != v2[0]) ++ n_fails += 1; ++ return n_fails; ++} ++ ++float vals[10] = ++ { ++ 100.0, 200.0, 300.0, 400.0, 5.0, ++ -1.0, .345, -21.5, 9.32, 8.41 ++ }; ++ ++static void ++sse4a_test (void) ++{ ++ int i; ++ int fail = 0; ++ float *out; ++ ++ out = (float *) malloc (sizeof (float)); ++ for (i = 0; i < 10; i += 1) ++ { ++ sse4a_test_movntss (out, &vals[i]); ++ ++ fail += chk_ss (out, &vals[i]); ++ } ++ ++ if (fail != 0) ++ abort (); ++ ++ exit (0); ++} +--- gcc/testsuite/gcc.target/i386/sse4a-montsd.c.jj 2007-02-09 21:26:06.000000000 +0100 ++++ gcc/testsuite/gcc.target/i386/sse4a-montsd.c 2007-02-09 21:26:06.000000000 +0100 +@@ -0,0 +1,64 @@ ++/* { dg-do run { target i?86-*-* x86_64-*-* } } */ ++/* { dg-options "-O2 -msse4a" } */ ++#include ++#include ++#include "../../gcc.dg/i386-cpuid.h" ++ ++static void sse4a_test (void); ++ ++int ++main () ++{ ++ unsigned long cpu_facilities; ++ ++ cpu_facilities = i386_extended_cpuid_ecx (); ++ ++ /* Run SSE4a test only if host has SSE4a support. */ ++ if ((cpu_facilities & bit_SSE4a)) ++ sse4a_test (); ++ ++ exit (0); ++} ++ ++static void ++sse4a_test_movntsd (double *out, double *in) ++{ ++ __m128d in_v2df = _mm_load_sd (in); ++ _mm_stream_sd (out, in_v2df); ++} ++ ++static int ++chk_sd (double *v1, double *v2) ++{ ++ int n_fails = 0; ++ if (v1[0] != v2[0]) ++ n_fails += 1; ++ return n_fails; ++} ++ ++double vals[10] = ++ { ++ 100.0, 200.0, 300.0, 400.0, 5.0, ++ -1.0, .345, -21.5, 9.32, 8.41 ++ }; ++ ++static void ++sse4a_test (void) ++{ ++ int i; ++ int fail = 0; ++ double *out; ++ ++ out = (double *) malloc (sizeof (double)); ++ for (i = 0; i < 10; i += 1) ++ { ++ sse4a_test_movntsd (out, &vals[i]); ++ ++ fail += chk_sd (out, &vals[i]); ++ } ++ ++ if (fail != 0) ++ abort (); ++ ++ exit (0); ++} +--- gcc/testsuite/gcc.dg/i386-cpuid.h.jj 2006-10-05 00:26:53.000000000 +0200 ++++ gcc/testsuite/gcc.dg/i386-cpuid.h 2007-02-07 13:07:08.000000000 +0100 +@@ -2,23 +2,32 @@ + Used by 20020523-2.c and i386-sse-6.c, and possibly others. */ + /* Plagarized from 20020523-2.c. */ + ++/* %ecx */ ++#define bit_SSE3 (1 << 0) ++#define bit_SSSE3 (1 << 9) ++ ++/* %edx */ + #define bit_CMOV (1 << 15) + #define bit_MMX (1 << 23) + #define bit_SSE (1 << 25) + #define bit_SSE2 (1 << 26) + ++/* Extended Features */ ++/* %ecx */ ++#define bit_SSE4a (1 << 6) ++ + #ifndef NOINLINE + #define NOINLINE __attribute__ ((noinline)) + #endif + +-unsigned int i386_cpuid (void) NOINLINE; +- +-unsigned int NOINLINE +-i386_cpuid (void) ++static inline unsigned int ++i386_get_cpuid (unsigned int *ecx, unsigned int *edx) + { +- int fl1, fl2; ++ int fl1; + + #ifndef __x86_64__ ++ int fl2; ++ + /* See if we can use cpuid. On AMD64 we always can. */ + __asm__ ("pushfl; pushfl; popl %0; movl %0,%1; xorl %2,%0;" + "pushl %0; popfl; pushfl; popl %0; popfl" +@@ -42,15 +51,99 @@ i386_cpuid (void) + if (fl1 == 0) + return (0); + +- /* Invoke CPUID(1), return %edx; caller can examine bits to ++ /* Invoke CPUID(1), return %ecx and %edx; caller can examine bits to + determine what's supported. */ + #ifdef __x86_64__ +- __asm__ ("pushq %%rcx; pushq %%rbx; cpuid; popq %%rbx; popq %%rcx" +- : "=d" (fl2), "=a" (fl1) : "1" (1) : "cc"); ++ __asm__ ("pushq %%rbx; cpuid; popq %%rbx" ++ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (1) : "cc"); + #else +- __asm__ ("pushl %%ecx; pushl %%ebx; cpuid; popl %%ebx; popl %%ecx" +- : "=d" (fl2), "=a" (fl1) : "1" (1) : "cc"); ++ __asm__ ("pushl %%ebx; cpuid; popl %%ebx" ++ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (1) : "cc"); ++#endif ++ ++ return 1; ++} ++ ++static inline unsigned int ++i386_get_extended_cpuid (unsigned int *ecx, unsigned int *edx) ++{ ++ int fl1; ++ if (!(i386_get_cpuid (ecx, edx))) ++ return 0; ++ ++ /* Invoke CPUID(0x80000000) to get the highest supported extended function ++ number */ ++#ifdef __x86_64__ ++ __asm__ ("cpuid" ++ : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx", "ebx"); ++#else ++ __asm__ ("pushl %%ebx; cpuid; popl %%ebx" ++ : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx"); ++#endif ++ /* Check if highest supported extended function used below are supported */ ++ if (fl1 < 0x80000001) ++ return 0; ++ ++ /* Invoke CPUID(0x80000001), return %ecx and %edx; caller can examine bits to ++ determine what's supported. */ ++#ifdef __x86_64__ ++ __asm__ ("cpuid" ++ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001) : "ebx"); ++#else ++ __asm__ ("pushl %%ebx; cpuid; popl %%ebx" ++ : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001)); + #endif ++ return 1; ++} ++ ++ ++unsigned int i386_cpuid_ecx (void) NOINLINE; ++unsigned int i386_cpuid_edx (void) NOINLINE; ++unsigned int i386_extended_cpuid_ecx (void) NOINLINE; ++unsigned int i386_extended_cpuid_edx (void) NOINLINE; ++ ++unsigned int NOINLINE ++i386_cpuid_ecx (void) ++{ ++ unsigned int ecx, edx; ++ if (i386_get_cpuid (&ecx, &edx)) ++ return ecx; ++ else ++ return 0; ++} ++ ++unsigned int NOINLINE ++i386_cpuid_edx (void) ++{ ++ unsigned int ecx, edx; ++ if (i386_get_cpuid (&ecx, &edx)) ++ return edx; ++ else ++ return 0; ++} + +- return fl2; ++unsigned int NOINLINE ++i386_extended_cpuid_ecx (void) ++{ ++ unsigned int ecx, edx; ++ if (i386_get_extended_cpuid (&ecx, &edx)) ++ return ecx; ++ else ++ return 0; ++} ++ ++unsigned int NOINLINE ++i386_extended_cpuid_edx (void) ++{ ++ unsigned int ecx, edx; ++ if (i386_get_extended_cpuid (&ecx, &edx)) ++ return edx; ++ else ++ return 0; ++} ++ ++static inline unsigned int ++i386_cpuid (void) ++{ ++ return i386_cpuid_edx (); + } +--- gcc/config.gcc.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/config.gcc 2007-02-09 21:26:06.000000000 +0100 +@@ -264,12 +264,12 @@ xscale-*-*) + i[34567]86-*-*) + cpu_type=i386 + extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h +- pmmintrin.h tmmintrin.h" ++ pmmintrin.h tmmintrin.h ammintrin.h" + ;; + x86_64-*-*) + cpu_type=i386 + extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h +- pmmintrin.h tmmintrin.h" ++ pmmintrin.h tmmintrin.h ammintrin.h" + need_64bit_hwint=yes + ;; + ia64-*-*) +@@ -2396,6 +2396,9 @@ if test x$with_cpu = x ; then + ;; + i686-*-* | i786-*-*) + case ${target_noncanonical} in ++ amdfam10-*) ++ with_cpu=amdfam10 ++ ;; + k8-*|opteron-*|athlon_64-*) + with_cpu=k8 + ;; +@@ -2436,6 +2439,9 @@ if test x$with_cpu = x ; then + ;; + x86_64-*-*) + case ${target_noncanonical} in ++ amdfam10-*) ++ with_cpu=amdfam10 ++ ;; + k8-*|opteron-*|athlon_64-*) + with_cpu=k8 + ;; +@@ -2668,7 +2674,7 @@ case "${target}" in + esac + # OK + ;; +- "" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) ++ "" | amdfam10 | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) + # OK + ;; + *) +--- gcc/config/i386/i386.h.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/config/i386/i386.h 2007-02-09 21:29:00.000000000 +0100 +@@ -141,6 +141,7 @@ extern const struct processor_costs *ix8 + #define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32) + #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) + #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) ++#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) + + #define TUNEMASK (1 << ix86_tune) + extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and; +@@ -159,6 +160,7 @@ extern const int x86_accumulate_outgoing + extern const int x86_epilogue_using_move, x86_decompose_lea; + extern const int x86_arch_always_fancy_math_387, x86_shift1; + extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs; ++extern const int x86_sse_unaligned_move_optimal; + extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor; + extern const int x86_use_ffreep; + extern const int x86_inter_unit_moves, x86_schedule; +@@ -208,6 +210,8 @@ extern int x86_prefetch_sse, x86_cmpxchg + #define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & TUNEMASK) + #define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ + (x86_sse_partial_reg_dependency & TUNEMASK) ++#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \ ++ (x86_sse_unaligned_move_optimal & TUNEMASK) + #define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & TUNEMASK) + #define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & TUNEMASK) + #define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & TUNEMASK) +@@ -376,6 +380,8 @@ extern int x86_prefetch_sse, x86_cmpxchg + } \ + else if (TARGET_K8) \ + builtin_define ("__tune_k8__"); \ ++ else if (TARGET_AMDFAM10) \ ++ builtin_define ("__tune_amdfam10__"); \ + else if (TARGET_PENTIUM4) \ + builtin_define ("__tune_pentium4__"); \ + else if (TARGET_NOCONA) \ +@@ -400,6 +406,8 @@ extern int x86_prefetch_sse, x86_cmpxchg + builtin_define ("__SSSE3__"); \ + builtin_define ("__MNI__"); \ + } \ ++ if (TARGET_SSE4A) \ ++ builtin_define ("__SSE4A__"); \ + if (TARGET_SSE_MATH && TARGET_SSE) \ + builtin_define ("__SSE_MATH__"); \ + if (TARGET_SSE_MATH && TARGET_SSE2) \ +@@ -455,6 +463,11 @@ extern int x86_prefetch_sse, x86_cmpxchg + builtin_define ("__k8"); \ + builtin_define ("__k8__"); \ + } \ ++ else if (ix86_arch == PROCESSOR_AMDFAM10) \ ++ { \ ++ builtin_define ("__amdfam10"); \ ++ builtin_define ("__amdfam10__"); \ ++ } \ + else if (ix86_arch == PROCESSOR_PENTIUM4) \ + { \ + builtin_define ("__pentium4"); \ +@@ -493,13 +506,14 @@ extern int x86_prefetch_sse, x86_cmpxchg + #define TARGET_CPU_DEFAULT_nocona 17 + #define TARGET_CPU_DEFAULT_core2 18 + #define TARGET_CPU_DEFAULT_generic 19 ++#define TARGET_CPU_DEFAULT_amdfam10 20 + + #define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\ + "pentiumpro", "pentium2", "pentium3", \ + "pentium4", "geode", "k6", "k6-2", "k6-3", \ + "athlon", "athlon-4", "k8", \ + "pentium-m", "prescott", "nocona", \ +- "core2", "generic"} ++ "core2", "generic", "amdfam10"} + + #ifndef CC1_SPEC + #define CC1_SPEC "%(cc1_cpu) " +@@ -2162,6 +2176,7 @@ enum processor_type + PROCESSOR_CORE2, + PROCESSOR_GENERIC32, + PROCESSOR_GENERIC64, ++ PROCESSOR_AMDFAM10, + PROCESSOR_max + }; + +--- gcc/config/i386/i386.md.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/config/i386/i386.md 2007-02-10 19:33:43.000000000 +0100 +@@ -151,6 +151,12 @@ + (UNSPEC_PSHUFB 120) + (UNSPEC_PSIGN 121) + (UNSPEC_PALIGNR 122) ++ ++ ; For SSE4A support ++ (UNSPEC_EXTRQI 130) ++ (UNSPEC_EXTRQ 131) ++ (UNSPEC_INSERTQI 132) ++ (UNSPEC_INSERTQ 133) + ]) + + (define_constants +@@ -190,7 +196,8 @@ + + ;; Processor type. This attribute must exactly match the processor_type + ;; enumeration in i386.h. +-(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64" ++(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8, ++ nocona,core2,generic32,generic64,amdfam10" + (const (symbol_ref "ix86_tune"))) + + ;; A basic instruction type. Refinements due to arguments to be +@@ -201,10 +208,10 @@ + incdec,ishift,ishift1,rotate,rotate1,imul,idiv, + icmp,test,ibr,setcc,icmov, + push,pop,call,callv,leave, +- str,cld, ++ str,bitmanip,cld, + fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint, + sselog,sselog1,sseiadd,sseishft,sseimul, +- sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv, ++ sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins, + mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft" + (const_string "other")) + +@@ -218,7 +225,7 @@ + (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") + (const_string "i387") + (eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul, +- sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv") ++ sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins") + (const_string "sse") + (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") + (const_string "mmx") +@@ -228,7 +235,8 @@ + + ;; The (bounding maximum) length of an instruction immediate. + (define_attr "length_immediate" "" +- (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave") ++ (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave, ++ bitmanip") + (const_int 0) + (eq_attr "unit" "i387,sse,mmx") + (const_int 0) +@@ -282,7 +290,7 @@ + ;; Set when 0f opcode prefix is used. + (define_attr "prefix_0f" "" + (if_then_else +- (ior (eq_attr "type" "imovx,setcc,icmov") ++ (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip") + (eq_attr "unit" "sse,mmx")) + (const_int 1) + (const_int 0))) +@@ -407,7 +415,7 @@ + (const_string "load") + (and (eq_attr "type" + "!alu1,negnot,ishift1, +- imov,imovx,icmp,test, ++ imov,imovx,icmp,test,bitmanip, + fmov,fcmp,fsgn, + sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1, + mmx,mmxmov,mmxcmp,mmxcvt") +@@ -961,10 +969,11 @@ + "sahf" + [(set_attr "length" "1") + (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct") + (set_attr "mode" "SI")]) + + ;; Pentium Pro can do steps 1 through 3 in one go. +- ++;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes) + (define_insn "*cmpfp_i_mixed" + [(set (reg:CCFP FLAGS_REG) + (compare:CCFP (match_operand 0 "register_operand" "f#x,x#f") +@@ -978,7 +987,8 @@ + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_insn "*cmpfp_i_sse" + [(set (reg:CCFP FLAGS_REG) +@@ -993,7 +1003,8 @@ + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_insn "*cmpfp_i_i387" + [(set (reg:CCFP FLAGS_REG) +@@ -1012,7 +1023,8 @@ + (const_string "DF") + ] + (const_string "XF"))) +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_insn "*cmpfp_iu_mixed" + [(set (reg:CCFPU FLAGS_REG) +@@ -1027,7 +1039,8 @@ + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_insn "*cmpfp_iu_sse" + [(set (reg:CCFPU FLAGS_REG) +@@ -1042,7 +1055,8 @@ + (if_then_else (match_operand:SF 1 "" "") + (const_string "SF") + (const_string "DF"))) +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_insn "*cmpfp_iu_387" + [(set (reg:CCFPU FLAGS_REG) +@@ -1061,7 +1075,8 @@ + (const_string "DF") + ] + (const_string "XF"))) +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "direct")]) + + ;; Move instructions. + +@@ -1267,7 +1282,8 @@ + [(set_attr "type" "imov") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "double")]) + + (define_expand "movhi" + [(set (match_operand:HI 0 "nonimmediate_operand" "") +@@ -1384,8 +1400,10 @@ + [(set_attr "type" "imov") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "double")]) + ++;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 + (define_insn "*swaphi_2" + [(set (match_operand:HI 0 "register_operand" "+r") + (match_operand:HI 1 "register_operand" "+r")) +@@ -1558,8 +1576,10 @@ + [(set_attr "type" "imov") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "vector")]) + ++;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 + (define_insn "*swapqi_2" + [(set (match_operand:QI 0 "register_operand" "+q") + (match_operand:QI 1 "register_operand" "+q")) +@@ -2113,7 +2133,8 @@ + [(set_attr "type" "imov") + (set_attr "mode" "DI") + (set_attr "pent_pair" "np") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "double")]) + + (define_expand "movti" + [(set (match_operand:TI 0 "nonimmediate_operand" "") +@@ -4122,7 +4143,8 @@ + "cvttss2si{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "SF") +- (set_attr "athlon_decode" "double,vector")]) ++ (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double")]) + + (define_insn "fix_truncdfdi_sse" + [(set (match_operand:DI 0 "register_operand" "=r,r") +@@ -4131,7 +4153,8 @@ + "cvttsd2si{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") +- (set_attr "athlon_decode" "double,vector")]) ++ (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double")]) + + (define_insn "fix_truncsfsi_sse" + [(set (match_operand:SI 0 "register_operand" "=r,r") +@@ -4140,7 +4163,8 @@ + "cvttss2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") +- (set_attr "athlon_decode" "double,vector")]) ++ (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double")]) + + (define_insn "fix_truncdfsi_sse" + [(set (match_operand:SI 0 "register_operand" "=r,r") +@@ -4149,7 +4173,8 @@ + "cvttsd2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") +- (set_attr "athlon_decode" "double,vector")]) ++ (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double")]) + + ;; Avoid vector decoded forms of the instruction. + (define_peephole2 +@@ -4410,7 +4435,8 @@ + [(set_attr "length" "2") + (set_attr "mode" "HI") + (set_attr "unit" "i387") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "vector")]) + + ;; Conversion between fixed point and floating point. + +@@ -4461,6 +4487,7 @@ + (set_attr "mode" "SF") + (set_attr "unit" "*,i387,*,*") + (set_attr "athlon_decode" "*,*,vector,double") ++ (set_attr "amdfam10_decode" "*,*,vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatsisf2_sse" +@@ -4471,6 +4498,7 @@ + [(set_attr "type" "sseicvt") + (set_attr "mode" "SF") + (set_attr "athlon_decode" "vector,double") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatsisf2_i387" +@@ -4504,6 +4532,7 @@ + (set_attr "mode" "SF") + (set_attr "unit" "*,i387,*,*") + (set_attr "athlon_decode" "*,*,vector,double") ++ (set_attr "amdfam10_decode" "*,*,vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatdisf2_sse" +@@ -4514,6 +4543,7 @@ + [(set_attr "type" "sseicvt") + (set_attr "mode" "SF") + (set_attr "athlon_decode" "vector,double") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatdisf2_i387" +@@ -4572,6 +4602,7 @@ + (set_attr "mode" "DF") + (set_attr "unit" "*,i387,*,*") + (set_attr "athlon_decode" "*,*,double,direct") ++ (set_attr "amdfam10_decode" "*,*,vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatsidf2_sse" +@@ -4582,6 +4613,7 @@ + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") + (set_attr "athlon_decode" "double,direct") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatsidf2_i387" +@@ -4615,6 +4647,7 @@ + (set_attr "mode" "DF") + (set_attr "unit" "*,i387,*,*") + (set_attr "athlon_decode" "*,*,double,direct") ++ (set_attr "amdfam10_decode" "*,*,vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatdidf2_sse" +@@ -4625,6 +4658,7 @@ + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") + (set_attr "athlon_decode" "double,direct") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "fp_int_src" "true")]) + + (define_insn "*floatdidf2_i387" +@@ -6832,6 +6866,14 @@ + "TARGET_64BIT" + "") + ++;; On AMDFAM10 ++;; IMUL reg64, reg64, imm8 Direct ++;; IMUL reg64, mem64, imm8 VectorPath ++;; IMUL reg64, reg64, imm32 Direct ++;; IMUL reg64, mem64, imm32 VectorPath ++;; IMUL reg64, reg64 Direct ++;; IMUL reg64, mem64 Direct ++ + (define_insn "*muldi3_1_rex64" + [(set (match_operand:DI 0 "register_operand" "=r,r,r") + (mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0") +@@ -6854,6 +6896,11 @@ + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) ++ (set (attr "amdfam10_decode") ++ (cond [(and (eq_attr "alternative" "0,1") ++ (match_operand 1 "memory_operand" "")) ++ (const_string "vector")] ++ (const_string "direct"))) + (set_attr "mode" "DI")]) + + (define_expand "mulsi3" +@@ -6864,6 +6911,14 @@ + "" + "") + ++;; On AMDFAM10 ++;; IMUL reg32, reg32, imm8 Direct ++;; IMUL reg32, mem32, imm8 VectorPath ++;; IMUL reg32, reg32, imm32 Direct ++;; IMUL reg32, mem32, imm32 VectorPath ++;; IMUL reg32, reg32 Direct ++;; IMUL reg32, mem32 Direct ++ + (define_insn "*mulsi3_1" + [(set (match_operand:SI 0 "register_operand" "=r,r,r") + (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0") +@@ -6885,6 +6940,11 @@ + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) ++ (set (attr "amdfam10_decode") ++ (cond [(and (eq_attr "alternative" "0,1") ++ (match_operand 1 "memory_operand" "")) ++ (const_string "vector")] ++ (const_string "direct"))) + (set_attr "mode" "SI")]) + + (define_insn "*mulsi3_1_zext" +@@ -6910,6 +6970,11 @@ + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) ++ (set (attr "amdfam10_decode") ++ (cond [(and (eq_attr "alternative" "0,1") ++ (match_operand 1 "memory_operand" "")) ++ (const_string "vector")] ++ (const_string "direct"))) + (set_attr "mode" "SI")]) + + (define_expand "mulhi3" +@@ -6920,6 +6985,13 @@ + "TARGET_HIMODE_MATH" + "") + ++;; On AMDFAM10 ++;; IMUL reg16, reg16, imm8 VectorPath ++;; IMUL reg16, mem16, imm8 VectorPath ++;; IMUL reg16, reg16, imm16 VectorPath ++;; IMUL reg16, mem16, imm16 VectorPath ++;; IMUL reg16, reg16 Direct ++;; IMUL reg16, mem16 Direct + (define_insn "*mulhi3_1" + [(set (match_operand:HI 0 "register_operand" "=r,r,r") + (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0") +@@ -6938,6 +7010,10 @@ + (eq_attr "alternative" "1,2") + (const_string "vector")] + (const_string "direct"))) ++ (set (attr "amdfam10_decode") ++ (cond [(eq_attr "alternative" "0,1") ++ (const_string "vector")] ++ (const_string "direct"))) + (set_attr "mode" "HI")]) + + (define_expand "mulqi3" +@@ -6948,6 +7024,10 @@ + "TARGET_QIMODE_MATH" + "") + ++;;On AMDFAM10 ++;; MUL reg8 Direct ++;; MUL mem8 Direct ++ + (define_insn "*mulqi3_1" + [(set (match_operand:QI 0 "register_operand" "=a") + (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") +@@ -6962,6 +7042,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "direct"))) ++ (set_attr "amdfam10_decode" "direct") + (set_attr "mode" "QI")]) + + (define_expand "umulqihi3" +@@ -6988,6 +7069,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "direct"))) ++ (set_attr "amdfam10_decode" "direct") + (set_attr "mode" "QI")]) + + (define_expand "mulqihi3" +@@ -7012,6 +7094,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "direct"))) ++ (set_attr "amdfam10_decode" "direct") + (set_attr "mode" "QI")]) + + (define_expand "umulditi3" +@@ -7038,6 +7121,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "DI")]) + + ;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers +@@ -7065,6 +7149,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "SI")]) + + (define_expand "mulditi3" +@@ -7091,6 +7176,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "DI")]) + + (define_expand "mulsidi3" +@@ -7117,6 +7203,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "SI")]) + + (define_expand "umuldi3_highpart" +@@ -7153,6 +7240,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "DI")]) + + (define_expand "umulsi3_highpart" +@@ -7188,6 +7276,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "SI")]) + + (define_insn "*umulsi3_highpart_zext" +@@ -7210,6 +7299,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "SI")]) + + (define_expand "smuldi3_highpart" +@@ -7245,6 +7335,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "DI")]) + + (define_expand "smulsi3_highpart" +@@ -7279,6 +7370,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "SI")]) + + (define_insn "*smulsi3_highpart_zext" +@@ -7300,6 +7392,7 @@ + (if_then_else (eq_attr "cpu" "athlon") + (const_string "vector") + (const_string "double"))) ++ (set_attr "amdfam10_decode" "double") + (set_attr "mode" "SI")]) + + ;; The patterns that match these are at the end of this file. +@@ -10281,7 +10374,8 @@ + [(set_attr "type" "ishift") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "vector")]) + + (define_expand "x86_64_shift_adj" + [(set (reg:CCZ FLAGS_REG) +@@ -10496,7 +10590,8 @@ + (set_attr "prefix_0f" "1") + (set_attr "mode" "SI") + (set_attr "pent_pair" "np") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "vector")]) + + (define_expand "x86_shift_adj_1" + [(set (reg:CCZ FLAGS_REG) +@@ -11256,7 +11351,8 @@ + [(set_attr "type" "ishift") + (set_attr "prefix_0f" "1") + (set_attr "mode" "DI") +- (set_attr "athlon_decode" "vector")]) ++ (set_attr "athlon_decode" "vector") ++ (set_attr "amdfam10_decode" "vector")]) + + (define_expand "ashrdi3" + [(set (match_operand:DI 0 "shiftdi_operand" "") +@@ -14520,7 +14616,23 @@ + [(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31))) + (clobber (reg:CC FLAGS_REG))])] + "" +- "") ++{ ++ if (TARGET_ABM) ++ { ++ emit_insn (gen_clzsi2_abm (operands[0], operands[1])); ++ DONE; ++ } ++}) ++ ++(define_insn "clzsi2_abm" ++ [(set (match_operand:SI 0 "register_operand" "=r") ++ (clz:SI (match_operand:SI 1 "nonimmediate_operand" ""))) ++ (clobber (reg:CC FLAGS_REG))] ++ "TARGET_ABM" ++ "lzcnt{l}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "SI")]) + + (define_insn "*bsr" + [(set (match_operand:SI 0 "register_operand" "=r") +@@ -14529,7 +14641,44 @@ + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{l}\t{%1, %0|%0, %1}" +- [(set_attr "prefix_0f" "1")]) ++ [(set_attr "prefix_0f" "1") ++ (set_attr "mode" "SI")]) ++ ++(define_insn "popcountsi2" ++ [(set (match_operand:SI 0 "register_operand" "=r") ++ (popcount:SI (match_operand:SI 1 "nonimmediate_operand" ""))) ++ (clobber (reg:CC FLAGS_REG))] ++ "TARGET_POPCNT" ++ "popcnt{l}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "SI")]) ++ ++(define_insn "*popcountsi2_cmp" ++ [(set (reg FLAGS_REG) ++ (compare ++ (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) ++ (const_int 0))) ++ (set (match_operand:SI 0 "register_operand" "=r") ++ (popcount:SI (match_dup 1)))] ++ "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" ++ "popcnt{l}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "SI")]) ++ ++(define_insn "*popcountsi2_cmp_zext" ++ [(set (reg FLAGS_REG) ++ (compare ++ (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) ++ (const_int 0))) ++ (set (match_operand:DI 0 "register_operand" "=r") ++ (zero_extend:DI(popcount:SI (match_dup 1))))] ++ "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" ++ "popcnt{l}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "SI")]) + + (define_expand "clzdi2" + [(parallel +@@ -14541,7 +14690,23 @@ + [(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_64BIT" +- "") ++{ ++ if (TARGET_ABM) ++ { ++ emit_insn (gen_clzdi2_abm (operands[0], operands[1])); ++ DONE; ++ } ++}) ++ ++(define_insn "clzdi2_abm" ++ [(set (match_operand:DI 0 "register_operand" "=r") ++ (clz:DI (match_operand:DI 1 "nonimmediate_operand" ""))) ++ (clobber (reg:CC FLAGS_REG))] ++ "TARGET_64BIT && TARGET_ABM" ++ "lzcnt{q}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "DI")]) + + (define_insn "*bsr_rex64" + [(set (match_operand:DI 0 "register_operand" "=r") +@@ -14550,7 +14715,92 @@ + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT" + "bsr{q}\t{%1, %0|%0, %1}" +- [(set_attr "prefix_0f" "1")]) ++ [(set_attr "prefix_0f" "1") ++ (set_attr "mode" "DI")]) ++ ++(define_insn "popcountdi2" ++ [(set (match_operand:DI 0 "register_operand" "=r") ++ (popcount:DI (match_operand:DI 1 "nonimmediate_operand" ""))) ++ (clobber (reg:CC FLAGS_REG))] ++ "TARGET_64BIT && TARGET_POPCNT" ++ "popcnt{q}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "DI")]) ++ ++(define_insn "*popcountdi2_cmp" ++ [(set (reg FLAGS_REG) ++ (compare ++ (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm")) ++ (const_int 0))) ++ (set (match_operand:DI 0 "register_operand" "=r") ++ (popcount:DI (match_dup 1)))] ++ "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" ++ "popcnt{q}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "DI")]) ++ ++(define_expand "clzhi2" ++ [(parallel ++ [(set (match_operand:HI 0 "register_operand" "") ++ (minus:HI (const_int 15) ++ (clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))) ++ (clobber (reg:CC FLAGS_REG))]) ++ (parallel ++ [(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15))) ++ (clobber (reg:CC FLAGS_REG))])] ++ "" ++{ ++ if (TARGET_ABM) ++ { ++ emit_insn (gen_clzhi2_abm (operands[0], operands[1])); ++ DONE; ++ } ++}) ++ ++(define_insn "clzhi2_abm" ++ [(set (match_operand:HI 0 "register_operand" "=r") ++ (clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))) ++ (clobber (reg:CC FLAGS_REG))] ++ "TARGET_ABM" ++ "lzcnt{w}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "HI")]) ++ ++(define_insn "*bsrhi" ++ [(set (match_operand:HI 0 "register_operand" "=r") ++ (minus:HI (const_int 15) ++ (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) ++ (clobber (reg:CC FLAGS_REG))] ++ "" ++ "bsr{w}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_0f" "1") ++ (set_attr "mode" "HI")]) ++ ++(define_insn "popcounthi2" ++ [(set (match_operand:HI 0 "register_operand" "=r") ++ (popcount:HI (match_operand:HI 1 "nonimmediate_operand" ""))) ++ (clobber (reg:CC FLAGS_REG))] ++ "TARGET_POPCNT" ++ "popcnt{w}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "HI")]) ++ ++(define_insn "*popcounthi2_cmp" ++ [(set (reg FLAGS_REG) ++ (compare ++ (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm")) ++ (const_int 0))) ++ (set (match_operand:HI 0 "register_operand" "=r") ++ (popcount:HI (match_dup 1)))] ++ "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" ++ "popcnt{w}\t{%1, %0|%0, %1}" ++ [(set_attr "prefix_rep" "1") ++ (set_attr "type" "bitmanip") ++ (set_attr "mode" "HI")]) + + ;; Thread-local storage patterns for ELF. + ;; +@@ -15302,7 +15552,8 @@ + sqrtss\t{%1, %0|%0, %1}" + [(set_attr "type" "fpspc,sse") + (set_attr "mode" "SF,SF") +- (set_attr "athlon_decode" "direct,*")]) ++ (set_attr "athlon_decode" "direct,*") ++ (set_attr "amdfam10_decode" "direct,*")]) + + (define_insn "*sqrtsf2_sse" + [(set (match_operand:SF 0 "register_operand" "=x") +@@ -15311,7 +15562,8 @@ + "sqrtss\t{%1, %0|%0, %1}" + [(set_attr "type" "sse") + (set_attr "mode" "SF") +- (set_attr "athlon_decode" "*")]) ++ (set_attr "athlon_decode" "*") ++ (set_attr "amdfam10_decode" "*")]) + + (define_insn "*sqrtsf2_i387" + [(set (match_operand:SF 0 "register_operand" "=f") +@@ -15320,7 +15572,8 @@ + "fsqrt" + [(set_attr "type" "fpspc") + (set_attr "mode" "SF") +- (set_attr "athlon_decode" "direct")]) ++ (set_attr "athlon_decode" "direct") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_expand "sqrtdf2" + [(set (match_operand:DF 0 "register_operand" "") +@@ -15399,7 +15652,8 @@ + "fsqrt" + [(set_attr "type" "fpspc") + (set_attr "mode" "XF") +- (set_attr "athlon_decode" "direct")]) ++ (set_attr "athlon_decode" "direct") ++ (set_attr "amdfam10_decode" "direct")]) + + (define_insn "fpremxf4" + [(set (match_operand:XF 0 "register_operand" "=f") +@@ -20186,7 +20440,7 @@ + (mult:DI (match_operand:DI 1 "memory_operand" "") + (match_operand:DI 2 "immediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] +- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size ++ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size + && (GET_CODE (operands[2]) != CONST_INT + || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))" + [(set (match_dup 3) (match_dup 1)) +@@ -20200,7 +20454,7 @@ + (mult:SI (match_operand:SI 1 "memory_operand" "") + (match_operand:SI 2 "immediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))])] +- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size ++ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size + && (GET_CODE (operands[2]) != CONST_INT + || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))" + [(set (match_dup 3) (match_dup 1)) +@@ -20215,7 +20469,7 @@ + (mult:SI (match_operand:SI 1 "memory_operand" "") + (match_operand:SI 2 "immediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))])] +- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size ++ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size + && (GET_CODE (operands[2]) != CONST_INT + || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))" + [(set (match_dup 3) (match_dup 1)) +@@ -20233,7 +20487,7 @@ + (match_operand:DI 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (match_scratch:DI 3 "r")] +- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size ++ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size + && CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')" + [(set (match_dup 3) (match_dup 2)) + (parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3))) +@@ -20249,7 +20503,7 @@ + (match_operand:SI 2 "const_int_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (match_scratch:SI 3 "r")] +- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size ++ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size + && CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')" + [(set (match_dup 3) (match_dup 2)) + (parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3))) +@@ -20265,7 +20519,7 @@ + (match_operand:HI 2 "immediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))]) + (match_scratch:HI 3 "r")] +- "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size" ++ "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size" + [(set (match_dup 3) (match_dup 2)) + (parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3))) + (clobber (reg:CC FLAGS_REG))])] +--- gcc/config/i386/athlon.md.jj 2006-10-29 20:56:45.000000000 +0100 ++++ gcc/config/i386/athlon.md 2007-02-09 21:26:06.000000000 +0100 +@@ -29,6 +29,8 @@ + (const_string "vector")] + (const_string "direct"))) + ++(define_attr "amdfam10_decode" "direct,vector,double" ++ (const_string "direct")) + ;; + ;; decode0 decode1 decode2 + ;; \ | / +@@ -131,18 +133,22 @@ + + ;; Jump instructions are executed in the branch unit completely transparent to us + (define_insn_reservation "athlon_branch" 0 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "ibr")) + "athlon-direct,athlon-ieu") + (define_insn_reservation "athlon_call" 0 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "call,callv")) + "athlon-vector,athlon-ieu") ++(define_insn_reservation "athlon_call_amdfam10" 0 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "call,callv")) ++ "athlon-double,athlon-ieu") + + ;; Latency of push operation is 3 cycles, but ESP value is available + ;; earlier + (define_insn_reservation "athlon_push" 2 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "push")) + "athlon-direct,athlon-agu,athlon-store") + (define_insn_reservation "athlon_pop" 4 +@@ -153,12 +159,16 @@ + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "pop")) + "athlon-double,(athlon-ieu+athlon-load)") ++(define_insn_reservation "athlon_pop_amdfam10" 3 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "pop")) ++ "athlon-direct,(athlon-ieu+athlon-load)") + (define_insn_reservation "athlon_leave" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "leave")) + "athlon-vector,(athlon-ieu+athlon-load)") + (define_insn_reservation "athlon_leave_k8" 3 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (eq_attr "type" "leave")) + "athlon-double,(athlon-ieu+athlon-load)") + +@@ -167,6 +177,11 @@ + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu,nothing") ++;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10 ++(define_insn_reservation "athlon_lea_amdfam10" 1 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "lea")) ++ "athlon-direct,athlon-agu,nothing") + + ;; Mul executes in special multiplier unit attached to IEU0 + (define_insn_reservation "athlon_imul" 5 +@@ -176,29 +191,35 @@ + "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0") + ;; ??? Widening multiply is vector or double. + (define_insn_reservation "athlon_imul_k8_DI" 4 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "none,unknown")))) + "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") + (define_insn_reservation "athlon_imul_k8" 3 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (eq_attr "memory" "none,unknown"))) + "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0") ++(define_insn_reservation "athlon_imul_amdfam10_HI" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "imul") ++ (and (eq_attr "mode" "HI") ++ (eq_attr "memory" "none,unknown")))) ++ "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") + (define_insn_reservation "athlon_imul_mem" 8 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu") + (define_insn_reservation "athlon_imul_mem_k8_DI" 7 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "DI") + (eq_attr "memory" "load,both")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu") + (define_insn_reservation "athlon_imul_mem_k8" 6 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "imul") + (eq_attr "memory" "load,both"))) + "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu") +@@ -209,21 +230,23 @@ + ;; other instructions. + ;; ??? Experiments show that the idiv can overlap with roughly 6 cycles + ;; of the other code ++;; Using the same heuristics for amdfam10 as K8 with idiv + + (define_insn_reservation "athlon_idiv" 6 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "none,unknown"))) + "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))") + (define_insn_reservation "athlon_idiv_mem" 9 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "idiv") + (eq_attr "memory" "load,both"))) + "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))") + ;; The parallelism of string instructions is not documented. Model it same way + ;; as idiv to create smaller automata. This probably does not matter much. ++;; Using the same heuristics for amdfam10 as K8 with idiv + (define_insn_reservation "athlon_str" 6 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "str") + (eq_attr "memory" "load,both,store"))) + "athlon-vector,athlon-load,athlon-ieu0*6") +@@ -234,34 +257,62 @@ + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") ++(define_insn_reservation "athlon_idirect_amdfam10" 1 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "direct") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "none,unknown")))) ++ "athlon-direct,athlon-ieu") + (define_insn_reservation "athlon_ivector" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") ++(define_insn_reservation "athlon_ivector_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "none,unknown")))) ++ "athlon-vector,athlon-ieu,athlon-ieu") ++ + (define_insn_reservation "athlon_idirect_loadmov" 3 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "imov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-load") ++ + (define_insn_reservation "athlon_idirect_load" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") ++(define_insn_reservation "athlon_idirect_load_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "direct") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "load")))) ++ "athlon-direct,athlon-load,athlon-ieu") + (define_insn_reservation "athlon_ivector_load" 6 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") ++(define_insn_reservation "athlon_ivector_load_amdfam10" 6 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "load")))) ++ "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") ++ + (define_insn_reservation "athlon_idirect_movstore" 1 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "imov") + (eq_attr "memory" "store"))) + "athlon-direct,athlon-agu,athlon-store") ++ + (define_insn_reservation "athlon_idirect_both" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") +@@ -270,6 +321,15 @@ + "athlon-direct,athlon-load, + athlon-ieu,athlon-store, + athlon-store") ++(define_insn_reservation "athlon_idirect_both_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "direct") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "both")))) ++ "athlon-direct,athlon-load, ++ athlon-ieu,athlon-store, ++ athlon-store") ++ + (define_insn_reservation "athlon_ivector_both" 6 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") +@@ -279,6 +339,16 @@ + athlon-ieu, + athlon-ieu, + athlon-store") ++(define_insn_reservation "athlon_ivector_both_amdfam10" 6 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "both")))) ++ "athlon-vector,athlon-load, ++ athlon-ieu, ++ athlon-ieu, ++ athlon-store") ++ + (define_insn_reservation "athlon_idirect_store" 1 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "direct") +@@ -286,6 +356,14 @@ + (eq_attr "memory" "store")))) + "athlon-direct,(athlon-ieu+athlon-agu), + athlon-store") ++(define_insn_reservation "athlon_idirect_store_amdfam10" 1 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "direct") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "store")))) ++ "athlon-direct,(athlon-ieu+athlon-agu), ++ athlon-store") ++ + (define_insn_reservation "athlon_ivector_store" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "athlon_decode" "vector") +@@ -293,6 +371,13 @@ + (eq_attr "memory" "store")))) + "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, + athlon-store") ++(define_insn_reservation "athlon_ivector_store_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "unit" "integer,unknown") ++ (eq_attr "memory" "store")))) ++ "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, ++ athlon-store") + + ;; Athlon floatin point unit + (define_insn_reservation "athlon_fldxf" 12 +@@ -302,7 +387,7 @@ + (eq_attr "mode" "XF")))) + "athlon-vector,athlon-fpload2,athlon-fvector*9") + (define_insn_reservation "athlon_fldxf_k8" 13 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "load") + (eq_attr "mode" "XF")))) +@@ -314,7 +399,7 @@ + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fany") + (define_insn_reservation "athlon_fld_k8" 2 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fstore") +@@ -326,7 +411,7 @@ + (eq_attr "mode" "XF")))) + "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))") + (define_insn_reservation "athlon_fstxf_k8" 8 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (and (eq_attr "memory" "store,both") + (eq_attr "mode" "XF")))) +@@ -337,16 +422,16 @@ + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") + (define_insn_reservation "athlon_fst_k8" 2 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") + (define_insn_reservation "athlon_fist" 4 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fistp,fisttp")) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") + (define_insn_reservation "athlon_fmov" 2 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fmov")) + "athlon-direct,athlon-fpsched,athlon-faddmul") + (define_insn_reservation "athlon_fadd_load" 4 +@@ -355,12 +440,12 @@ + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fadd") + (define_insn_reservation "athlon_fadd_load_k8" 6 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fop") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_fadd" 4 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fop")) + "athlon-direct,athlon-fpsched,athlon-fadd") + (define_insn_reservation "athlon_fmul_load" 4 +@@ -369,16 +454,16 @@ + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fmul") + (define_insn_reservation "athlon_fmul_load_k8" 6 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fmul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") + (define_insn_reservation "athlon_fmul" 4 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fmul")) + "athlon-direct,athlon-fpsched,athlon-fmul") + (define_insn_reservation "athlon_fsgn" 2 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fsgn")) + "athlon-direct,athlon-fpsched,athlon-fmul") + (define_insn_reservation "athlon_fdiv_load" 24 +@@ -387,7 +472,7 @@ + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fmul") + (define_insn_reservation "athlon_fdiv_load_k8" 13 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fdiv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") +@@ -396,16 +481,16 @@ + (eq_attr "type" "fdiv")) + "athlon-direct,athlon-fpsched,athlon-fmul") + (define_insn_reservation "athlon_fdiv_k8" 11 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (eq_attr "type" "fdiv")) + "athlon-direct,athlon-fpsched,athlon-fmul") + (define_insn_reservation "athlon_fpspc_load" 103 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "fpspc") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fpload,athlon-fvector") + (define_insn_reservation "athlon_fpspc" 100 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fpspc")) + "athlon-vector,athlon-fpsched,athlon-fvector") + (define_insn_reservation "athlon_fcmov_load" 7 +@@ -418,12 +503,12 @@ + (eq_attr "type" "fcmov")) + "athlon-vector,athlon-fpsched,athlon-fvector") + (define_insn_reservation "athlon_fcmov_load_k8" 17 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fcmov") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fploadk8,athlon-fvector") + (define_insn_reservation "athlon_fcmov_k8" 15 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (eq_attr "type" "fcmov")) + "athlon-vector,athlon-fpsched,athlon-fvector") + ;; fcomi is vector decoded by uses only one pipe. +@@ -434,13 +519,13 @@ + (eq_attr "memory" "load")))) + "athlon-vector,athlon-fpload,athlon-fadd") + (define_insn_reservation "athlon_fcomi_load_k8" 5 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fcmp") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_fcomi" 3 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "athlon_decode" "vector") + (eq_attr "type" "fcmp"))) + "athlon-vector,athlon-fpsched,athlon-fadd") +@@ -450,18 +535,18 @@ + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fpload,athlon-fadd") + (define_insn_reservation "athlon_fcom_load_k8" 4 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "fcmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_fcom" 2 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (eq_attr "type" "fcmp")) + "athlon-direct,athlon-fpsched,athlon-fadd") + ;; Never seen by the scheduler because we still don't do post reg-stack + ;; scheduling. + ;(define_insn_reservation "athlon_fxch" 2 +-; (and (eq_attr "cpu" "athlon,k8,generic64") ++; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + ; (eq_attr "type" "fxch")) + ; "athlon-direct,athlon-fpsched,athlon-fany") + +@@ -516,6 +601,23 @@ + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fstore") ++;; On AMDFAM10 all double, single and integer packed and scalar SSEx data ++;; loads generated are direct path, latency of 2 and do not use any FP ++;; executions units. No seperate entries for movlpx/movhpx loads, which ++;; are direct path, latency of 4 and use the FADD/FMUL FP execution units, ++;; as they will not be generated. ++(define_insn_reservation "athlon_sseld_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssemov") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8") ++;; On AMDFAM10 MMX data loads generated are direct path, latency of 4 ++;; and can use any FP executions units ++(define_insn_reservation "athlon_mmxld_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "mmxmov") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8, athlon-fany") + (define_insn_reservation "athlon_mmxssest" 3 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "mmxmov,ssemov") +@@ -533,6 +635,25 @@ + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") ++;; On AMDFAM10 all double, single and integer packed SSEx data stores ++;; generated are all double path, latency of 2 and use the FSTORE FP ++;; execution unit. No entries seperate for movupx/movdqu, which are ++;; vector path, latency of 3 and use the FSTORE*2 FP execution unit, ++;; as they will not be generated. ++(define_insn_reservation "athlon_ssest_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssemov") ++ (and (eq_attr "mode" "V4SF,V2DF,TI") ++ (eq_attr "memory" "store,both")))) ++ "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)") ++;; On AMDFAM10 all double, single and integer scalar SSEx and MMX ++;; data stores generated are all direct path, latency of 2 and use ++;; the FSTORE FP execution unit ++(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "mmxmov,ssemov") ++ (eq_attr "memory" "store,both"))) ++ "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") + (define_insn_reservation "athlon_movaps_k8" 2 + (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "type" "ssemov") +@@ -578,6 +699,11 @@ + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") ++(define_insn_reservation "athlon_sselog_load_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sselog,sselog1") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)") + (define_insn_reservation "athlon_sselog" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "sselog,sselog1")) +@@ -586,6 +712,11 @@ + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "sselog,sselog1")) + "athlon-double,athlon-fpsched,athlon-fmul") ++(define_insn_reservation "athlon_sselog_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "sselog,sselog1")) ++ "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)") ++ + ;; ??? pcmp executes in addmul, probably not worthwhile to bother about that. + (define_insn_reservation "athlon_ssecmp_load" 2 + (and (eq_attr "cpu" "athlon") +@@ -594,13 +725,13 @@ + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fadd") + (define_insn_reservation "athlon_ssecmp_load_k8" 4 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "ssecmp") + (and (eq_attr "mode" "SF,DF,DI,TI") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_ssecmp" 2 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "ssecmp") + (eq_attr "mode" "SF,DF,DI,TI"))) + "athlon-direct,athlon-fpsched,athlon-fadd") +@@ -614,6 +745,11 @@ + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") ++(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecmp") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_ssecmpvector" 3 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssecmp")) +@@ -622,6 +758,10 @@ + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "ssecmp")) + "athlon-double,athlon-fpsched,(athlon-fadd*2)") ++(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "ssecmp")) ++ "athlon-direct,athlon-fpsched,athlon-fadd") + (define_insn_reservation "athlon_ssecomi_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "ssecomi") +@@ -632,10 +772,20 @@ + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-vector,athlon-fploadk8,athlon-fadd") ++(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecomi") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_ssecomi" 4 + (and (eq_attr "cpu" "athlon,k8,generic64") + (eq_attr "type" "ssecmp")) + "athlon-vector,athlon-fpsched,athlon-fadd") ++(define_insn_reservation "athlon_ssecomi_amdfam10" 3 ++ (and (eq_attr "cpu" "amdfam10") ++;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10 ++ (eq_attr "type" "ssecomi")) ++ "athlon-direct,athlon-fpsched,athlon-fadd") + (define_insn_reservation "athlon_sseadd_load" 4 + (and (eq_attr "cpu" "athlon") + (and (eq_attr "type" "sseadd") +@@ -643,13 +793,13 @@ + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fadd") + (define_insn_reservation "athlon_sseadd_load_k8" 6 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "sseadd") + (and (eq_attr "mode" "SF,DF,DI") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_sseadd" 4 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "sseadd") + (eq_attr "mode" "SF,DF,DI"))) + "athlon-direct,athlon-fpsched,athlon-fadd") +@@ -663,6 +813,11 @@ + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") ++(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseadd") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8,athlon-fadd") + (define_insn_reservation "athlon_sseaddvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "sseadd")) +@@ -671,6 +826,10 @@ + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "sseadd")) + "athlon-double,athlon-fpsched,(athlon-fadd*2)") ++(define_insn_reservation "athlon_sseaddvector_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "sseadd")) ++ "athlon-direct,athlon-fpsched,athlon-fadd") + + ;; Conversions behaves very irregularly and the scheduling is critical here. + ;; Take each instruction separately. Assume that the mode is always set to the +@@ -684,12 +843,25 @@ + (and (eq_attr "mode" "DF") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") ++(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "DF") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + (define_insn_reservation "athlon_ssecvt_cvtss2sd" 2 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "direct") + (eq_attr "mode" "DF")))) + "athlon-direct,athlon-fpsched,athlon-fstore") ++(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (eq_attr "mode" "DF")))) ++ "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") + ;; cvtps2pd. Model same way the other double decoded FP conversions. + (define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5 + (and (eq_attr "cpu" "k8,athlon,generic64") +@@ -698,12 +870,25 @@ + (and (eq_attr "mode" "V2DF,V4SF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fpload2k8,(athlon-fstore*2)") ++(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "direct") ++ (and (eq_attr "mode" "V2DF,V4SF,TI") ++ (eq_attr "memory" "load"))))) ++ "athlon-direct,athlon-fploadk8,athlon-fstore") + (define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3 + (and (eq_attr "cpu" "k8,athlon,generic64") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "athlon_decode" "double") + (eq_attr "mode" "V2DF,V4SF,TI")))) + "athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore") ++(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "direct") ++ (eq_attr "mode" "V2DF,V4SF,TI")))) ++ "athlon-direct,athlon-fpsched,athlon-fstore") + ;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath) + ;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6 + (define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6 +@@ -713,6 +898,13 @@ + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") ++(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "SF,DF") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + ;; cvtsi2ss mem, reg is doublepath + (define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9 + (and (eq_attr "cpu" "athlon") +@@ -728,6 +920,13 @@ + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fstore*2)") ++(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "SF,DF") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + ;; cvtsi2sd reg,reg is double decoded (vector on Athlon) + (define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11 + (and (eq_attr "cpu" "k8,athlon,generic64") +@@ -736,6 +935,13 @@ + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fploadk8,athlon-fstore") ++(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "mode" "SF,DF") ++ (eq_attr "memory" "none"))))) ++ "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + ;; cvtsi2ss reg, reg is doublepath + (define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14 + (and (eq_attr "cpu" "athlon,k8,generic64") +@@ -744,6 +950,13 @@ + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-fvector*2)") ++(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "mode" "SF,DF") ++ (eq_attr "memory" "none"))))) ++ "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + ;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9 + (define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9 + (and (eq_attr "cpu" "k8,athlon,generic64") +@@ -752,6 +965,13 @@ + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fstore*3)") ++(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "SF") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + ;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12 + (define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12 + (and (eq_attr "cpu" "athlon,k8,generic64") +@@ -760,6 +980,13 @@ + (and (eq_attr "mode" "SF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,(athlon-fvector*3)") ++(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "vector") ++ (and (eq_attr "mode" "SF") ++ (eq_attr "memory" "none"))))) ++ "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") + (define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8 + (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "type" "ssecvt") +@@ -767,6 +994,13 @@ + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fpload2k8,(athlon-fstore*3)") ++(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "V4SF,V2DF,TI") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") + ;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10 + ;; ??? Why it is fater than cvtsd2ss? + (define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8 +@@ -776,6 +1010,13 @@ + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-fvector*2") ++(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssecvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "V4SF,V2DF,TI") ++ (eq_attr "memory" "none"))))) ++ "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") + ;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9 + (define_insn_reservation "athlon_secvt_cvtsX2si_load" 9 + (and (eq_attr "cpu" "athlon,k8,generic64") +@@ -784,6 +1025,13 @@ + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load"))))) + "athlon-vector,athlon-fploadk8,athlon-fvector") ++(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "SI,DI") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)") + ;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9 + (define_insn_reservation "athlon_ssecvt_cvtsX2si" 9 + (and (eq_attr "cpu" "athlon") +@@ -799,6 +1047,29 @@ + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,athlon-fstore") ++(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "SI,DI") ++ (eq_attr "memory" "none"))))) ++ "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)") ++;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10 ++(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "TI") ++ (eq_attr "memory" "load"))))) ++ "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ++;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10 ++(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseicvt") ++ (and (eq_attr "amdfam10_decode" "double") ++ (and (eq_attr "mode" "TI") ++ (eq_attr "memory" "none"))))) ++ "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") + + + (define_insn_reservation "athlon_ssemul_load" 4 +@@ -808,13 +1079,13 @@ + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fmul") + (define_insn_reservation "athlon_ssemul_load_k8" 6 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "ssemul") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fmul") + (define_insn_reservation "athlon_ssemul" 4 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fpsched,athlon-fmul") +@@ -828,6 +1099,11 @@ + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") ++(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssemul") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8,athlon-fmul") + (define_insn_reservation "athlon_ssemulvector" 5 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssemul")) +@@ -836,6 +1112,10 @@ + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "ssemul")) + "athlon-double,athlon-fpsched,(athlon-fmul*2)") ++(define_insn_reservation "athlon_ssemulvector_amdfam10" 4 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "ssemul")) ++ "athlon-direct,athlon-fpsched,athlon-fmul") + ;; divsd timings. divss is faster + (define_insn_reservation "athlon_ssediv_load" 20 + (and (eq_attr "cpu" "athlon") +@@ -844,13 +1124,13 @@ + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fpload,athlon-fmul*17") + (define_insn_reservation "athlon_ssediv_load_k8" 22 +- (and (eq_attr "cpu" "k8,generic64") ++ (and (eq_attr "cpu" "k8,generic64,amdfam10") + (and (eq_attr "type" "ssediv") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-fploadk8,athlon-fmul*17") + (define_insn_reservation "athlon_ssediv" 20 +- (and (eq_attr "cpu" "athlon,k8,generic64") ++ (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") + (and (eq_attr "type" "ssediv") + (eq_attr "mode" "SF,DF"))) + "athlon-direct,athlon-fpsched,athlon-fmul*17") +@@ -864,6 +1144,11 @@ + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-double,athlon-fpload2k8,athlon-fmul*34") ++(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "ssediv") ++ (eq_attr "memory" "load"))) ++ "athlon-direct,athlon-fploadk8,athlon-fmul*17") + (define_insn_reservation "athlon_ssedivvector" 39 + (and (eq_attr "cpu" "athlon") + (eq_attr "type" "ssediv")) +@@ -872,3 +1157,12 @@ + (and (eq_attr "cpu" "k8,generic64") + (eq_attr "type" "ssediv")) + "athlon-double,athlon-fmul*34") ++(define_insn_reservation "athlon_ssedivvector_amdfam10" 20 ++ (and (eq_attr "cpu" "amdfam10") ++ (eq_attr "type" "ssediv")) ++ "athlon-direct,athlon-fmul*17") ++(define_insn_reservation "athlon_sseins_amdfam10" 5 ++ (and (eq_attr "cpu" "amdfam10") ++ (and (eq_attr "type" "sseins") ++ (eq_attr "mode" "TI"))) ++ "athlon-vector,athlon-fpsched,athlon-faddmul") +--- gcc/config/i386/pmmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200 ++++ gcc/config/i386/pmmintrin.h 2007-02-09 21:26:06.000000000 +0100 +@@ -30,7 +30,11 @@ + #ifndef _PMMINTRIN_H_INCLUDED + #define _PMMINTRIN_H_INCLUDED + +-#ifdef __SSE3__ ++#ifndef __SSE3__ ++# error "SSE3 instruction set not enabled" ++#else ++ ++/* We need definitions from the SSE2 and SSE header files*/ + #include + #include + +--- gcc/config/i386/tmmintrin.h.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/config/i386/tmmintrin.h 2007-02-09 21:26:06.000000000 +0100 +@@ -30,7 +30,11 @@ + #ifndef _TMMINTRIN_H_INCLUDED + #define _TMMINTRIN_H_INCLUDED + +-#ifdef __SSSE3__ ++#ifndef __SSSE3__ ++# error "SSSE3 instruction set not enabled" ++#else ++ ++/* We need definitions from the SSE3, SSE2 and SSE header files*/ + #include + + static __inline __m128i +--- gcc/config/i386/sse.md.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/config/i386/sse.md 2007-02-09 21:26:06.000000000 +0100 +@@ -963,6 +963,7 @@ + "cvtsi2ss\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "vector,double") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "mode" "SF")]) + + (define_insn "sse_cvtsi2ssq" +@@ -976,6 +977,7 @@ + "cvtsi2ssq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "vector,double") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "mode" "SF")]) + + (define_insn "sse_cvtss2si" +@@ -989,6 +991,7 @@ + "cvtss2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double") + (set_attr "mode" "SI")]) + + (define_insn "sse_cvtss2siq" +@@ -1002,6 +1005,7 @@ + "cvtss2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double") + (set_attr "mode" "DI")]) + + (define_insn "sse_cvttss2si" +@@ -1014,6 +1018,7 @@ + "cvttss2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double") + (set_attr "mode" "SI")]) + + (define_insn "sse_cvttss2siq" +@@ -1026,6 +1031,7 @@ + "cvttss2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double") + (set_attr "mode" "DI")]) + + (define_insn "sse2_cvtdq2ps" +@@ -1921,7 +1927,8 @@ + "cvtsi2sd\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") +- (set_attr "athlon_decode" "double,direct")]) ++ (set_attr "athlon_decode" "double,direct") ++ (set_attr "amdfam10_decode" "vector,double")]) + + (define_insn "sse2_cvtsi2sdq" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") +@@ -1934,7 +1941,8 @@ + "cvtsi2sdq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DF") +- (set_attr "athlon_decode" "double,direct")]) ++ (set_attr "athlon_decode" "double,direct") ++ (set_attr "amdfam10_decode" "vector,double")]) + + (define_insn "sse2_cvtsd2si" + [(set (match_operand:SI 0 "register_operand" "=r,r") +@@ -1947,6 +1955,7 @@ + "cvtsd2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double") + (set_attr "mode" "SI")]) + + (define_insn "sse2_cvtsd2siq" +@@ -1960,6 +1969,7 @@ + "cvtsd2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double") + (set_attr "mode" "DI")]) + + (define_insn "sse2_cvttsd2si" +@@ -1972,7 +1982,8 @@ + "cvttsd2si\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "SI") +- (set_attr "athlon_decode" "double,vector")]) ++ (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double")]) + + (define_insn "sse2_cvttsd2siq" + [(set (match_operand:DI 0 "register_operand" "=r,r") +@@ -1984,7 +1995,8 @@ + "cvttsd2siq\t{%1, %0|%0, %1}" + [(set_attr "type" "sseicvt") + (set_attr "mode" "DI") +- (set_attr "athlon_decode" "double,vector")]) ++ (set_attr "athlon_decode" "double,vector") ++ (set_attr "amdfam10_decode" "double,double")]) + + (define_insn "sse2_cvtdq2pd" + [(set (match_operand:V2DF 0 "register_operand" "=x") +@@ -2015,7 +2027,8 @@ + "TARGET_SSE2" + "cvtpd2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") +- (set_attr "mode" "TI")]) ++ (set_attr "mode" "TI") ++ (set_attr "amdfam10_decode" "double")]) + + (define_expand "sse2_cvttpd2dq" + [(set (match_operand:V4SI 0 "register_operand" "") +@@ -2033,7 +2046,8 @@ + "TARGET_SSE2" + "cvttpd2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") +- (set_attr "mode" "TI")]) ++ (set_attr "mode" "TI") ++ (set_attr "amdfam10_decode" "double")]) + + (define_insn "sse2_cvtsd2ss" + [(set (match_operand:V4SF 0 "register_operand" "=x,x") +@@ -2047,20 +2061,22 @@ + "cvtsd2ss\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") + (set_attr "athlon_decode" "vector,double") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "mode" "SF")]) + + (define_insn "sse2_cvtss2sd" +- [(set (match_operand:V2DF 0 "register_operand" "=x") ++ [(set (match_operand:V2DF 0 "register_operand" "=x,x") + (vec_merge:V2DF + (float_extend:V2DF + (vec_select:V2SF +- (match_operand:V4SF 2 "nonimmediate_operand" "xm") ++ (match_operand:V4SF 2 "nonimmediate_operand" "x,m") + (parallel [(const_int 0) (const_int 1)]))) +- (match_operand:V2DF 1 "register_operand" "0") ++ (match_operand:V2DF 1 "register_operand" "0,0") + (const_int 1)))] + "TARGET_SSE2" + "cvtss2sd\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") ++ (set_attr "amdfam10_decode" "vector,double") + (set_attr "mode" "DF")]) + + (define_expand "sse2_cvtpd2ps" +@@ -2081,7 +2097,8 @@ + "TARGET_SSE2" + "cvtpd2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") +- (set_attr "mode" "V4SF")]) ++ (set_attr "mode" "V4SF") ++ (set_attr "amdfam10_decode" "double")]) + + (define_insn "sse2_cvtps2pd" + [(set (match_operand:V2DF 0 "register_operand" "=x") +@@ -2092,7 +2109,8 @@ + "TARGET_SSE2" + "cvtps2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") +- (set_attr "mode" "V2DF")]) ++ (set_attr "mode" "V2DF") ++ (set_attr "amdfam10_decode" "direct")]) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; +@@ -4550,3 +4568,92 @@ + "pabs\t{%1, %0|%0, %1}"; + [(set_attr "type" "sselog1") + (set_attr "mode" "DI")]) ++ ++;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++;; ++;; AMD SSE4A instructions ++;; ++;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ++ ++(define_insn "sse4a_vmmovntv2df" ++ [(set (match_operand:DF 0 "memory_operand" "=m") ++ (unspec:DF [(vec_select:DF ++ (match_operand:V2DF 1 "register_operand" "x") ++ (parallel [(const_int 0)]))] ++ UNSPEC_MOVNT))] ++ "TARGET_SSE4A" ++ "movntsd\t{%1, %0|%0, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "mode" "DF")]) ++ ++(define_insn "sse4a_movntdf" ++ [(set (match_operand:DF 0 "memory_operand" "=m") ++ (unspec:DF [(match_operand:DF 1 "register_operand" "x")] ++ UNSPEC_MOVNT))] ++ "TARGET_SSE4A" ++ "movntsd\t{%1, %0|%0, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "mode" "DF")]) ++ ++(define_insn "sse4a_vmmovntv4sf" ++ [(set (match_operand:SF 0 "memory_operand" "=m") ++ (unspec:SF [(vec_select:SF ++ (match_operand:V4SF 1 "register_operand" "x") ++ (parallel [(const_int 0)]))] ++ UNSPEC_MOVNT))] ++ "TARGET_SSE4A" ++ "movntss\t{%1, %0|%0, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "mode" "SF")]) ++ ++(define_insn "sse4a_movntsf" ++ [(set (match_operand:SF 0 "memory_operand" "=m") ++ (unspec:SF [(match_operand:SF 1 "register_operand" "x")] ++ UNSPEC_MOVNT))] ++ "TARGET_SSE4A" ++ "movntss\t{%1, %0|%0, %1}" ++ [(set_attr "type" "ssemov") ++ (set_attr "mode" "SF")]) ++ ++(define_insn "sse4a_extrqi" ++ [(set (match_operand:V2DI 0 "register_operand" "=x") ++ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") ++ (match_operand 2 "const_int_operand" "") ++ (match_operand 3 "const_int_operand" "")] ++ UNSPEC_EXTRQI))] ++ "TARGET_SSE4A" ++ "extrq\t{%3, %2, %0|%0, %2, %3}" ++ [(set_attr "type" "sse") ++ (set_attr "mode" "TI")]) ++ ++(define_insn "sse4a_extrq" ++ [(set (match_operand:V2DI 0 "register_operand" "=x") ++ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") ++ (match_operand:V16QI 2 "register_operand" "x")] ++ UNSPEC_EXTRQ))] ++ "TARGET_SSE4A" ++ "extrq\t{%2, %0|%0, %2}" ++ [(set_attr "type" "sse") ++ (set_attr "mode" "TI")]) ++ ++(define_insn "sse4a_insertqi" ++ [(set (match_operand:V2DI 0 "register_operand" "=x") ++ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") ++ (match_operand:V2DI 2 "register_operand" "x") ++ (match_operand 3 "const_int_operand" "") ++ (match_operand 4 "const_int_operand" "")] ++ UNSPEC_INSERTQI))] ++ "TARGET_SSE4A" ++ "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" ++ [(set_attr "type" "sseins") ++ (set_attr "mode" "TI")]) ++ ++(define_insn "sse4a_insertq" ++ [(set (match_operand:V2DI 0 "register_operand" "=x") ++ (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") ++ (match_operand:V2DI 2 "register_operand" "x")] ++ UNSPEC_INSERTQ))] ++ "TARGET_SSE4A" ++ "insertq\t{%2, %0|%0, %2}" ++ [(set_attr "type" "sseins") ++ (set_attr "mode" "TI")]) +--- gcc/config/i386/i386.opt.jj 2007-02-09 16:18:25.000000000 +0100 ++++ gcc/config/i386/i386.opt 2007-02-09 21:26:06.000000000 +0100 +@@ -205,6 +205,22 @@ mmni + Target Undocumented Mask(SSSE3) MaskExists + Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation + ++msse4a ++Target Report Mask(SSE4A) ++Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation ++ ++mpopcnt ++Target Report Mask(POPCNT) ++Support code generation of popcount instruction for popcount built-ins ++namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll ++ ++mabm ++Target Report Mask(ABM) ++Support code generation of Advanced Bit Manipulation (ABM) instructions, ++which include popcnt and lzcnt instructions, for popcount and clz built-ins ++namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and ++__builtin_clz, __builtin_clzl, __builtin_clzll ++ + msseregparm + Target RejectNegative Mask(SSEREGPARM) + Use SSE register passing conventions for SF and DF mode +--- gcc/config/i386/ammintrin.h.jj 2007-02-09 21:26:06.000000000 +0100 ++++ gcc/config/i386/ammintrin.h 2007-02-09 21:26:06.000000000 +0100 +@@ -0,0 +1,73 @@ ++/* Copyright (C) 2007 Free Software Foundation, Inc. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2, or (at your option) ++ any later version. ++ ++ GCC is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING. If not, write to ++ the Free Software Foundation, 51 Franklin Street, Fifth Floor, ++ Boston, MA 02110-1301, USA. */ ++ ++/* As a special exception, if you include this header file into source ++ files compiled by GCC, this header file does not by itself cause ++ the resulting executable to be covered by the GNU General Public ++ License. This exception does not however invalidate any other ++ reasons why the executable file might be covered by the GNU General ++ Public License. */ ++ ++/* Implemented from the specification included in the AMD Programmers ++ Manual Update, version 2.x */ ++ ++#ifndef _AMMINTRIN_H_INCLUDED ++#define _AMMINTRIN_H_INCLUDED ++ ++#ifndef __SSE4A__ ++# error "SSE4A instruction set not enabled" ++#else ++ ++/* We need definitions from the SSE3, SSE2 and SSE header files*/ ++#include ++ ++static __inline void __attribute__((__always_inline__)) ++_mm_stream_sd (double * __P, __m128d __Y) ++{ ++ __builtin_ia32_movntsd (__P, (__v2df) __Y); ++} ++ ++static __inline void __attribute__((__always_inline__)) ++_mm_stream_ss (float * __P, __m128 __Y) ++{ ++ __builtin_ia32_movntss (__P, (__v4sf) __Y); ++} ++ ++static __inline __m128i __attribute__((__always_inline__)) ++_mm_extract_si64 (__m128i __X, __m128i __Y) ++{ ++ return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); ++} ++ ++#define _mm_extracti_si64(X, I, L) \ ++((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L)) ++ ++static __inline __m128i __attribute__((__always_inline__)) ++_mm_insert_si64 (__m128i __X,__m128i __Y) ++{ ++ return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); ++} ++ ++#define _mm_inserti_si64(X, Y, I, L) \ ++((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L)) ++ ++ ++#endif /* __SSE4A__ */ ++ ++#endif /* _AMMINTRIN_H_INCLUDED */ +--- gcc/config/i386/emmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200 ++++ gcc/config/i386/emmintrin.h 2007-02-09 21:26:06.000000000 +0100 +@@ -30,7 +30,11 @@ + #ifndef _EMMINTRIN_H_INCLUDED + #define _EMMINTRIN_H_INCLUDED + +-#ifdef __SSE2__ ++#ifndef __SSE2__ ++# error "SSE2 instruction set not enabled" ++#else ++ ++/* We need definitions from the SSE header files*/ + #include + + /* SSE2 */ +--- gcc/config/i386/i386.c.jj 2007-02-09 16:24:00.000000000 +0100 ++++ gcc/config/i386/i386.c 2007-02-10 19:47:05.000000000 +0100 +@@ -534,6 +534,71 @@ struct processor_costs k8_cost = { + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + }; + ++struct processor_costs amdfam10_cost = { ++ COSTS_N_INSNS (1), /* cost of an add instruction */ ++ COSTS_N_INSNS (2), /* cost of a lea instruction */ ++ COSTS_N_INSNS (1), /* variable shift costs */ ++ COSTS_N_INSNS (1), /* constant shift costs */ ++ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ ++ COSTS_N_INSNS (4), /* HI */ ++ COSTS_N_INSNS (3), /* SI */ ++ COSTS_N_INSNS (4), /* DI */ ++ COSTS_N_INSNS (5)}, /* other */ ++ 0, /* cost of multiply per each bit set */ ++ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ ++ COSTS_N_INSNS (35), /* HI */ ++ COSTS_N_INSNS (51), /* SI */ ++ COSTS_N_INSNS (83), /* DI */ ++ COSTS_N_INSNS (83)}, /* other */ ++ COSTS_N_INSNS (1), /* cost of movsx */ ++ COSTS_N_INSNS (1), /* cost of movzx */ ++ 8, /* "large" insn */ ++ 9, /* MOVE_RATIO */ ++ 4, /* cost for loading QImode using movzbl */ ++ {3, 4, 3}, /* cost of loading integer registers ++ in QImode, HImode and SImode. ++ Relative to reg-reg move (2). */ ++ {3, 4, 3}, /* cost of storing integer registers */ ++ 4, /* cost of reg,reg fld/fst */ ++ {4, 4, 12}, /* cost of loading fp registers ++ in SFmode, DFmode and XFmode */ ++ {6, 6, 8}, /* cost of storing fp registers ++ in SFmode, DFmode and XFmode */ ++ 2, /* cost of moving MMX register */ ++ {3, 3}, /* cost of loading MMX registers ++ in SImode and DImode */ ++ {4, 4}, /* cost of storing MMX registers ++ in SImode and DImode */ ++ 2, /* cost of moving SSE register */ ++ {4, 4, 3}, /* cost of loading SSE registers ++ in SImode, DImode and TImode */ ++ {4, 4, 5}, /* cost of storing SSE registers ++ in SImode, DImode and TImode */ ++ 3, /* MMX or SSE register to integer */ ++ /* On K8 ++ MOVD reg64, xmmreg Double FSTORE 4 ++ MOVD reg32, xmmreg Double FSTORE 4 ++ On AMDFAM10 ++ MOVD reg64, xmmreg Double FADD 3 ++ 1/1 1/1 ++ MOVD reg32, xmmreg Double FADD 3 ++ 1/1 1/1 */ ++ 64, /* size of prefetch block */ ++ /* New AMD processors never drop prefetches; if they cannot be performed ++ immediately, they are queued. We set number of simultaneous prefetches ++ to a large constant to reflect this (it probably is not a good idea not ++ to limit number of prefetches at all, as their execution also takes some ++ time). */ ++ 100, /* number of parallel prefetches */ ++ 5, /* Branch cost */ ++ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ ++ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ ++ COSTS_N_INSNS (19), /* cost of FDIV instruction. */ ++ COSTS_N_INSNS (2), /* cost of FABS instruction. */ ++ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ ++ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ ++}; ++ + static const + struct processor_costs pentium4_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ +@@ -816,11 +881,13 @@ const struct processor_costs *ix86_cost + #define m_PENT4 (1< ++#ifdef __SSE2__ ++# include ++#endif + + #endif /* __SSE__ */ + #endif /* _XMMINTRIN_H_INCLUDED */ diff --git a/gcc41.spec b/gcc41.spec index 1af6a15..294870a 100644 --- a/gcc41.spec +++ b/gcc41.spec @@ -1,10 +1,10 @@ -%define DATE 20070202 +%define DATE 20070209 %define gcc_version 4.1.1 -%define gcc_release 55 +%define gcc_release 56 %define _unpackaged_files_terminate_build 0 %define multilib_64_archs sparc64 ppc64 s390x x86_64 %define include_gappletviewer 1 -%ifarch %{ix86} x86_64 ia64 ppc +%ifarch %{ix86} x86_64 ia64 ppc alpha %define build_ada 1 %else %define build_ada 0 @@ -116,7 +116,7 @@ Patch6: gcc41-ada-pr18302.patch Patch7: gcc41-ada-tweaks.patch Patch8: gcc41-java-slow_pthread_self.patch Patch9: gcc41-ppc32-retaddr.patch -Patch10: gcc41-i386-tune-core2.patch +Patch10: gcc41-amdfam10.patch Patch11: gcc41-dsohandle.patch Patch12: gcc41-rh184446.patch Patch13: gcc41-pr20297-test.patch @@ -124,7 +124,7 @@ Patch14: gcc41-objc-rh185398.patch Patch15: gcc41-tests.patch Patch16: gcc41-pr25874.patch Patch17: gcc41-pr30189.patch -Patch18: gcc41-ssse3.patch +Patch18: gcc41-rh227983.patch Patch19: gcc41-hash-style-gnu.patch Patch20: gcc41-pr30001.patch Patch21: gcc41-java-libdotdotlib.patch @@ -429,7 +429,7 @@ which are required to run programs compiled with the GNAT. %patch7 -p0 -b .ada-tweaks~ %patch8 -p0 -b .java-slow_pthread_self~ %patch9 -p0 -b .ppc32-retaddr~ -%patch10 -p0 -b .i386-tune-core2~ +%patch10 -p0 -b .amdfam10~ %patch11 -p0 -b .dsohandle~ %patch12 -p0 -b .rh184446~ %patch13 -p0 -E -b .pr20297-test~ @@ -437,7 +437,7 @@ which are required to run programs compiled with the GNAT. %patch15 -p0 -b .tests~ %patch16 -p0 -b .pr25874~ %patch17 -p0 -b .pr30189~ -%patch18 -p0 -b .ssse3~ +%patch18 -p0 -b .rh227983~ %patch19 -p0 -b .hash-style-gnu~ %patch20 -p0 -b .pr30001~ %patch21 -p0 -b .java-libdotdotlib~ @@ -1529,6 +1529,15 @@ fi %doc rpm.doc/changelogs/libmudflap/ChangeLog* %changelog +* Sat Feb 10 2007 Jakub Jelinek 4.1.1-56 +- update from gcc-4_1-branch (-r121479:121738) + - PRs c++/29487, target/29487, target/30370 +- merge gomp fixes from gcc-4_2-branch (-r121689:121690) + PR c++/30703 +- add AMDfam10 support (Harsha Jagasia, #222897) +- set build_ada to 1 on alpha (#224247) +- regenerate libjava.util.TimeZone data from tzdata2007a (#227888) + * Fri Feb 2 2007 Jakub Jelinek 4.1.1-55 - update from gcc-4_1-branch (-r121069:121479) - PRs c++/28988, fortran/30278, libstdc++/30586, middle-end/29683, diff --git a/sources b/sources index 4a137e7..2da9fe9 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -da5ebc8e1045dab9142f7a0a9ce47304 gcc-4.1.1-20070202.tar.bz2 +900d1b9e7c3edfa4d855cd99d23f200f gcc-4.1.1-20070209.tar.bz2