2007-02-10 H.J. Lu * gcc.target/i386/sse4a-extract.c: Add "LL" to 64bit constants. * gcc.target/i386/sse4a-insert.c: Likewise. 2007-02-08 Harsha Jagasia * config/i386/xmmintrin.h: Make inclusion of emmintrin.h conditional to __SSE2__. * config/i386/emmintrin.h: Generate #error if __SSE2__ is not defined. * config/i386/pmmintrin.h: Generate #error if __SSE3__ is not defined. * config/i386/tmmintrin.h: Generate #error if __SSSE3__ is not defined. 2007-02-05 Harsha Jagasia * config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8, athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov, athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul, athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn, athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8, athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load, athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8, athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10. * config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse, cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387, swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse, fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse, x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed, floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse, floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1, mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn, umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn, umuldi3_highpart_rex64, umulsi3_highpart_insn, umulsi3_highpart_zext, smuldi3_highpart_rex64, smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld, x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse, sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387, sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387, sqrtextenddfxf2_i387): Added amdfam10_decode. * config/i386/athlon.md (athlon_idirect_amdfam10, athlon_ivector_amdfam10, athlon_idirect_load_amdfam10, athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10, athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10, athlon_ivector_store_amdfam10): New define_insn_reservation. (athlon_idirect_loadmov, athlon_idirect_movstore): Added amdfam10. * config/i386/athlon.md (athlon_call_amdfam10, athlon_pop_amdfam10, athlon_lea_amdfam10): New define_insn_reservation. (athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8, athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI, athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10. * config/i386/athlon.md (athlon_sseld_amdfam10, athlon_mmxld_amdfam10, athlon_ssest_amdfam10, athlon_mmxssest_short_amdfam10): New define_insn_reservation. * config/i386/athlon.md (athlon_sseins_amdfam10): New define_insn_reservation. * config/i386/i386.md (sseins): Added sseins to define_attr type and define_attr unit. * config/i386/sse.md: Set type attribute to sseins for insertq and insertqi. * config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10, ssecmpvector_load_amdfam10, ssecmpvector_amdfam10, ssecomi_load_amdfam10, ssecomi_amdfam10, sseaddvector_load_amdfam10, sseaddvector_amdfam10): New define_insn_reservation. (ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10. * config/i386/athlon.md (cvtss2sd_load_amdfam10, cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10, cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10, cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10, cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10, cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New define_insn_reservation. * config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si, cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq, cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq, cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd, cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute. * config/i386/athlon.md (athlon_ssedivvector_amdfam10, athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10, athlon_ssemulvector_load_amdfam10): New define_insn_reservation. (athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul, athlon_ssemul_load_k8): Added amdfam10. * config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro. (x86_sse_unaligned_move_optimal): New variable. * config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for m_AMDFAM10. (ix86_expand_vector_move_misalign): Add code to generate movupd/movups for unaligned vector SSE double/single precision loads for AMDFAM10. * config/i386/i386.h (TARGET_AMDFAM10): New macro. (TARGET_CPU_CPP_BUILTINS): Add code for amdfam10. Define TARGET_CPU_DEFAULT_amdfam10. (TARGET_CPU_DEFAULT_NAMES): Add amdfam10. (processor_type): Add PROCESSOR_AMDFAM10. * config/i386/i386.md: Add amdfam10 as a new cpu attribute to match processor_type in config/i386/i386.h. Enable imul peepholes for TARGET_AMDFAM10. * config.gcc: Add support for --with-cpu option for amdfam10. * config/i386/i386.c (amdfam10_cost): New variable. (m_AMDFAM10): New macro. (m_ATHLON_K8_AMDFAM10): New macro. (x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen, x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop, x86_promote_QImode, x86_integer_DFmode_moves, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, x86_sse_partial_reg_dependency, x86_sse_typeless_stores, x86_use_ffreep, x86_use_incdec, x86_four_jump_limit, x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns): Enable/disable for amdfam10. (override_options): Add amdfam10_cost to processor_target_table. Set up PROCESSOR_AMDFAM10 for amdfam10 entry in processor_alias_table. (ix86_issue_rate): Add PROCESSOR_AMDFAM10. (ix86_adjust_cost): Add code for amdfam10. * config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm) instruction set feature flag. Add new (-mpopcnt) flag for popcnt instruction. Add new SSE4A (-msse4a) instruction set feature flag. * config/i386/i386.h: Add builtin definition for SSE4A. * config/i386/i386.md: Add support for ABM instructions (popcnt and lzcnt). * config/i386/sse.md: Add support for SSE4A instructions (movntss, movntsd, extrq, insertq). * config/i386/i386.c: Add support for ABM and SSE4A builtins. Add -march=amdfam10 flag. * config/i386/ammintrin.h: Add support for SSE4A intrinsics. * doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt and amdfam10. * doc/extend.texi: Add documentation for SSE4A builtins. 2007-02-05 Dwarakanath Rajagopal * gcc.dg/i386-cpuid.h: Test whether SSE4A is supported for running tests. * gcc.target/i386/sse4a-extract.c: New test. * gcc.target/i386/sse4a-insert.c: New test. * gcc.target/i386/sse4a-montsd.c: New test. * gcc.target/i386/sse4a-montss.c: New test. 2006-12-15 H.J. Lu * gcc.dg/i386-cpuid.h (bit_SSSE3): New. 2006-11-30 H.J. Lu * gcc.dg/i386-cpuid.h (bit_SSE3): New. (i386_get_cpuid): New function. (i386_cpuid_ecx): Likewise. (i386_cpuid_edx): Likewise. (i386_cpuid): Updated to call i386_cpuid_edx. --- gcc/doc/extend.texi.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/doc/extend.texi 2007-02-09 21:26:06.000000000 +0100 @@ -6931,6 +6931,23 @@ v4si __builtin_ia32_pabsd128 (v4si) v8hi __builtin_ia32_pabsw128 (v8hi) @end smallexample +The following built-in functions are available when @option{-msse4a} is used. + +@smallexample +void _mm_stream_sd (double*,__m128d); +Generates the @code{movntsd} machine instruction. +void _mm_stream_ss (float*,__m128); +Generates the @code{movntss} machine instruction. +__m128i _mm_extract_si64 (__m128i, __m128i); +Generates the @code{extrq} machine instruction with only SSE register operands. +__m128i _mm_extracti_si64 (__m128i, int, int); +Generates the @code{extrq} machine instruction with SSE register and immediate operands. +__m128i _mm_insert_si64 (__m128i, __m128i); +Generates the @code{insertq} machine instruction with only SSE register operands. +__m128i _mm_inserti_si64 (__m128i, __m128i, int, int); +Generates the @code{insertq} machine instruction with SSE register and immediate operands. +@end smallexample + The following built-in functions are available when @option{-m3dnow} is used. All of them generate the machine instruction that is part of the name. --- gcc/doc/invoke.texi.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/doc/invoke.texi 2007-02-09 21:56:44.000000000 +0100 @@ -522,7 +522,7 @@ Objective-C and Objective-C++ Dialects}. -mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol -mno-wide-multiply -mrtd -malign-double @gol -mpreferred-stack-boundary=@var{num} @gol --mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol +-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol -mthreads -mno-align-stringops -minline-all-stringops @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mregparm=@var{num} -msseregparm @gol @@ -9062,6 +9062,10 @@ instruction set support. @item k8, opteron, athlon64, athlon-fx AMD K8 core based CPUs with x86-64 instruction set support. (This supersets MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.) +@item amdfam10 +AMD Family 10 core based CPUs with x86-64 instruction set support. (This +supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit +instruction set extensions.) @item winchip-c6 IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction set support. @@ -9339,8 +9343,14 @@ preferred alignment to @option{-mpreferr @itemx -mno-sse3 @item -mssse3 @itemx -mno-ssse3 +@item -msse4a +@item -mno-sse4a @item -m3dnow @itemx -mno-3dnow +@item -mpopcnt +@itemx -mno-popcnt +@item -mabm +@itemx -mno-abm @opindex mmmx @opindex mno-mmx @opindex msse --- gcc/testsuite/gcc.target/i386/sse4a-insert.c.jj 2007-02-09 21:26:06.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/sse4a-insert.c 2007-02-09 21:26:06.000000000 +0100 @@ -0,0 +1,110 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +typedef union +{ + long long i[2]; + __m128i vec; +} LI; + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static long long +sse4a_test_insert (long long in1, long long in2) +{ + __m128i v1,v2; + long long index_length, pad; + LI v_out; + index_length = 0x0000000000000810LL; + pad = 0x0; + v1 = _mm_set_epi64x (pad, in1); + v2 = _mm_set_epi64x (index_length, in2); + v_out.vec = _mm_insert_si64 (v1, v2); + return (v_out.i[0]); +} + +static long long +sse4a_test_inserti (long long in1, long long in2) +{ + __m128i v1,v2; + long long pad = 0x0; + LI v_out; + v1 = _mm_set_epi64x (pad, in1); + v2 = _mm_set_epi64x (pad, in2); + v_out.vec = _mm_inserti_si64 (v1, v2, (unsigned int) 0x10, (unsigned int) 0x08); + return (v_out.i[0]); +} + +static chk (long long i1, long long i2) +{ + int n_fails =0; + if (i1 != i2) + n_fails +=1; + return n_fails; +} + +long long vals_in1[5] = + { + 0x1234567887654321LL, + 0x1456782093002490LL, + 0x2340909123990390LL, + 0x9595959599595999LL, + 0x9099038798000029LL + }; + +long long vals_in2[5] = + { + 0x9ABCDEF00FEDCBA9LL, + 0x234567097289672ALL, + 0x45476453097BD342LL, + 0x23569012AE586FF0LL, + 0x432567ABCDEF765DLL + }; + +long long vals_out[5] = + { + 0x1234567887CBA921LL, + 0x1456782093672A90LL, + 0x2340909123D34290LL, + 0x95959595996FF099LL, + 0x9099038798765D29LL + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + long long out; + + for (i = 0; i < 5; i += 1) + { + out = sse4a_test_insert (vals_in1[i], vals_in2[i]); + fail += chk(out, vals_out[i]); + + out = sse4a_test_inserti (vals_in1[i], vals_in2[i]); + fail += chk(out, vals_out[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} --- gcc/testsuite/gcc.target/i386/sse4a-extract.c.jj 2007-02-09 21:26:06.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/sse4a-extract.c 2007-02-09 21:26:06.000000000 +0100 @@ -0,0 +1,100 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +typedef union +{ + long long i[2]; + __m128i vec; +} LI; + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static long long +sse4a_test_extrq (long long in) +{ + __m128i v1, v2; + long long index_length, pad; + LI v_out; + index_length = 0x0000000000000810LL; + pad = 0x0; + v1 = _mm_set_epi64x (pad, in); + v2 = _mm_set_epi64x (pad, index_length); + v_out.vec = _mm_extract_si64 (v1, v2); + return (v_out.i[0]); +} + +static long long +sse4a_test_extrqi (long long in) +{ + __m128i v1; + long long pad =0x0; + LI v_out; + v1 = _mm_set_epi64x (pad, in); + v_out.vec = _mm_extracti_si64 (v1, (unsigned int) 0x10,(unsigned int) 0x08); + return (v_out.i[0]); +} + +static chk (long long i1, long long i2) +{ + int n_fails =0; + if (i1 != i2) + n_fails +=1; + return n_fails; +} + +long long vals_in[5] = + { + 0x1234567887654321LL, + 0x1456782093002490LL, + 0x2340909123990390LL, + 0x9595959599595999LL, + 0x9099038798000029LL + }; + +long long vals_out[5] = + { + 0x0000000000006543LL, + 0x0000000000000024LL, + 0x0000000000009903LL, + 0x0000000000005959LL, + 0x0000000000000000LL + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + long long out; + + for (i = 0; i < 5; i += 1) + { + out = sse4a_test_extrq (vals_in[i]); + fail += chk(out, vals_out[i]); + + out = sse4a_test_extrqi (vals_in[i]); + fail += chk(out, vals_out[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} --- gcc/testsuite/gcc.target/i386/sse4a-montss.c.jj 2007-02-09 21:26:06.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/sse4a-montss.c 2007-02-09 21:26:06.000000000 +0100 @@ -0,0 +1,64 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static void +sse4a_test_movntss (float *out, float *in) +{ + __m128 in_v4sf = _mm_load_ss (in); + _mm_stream_ss (out, in_v4sf); +} + +static int +chk_ss (float *v1, float *v2) +{ + int n_fails = 0; + if (v1[0] != v2[0]) + n_fails += 1; + return n_fails; +} + +float vals[10] = + { + 100.0, 200.0, 300.0, 400.0, 5.0, + -1.0, .345, -21.5, 9.32, 8.41 + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + float *out; + + out = (float *) malloc (sizeof (float)); + for (i = 0; i < 10; i += 1) + { + sse4a_test_movntss (out, &vals[i]); + + fail += chk_ss (out, &vals[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} --- gcc/testsuite/gcc.target/i386/sse4a-montsd.c.jj 2007-02-09 21:26:06.000000000 +0100 +++ gcc/testsuite/gcc.target/i386/sse4a-montsd.c 2007-02-09 21:26:06.000000000 +0100 @@ -0,0 +1,64 @@ +/* { dg-do run { target i?86-*-* x86_64-*-* } } */ +/* { dg-options "-O2 -msse4a" } */ +#include +#include +#include "../../gcc.dg/i386-cpuid.h" + +static void sse4a_test (void); + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_extended_cpuid_ecx (); + + /* Run SSE4a test only if host has SSE4a support. */ + if ((cpu_facilities & bit_SSE4a)) + sse4a_test (); + + exit (0); +} + +static void +sse4a_test_movntsd (double *out, double *in) +{ + __m128d in_v2df = _mm_load_sd (in); + _mm_stream_sd (out, in_v2df); +} + +static int +chk_sd (double *v1, double *v2) +{ + int n_fails = 0; + if (v1[0] != v2[0]) + n_fails += 1; + return n_fails; +} + +double vals[10] = + { + 100.0, 200.0, 300.0, 400.0, 5.0, + -1.0, .345, -21.5, 9.32, 8.41 + }; + +static void +sse4a_test (void) +{ + int i; + int fail = 0; + double *out; + + out = (double *) malloc (sizeof (double)); + for (i = 0; i < 10; i += 1) + { + sse4a_test_movntsd (out, &vals[i]); + + fail += chk_sd (out, &vals[i]); + } + + if (fail != 0) + abort (); + + exit (0); +} --- gcc/testsuite/gcc.dg/i386-cpuid.h.jj 2006-10-05 00:26:53.000000000 +0200 +++ gcc/testsuite/gcc.dg/i386-cpuid.h 2007-02-07 13:07:08.000000000 +0100 @@ -2,23 +2,32 @@ Used by 20020523-2.c and i386-sse-6.c, and possibly others. */ /* Plagarized from 20020523-2.c. */ +/* %ecx */ +#define bit_SSE3 (1 << 0) +#define bit_SSSE3 (1 << 9) + +/* %edx */ #define bit_CMOV (1 << 15) #define bit_MMX (1 << 23) #define bit_SSE (1 << 25) #define bit_SSE2 (1 << 26) +/* Extended Features */ +/* %ecx */ +#define bit_SSE4a (1 << 6) + #ifndef NOINLINE #define NOINLINE __attribute__ ((noinline)) #endif -unsigned int i386_cpuid (void) NOINLINE; - -unsigned int NOINLINE -i386_cpuid (void) +static inline unsigned int +i386_get_cpuid (unsigned int *ecx, unsigned int *edx) { - int fl1, fl2; + int fl1; #ifndef __x86_64__ + int fl2; + /* See if we can use cpuid. On AMD64 we always can. */ __asm__ ("pushfl; pushfl; popl %0; movl %0,%1; xorl %2,%0;" "pushl %0; popfl; pushfl; popl %0; popfl" @@ -42,15 +51,99 @@ i386_cpuid (void) if (fl1 == 0) return (0); - /* Invoke CPUID(1), return %edx; caller can examine bits to + /* Invoke CPUID(1), return %ecx and %edx; caller can examine bits to determine what's supported. */ #ifdef __x86_64__ - __asm__ ("pushq %%rcx; pushq %%rbx; cpuid; popq %%rbx; popq %%rcx" - : "=d" (fl2), "=a" (fl1) : "1" (1) : "cc"); + __asm__ ("pushq %%rbx; cpuid; popq %%rbx" + : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (1) : "cc"); #else - __asm__ ("pushl %%ecx; pushl %%ebx; cpuid; popl %%ebx; popl %%ecx" - : "=d" (fl2), "=a" (fl1) : "1" (1) : "cc"); + __asm__ ("pushl %%ebx; cpuid; popl %%ebx" + : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (1) : "cc"); +#endif + + return 1; +} + +static inline unsigned int +i386_get_extended_cpuid (unsigned int *ecx, unsigned int *edx) +{ + int fl1; + if (!(i386_get_cpuid (ecx, edx))) + return 0; + + /* Invoke CPUID(0x80000000) to get the highest supported extended function + number */ +#ifdef __x86_64__ + __asm__ ("cpuid" + : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx", "ebx"); +#else + __asm__ ("pushl %%ebx; cpuid; popl %%ebx" + : "=a" (fl1) : "0" (0x80000000) : "edx", "ecx"); +#endif + /* Check if highest supported extended function used below are supported */ + if (fl1 < 0x80000001) + return 0; + + /* Invoke CPUID(0x80000001), return %ecx and %edx; caller can examine bits to + determine what's supported. */ +#ifdef __x86_64__ + __asm__ ("cpuid" + : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001) : "ebx"); +#else + __asm__ ("pushl %%ebx; cpuid; popl %%ebx" + : "=c" (*ecx), "=d" (*edx), "=a" (fl1) : "2" (0x80000001)); #endif + return 1; +} + + +unsigned int i386_cpuid_ecx (void) NOINLINE; +unsigned int i386_cpuid_edx (void) NOINLINE; +unsigned int i386_extended_cpuid_ecx (void) NOINLINE; +unsigned int i386_extended_cpuid_edx (void) NOINLINE; + +unsigned int NOINLINE +i386_cpuid_ecx (void) +{ + unsigned int ecx, edx; + if (i386_get_cpuid (&ecx, &edx)) + return ecx; + else + return 0; +} + +unsigned int NOINLINE +i386_cpuid_edx (void) +{ + unsigned int ecx, edx; + if (i386_get_cpuid (&ecx, &edx)) + return edx; + else + return 0; +} - return fl2; +unsigned int NOINLINE +i386_extended_cpuid_ecx (void) +{ + unsigned int ecx, edx; + if (i386_get_extended_cpuid (&ecx, &edx)) + return ecx; + else + return 0; +} + +unsigned int NOINLINE +i386_extended_cpuid_edx (void) +{ + unsigned int ecx, edx; + if (i386_get_extended_cpuid (&ecx, &edx)) + return edx; + else + return 0; +} + +static inline unsigned int +i386_cpuid (void) +{ + return i386_cpuid_edx (); } --- gcc/config.gcc.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/config.gcc 2007-02-09 21:26:06.000000000 +0100 @@ -264,12 +264,12 @@ xscale-*-*) i[34567]86-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" ;; x86_64-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" need_64bit_hwint=yes ;; ia64-*-*) @@ -2396,6 +2396,9 @@ if test x$with_cpu = x ; then ;; i686-*-* | i786-*-*) case ${target_noncanonical} in + amdfam10-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2436,6 +2439,9 @@ if test x$with_cpu = x ; then ;; x86_64-*-*) case ${target_noncanonical} in + amdfam10-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2668,7 +2674,7 @@ case "${target}" in esac # OK ;; - "" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) + "" | amdfam10 | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) # OK ;; *) --- gcc/config/i386/i386.h.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/config/i386/i386.h 2007-02-09 21:29:00.000000000 +0100 @@ -141,6 +141,7 @@ extern const struct processor_costs *ix8 #define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32) #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) +#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) #define TUNEMASK (1 << ix86_tune) extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and; @@ -159,6 +160,7 @@ extern const int x86_accumulate_outgoing extern const int x86_epilogue_using_move, x86_decompose_lea; extern const int x86_arch_always_fancy_math_387, x86_shift1; extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs; +extern const int x86_sse_unaligned_move_optimal; extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor; extern const int x86_use_ffreep; extern const int x86_inter_unit_moves, x86_schedule; @@ -208,6 +210,8 @@ extern int x86_prefetch_sse, x86_cmpxchg #define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & TUNEMASK) #define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ (x86_sse_partial_reg_dependency & TUNEMASK) +#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \ + (x86_sse_unaligned_move_optimal & TUNEMASK) #define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & TUNEMASK) #define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & TUNEMASK) #define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & TUNEMASK) @@ -376,6 +380,8 @@ extern int x86_prefetch_sse, x86_cmpxchg } \ else if (TARGET_K8) \ builtin_define ("__tune_k8__"); \ + else if (TARGET_AMDFAM10) \ + builtin_define ("__tune_amdfam10__"); \ else if (TARGET_PENTIUM4) \ builtin_define ("__tune_pentium4__"); \ else if (TARGET_NOCONA) \ @@ -400,6 +406,8 @@ extern int x86_prefetch_sse, x86_cmpxchg builtin_define ("__SSSE3__"); \ builtin_define ("__MNI__"); \ } \ + if (TARGET_SSE4A) \ + builtin_define ("__SSE4A__"); \ if (TARGET_SSE_MATH && TARGET_SSE) \ builtin_define ("__SSE_MATH__"); \ if (TARGET_SSE_MATH && TARGET_SSE2) \ @@ -455,6 +463,11 @@ extern int x86_prefetch_sse, x86_cmpxchg builtin_define ("__k8"); \ builtin_define ("__k8__"); \ } \ + else if (ix86_arch == PROCESSOR_AMDFAM10) \ + { \ + builtin_define ("__amdfam10"); \ + builtin_define ("__amdfam10__"); \ + } \ else if (ix86_arch == PROCESSOR_PENTIUM4) \ { \ builtin_define ("__pentium4"); \ @@ -493,13 +506,14 @@ extern int x86_prefetch_sse, x86_cmpxchg #define TARGET_CPU_DEFAULT_nocona 17 #define TARGET_CPU_DEFAULT_core2 18 #define TARGET_CPU_DEFAULT_generic 19 +#define TARGET_CPU_DEFAULT_amdfam10 20 #define TARGET_CPU_DEFAULT_NAMES {"i386", "i486", "pentium", "pentium-mmx",\ "pentiumpro", "pentium2", "pentium3", \ "pentium4", "geode", "k6", "k6-2", "k6-3", \ "athlon", "athlon-4", "k8", \ "pentium-m", "prescott", "nocona", \ - "core2", "generic"} + "core2", "generic", "amdfam10"} #ifndef CC1_SPEC #define CC1_SPEC "%(cc1_cpu) " @@ -2162,6 +2176,7 @@ enum processor_type PROCESSOR_CORE2, PROCESSOR_GENERIC32, PROCESSOR_GENERIC64, + PROCESSOR_AMDFAM10, PROCESSOR_max }; --- gcc/config/i386/i386.md.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/config/i386/i386.md 2007-02-10 19:33:43.000000000 +0100 @@ -151,6 +151,12 @@ (UNSPEC_PSHUFB 120) (UNSPEC_PSIGN 121) (UNSPEC_PALIGNR 122) + + ; For SSE4A support + (UNSPEC_EXTRQI 130) + (UNSPEC_EXTRQ 131) + (UNSPEC_INSERTQI 132) + (UNSPEC_INSERTQ 133) ]) (define_constants @@ -190,7 +196,8 @@ ;; Processor type. This attribute must exactly match the processor_type ;; enumeration in i386.h. -(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64" +(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8, + nocona,core2,generic32,generic64,amdfam10" (const (symbol_ref "ix86_tune"))) ;; A basic instruction type. Refinements due to arguments to be @@ -201,10 +208,10 @@ incdec,ishift,ishift1,rotate,rotate1,imul,idiv, icmp,test,ibr,setcc,icmov, push,pop,call,callv,leave, - str,cld, + str,bitmanip,cld, fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint, sselog,sselog1,sseiadd,sseishft,sseimul, - sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins, mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft" (const_string "other")) @@ -218,7 +225,7 @@ (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") (const_string "i387") (eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul, - sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv") + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins") (const_string "sse") (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") (const_string "mmx") @@ -228,7 +235,8 @@ ;; The (bounding maximum) length of an instruction immediate. (define_attr "length_immediate" "" - (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave") + (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave, + bitmanip") (const_int 0) (eq_attr "unit" "i387,sse,mmx") (const_int 0) @@ -282,7 +290,7 @@ ;; Set when 0f opcode prefix is used. (define_attr "prefix_0f" "" (if_then_else - (ior (eq_attr "type" "imovx,setcc,icmov") + (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip") (eq_attr "unit" "sse,mmx")) (const_int 1) (const_int 0))) @@ -407,7 +415,7 @@ (const_string "load") (and (eq_attr "type" "!alu1,negnot,ishift1, - imov,imovx,icmp,test, + imov,imovx,icmp,test,bitmanip, fmov,fcmp,fsgn, sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1, mmx,mmxmov,mmxcmp,mmxcvt") @@ -961,10 +969,11 @@ "sahf" [(set_attr "length" "1") (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "SI")]) ;; Pentium Pro can do steps 1 through 3 in one go. - +;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes) (define_insn "*cmpfp_i_mixed" [(set (reg:CCFP FLAGS_REG) (compare:CCFP (match_operand 0 "register_operand" "f#x,x#f") @@ -978,7 +987,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_i_sse" [(set (reg:CCFP FLAGS_REG) @@ -993,7 +1003,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_i_i387" [(set (reg:CCFP FLAGS_REG) @@ -1012,7 +1023,8 @@ (const_string "DF") ] (const_string "XF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_mixed" [(set (reg:CCFPU FLAGS_REG) @@ -1027,7 +1039,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_sse" [(set (reg:CCFPU FLAGS_REG) @@ -1042,7 +1055,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_387" [(set (reg:CCFPU FLAGS_REG) @@ -1061,7 +1075,8 @@ (const_string "DF") ] (const_string "XF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) ;; Move instructions. @@ -1267,7 +1282,8 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) (define_expand "movhi" [(set (match_operand:HI 0 "nonimmediate_operand" "") @@ -1384,8 +1400,10 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 (define_insn "*swaphi_2" [(set (match_operand:HI 0 "register_operand" "+r") (match_operand:HI 1 "register_operand" "+r")) @@ -1558,8 +1576,10 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 (define_insn "*swapqi_2" [(set (match_operand:QI 0 "register_operand" "+q") (match_operand:QI 1 "register_operand" "+q")) @@ -2113,7 +2133,8 @@ [(set_attr "type" "imov") (set_attr "mode" "DI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) (define_expand "movti" [(set (match_operand:TI 0 "nonimmediate_operand" "") @@ -4122,7 +4143,8 @@ "cvttss2si{q}\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncdfdi_sse" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -4131,7 +4153,8 @@ "cvttsd2si{q}\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncsfsi_sse" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -4140,7 +4163,8 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncdfsi_sse" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -4149,7 +4173,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) ;; Avoid vector decoded forms of the instruction. (define_peephole2 @@ -4410,7 +4435,8 @@ [(set_attr "length" "2") (set_attr "mode" "HI") (set_attr "unit" "i387") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) ;; Conversion between fixed point and floating point. @@ -4461,6 +4487,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_sse" @@ -4471,6 +4498,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_i387" @@ -4504,6 +4532,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_sse" @@ -4514,6 +4543,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_i387" @@ -4572,6 +4602,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_sse" @@ -4582,6 +4613,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_i387" @@ -4615,6 +4647,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_sse" @@ -4625,6 +4658,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_i387" @@ -6832,6 +6866,14 @@ "TARGET_64BIT" "") +;; On AMDFAM10 +;; IMUL reg64, reg64, imm8 Direct +;; IMUL reg64, mem64, imm8 VectorPath +;; IMUL reg64, reg64, imm32 Direct +;; IMUL reg64, mem64, imm32 VectorPath +;; IMUL reg64, reg64 Direct +;; IMUL reg64, mem64 Direct + (define_insn "*muldi3_1_rex64" [(set (match_operand:DI 0 "register_operand" "=r,r,r") (mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6854,6 +6896,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "DI")]) (define_expand "mulsi3" @@ -6864,6 +6911,14 @@ "" "") +;; On AMDFAM10 +;; IMUL reg32, reg32, imm8 Direct +;; IMUL reg32, mem32, imm8 VectorPath +;; IMUL reg32, reg32, imm32 Direct +;; IMUL reg32, mem32, imm32 VectorPath +;; IMUL reg32, reg32 Direct +;; IMUL reg32, mem32 Direct + (define_insn "*mulsi3_1" [(set (match_operand:SI 0 "register_operand" "=r,r,r") (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6885,6 +6940,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_insn "*mulsi3_1_zext" @@ -6910,6 +6970,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_expand "mulhi3" @@ -6920,6 +6985,13 @@ "TARGET_HIMODE_MATH" "") +;; On AMDFAM10 +;; IMUL reg16, reg16, imm8 VectorPath +;; IMUL reg16, mem16, imm8 VectorPath +;; IMUL reg16, reg16, imm16 VectorPath +;; IMUL reg16, mem16, imm16 VectorPath +;; IMUL reg16, reg16 Direct +;; IMUL reg16, mem16 Direct (define_insn "*mulhi3_1" [(set (match_operand:HI 0 "register_operand" "=r,r,r") (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6938,6 +7010,10 @@ (eq_attr "alternative" "1,2") (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(eq_attr "alternative" "0,1") + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "HI")]) (define_expand "mulqi3" @@ -6948,6 +7024,10 @@ "TARGET_QIMODE_MATH" "") +;;On AMDFAM10 +;; MUL reg8 Direct +;; MUL mem8 Direct + (define_insn "*mulqi3_1" [(set (match_operand:QI 0 "register_operand" "=a") (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") @@ -6962,6 +7042,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulqihi3" @@ -6988,6 +7069,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "mulqihi3" @@ -7012,6 +7094,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulditi3" @@ -7038,6 +7121,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) ;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers @@ -7065,6 +7149,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "mulditi3" @@ -7091,6 +7176,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "mulsidi3" @@ -7117,6 +7203,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "umuldi3_highpart" @@ -7153,6 +7240,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "umulsi3_highpart" @@ -7188,6 +7276,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*umulsi3_highpart_zext" @@ -7210,6 +7299,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "smuldi3_highpart" @@ -7245,6 +7335,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "smulsi3_highpart" @@ -7279,6 +7370,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*smulsi3_highpart_zext" @@ -7300,6 +7392,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) ;; The patterns that match these are at the end of this file. @@ -10281,7 +10374,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_64_shift_adj" [(set (reg:CCZ FLAGS_REG) @@ -10496,7 +10590,8 @@ (set_attr "prefix_0f" "1") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_shift_adj_1" [(set (reg:CCZ FLAGS_REG) @@ -11256,7 +11351,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "ashrdi3" [(set (match_operand:DI 0 "shiftdi_operand" "") @@ -14520,7 +14616,23 @@ [(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31))) (clobber (reg:CC FLAGS_REG))])] "" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzsi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzsi2_abm" + [(set (match_operand:SI 0 "register_operand" "=r") + (clz:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) (define_insn "*bsr" [(set (match_operand:SI 0 "register_operand" "=r") @@ -14529,7 +14641,44 @@ (clobber (reg:CC FLAGS_REG))] "" "bsr{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "SI")]) + +(define_insn "popcountsi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI(popcount:SI (match_dup 1))))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) (define_expand "clzdi2" [(parallel @@ -14541,7 +14690,23 @@ [(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63))) (clobber (reg:CC FLAGS_REG))])] "TARGET_64BIT" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzdi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzdi2_abm" + [(set (match_operand:DI 0 "register_operand" "=r") + (clz:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_ABM" + "lzcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) (define_insn "*bsr_rex64" [(set (match_operand:DI 0 "register_operand" "=r") @@ -14550,7 +14715,92 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT" "bsr{q}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "popcountdi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_POPCNT" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_insn "*popcountdi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_dup 1)))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_expand "clzhi2" + [(parallel + [(set (match_operand:HI 0 "register_operand" "") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (TARGET_ABM) + { + emit_insn (gen_clzhi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzhi2_abm" + [(set (match_operand:HI 0 "register_operand" "=r") + (clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*bsrhi" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_0f" "1") + (set_attr "mode" "HI")]) + +(define_insn "popcounthi2" + [(set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*popcounthi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) ;; Thread-local storage patterns for ELF. ;; @@ -15302,7 +15552,8 @@ sqrtss\t{%1, %0|%0, %1}" [(set_attr "type" "fpspc,sse") (set_attr "mode" "SF,SF") - (set_attr "athlon_decode" "direct,*")]) + (set_attr "athlon_decode" "direct,*") + (set_attr "amdfam10_decode" "direct,*")]) (define_insn "*sqrtsf2_sse" [(set (match_operand:SF 0 "register_operand" "=x") @@ -15311,7 +15562,8 @@ "sqrtss\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "mode" "SF") - (set_attr "athlon_decode" "*")]) + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*")]) (define_insn "*sqrtsf2_i387" [(set (match_operand:SF 0 "register_operand" "=f") @@ -15320,7 +15572,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "SF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_expand "sqrtdf2" [(set (match_operand:DF 0 "register_operand" "") @@ -15399,7 +15652,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "fpremxf4" [(set (match_operand:XF 0 "register_operand" "=f") @@ -20186,7 +20440,7 @@ (mult:DI (match_operand:DI 1 "memory_operand" "") (match_operand:DI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && (GET_CODE (operands[2]) != CONST_INT || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))" [(set (match_dup 3) (match_dup 1)) @@ -20200,7 +20454,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && (GET_CODE (operands[2]) != CONST_INT || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))" [(set (match_dup 3) (match_dup 1)) @@ -20215,7 +20469,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" "")))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && (GET_CODE (operands[2]) != CONST_INT || !CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K'))" [(set (match_dup 3) (match_dup 1)) @@ -20233,7 +20487,7 @@ (match_operand:DI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:DI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3))) @@ -20249,7 +20503,7 @@ (match_operand:SI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:SI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && CONST_OK_FOR_LETTER_P (INTVAL (operands[2]), 'K')" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3))) @@ -20265,7 +20519,7 @@ (match_operand:HI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:HI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size" + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3))) (clobber (reg:CC FLAGS_REG))])] --- gcc/config/i386/athlon.md.jj 2006-10-29 20:56:45.000000000 +0100 +++ gcc/config/i386/athlon.md 2007-02-09 21:26:06.000000000 +0100 @@ -29,6 +29,8 @@ (const_string "vector")] (const_string "direct"))) +(define_attr "amdfam10_decode" "direct,vector,double" + (const_string "direct")) ;; ;; decode0 decode1 decode2 ;; \ | / @@ -131,18 +133,22 @@ ;; Jump instructions are executed in the branch unit completely transparent to us (define_insn_reservation "athlon_branch" 0 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "ibr")) "athlon-direct,athlon-ieu") (define_insn_reservation "athlon_call" 0 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "call,callv")) "athlon-vector,athlon-ieu") +(define_insn_reservation "athlon_call_amdfam10" 0 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "call,callv")) + "athlon-double,athlon-ieu") ;; Latency of push operation is 3 cycles, but ESP value is available ;; earlier (define_insn_reservation "athlon_push" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "push")) "athlon-direct,athlon-agu,athlon-store") (define_insn_reservation "athlon_pop" 4 @@ -153,12 +159,16 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "pop")) "athlon-double,(athlon-ieu+athlon-load)") +(define_insn_reservation "athlon_pop_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "pop")) + "athlon-direct,(athlon-ieu+athlon-load)") (define_insn_reservation "athlon_leave" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "leave")) "athlon-vector,(athlon-ieu+athlon-load)") (define_insn_reservation "athlon_leave_k8" 3 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "leave")) "athlon-double,(athlon-ieu+athlon-load)") @@ -167,6 +177,11 @@ (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "lea")) "athlon-direct,athlon-agu,nothing") +;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10 +(define_insn_reservation "athlon_lea_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "lea")) + "athlon-direct,athlon-agu,nothing") ;; Mul executes in special multiplier unit attached to IEU0 (define_insn_reservation "athlon_imul" 5 @@ -176,29 +191,35 @@ "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0") ;; ??? Widening multiply is vector or double. (define_insn_reservation "athlon_imul_k8_DI" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "none,unknown")))) "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") (define_insn_reservation "athlon_imul_k8" 3 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (eq_attr "memory" "none,unknown"))) "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0") +(define_insn_reservation "athlon_imul_amdfam10_HI" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "imul") + (and (eq_attr "mode" "HI") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0") (define_insn_reservation "athlon_imul_mem" 8 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu") (define_insn_reservation "athlon_imul_mem_k8_DI" 7 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (and (eq_attr "mode" "DI") (eq_attr "memory" "load,both")))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu") (define_insn_reservation "athlon_imul_mem_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "imul") (eq_attr "memory" "load,both"))) "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu") @@ -209,21 +230,23 @@ ;; other instructions. ;; ??? Experiments show that the idiv can overlap with roughly 6 cycles ;; of the other code +;; Using the same heuristics for amdfam10 as K8 with idiv (define_insn_reservation "athlon_idiv" 6 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "idiv") (eq_attr "memory" "none,unknown"))) "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))") (define_insn_reservation "athlon_idiv_mem" 9 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "idiv") (eq_attr "memory" "load,both"))) "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))") ;; The parallelism of string instructions is not documented. Model it same way ;; as idiv to create smaller automata. This probably does not matter much. +;; Using the same heuristics for amdfam10 as K8 with idiv (define_insn_reservation "athlon_str" 6 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "str") (eq_attr "memory" "load,both,store"))) "athlon-vector,athlon-load,athlon-ieu0*6") @@ -234,34 +257,62 @@ (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-direct,athlon-ieu") +(define_insn_reservation "athlon_idirect_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-direct,athlon-ieu") (define_insn_reservation "athlon_ivector" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "none,unknown")))) "athlon-vector,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "none,unknown")))) + "athlon-vector,athlon-ieu,athlon-ieu") + (define_insn_reservation "athlon_idirect_loadmov" 3 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "imov") (eq_attr "memory" "load"))) "athlon-direct,athlon-load") + (define_insn_reservation "athlon_idirect_load" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-direct,athlon-load,athlon-ieu") +(define_insn_reservation "athlon_idirect_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-direct,athlon-load,athlon-ieu") (define_insn_reservation "athlon_ivector_load" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") (and (eq_attr "unit" "integer,unknown") (eq_attr "memory" "load")))) "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") +(define_insn_reservation "athlon_ivector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "load")))) + "athlon-vector,athlon-load,athlon-ieu,athlon-ieu") + (define_insn_reservation "athlon_idirect_movstore" 1 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "imov") (eq_attr "memory" "store"))) "athlon-direct,athlon-agu,athlon-store") + (define_insn_reservation "athlon_idirect_both" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") @@ -270,6 +321,15 @@ "athlon-direct,athlon-load, athlon-ieu,athlon-store, athlon-store") +(define_insn_reservation "athlon_idirect_both_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-direct,athlon-load, + athlon-ieu,athlon-store, + athlon-store") + (define_insn_reservation "athlon_ivector_both" 6 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") @@ -279,6 +339,16 @@ athlon-ieu, athlon-ieu, athlon-store") +(define_insn_reservation "athlon_ivector_both_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "both")))) + "athlon-vector,athlon-load, + athlon-ieu, + athlon-ieu, + athlon-store") + (define_insn_reservation "athlon_idirect_store" 1 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "direct") @@ -286,6 +356,14 @@ (eq_attr "memory" "store")))) "athlon-direct,(athlon-ieu+athlon-agu), athlon-store") +(define_insn_reservation "athlon_idirect_store_amdfam10" 1 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-direct,(athlon-ieu+athlon-agu), + athlon-store") + (define_insn_reservation "athlon_ivector_store" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "athlon_decode" "vector") @@ -293,6 +371,13 @@ (eq_attr "memory" "store")))) "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, athlon-store") +(define_insn_reservation "athlon_ivector_store_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "unit" "integer,unknown") + (eq_attr "memory" "store")))) + "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu, + athlon-store") ;; Athlon floatin point unit (define_insn_reservation "athlon_fldxf" 12 @@ -302,7 +387,7 @@ (eq_attr "mode" "XF")))) "athlon-vector,athlon-fpload2,athlon-fvector*9") (define_insn_reservation "athlon_fldxf_k8" 13 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "load") (eq_attr "mode" "XF")))) @@ -314,7 +399,7 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fany") (define_insn_reservation "athlon_fld_k8" 2 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fstore") @@ -326,7 +411,7 @@ (eq_attr "mode" "XF")))) "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))") (define_insn_reservation "athlon_fstxf_k8" 8 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (and (eq_attr "memory" "store,both") (eq_attr "mode" "XF")))) @@ -337,16 +422,16 @@ (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fst_k8" 2 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fist" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fistp,fisttp")) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_fmov" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fmov")) "athlon-direct,athlon-fpsched,athlon-faddmul") (define_insn_reservation "athlon_fadd_load" 4 @@ -355,12 +440,12 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fadd_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fop") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fadd" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fop")) "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_fmul_load" 4 @@ -369,16 +454,16 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_fmul_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fmul") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_fmul" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fmul")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fsgn" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fsgn")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fdiv_load" 24 @@ -387,7 +472,7 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_fdiv_load_k8" 13 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fdiv") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fmul") @@ -396,16 +481,16 @@ (eq_attr "type" "fdiv")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fdiv_k8" 11 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "fdiv")) "athlon-direct,athlon-fpsched,athlon-fmul") (define_insn_reservation "athlon_fpspc_load" 103 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "fpspc") (eq_attr "memory" "load"))) "athlon-vector,athlon-fpload,athlon-fvector") (define_insn_reservation "athlon_fpspc" 100 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fpspc")) "athlon-vector,athlon-fpsched,athlon-fvector") (define_insn_reservation "athlon_fcmov_load" 7 @@ -418,12 +503,12 @@ (eq_attr "type" "fcmov")) "athlon-vector,athlon-fpsched,athlon-fvector") (define_insn_reservation "athlon_fcmov_load_k8" 17 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmov") (eq_attr "memory" "load"))) "athlon-vector,athlon-fploadk8,athlon-fvector") (define_insn_reservation "athlon_fcmov_k8" 15 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (eq_attr "type" "fcmov")) "athlon-vector,athlon-fpsched,athlon-fvector") ;; fcomi is vector decoded by uses only one pipe. @@ -434,13 +519,13 @@ (eq_attr "memory" "load")))) "athlon-vector,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fcomi_load_k8" 5 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmp") (and (eq_attr "athlon_decode" "vector") (eq_attr "memory" "load")))) "athlon-vector,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fcomi" 3 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "athlon_decode" "vector") (eq_attr "type" "fcmp"))) "athlon-vector,athlon-fpsched,athlon-fadd") @@ -450,18 +535,18 @@ (eq_attr "memory" "load"))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_fcom_load_k8" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "fcmp") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_fcom" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (eq_attr "type" "fcmp")) "athlon-direct,athlon-fpsched,athlon-fadd") ;; Never seen by the scheduler because we still don't do post reg-stack ;; scheduling. ;(define_insn_reservation "athlon_fxch" 2 -; (and (eq_attr "cpu" "athlon,k8,generic64") +; (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") ; (eq_attr "type" "fxch")) ; "athlon-direct,athlon-fpsched,athlon-fany") @@ -516,6 +601,23 @@ (and (eq_attr "type" "mmxmov,ssemov") (eq_attr "memory" "load"))) "athlon-direct,athlon-fploadk8,athlon-fstore") +;; On AMDFAM10 all double, single and integer packed and scalar SSEx data +;; loads generated are direct path, latency of 2 and do not use any FP +;; executions units. No seperate entries for movlpx/movhpx loads, which +;; are direct path, latency of 4 and use the FADD/FMUL FP execution units, +;; as they will not be generated. +(define_insn_reservation "athlon_sseld_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8") +;; On AMDFAM10 MMX data loads generated are direct path, latency of 4 +;; and can use any FP executions units +(define_insn_reservation "athlon_mmxld_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8, athlon-fany") (define_insn_reservation "athlon_mmxssest" 3 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "mmxmov,ssemov") @@ -533,6 +635,25 @@ (and (eq_attr "type" "mmxmov,ssemov") (eq_attr "memory" "store,both"))) "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") +;; On AMDFAM10 all double, single and integer packed SSEx data stores +;; generated are all double path, latency of 2 and use the FSTORE FP +;; execution unit. No entries seperate for movupx/movdqu, which are +;; vector path, latency of 3 and use the FSTORE*2 FP execution unit, +;; as they will not be generated. +(define_insn_reservation "athlon_ssest_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemov") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "store,both")))) + "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)") +;; On AMDFAM10 all double, single and integer scalar SSEx and MMX +;; data stores generated are all direct path, latency of 2 and use +;; the FSTORE FP execution unit +(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "mmxmov,ssemov") + (eq_attr "memory" "store,both"))) + "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)") (define_insn_reservation "athlon_movaps_k8" 2 (and (eq_attr "cpu" "k8,generic64") (and (eq_attr "type" "ssemov") @@ -578,6 +699,11 @@ (and (eq_attr "type" "sselog,sselog1") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_sselog_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sselog,sselog1") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)") (define_insn_reservation "athlon_sselog" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "sselog,sselog1")) @@ -586,6 +712,11 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "sselog,sselog1")) "athlon-double,athlon-fpsched,athlon-fmul") +(define_insn_reservation "athlon_sselog_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sselog,sselog1")) + "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)") + ;; ??? pcmp executes in addmul, probably not worthwhile to bother about that. (define_insn_reservation "athlon_ssecmp_load" 2 (and (eq_attr "cpu" "athlon") @@ -594,13 +725,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_ssecmp_load_k8" 4 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssecmp") (and (eq_attr "mode" "SF,DF,DI,TI") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecmp" 2 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssecmp") (eq_attr "mode" "SF,DF,DI,TI"))) "athlon-direct,athlon-fpsched,athlon-fadd") @@ -614,6 +745,11 @@ (and (eq_attr "type" "ssecmp") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecmp") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecmpvector" 3 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssecmp")) @@ -622,6 +758,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssecmp")) "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssecmp")) + "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_ssecomi_load" 4 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "ssecomi") @@ -632,10 +772,20 @@ (and (eq_attr "type" "ssecomi") (eq_attr "memory" "load"))) "athlon-vector,athlon-fploadk8,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecomi") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_ssecomi" 4 (and (eq_attr "cpu" "athlon,k8,generic64") (eq_attr "type" "ssecmp")) "athlon-vector,athlon-fpsched,athlon-fadd") +(define_insn_reservation "athlon_ssecomi_amdfam10" 3 + (and (eq_attr "cpu" "amdfam10") +;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10 + (eq_attr "type" "ssecomi")) + "athlon-direct,athlon-fpsched,athlon-fadd") (define_insn_reservation "athlon_sseadd_load" 4 (and (eq_attr "cpu" "athlon") (and (eq_attr "type" "sseadd") @@ -643,13 +793,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fadd") (define_insn_reservation "athlon_sseadd_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "sseadd") (and (eq_attr "mode" "SF,DF,DI") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_sseadd" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "sseadd") (eq_attr "mode" "SF,DF,DI"))) "athlon-direct,athlon-fpsched,athlon-fadd") @@ -663,6 +813,11 @@ (and (eq_attr "type" "sseadd") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseadd") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fadd") (define_insn_reservation "athlon_sseaddvector" 5 (and (eq_attr "cpu" "athlon") (eq_attr "type" "sseadd")) @@ -671,6 +826,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "sseadd")) "athlon-double,athlon-fpsched,(athlon-fadd*2)") +(define_insn_reservation "athlon_sseaddvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "sseadd")) + "athlon-direct,athlon-fpsched,athlon-fadd") ;; Conversions behaves very irregularly and the scheduling is critical here. ;; Take each instruction separately. Assume that the mode is always set to the @@ -684,12 +843,25 @@ (and (eq_attr "mode" "DF") (eq_attr "memory" "load"))))) "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") (define_insn_reservation "athlon_ssecvt_cvtss2sd" 2 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "ssecvt") (and (eq_attr "athlon_decode" "direct") (eq_attr "mode" "DF")))) "athlon-direct,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (eq_attr "mode" "DF")))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") ;; cvtps2pd. Model same way the other double decoded FP conversions. (define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -698,12 +870,25 @@ (and (eq_attr "mode" "V2DF,V4SF,TI") (eq_attr "memory" "load"))))) "athlon-double,athlon-fpload2k8,(athlon-fstore*2)") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (and (eq_attr "mode" "V2DF,V4SF,TI") + (eq_attr "memory" "load"))))) + "athlon-direct,athlon-fploadk8,athlon-fstore") (define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3 (and (eq_attr "cpu" "k8,athlon,generic64") (and (eq_attr "type" "ssecvt") (and (eq_attr "athlon_decode" "double") (eq_attr "mode" "V2DF,V4SF,TI")))) "athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "direct") + (eq_attr "mode" "V2DF,V4SF,TI")))) + "athlon-direct,athlon-fpsched,athlon-fstore") ;; cvtsi2sd mem,reg is directpath path (cvtsi2sd reg,reg is doublepath) ;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6 (define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6 @@ -713,6 +898,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load"))))) "athlon-direct,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2ss mem, reg is doublepath (define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9 (and (eq_attr "cpu" "athlon") @@ -728,6 +920,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load"))))) "athlon-double,athlon-fploadk8,(athlon-fstore*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2sd reg,reg is double decoded (vector on Athlon) (define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -736,6 +935,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "none"))))) "athlon-double,athlon-fploadk8,athlon-fstore") +(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsi2ss reg, reg is doublepath (define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -744,6 +950,13 @@ (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fploadk8,(athlon-fvector*2)") +(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF,DF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9 (define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9 (and (eq_attr "cpu" "k8,athlon,generic64") @@ -752,6 +965,13 @@ (and (eq_attr "mode" "SF") (eq_attr "memory" "load"))))) "athlon-double,athlon-fploadk8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12 (define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -760,6 +980,13 @@ (and (eq_attr "mode" "SF") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fpsched,(athlon-fvector*3)") +(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "vector") + (and (eq_attr "mode" "SF") + (eq_attr "memory" "none"))))) + "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)") (define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8 (and (eq_attr "cpu" "athlon,k8,generic64") (and (eq_attr "type" "ssecvt") @@ -767,6 +994,13 @@ (and (eq_attr "mode" "V4SF,V2DF,TI") (eq_attr "memory" "load"))))) "athlon-double,athlon-fpload2k8,(athlon-fstore*3)") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") ;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10 ;; ??? Why it is fater than cvtsd2ss? (define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8 @@ -776,6 +1010,13 @@ (and (eq_attr "mode" "V4SF,V2DF,TI") (eq_attr "memory" "none"))))) "athlon-vector,athlon-fpsched,athlon-fvector*2") +(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssecvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "V4SF,V2DF,TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") ;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9 (define_insn_reservation "athlon_secvt_cvtsX2si_load" 9 (and (eq_attr "cpu" "athlon,k8,generic64") @@ -784,6 +1025,13 @@ (and (eq_attr "mode" "SI,DI") (eq_attr "memory" "load"))))) "athlon-vector,athlon-fploadk8,athlon-fvector") +(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)") ;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9 (define_insn_reservation "athlon_ssecvt_cvtsX2si" 9 (and (eq_attr "cpu" "athlon") @@ -799,6 +1047,29 @@ (and (eq_attr "mode" "SI,DI") (eq_attr "memory" "none"))))) "athlon-double,athlon-fpsched,athlon-fstore") +(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "SI,DI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "load"))))) + "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)") +;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10 +(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseicvt") + (and (eq_attr "amdfam10_decode" "double") + (and (eq_attr "mode" "TI") + (eq_attr "memory" "none"))))) + "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)") (define_insn_reservation "athlon_ssemul_load" 4 @@ -808,13 +1079,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fmul") (define_insn_reservation "athlon_ssemul_load_k8" 6 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssemul") (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_ssemul" 4 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssemul") (eq_attr "mode" "SF,DF"))) "athlon-direct,athlon-fpsched,athlon-fmul") @@ -828,6 +1099,11 @@ (and (eq_attr "type" "ssemul") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssemul") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul") (define_insn_reservation "athlon_ssemulvector" 5 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssemul")) @@ -836,6 +1112,10 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssemul")) "athlon-double,athlon-fpsched,(athlon-fmul*2)") +(define_insn_reservation "athlon_ssemulvector_amdfam10" 4 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssemul")) + "athlon-direct,athlon-fpsched,athlon-fmul") ;; divsd timings. divss is faster (define_insn_reservation "athlon_ssediv_load" 20 (and (eq_attr "cpu" "athlon") @@ -844,13 +1124,13 @@ (eq_attr "memory" "load")))) "athlon-direct,athlon-fpload,athlon-fmul*17") (define_insn_reservation "athlon_ssediv_load_k8" 22 - (and (eq_attr "cpu" "k8,generic64") + (and (eq_attr "cpu" "k8,generic64,amdfam10") (and (eq_attr "type" "ssediv") (and (eq_attr "mode" "SF,DF") (eq_attr "memory" "load")))) "athlon-direct,athlon-fploadk8,athlon-fmul*17") (define_insn_reservation "athlon_ssediv" 20 - (and (eq_attr "cpu" "athlon,k8,generic64") + (and (eq_attr "cpu" "athlon,k8,generic64,amdfam10") (and (eq_attr "type" "ssediv") (eq_attr "mode" "SF,DF"))) "athlon-direct,athlon-fpsched,athlon-fmul*17") @@ -864,6 +1144,11 @@ (and (eq_attr "type" "ssediv") (eq_attr "memory" "load"))) "athlon-double,athlon-fpload2k8,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "ssediv") + (eq_attr "memory" "load"))) + "athlon-direct,athlon-fploadk8,athlon-fmul*17") (define_insn_reservation "athlon_ssedivvector" 39 (and (eq_attr "cpu" "athlon") (eq_attr "type" "ssediv")) @@ -872,3 +1157,12 @@ (and (eq_attr "cpu" "k8,generic64") (eq_attr "type" "ssediv")) "athlon-double,athlon-fmul*34") +(define_insn_reservation "athlon_ssedivvector_amdfam10" 20 + (and (eq_attr "cpu" "amdfam10") + (eq_attr "type" "ssediv")) + "athlon-direct,athlon-fmul*17") +(define_insn_reservation "athlon_sseins_amdfam10" 5 + (and (eq_attr "cpu" "amdfam10") + (and (eq_attr "type" "sseins") + (eq_attr "mode" "TI"))) + "athlon-vector,athlon-fpsched,athlon-faddmul") --- gcc/config/i386/pmmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200 +++ gcc/config/i386/pmmintrin.h 2007-02-09 21:26:06.000000000 +0100 @@ -30,7 +30,11 @@ #ifndef _PMMINTRIN_H_INCLUDED #define _PMMINTRIN_H_INCLUDED -#ifdef __SSE3__ +#ifndef __SSE3__ +# error "SSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE2 and SSE header files*/ #include #include --- gcc/config/i386/tmmintrin.h.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/config/i386/tmmintrin.h 2007-02-09 21:26:06.000000000 +0100 @@ -30,7 +30,11 @@ #ifndef _TMMINTRIN_H_INCLUDED #define _TMMINTRIN_H_INCLUDED -#ifdef __SSSE3__ +#ifndef __SSSE3__ +# error "SSSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ #include static __inline __m128i --- gcc/config/i386/sse.md.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/config/i386/sse.md 2007-02-09 21:26:06.000000000 +0100 @@ -963,6 +963,7 @@ "cvtsi2ss\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtsi2ssq" @@ -976,6 +977,7 @@ "cvtsi2ssq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtss2si" @@ -989,6 +991,7 @@ "cvtss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvtss2siq" @@ -1002,6 +1005,7 @@ "cvtss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse_cvttss2si" @@ -1014,6 +1018,7 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvttss2siq" @@ -1026,6 +1031,7 @@ "cvttss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvtdq2ps" @@ -1921,7 +1927,8 @@ "cvtsi2sd\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsi2sdq" [(set (match_operand:V2DF 0 "register_operand" "=x,x") @@ -1934,7 +1941,8 @@ "cvtsi2sdq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsd2si" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -1947,6 +1955,7 @@ "cvtsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse2_cvtsd2siq" @@ -1960,6 +1969,7 @@ "cvtsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvttsd2si" @@ -1972,7 +1982,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvttsd2siq" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -1984,7 +1995,8 @@ "cvttsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -2015,7 +2027,8 @@ "TARGET_SSE2" "cvtpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_expand "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") @@ -2033,7 +2046,8 @@ "TARGET_SSE2" "cvttpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtsd2ss" [(set (match_operand:V4SF 0 "register_operand" "=x,x") @@ -2047,20 +2061,22 @@ "cvtsd2ss\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse2_cvtss2sd" - [(set (match_operand:V2DF 0 "register_operand" "=x") + [(set (match_operand:V2DF 0 "register_operand" "=x,x") (vec_merge:V2DF (float_extend:V2DF (vec_select:V2SF - (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 2 "nonimmediate_operand" "x,m") (parallel [(const_int 0) (const_int 1)]))) - (match_operand:V2DF 1 "register_operand" "0") + (match_operand:V2DF 1 "register_operand" "0,0") (const_int 1)))] "TARGET_SSE2" "cvtss2sd\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "DF")]) (define_expand "sse2_cvtpd2ps" @@ -2081,7 +2097,8 @@ "TARGET_SSE2" "cvtpd2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "V4SF") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtps2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -2092,7 +2109,8 @@ "TARGET_SSE2" "cvtps2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "V2DF") + (set_attr "amdfam10_decode" "direct")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -4550,3 +4568,92 @@ "pabs\t{%1, %0|%0, %1}"; [(set_attr "type" "sselog1") (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; AMD SSE4A instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse4a_vmmovntv2df" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(vec_select:DF + (match_operand:V2DF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_movntdf" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_vmmovntv4sf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_movntsf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(match_operand:SF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_extrqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand 2 "const_int_operand" "") + (match_operand 3 "const_int_operand" "")] + UNSPEC_EXTRQI))] + "TARGET_SSE4A" + "extrq\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_extrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "x")] + UNSPEC_EXTRQ))] + "TARGET_SSE4A" + "extrq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x") + (match_operand 3 "const_int_operand" "") + (match_operand 4 "const_int_operand" "")] + UNSPEC_INSERTQI))] + "TARGET_SSE4A" + "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x")] + UNSPEC_INSERTQ))] + "TARGET_SSE4A" + "insertq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) --- gcc/config/i386/i386.opt.jj 2007-02-09 16:18:25.000000000 +0100 +++ gcc/config/i386/i386.opt 2007-02-09 21:26:06.000000000 +0100 @@ -205,6 +205,22 @@ mmni Target Undocumented Mask(SSSE3) MaskExists Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation +msse4a +Target Report Mask(SSE4A) +Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation + +mpopcnt +Target Report Mask(POPCNT) +Support code generation of popcount instruction for popcount built-ins +namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll + +mabm +Target Report Mask(ABM) +Support code generation of Advanced Bit Manipulation (ABM) instructions, +which include popcnt and lzcnt instructions, for popcount and clz built-ins +namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and +__builtin_clz, __builtin_clzl, __builtin_clzll + msseregparm Target RejectNegative Mask(SSEREGPARM) Use SSE register passing conventions for SF and DF mode --- gcc/config/i386/ammintrin.h.jj 2007-02-09 21:26:06.000000000 +0100 +++ gcc/config/i386/ammintrin.h 2007-02-09 21:26:06.000000000 +0100 @@ -0,0 +1,73 @@ +/* Copyright (C) 2007 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to + the Free Software Foundation, 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, if you include this header file into source + files compiled by GCC, this header file does not by itself cause + the resulting executable to be covered by the GNU General Public + License. This exception does not however invalidate any other + reasons why the executable file might be covered by the GNU General + Public License. */ + +/* Implemented from the specification included in the AMD Programmers + Manual Update, version 2.x */ + +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED + +#ifndef __SSE4A__ +# error "SSE4A instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +static __inline void __attribute__((__always_inline__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); +} + +static __inline void __attribute__((__always_inline__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); +} + +#define _mm_extracti_si64(X, I, L) \ +((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L)) + +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); +} + +#define _mm_inserti_si64(X, Y, I, L) \ +((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L)) + + +#endif /* __SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ --- gcc/config/i386/emmintrin.h.jj 2006-10-05 00:29:29.000000000 +0200 +++ gcc/config/i386/emmintrin.h 2007-02-09 21:26:06.000000000 +0100 @@ -30,7 +30,11 @@ #ifndef _EMMINTRIN_H_INCLUDED #define _EMMINTRIN_H_INCLUDED -#ifdef __SSE2__ +#ifndef __SSE2__ +# error "SSE2 instruction set not enabled" +#else + +/* We need definitions from the SSE header files*/ #include /* SSE2 */ --- gcc/config/i386/i386.c.jj 2007-02-09 16:24:00.000000000 +0100 +++ gcc/config/i386/i386.c 2007-02-10 19:47:05.000000000 +0100 @@ -534,6 +534,71 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ }; +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -816,11 +881,13 @@ const struct processor_costs *ix86_cost #define m_PENT4 (1< +#ifdef __SSE2__ +# include +#endif #endif /* __SSE__ */ #endif /* _XMMINTRIN_H_INCLUDED */