commit dafbd9e0c347cde6d8192f374787f5dbee4aed89 Author: Jeff Law Date: Wed Nov 22 12:01:56 2017 -0700 aarch64 support diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 75c12f1de65..d7100c36664 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -2747,7 +2747,14 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2) output_asm_insn ("sub\t%0, %0, %1", xops); /* Probe at TEST_ADDR. */ - output_asm_insn ("str\txzr, [%0]", xops); + if (flag_stack_clash_protection) + { + gcc_assert (xops[0] == stack_pointer_rtx); + xops[1] = GEN_INT (PROBE_INTERVAL - 8); + output_asm_insn ("str\txzr, [%0, %1]", xops); + } + else + output_asm_insn ("str\txzr, [%0]", xops); /* Test if TEST_ADDR == LAST_ADDR. */ xops[1] = reg2; @@ -3511,6 +3518,125 @@ aarch64_set_handled_components (sbitmap components) cfun->machine->reg_is_wrapped_separately[regno] = true; } +/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch + register. */ + +static void +aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size) +{ + HOST_WIDE_INT probe_interval + = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); + HOST_WIDE_INT guard_size + = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); + HOST_WIDE_INT guard_used_by_caller = 1024; + + /* SIZE should be large enough to require probing here. ie, it + must be larger than GUARD_SIZE - GUARD_USED_BY_CALLER. + + We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk + without any probing. */ + gcc_assert (size >= guard_size - guard_used_by_caller); + aarch64_sub_sp (scratchreg, guard_size - guard_used_by_caller, true); + HOST_WIDE_INT orig_size = size; + size -= (guard_size - guard_used_by_caller); + + HOST_WIDE_INT rounded_size = size & -probe_interval; + HOST_WIDE_INT residual = size - rounded_size; + + /* We can handle a small number of allocations/probes inline. Otherwise + punt to a loop. */ + if (rounded_size && rounded_size <= 4 * probe_interval) + { + /* We don't use aarch64_sub_sp here because we don't want to + repeatedly load SCRATCHREG. */ + rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg); + if (probe_interval > ARITH_FACTOR) + emit_move_insn (scratch_rtx, GEN_INT (-probe_interval)); + else + scratch_rtx = GEN_INT (-probe_interval); + + for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval) + { + rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx, + scratch_rtx)); + add_reg_note (insn, REG_STACK_CHECK, const0_rtx); + + if (probe_interval > ARITH_FACTOR) + { + RTX_FRAME_RELATED_P (insn) = 1; + rtx adj = plus_constant (Pmode, stack_pointer_rtx, -probe_interval); + add_reg_note (insn, REG_CFA_ADJUST_CFA, + gen_rtx_SET (stack_pointer_rtx, adj)); + } + + emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, + (probe_interval + - GET_MODE_SIZE (word_mode)))); + emit_insn (gen_blockage ()); + } + dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); + } + else if (rounded_size) + { + /* Compute the ending address. */ + rtx temp = gen_rtx_REG (word_mode, scratchreg); + emit_move_insn (temp, GEN_INT (-rounded_size)); + rtx_insn *insn + = emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp)); + + /* For the initial allocation, we don't have a frame pointer + set up, so we always need CFI notes. If we're doing the + final allocation, then we may have a frame pointer, in which + case it is the CFA, otherwise we need CFI notes. + + We can determine which allocation we are doing by looking at + the temporary register. IP0 is the initial allocation, IP1 + is the final allocation. */ + if (scratchreg == IP0_REGNUM || !frame_pointer_needed) + { + /* We want the CFA independent of the stack pointer for the + duration of the loop. */ + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, temp, + (rounded_size + (orig_size - size)))); + RTX_FRAME_RELATED_P (insn) = 1; + } + + /* This allocates and probes the stack. + + It also probes at a 4k interval regardless of the value of + PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL. */ + insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx, + stack_pointer_rtx, temp)); + + /* Now reset the CFA register if needed. */ + if (scratchreg == IP0_REGNUM || !frame_pointer_needed) + { + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_pointer_rtx, + (rounded_size + (orig_size - size)))); + RTX_FRAME_RELATED_P (insn) = 1; + } + + emit_insn (gen_blockage ()); + dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); + } + else + dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); + + /* Handle any residuals. + Note that any residual must be probed. */ + if (residual) + { + aarch64_sub_sp (scratchreg, residual, true); + add_reg_note (get_last_insn (), REG_STACK_CHECK, const0_rtx); + emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, + (residual - GET_MODE_SIZE (word_mode)))); + emit_insn (gen_blockage ()); + } + return; +} + /* AArch64 stack frames generated by this compiler look like: +-------------------------------+ @@ -3592,7 +3718,54 @@ aarch64_expand_prologue (void) aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size); } - aarch64_sub_sp (IP0_REGNUM, initial_adjust, true); + /* We do not fully protect aarch64 against stack clash style attacks + as doing so would be prohibitively expensive with less utility over + time as newer compilers are deployed. + + We assume the guard is at least 64k. Furthermore, we assume that + the caller has not pushed the stack pointer more than 1k into + the guard. A caller that pushes the stack pointer than 1k into + the guard is considered invalid. + + Note that the caller's ability to push the stack pointer into the + guard is a function of the number and size of outgoing arguments and/or + dynamic stack allocations due to the mandatory save of the link register + in the caller's frame. + + With those assumptions the callee can allocate up to 63k of stack + space without probing. + + When probing is needed, we emit a probe at the start of the prologue + and every PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL bytes thereafter. + + We have to track how much space has been allocated, but we do not + track stores into the stack as implicit probes except for the + fp/lr store. */ + HOST_WIDE_INT guard_size + = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); + HOST_WIDE_INT guard_used_by_caller = 1024; + if (flag_stack_clash_protection) + { + if (frame_size == 0) + dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); + else if (initial_adjust < guard_size - guard_used_by_caller + && final_adjust < guard_size - guard_used_by_caller) + dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); + } + + /* In theory we should never have both an initial adjustment + and a callee save adjustment. Verify that is the case since the + code below does not handle it for -fstack-clash-protection. */ + gcc_assert (initial_adjust == 0 || callee_adjust == 0); + + /* Only probe if the initial adjustment is larger than the guard + less the amount of the guard reserved for use by the caller's + outgoing args. */ + if (flag_stack_clash_protection + && initial_adjust >= guard_size - guard_used_by_caller) + aarch64_allocate_and_probe_stack_space (IP0_REGNUM, initial_adjust); + else + aarch64_sub_sp (IP0_REGNUM, initial_adjust, true); if (callee_adjust != 0) aarch64_push_regs (reg1, reg2, callee_adjust); @@ -3613,7 +3786,30 @@ aarch64_expand_prologue (void) callee_adjust != 0 || frame_pointer_needed); aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, callee_adjust != 0 || frame_pointer_needed); - aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + + /* We may need to probe the final adjustment as well. */ + if (flag_stack_clash_protection && final_adjust != 0) + { + /* First probe if the final adjustment is larger than the guard size + less the amount of the guard reserved for use by the caller's + outgoing args. */ + if (final_adjust >= guard_size - guard_used_by_caller) + aarch64_allocate_and_probe_stack_space (IP1_REGNUM, final_adjust); + else + aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + + /* We must also probe if the final adjustment is larger than the guard + that is assumed used by the caller. This may be sub-optimal. */ + if (final_adjust >= guard_used_by_caller) + { + if (dump_file) + fprintf (dump_file, + "Stack clash aarch64 large outgoing arg, probing\n"); + emit_stack_probe (stack_pointer_rtx); + } + } + else + aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); } /* Return TRUE if we can use a simple_return insn. @@ -3679,7 +3875,11 @@ aarch64_expand_epilogue (bool for_sibcall) RTX_FRAME_RELATED_P (insn) = callee_adjust == 0; } else - aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM)); + aarch64_add_sp (IP1_REGNUM, final_adjust, + /* A stack clash protection prologue may not have + left IP1_REGNUM in a usable state. */ + (flag_stack_clash_protection + || df_regs_ever_live_p (IP1_REGNUM))); aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, callee_adjust != 0, &cfi_ops); @@ -3702,7 +3902,11 @@ aarch64_expand_epilogue (bool for_sibcall) cfi_ops = NULL; } - aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM)); + /* A stack clash protection prologue may not have left IP0_REGNUM + in a usable state. */ + aarch64_add_sp (IP0_REGNUM, initial_adjust, + (flag_stack_clash_protection + || df_regs_ever_live_p (IP0_REGNUM))); if (cfi_ops) { @@ -8696,6 +8900,12 @@ aarch64_override_options_internal (struct gcc_options *opts) opts->x_param_values, global_options_set.x_param_values); + /* We assume the guard page is 64k. */ + maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE, + 16, + opts->x_param_values, + global_options_set.x_param_values); + aarch64_override_options_after_change_1 (opts); } @@ -14726,6 +14936,28 @@ aarch64_sched_can_speculate_insn (rtx_insn *insn) } } +/* It has been decided that to allow up to 1kb of outgoing argument + space to be allocated w/o probing. If more than 1kb of outgoing + argment space is allocated, then it must be probed and the last + probe must occur no more than 1kbyte away from the end of the + allocated space. + + This implies that the residual part of an alloca allocation may + need probing in cases where the generic code might not otherwise + think a probe is needed. + + This target hook returns TRUE when allocating RESIDUAL bytes of + alloca space requires an additional probe, otherwise FALSE is + returned. */ + +static bool +aarch64_stack_clash_protection_final_dynamic_probe (rtx residual) +{ + return (residual == CONST0_RTX (Pmode) + || GET_CODE (residual) != CONST_INT + || INTVAL (residual) >= 1024); +} + /* Target-specific selftests. */ #if CHECKING_P @@ -15154,6 +15386,10 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4 +#undef TARGET_STACK_CLASH_PROTECTION_FINAL_DYNAMIC_PROBE +#define TARGET_STACK_CLASH_PROTECTION_FINAL_DYNAMIC_PROBE \ + aarch64_stack_clash_protection_final_dynamic_probe + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 51368e29f2d..09b353d7470 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5413,7 +5413,7 @@ ) (define_insn "probe_stack_range" - [(set (match_operand:DI 0 "register_operand" "=r") + [(set (match_operand:DI 0 "register_operand" "=rk") (unspec_volatile:DI [(match_operand:DI 1 "register_operand" "0") (match_operand:DI 2 "register_operand" "r")] UNSPECV_PROBE_STACK_RANGE))] diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-12.c b/gcc/testsuite/gcc.target/aarch64/stack-check-12.c new file mode 100644 index 00000000000..2ce38483b6b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-12.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +extern void arf (unsigned long int *, unsigned long int *); +void +frob () +{ + unsigned long int num[1000]; + unsigned long int den[1000]; + arf (den, num); +} + +/* This verifies that the scheduler did not break the dependencies + by adjusting the offsets within the probe and that the scheduler + did not reorder around the stack probes. */ +/* { dg-final { scan-assembler-times "sub\\tsp, sp, #4096\\n\\tstr\\txzr, .sp, 4088." 3 } } */ + + + diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-13.c b/gcc/testsuite/gcc.target/aarch64/stack-check-13.c new file mode 100644 index 00000000000..d8886835989 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-13.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +#define ARG32(X) X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X,X +#define ARG192(X) ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X),ARG32(X) +void out1(ARG192(__int128)); +int t1(int); + +int t3(int x) +{ + if (x < 1000) + return t1 (x) + 1; + + out1 (ARG192(1)); + return 0; +} + + + +/* This test creates a large (> 1k) outgoing argument area that needs + to be probed. We don't test the exact size of the space or the + exact offset to make the test a little less sensitive to trivial + output changes. */ +/* { dg-final { scan-assembler-times "sub\\tsp, sp, #....\\n\\tstr\\txzr, \\\[sp" 1 } } */ + + + diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-14.c b/gcc/testsuite/gcc.target/aarch64/stack-check-14.c new file mode 100644 index 00000000000..59ffe01376d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-14.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +int t1(int); + +int t2(int x) +{ + char *p = __builtin_alloca (4050); + x = t1 (x); + return p[x]; +} + + +/* This test has a constant sized alloca that is smaller than the + probe interval. But it actually requires two probes instead + of one because of the optimistic assumptions we made in the + aarch64 prologue code WRT probing state. + + The form can change quite a bit so we just check for two + probes without looking at the actual address. */ +/* { dg-final { scan-assembler-times "str\\txzr," 2 } } */ + + + diff --git a/gcc/testsuite/gcc.target/aarch64/stack-check-15.c b/gcc/testsuite/gcc.target/aarch64/stack-check-15.c new file mode 100644 index 00000000000..e06db6dc2f0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/stack-check-15.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fstack-clash-protection --param stack-clash-protection-guard-size=12" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +int t1(int); + +int t2(int x) +{ + char *p = __builtin_alloca (x); + x = t1 (x); + return p[x]; +} + + +/* This test has a variable sized alloca. It requires 3 probes. + One in the loop, one for the residual and at the end of the + alloca area. + + The form can change quite a bit so we just check for two + probes without looking at the actual address. */ +/* { dg-final { scan-assembler-times "str\\txzr," 3 } } */ + + + diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 02eb2066393..5431c236aa1 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -8384,14 +8384,9 @@ proc check_effective_target_arm_coproc4_ok { } { # proc check_effective_target_supports_stack_clash_protection { } { - # Temporary until the target bits are fully ACK'd. -# if { [istarget aarch*-*-*] } { -# return 1 -# } - if { [istarget x86_64-*-*] || [istarget i?86-*-*] || [istarget powerpc*-*-*] || [istarget rs6000*-*-*] - || [istarget s390*-*-*] } { + || [istarget aarch64*-**] || [istarget s390*-*-*] } { return 1 } return 0