From 9af9dcf11bda3e2c0e24c1acaacb8685ad974e93 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:00 +0200 Subject: [PATCH 01/41] x86/xen: Mark cpu_bringup_and_idle() as dead_end_function The asm_cpu_bringup_and_idle() function is required to push the return value on the stack in order to make ORC happy, but the only reason objtool doesn't complain is because of a happy accident. The thing is that asm_cpu_bringup_and_idle() doesn't return, so validate_branch() never terminates and falls through to the next function, which in the normal case is the hypercall_page. And that, as it happens, is 4095 NOPs and a RET. Make asm_cpu_bringup_and_idle() terminate on it's own, by making the function it calls as a dead-end. This way we no longer rely on what code happens to come after. Fixes: c3881eb58d56 ("x86/xen: Make the secondary CPU idle tasks reliable") Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Reviewed-by: Miroslav Benes Link: https://lore.kernel.org/r/20210624095147.693801717@infradead.org --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e5947fbb9e7a..0e3981d91afc 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -173,6 +173,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "rewind_stack_do_exit", "kunit_try_catch_throw", "xen_start_kernel", + "cpu_bringup_and_idle", }; if (!func) From b7b205c3a0bc2b51f83cb793178ccbc12addf275 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 20 Aug 2021 12:31:07 -0700 Subject: [PATCH 02/41] x86/xen: Move hypercall_page to top of the file Because hypercall_page is page-aligned, the assembler inexplicably adds an unreachable jump from after the end of the previous code to the beginning of hypercall_page. That confuses objtool, understandably. It also creates significant text fragmentation. As a result, much of the object file is wasted text (nops). Move hypercall_page to the beginning of the file to both prevent the text fragmentation and avoid the dead jump instruction. $ size /tmp/head_64.before.o /tmp/head_64.after.o text data bss dec hex filename 10924 307252 4096 322272 4eae0 /tmp/head_64.before.o 6823 307252 4096 318171 4dadb /tmp/head_64.after.o Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lkml.kernel.org/r/20210820193107.omvshmsqbpxufzkc@treble --- arch/x86/xen/xen-head.S | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index cb6538ae2fe0..488944d6d430 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -20,6 +20,23 @@ #include #include +.pushsection .text + .balign PAGE_SIZE +SYM_CODE_START(hypercall_page) + .rept (PAGE_SIZE / 32) + UNWIND_HINT_FUNC + .skip 31, 0x90 + ret + .endr + +#define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 +#include +#undef HYPERCALL +SYM_CODE_END(hypercall_page) +.popsection + #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -64,23 +81,6 @@ SYM_CODE_END(asm_cpu_bringup_and_idle) #endif #endif -.pushsection .text - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - .skip 31, 0x90 - ret - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") From 8b946cc38e063f0f7bb67789478c38f6d7d457c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:01 +0200 Subject: [PATCH 03/41] objtool: Introduce CFI hash Andi reported that objtool on vmlinux.o consumes more memory than his system has, leading to horrific performance. This is in part because we keep a struct instruction for every instruction in the file in-memory. Shrink struct instruction by removing the CFI state (which includes full register state) from it and demand allocating it. Given most instructions don't actually change CFI state, there's lots of repetition there, so add a hash table to find previous CFI instances. Reduces memory consumption (and runtime) for processing an x86_64-allyesconfig: pre: 4:40.84 real, 143.99 user, 44.18 sys, 30624988 mem post: 2:14.61 real, 108.58 user, 25.04 sys, 16396184 mem Suggested-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095147.756759107@infradead.org --- tools/objtool/arch/x86/decode.c | 20 ++-- tools/objtool/check.c | 154 +++++++++++++++++++++++--- tools/objtool/include/objtool/arch.h | 2 +- tools/objtool/include/objtool/cfi.h | 2 + tools/objtool/include/objtool/check.h | 2 +- tools/objtool/orc_gen.c | 15 ++- 6 files changed, 160 insertions(+), 35 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index bc821056aba9..3435a32afbd1 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -779,34 +779,32 @@ int arch_rewrite_retpolines(struct objtool_file *file) return 0; } -int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) +int arch_decode_hint_reg(u8 sp_reg, int *base) { - struct cfi_reg *cfa = &insn->cfi.cfa; - switch (sp_reg) { case ORC_REG_UNDEFINED: - cfa->base = CFI_UNDEFINED; + *base = CFI_UNDEFINED; break; case ORC_REG_SP: - cfa->base = CFI_SP; + *base = CFI_SP; break; case ORC_REG_BP: - cfa->base = CFI_BP; + *base = CFI_BP; break; case ORC_REG_SP_INDIRECT: - cfa->base = CFI_SP_INDIRECT; + *base = CFI_SP_INDIRECT; break; case ORC_REG_R10: - cfa->base = CFI_R10; + *base = CFI_R10; break; case ORC_REG_R13: - cfa->base = CFI_R13; + *base = CFI_R13; break; case ORC_REG_DI: - cfa->base = CFI_DI; + *base = CFI_DI; break; case ORC_REG_DX: - cfa->base = CFI_DX; + *base = CFI_DX; break; default: return -1; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0e3981d91afc..d9f32739b433 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -26,7 +27,11 @@ struct alternative { bool skip_orig; }; -struct cfi_init_state initial_func_cfi; +static unsigned long nr_cfi, nr_cfi_reused, nr_cfi_cache; + +static struct cfi_init_state initial_func_cfi; +static struct cfi_state init_cfi; +static struct cfi_state func_cfi; struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset) @@ -266,6 +271,78 @@ static void init_insn_state(struct insn_state *state, struct section *sec) state->noinstr = sec->noinstr; } +static struct cfi_state *cfi_alloc(void) +{ + struct cfi_state *cfi = calloc(sizeof(struct cfi_state), 1); + if (!cfi) { + WARN("calloc failed"); + exit(1); + } + nr_cfi++; + return cfi; +} + +static int cfi_bits; +static struct hlist_head *cfi_hash; + +static inline bool cficmp(struct cfi_state *cfi1, struct cfi_state *cfi2) +{ + return memcmp((void *)cfi1 + sizeof(cfi1->hash), + (void *)cfi2 + sizeof(cfi2->hash), + sizeof(struct cfi_state) - sizeof(struct hlist_node)); +} + +static inline u32 cfi_key(struct cfi_state *cfi) +{ + return jhash((void *)cfi + sizeof(cfi->hash), + sizeof(*cfi) - sizeof(cfi->hash), 0); +} + +static struct cfi_state *cfi_hash_find_or_add(struct cfi_state *cfi) +{ + struct hlist_head *head = &cfi_hash[hash_min(cfi_key(cfi), cfi_bits)]; + struct cfi_state *obj; + + hlist_for_each_entry(obj, head, hash) { + if (!cficmp(cfi, obj)) { + nr_cfi_cache++; + return obj; + } + } + + obj = cfi_alloc(); + *obj = *cfi; + hlist_add_head(&obj->hash, head); + + return obj; +} + +static void cfi_hash_add(struct cfi_state *cfi) +{ + struct hlist_head *head = &cfi_hash[hash_min(cfi_key(cfi), cfi_bits)]; + + hlist_add_head(&cfi->hash, head); +} + +static void *cfi_hash_alloc(unsigned long size) +{ + cfi_bits = max(10, ilog2(size)); + cfi_hash = mmap(NULL, sizeof(struct hlist_head) << cfi_bits, + PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (cfi_hash == (void *)-1L) { + WARN("mmap fail cfi_hash"); + cfi_hash = NULL; + } else if (stats) { + printf("cfi_bits: %d\n", cfi_bits); + } + + return cfi_hash; +} + +static unsigned long nr_insns; +static unsigned long nr_insns_visited; + /* * Call the arch-specific instruction decoder for all the instructions and add * them to the global instruction list. @@ -276,7 +353,6 @@ static int decode_instructions(struct objtool_file *file) struct symbol *func; unsigned long offset; struct instruction *insn; - unsigned long nr_insns = 0; int ret; for_each_sec(file, sec) { @@ -302,7 +378,6 @@ static int decode_instructions(struct objtool_file *file) memset(insn, 0, sizeof(*insn)); INIT_LIST_HEAD(&insn->alts); INIT_LIST_HEAD(&insn->stack_ops); - init_cfi_state(&insn->cfi); insn->sec = sec; insn->offset = offset; @@ -1137,7 +1212,6 @@ static int handle_group_alt(struct objtool_file *file, memset(nop, 0, sizeof(*nop)); INIT_LIST_HEAD(&nop->alts); INIT_LIST_HEAD(&nop->stack_ops); - init_cfi_state(&nop->cfi); nop->sec = special_alt->new_sec; nop->offset = special_alt->new_off + special_alt->new_len; @@ -1546,10 +1620,11 @@ static void set_func_state(struct cfi_state *state) static int read_unwind_hints(struct objtool_file *file) { + struct cfi_state cfi = init_cfi; struct section *sec, *relocsec; - struct reloc *reloc; struct unwind_hint *hint; struct instruction *insn; + struct reloc *reloc; int i; sec = find_section_by_name(file->elf, ".discard.unwind_hints"); @@ -1587,19 +1662,24 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; if (hint->type == UNWIND_HINT_TYPE_FUNC) { - set_func_state(&insn->cfi); + insn->cfi = &func_cfi; continue; } - if (arch_decode_hint_reg(insn, hint->sp_reg)) { + if (insn->cfi) + cfi = *(insn->cfi); + + if (arch_decode_hint_reg(hint->sp_reg, &cfi.cfa.base)) { WARN_FUNC("unsupported unwind_hint sp base reg %d", insn->sec, insn->offset, hint->sp_reg); return -1; } - insn->cfi.cfa.offset = bswap_if_needed(hint->sp_offset); - insn->cfi.type = hint->type; - insn->cfi.end = hint->end; + cfi.cfa.offset = bswap_if_needed(hint->sp_offset); + cfi.type = hint->type; + cfi.end = hint->end; + + insn->cfi = cfi_hash_find_or_add(&cfi); } return 0; @@ -2453,13 +2533,18 @@ static int propagate_alt_cfi(struct objtool_file *file, struct instruction *insn if (!insn->alt_group) return 0; + if (!insn->cfi) { + WARN("CFI missing"); + return -1; + } + alt_cfi = insn->alt_group->cfi; group_off = insn->offset - insn->alt_group->first_insn->offset; if (!alt_cfi[group_off]) { - alt_cfi[group_off] = &insn->cfi; + alt_cfi[group_off] = insn->cfi; } else { - if (memcmp(alt_cfi[group_off], &insn->cfi, sizeof(struct cfi_state))) { + if (cficmp(alt_cfi[group_off], insn->cfi)) { WARN_FUNC("stack layout conflict in alternatives", insn->sec, insn->offset); return -1; @@ -2510,9 +2595,14 @@ static int handle_insn_ops(struct instruction *insn, static bool insn_cfi_match(struct instruction *insn, struct cfi_state *cfi2) { - struct cfi_state *cfi1 = &insn->cfi; + struct cfi_state *cfi1 = insn->cfi; int i; + if (!cfi1) { + WARN("CFI missing"); + return false; + } + if (memcmp(&cfi1->cfa, &cfi2->cfa, sizeof(cfi1->cfa))) { WARN_FUNC("stack state mismatch: cfa1=%d%+d cfa2=%d%+d", @@ -2697,7 +2787,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, struct instruction *insn, struct insn_state state) { struct alternative *alt; - struct instruction *next_insn; + struct instruction *next_insn, *prev_insn = NULL; struct section *sec; u8 visited; int ret; @@ -2726,15 +2816,25 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, if (insn->visited & visited) return 0; + } else { + nr_insns_visited++; } if (state.noinstr) state.instr += insn->instr; - if (insn->hint) - state.cfi = insn->cfi; - else - insn->cfi = state.cfi; + if (insn->hint) { + state.cfi = *insn->cfi; + } else { + /* XXX track if we actually changed state.cfi */ + + if (prev_insn && !cficmp(prev_insn->cfi, &state.cfi)) { + insn->cfi = prev_insn->cfi; + nr_cfi_reused++; + } else { + insn->cfi = cfi_hash_find_or_add(&state.cfi); + } + } insn->visited |= visited; @@ -2884,6 +2984,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 1; } + prev_insn = insn; insn = next_insn; } @@ -3139,10 +3240,20 @@ int check(struct objtool_file *file) int ret, warnings = 0; arch_initial_func_cfi_state(&initial_func_cfi); + init_cfi_state(&init_cfi); + init_cfi_state(&func_cfi); + set_func_state(&func_cfi); + + if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) + goto out; + + cfi_hash_add(&init_cfi); + cfi_hash_add(&func_cfi); ret = decode_sections(file); if (ret < 0) goto out; + warnings += ret; if (list_empty(&file->insn_list)) @@ -3193,6 +3304,13 @@ int check(struct objtool_file *file) warnings += ret; } + if (stats) { + printf("nr_insns_visited: %ld\n", nr_insns_visited); + printf("nr_cfi: %ld\n", nr_cfi); + printf("nr_cfi_reused: %ld\n", nr_cfi_reused); + printf("nr_cfi_cache: %ld\n", nr_cfi_cache); + } + out: /* * For now, don't fail the kernel build on fatal warnings. These diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 062bb6e9b865..a5ab6829511f 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -83,7 +83,7 @@ unsigned long arch_dest_reloc_offset(int addend); const char *arch_nop_insn(int len); -int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg); +int arch_decode_hint_reg(u8 sp_reg, int *base); bool arch_is_retpoline(struct symbol *sym); diff --git a/tools/objtool/include/objtool/cfi.h b/tools/objtool/include/objtool/cfi.h index fd5cb0bed9bf..f11d1ac1dadf 100644 --- a/tools/objtool/include/objtool/cfi.h +++ b/tools/objtool/include/objtool/cfi.h @@ -7,6 +7,7 @@ #define _OBJTOOL_CFI_H #include +#include #define CFI_UNDEFINED -1 #define CFI_CFA -2 @@ -24,6 +25,7 @@ struct cfi_init_state { }; struct cfi_state { + struct hlist_node hash; /* must be first, cficmp() */ struct cfi_reg regs[CFI_NUM_REGS]; struct cfi_reg vals[CFI_NUM_REGS]; struct cfi_reg cfa; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 56d50bc50c10..07e99c25c7ac 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -60,7 +60,7 @@ struct instruction { struct list_head alts; struct symbol *func; struct list_head stack_ops; - struct cfi_state cfi; + struct cfi_state *cfi; }; static inline bool is_static_jump(struct instruction *insn) diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index dc9b7dd314b0..ddacb4215748 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -13,13 +13,19 @@ #include #include -static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi) +static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, + struct instruction *insn) { - struct instruction *insn = container_of(cfi, struct instruction, cfi); struct cfi_reg *bp = &cfi->regs[CFI_BP]; memset(orc, 0, sizeof(*orc)); + if (!cfi) { + orc->end = 0; + orc->sp_reg = ORC_REG_UNDEFINED; + return 0; + } + orc->end = cfi->end; if (cfi->cfa.base == CFI_UNDEFINED) { @@ -162,7 +168,7 @@ int orc_create(struct objtool_file *file) int i; if (!alt_group) { - if (init_orc_entry(&orc, &insn->cfi)) + if (init_orc_entry(&orc, insn->cfi, insn)) return -1; if (!memcmp(&prev_orc, &orc, sizeof(orc))) continue; @@ -186,7 +192,8 @@ int orc_create(struct objtool_file *file) struct cfi_state *cfi = alt_group->cfi[i]; if (!cfi) continue; - if (init_orc_entry(&orc, cfi)) + /* errors are reported on the original insn */ + if (init_orc_entry(&orc, cfi, insn)) return -1; if (!memcmp(&prev_orc, &orc, sizeof(orc))) continue; From f56dae88a81fded66adf2bea9922d1d98d1da14f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:02 +0200 Subject: [PATCH 04/41] objtool: Handle __sanitize_cov*() tail calls Turns out the compilers also generate tail calls to __sanitize_cov*(), make sure to also patch those out in noinstr code. Fixes: 0f1441b44e82 ("objtool: Fix noinstr vs KCOV") Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://lore.kernel.org/r/20210624095147.818783799@infradead.org --- tools/objtool/arch/x86/decode.c | 20 ++++ tools/objtool/check.c | 158 ++++++++++++++------------- tools/objtool/include/objtool/arch.h | 1 + 3 files changed, 105 insertions(+), 74 deletions(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 3435a32afbd1..340a3dce94a0 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -659,6 +659,26 @@ const char *arch_nop_insn(int len) return nops[len-1]; } +#define BYTE_RET 0xC3 + +const char *arch_ret_insn(int len) +{ + static const char ret[5][5] = { + { BYTE_RET }, + { BYTE_RET, BYTES_NOP1 }, + { BYTE_RET, BYTES_NOP2 }, + { BYTE_RET, BYTES_NOP3 }, + { BYTE_RET, BYTES_NOP4 }, + }; + + if (len < 1 || len > 5) { + WARN("invalid RET size: %d\n", len); + return NULL; + } + + return ret[len-1]; +} + /* asm/alternative.h ? */ #define ALTINSTR_FLAG_INV (1 << 15) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d9f32739b433..c6f206fee8ff 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -904,6 +904,79 @@ static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *i return insn->reloc; } +static void remove_insn_ops(struct instruction *insn) +{ + struct stack_op *op, *tmp; + + list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) { + list_del(&op->list); + free(op); + } +} + +static void add_call_dest(struct objtool_file *file, struct instruction *insn, + struct symbol *dest, bool sibling) +{ + struct reloc *reloc = insn_reloc(file, insn); + + insn->call_dest = dest; + if (!dest) + return; + + if (insn->call_dest->static_call_tramp) { + list_add_tail(&insn->call_node, + &file->static_call_list); + } + + /* + * Many compilers cannot disable KCOV with a function attribute + * so they need a little help, NOP out any KCOV calls from noinstr + * text. + */ + if (insn->sec->noinstr && + !strncmp(insn->call_dest->name, "__sanitizer_cov_", 16)) { + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + sibling ? arch_ret_insn(insn->len) + : arch_nop_insn(insn->len)); + + insn->type = sibling ? INSN_RETURN : INSN_NOP; + } + + if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) { + if (sibling) + WARN_FUNC("Tail call to __fentry__ !?!?", insn->sec, insn->offset); + + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + arch_nop_insn(insn->len)); + + insn->type = INSN_NOP; + + list_add_tail(&insn->mcount_loc_node, + &file->mcount_loc_list); + } + + /* + * Whatever stack impact regular CALLs have, should be undone + * by the RETURN of the called function. + * + * Annotated intra-function calls retain the stack_ops but + * are converted to JUMP, see read_intra_function_calls(). + */ + remove_insn_ops(insn); +} + /* * Find the destination instructions for all jumps. */ @@ -942,11 +1015,7 @@ static int add_jump_destinations(struct objtool_file *file) continue; } else if (insn->func) { /* internal or external sibling call (with reloc) */ - insn->call_dest = reloc->sym; - if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); - } + add_call_dest(file, insn, reloc->sym, true); continue; } else if (reloc->sym->sec->idx) { dest_sec = reloc->sym->sec; @@ -1002,13 +1071,8 @@ static int add_jump_destinations(struct objtool_file *file) } else if (insn->jump_dest->func->pfunc != insn->func->pfunc && insn->jump_dest->offset == insn->jump_dest->func->offset) { - /* internal sibling call (without reloc) */ - insn->call_dest = insn->jump_dest->func; - if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); - } + add_call_dest(file, insn, insn->jump_dest->func, true); } } } @@ -1016,16 +1080,6 @@ static int add_jump_destinations(struct objtool_file *file) return 0; } -static void remove_insn_ops(struct instruction *insn) -{ - struct stack_op *op, *tmp; - - list_for_each_entry_safe(op, tmp, &insn->stack_ops, list) { - list_del(&op->list); - free(op); - } -} - static struct symbol *find_call_destination(struct section *sec, unsigned long offset) { struct symbol *call_dest; @@ -1044,6 +1098,7 @@ static int add_call_destinations(struct objtool_file *file) { struct instruction *insn; unsigned long dest_off; + struct symbol *dest; struct reloc *reloc; for_each_insn(file, insn) { @@ -1053,7 +1108,9 @@ static int add_call_destinations(struct objtool_file *file) reloc = insn_reloc(file, insn); if (!reloc) { dest_off = arch_jump_destination(insn); - insn->call_dest = find_call_destination(insn->sec, dest_off); + dest = find_call_destination(insn->sec, dest_off); + + add_call_dest(file, insn, dest, false); if (insn->ignore) continue; @@ -1071,9 +1128,8 @@ static int add_call_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_off = arch_dest_reloc_offset(reloc->addend); - insn->call_dest = find_call_destination(reloc->sym->sec, - dest_off); - if (!insn->call_dest) { + dest = find_call_destination(reloc->sym->sec, dest_off); + if (!dest) { WARN_FUNC("can't find call dest symbol at %s+0x%lx", insn->sec, insn->offset, reloc->sym->sec->name, @@ -1081,6 +1137,8 @@ static int add_call_destinations(struct objtool_file *file) return -1; } + add_call_dest(file, insn, dest, false); + } else if (arch_is_retpoline(reloc->sym)) { /* * Retpoline calls are really dynamic calls in @@ -1096,55 +1154,7 @@ static int add_call_destinations(struct objtool_file *file) continue; } else - insn->call_dest = reloc->sym; - - if (insn->call_dest && insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); - } - - /* - * Many compilers cannot disable KCOV with a function attribute - * so they need a little help, NOP out any KCOV calls from noinstr - * text. - */ - if (insn->sec->noinstr && - !strncmp(insn->call_dest->name, "__sanitizer_cov_", 16)) { - if (reloc) { - reloc->type = R_NONE; - elf_write_reloc(file->elf, reloc); - } - - elf_write_insn(file->elf, insn->sec, - insn->offset, insn->len, - arch_nop_insn(insn->len)); - insn->type = INSN_NOP; - } - - if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) { - if (reloc) { - reloc->type = R_NONE; - elf_write_reloc(file->elf, reloc); - } - - elf_write_insn(file->elf, insn->sec, - insn->offset, insn->len, - arch_nop_insn(insn->len)); - - insn->type = INSN_NOP; - - list_add_tail(&insn->mcount_loc_node, - &file->mcount_loc_list); - } - - /* - * Whatever stack impact regular CALLs have, should be undone - * by the RETURN of the called function. - * - * Annotated intra-function calls retain the stack_ops but - * are converted to JUMP, see read_intra_function_calls(). - */ - remove_insn_ops(insn); + add_call_dest(file, insn, reloc->sym, false); } return 0; diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index a5ab6829511f..6f482ae2d7d8 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -82,6 +82,7 @@ unsigned long arch_jump_destination(struct instruction *insn); unsigned long arch_dest_reloc_offset(int addend); const char *arch_nop_insn(int len); +const char *arch_ret_insn(int len); int arch_decode_hint_reg(u8 sp_reg, int *base); From 2b2f72d4d81936bc08c18c426f40b7df70e2f8e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:03 +0200 Subject: [PATCH 05/41] x86/kvm: Always inline sev_*guest() vmlinux.o: warning: objtool: svm_vcpu_enter_exit()+0x4d: call to sev_es_guest() leaves .noinstr.text section vmlinux.o: warning: objtool: svm_vcpu_enter_exit()+0x50: call to sev_guest() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095147.880513802@infradead.org --- arch/x86/kvm/svm/svm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 524d943f3efc..408031a312c9 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -218,12 +218,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); void recalc_intercepts(struct vcpu_svm *svm); -static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) { return container_of(kvm, struct kvm_svm, kvm); } -static inline bool sev_guest(struct kvm *kvm) +static __always_inline bool sev_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -234,7 +234,7 @@ static inline bool sev_guest(struct kvm *kvm) #endif } -static inline bool sev_es_guest(struct kvm *kvm) +static __always_inline bool sev_es_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; From a168233a440d01d60ca65ea41e876661466f108b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:04 +0200 Subject: [PATCH 06/41] x86/kvm: Always inline vmload() / vmsave() vmlinux.o: warning: objtool: svm_vcpu_enter_exit()+0xea: call to vmload() leaves .noinstr.text section vmlinux.o: warning: objtool: svm_vcpu_enter_exit()+0x133: call to vmsave() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095147.942250748@infradead.org --- arch/x86/kvm/svm/svm_ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 22e2b019de37..9430d6437c9f 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid) * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, * hence 'unsigned long' instead of 'hpa_t'. */ -static inline void vmsave(unsigned long pa) +static __always_inline void vmsave(unsigned long pa) { svm_asm1(vmsave, "a" (pa), "memory"); } -static inline void vmload(unsigned long pa) +static __always_inline void vmload(unsigned long pa) { svm_asm1(vmload, "a" (pa), "memory"); } From e25b694bf1d9ef4a3f36c0b85348f8e780f22139 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:05 +0200 Subject: [PATCH 07/41] x86: Always inline context_tracking_guest_enter() Yes, it really did out-of-line this.... vmlinux.o: warning: objtool: vmx_vcpu_enter_exit()+0x31: call to context_tracking_guest_enter() leaves .noinstr.text section 000000000019f660 : 19f660: e8 00 00 00 00 callq 19f665 19f661: R_X86_64_PLT32 __sanitizer_cov_trace_pc-0x4 19f665: 31 c0 xor %eax,%eax 19f667: c3 retq Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.003928226@infradead.org --- include/linux/context_tracking.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 4d7fced3a39f..7a14807c9d1a 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -105,7 +105,7 @@ static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } -static inline bool context_tracking_guest_enter(void) { return false; } +static __always_inline bool context_tracking_guest_enter(void) { return false; } static inline void context_tracking_guest_exit(void) { } #endif /* !CONFIG_CONTEXT_TRACKING */ From aee045ed0a6b22100f4d5945ee2deb75db6a0dd5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:06 +0200 Subject: [PATCH 08/41] x86/kvm: Always inline to_svm() vmlinux.o: warning: objtool: svm_vcpu_enter_exit()+0x13: call to to_svm() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.066347165@infradead.org --- arch/x86/kvm/svm/svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 408031a312c9..38f12a656d9c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -271,7 +271,7 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } From 010050a86393703f43859a4704d2193be49126d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:07 +0200 Subject: [PATCH 09/41] x86/kvm: Always inline evmcs_write64() vmlinux.o: warning: objtool: vmx_update_host_rsp()+0x64: call to evmcs_write64() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.126956644@infradead.org --- arch/x86/kvm/vmx/evmcs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 152ab0aa82cf..16731d2cf231 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -static inline void evmcs_write64(unsigned long field, u64 value) +static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} static inline void evmcs_write16(unsigned long field, u16 value) {} static inline u64 evmcs_read64(unsigned long field) { return 0; } From c6b01dace2cd7f6b3e9174d4d1411755608486f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:08 +0200 Subject: [PATCH 10/41] x86: Always inline ip_within_syscall_gap() vmlinux.o: warning: objtool: vc_switch_off_ist()+0x20: call to ip_within_syscall_gap.isra.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.188166492@infradead.org --- arch/x86/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index b94f615600d5..703663175a5a 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -181,7 +181,7 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp -static inline bool ip_within_syscall_gap(struct pt_regs *regs) +static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) { bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); From 2c36d87be49355931da5b29ef7621505e0e46ce9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:09 +0200 Subject: [PATCH 11/41] x86/sev: Fix noinstr for vc_ghcb_invalidate() vmlinux.o: warning: objtool: __sev_put_ghcb()+0x88: call to __memset() leaves .noinstr.text section vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x39: call to __memset() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.250770465@infradead.org --- arch/x86/kernel/sev-shared.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 9f90f460a28c..34f20e08dc46 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { ghcb->save.sw_exit_code = 0; - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } static bool vc_decoding_needed(unsigned long exit_code) From ce0b9c805dd66d5e49fd53ec5415ae398f4c56e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:10 +0200 Subject: [PATCH 12/41] locking/lockdep: Avoid RCU-induced noinstr fail vmlinux.o: warning: objtool: look_up_lock_class()+0xc7: call to rcu_read_lock_any_held() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.311980536@infradead.org --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf1c00c881e4..8a509672a4cc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -888,7 +888,7 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - hlist_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample From e9382440de18718fb6f878986c0844c30abc6f99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:11 +0200 Subject: [PATCH 13/41] x86/paravirt: Mark arch_local_irq_*() __always_inline vmlinux.o: warning: objtool: lockdep_hardirqs_on()+0x72: call to arch_local_save_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: lockdep_hardirqs_off()+0x73: call to arch_local_save_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: match_held_lock()+0x11f: call to arch_local_save_flags() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x4e: call to arch_local_irq_save() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0x65: call to arch_local_irq_disable() leaves .noinstr.text section vmlinux.o: warning: objtool: lock_is_held_type()+0xfe: call to arch_local_irq_enable() leaves .noinstr.text section It makes no sense to not inline these things. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.373073648@infradead.org --- arch/x86/include/asm/paravirt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index da3a1ac82be5..89a53227f210 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -678,23 +678,23 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; From eac46b323b28215ad19d53390737df4aa336ac14 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:12 +0200 Subject: [PATCH 14/41] x86/paravirt: Use PVOP_* for paravirt calls Doing unconditional indirect calls through the pv_ops vector is weird. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.437720419@infradead.org --- arch/x86/include/asm/paravirt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 89a53227f210..a13a9a346539 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -52,11 +52,11 @@ void __init paravirt_set_cap(void); /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); #endif } From 0a53c9acf4da51a75392b0b543ce5eaae78a567f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:13 +0200 Subject: [PATCH 15/41] x86/xen: Make read_cr2() noinstr vmlinux.o: warning: objtool: pv_ops[41]: native_read_cr2 vmlinux.o: warning: objtool: pv_ops[41]: xen_read_cr2 vmlinux.o: warning: objtool: pv_ops[41]: xen_read_cr2_direct vmlinux.o: warning: objtool: exc_double_fault()+0x15: call to pv_ops[41]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.500331616@infradead.org --- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/kernel/paravirt.c | 7 ++++++- arch/x86/xen/xen-asm.S | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a13a9a346539..8878065107a2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -133,7 +133,7 @@ static inline void write_cr0(unsigned long x) PVOP_VCALL1(cpu.write_cr0, x); } -static inline unsigned long read_cr2(void) +static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 04cafc057bed..e351014fd62e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -218,6 +218,11 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } + +static noinstr unsigned long pv_native_read_cr2(void) +{ + return native_read_cr2(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -298,7 +303,7 @@ struct paravirt_patch_template pv_ops = { .mmu.exit_mmap = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1e626444712b..aef4a1e8f33f 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -102,6 +102,7 @@ SYM_FUNC_START(check_events) ret SYM_FUNC_END(check_events) +.pushsection .noinstr.text, "ax" SYM_FUNC_START(xen_read_cr2) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX @@ -116,6 +117,7 @@ SYM_FUNC_START(xen_read_cr2_direct) FRAME_END ret SYM_FUNC_END(xen_read_cr2_direct); +.popsection .macro xen_pv_trap name SYM_CODE_START(xen_\name) From 209cfd0cbb6722d3461e4f928dc150e4c3811948 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:14 +0200 Subject: [PATCH 16/41] x86/xen: Make write_cr2() noinstr vmlinux.o: warning: objtool: pv_ops[42]: native_write_cr2 vmlinux.o: warning: objtool: pv_ops[42]: xen_write_cr2 vmlinux.o: warning: objtool: exc_nmi()+0x127: call to pv_ops[42]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.563524913@infradead.org --- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/kernel/paravirt.c | 7 ++++++- arch/x86/xen/mmu_pv.c | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 8878065107a2..be82b5217958 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -140,7 +140,7 @@ static __always_inline unsigned long read_cr2(void) ALT_NOT(X86_FEATURE_XENPV)); } -static inline void write_cr2(unsigned long x) +static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e351014fd62e..fc2cf2b6cdba 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -223,6 +223,11 @@ static noinstr unsigned long pv_native_read_cr2(void) { return native_read_cr2(); } + +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -304,7 +309,7 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), - .mmu.write_cr2 = native_write_cr2, + .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1df5f01529e5..f3cafe56fbe5 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1204,7 +1204,8 @@ static void __init xen_pagetable_init(void) xen_remap_memory(); xen_setup_mfn_list_list(); } -static void xen_write_cr2(unsigned long cr2) + +static noinstr void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } From f4afb713e5c3a4419ba7aaecc31a8c8bd91d13fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:15 +0200 Subject: [PATCH 17/41] x86/xen: Make get_debugreg() noinstr vmlinux.o: warning: objtool: pv_ops[1]: xen_get_debugreg vmlinux.o: warning: objtool: pv_ops[1]: native_get_debugreg vmlinux.o: warning: objtool: exc_debug()+0x25: call to pv_ops[1]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.625523645@infradead.org --- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/xen/hypercall.h | 2 +- arch/x86/kernel/paravirt.c | 8 ++++++-- arch/x86/xen/enlighten_pv.c | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index be82b5217958..f48465c66c31 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -113,7 +113,7 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, /* * These special macros can be used to get or set a debugging register */ -static inline unsigned long paravirt_get_debugreg(int reg) +static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 454b20815f35..af9220258d82 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -314,7 +314,7 @@ HYPERVISOR_set_debugreg(int reg, unsigned long value) return _hypercall2(int, set_debugreg, reg, value); } -static inline unsigned long +static __always_inline unsigned long HYPERVISOR_get_debugreg(int reg) { return _hypercall1(unsigned long, get_debugreg, reg); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index fc2cf2b6cdba..8af526c41b34 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -228,6 +228,11 @@ static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); } + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -254,7 +259,7 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, - .cpu.get_debugreg = native_get_debugreg, + .cpu.get_debugreg = pv_native_get_debugreg, .cpu.set_debugreg = native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, @@ -382,7 +387,6 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL /* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 753f63734c13..273e1fa8537c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -316,7 +316,7 @@ static void xen_set_debugreg(int reg, unsigned long val) HYPERVISOR_set_debugreg(reg, val); } -static unsigned long xen_get_debugreg(int reg) +static noinstr unsigned long xen_get_debugreg(int reg) { return HYPERVISOR_get_debugreg(reg); } From 7361fac0465ba96ec8f7559459e3c70818ba6c78 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:16 +0200 Subject: [PATCH 18/41] x86/xen: Make set_debugreg() noinstr vmlinux.o: warning: objtool: pv_ops[2]: xen_set_debugreg vmlinux.o: warning: objtool: pv_ops[2]: native_set_debugreg vmlinux.o: warning: objtool: exc_debug()+0x3b: call to pv_ops[2]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.687755639@infradead.org --- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/include/asm/xen/hypercall.h | 2 +- arch/x86/kernel/paravirt.c | 9 ++++++--- arch/x86/xen/enlighten_pv.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f48465c66c31..34da790ac429 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -118,7 +118,7 @@ static __always_inline unsigned long paravirt_get_debugreg(int reg) return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) -static inline void set_debugreg(unsigned long val, int reg) +static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index af9220258d82..990b8aa179c8 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -308,7 +308,7 @@ HYPERVISOR_platform_op(struct xen_platform_op *op) return _hypercall1(int, platform_op, op); } -static inline int +static __always_inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) { return _hypercall2(int, set_debugreg, reg, value); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8af526c41b34..cdaf8624ea1b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -233,6 +233,11 @@ static noinstr unsigned long pv_native_get_debugreg(int regno) { return native_get_debugreg(regno); } + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -260,7 +265,7 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, .cpu.get_debugreg = pv_native_get_debugreg, - .cpu.set_debugreg = native_set_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, @@ -386,8 +391,6 @@ struct paravirt_patch_template pv_ops = { }; #ifdef CONFIG_PARAVIRT_XXL -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); void (*paravirt_iret)(void) = native_iret; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 273e1fa8537c..2b1a8ba71629 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -311,7 +311,7 @@ static void __init xen_init_capabilities(void) } } -static void xen_set_debugreg(int reg, unsigned long val) +static noinstr void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); } From 20125c872a3f129cef7fdec2b7681da98502a55d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:17 +0200 Subject: [PATCH 19/41] x86/xen: Make save_fl() noinstr vmlinux.o: warning: objtool: pv_ops[30]: native_save_fl vmlinux.o: warning: objtool: pv_ops[30]: __raw_callee_save_xen_save_fl vmlinux.o: warning: objtool: pv_ops[30]: xen_save_fl_direct vmlinux.o: warning: objtool: lockdep_hardirqs_off()+0x73: call to pv_ops[30]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.749712274@infradead.org --- arch/x86/include/asm/paravirt.h | 7 +++++-- arch/x86/kernel/irqflags.S | 2 ++ arch/x86/xen/irq.c | 4 ++-- arch/x86/xen/xen-asm.S | 32 ++++++++++++++++---------------- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 34da790ac429..cebec95a7124 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -653,10 +653,10 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func -#define PV_CALLEE_SAVE_REGS_THUNK(func) \ +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ - asm(".pushsection .text;" \ + asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ PV_THUNK_NAME(func) ":" \ @@ -669,6 +669,9 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") +#define PV_CALLEE_SAVE_REGS_THUNK(func) \ + __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") + /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 8ef35063964b..760e1f293093 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,9 +7,11 @@ /* * unsigned long native_save_fl(void) */ +.pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX ret SYM_FUNC_END(native_save_fl) +.popsection EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index dfa091d79c2e..9c71f43ba303 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -24,7 +24,7 @@ void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage __visible unsigned long xen_save_fl(void) +asmlinkage __visible noinstr unsigned long xen_save_fl(void) { struct vcpu_info *vcpu; unsigned long flags; @@ -40,7 +40,7 @@ asmlinkage __visible unsigned long xen_save_fl(void) */ return (-flags) & X86_EFLAGS_IF; } -PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); +__PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl, ".noinstr.text"); asmlinkage __visible void xen_irq_disable(void) { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index aef4a1e8f33f..0883e39fee2e 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -57,22 +57,6 @@ SYM_FUNC_START(xen_irq_disable_direct) ret SYM_FUNC_END(xen_irq_disable_direct) -/* - * (xen_)save_fl is used to get the current interrupt enable status. - * Callers expect the status to be in X86_EFLAGS_IF, and other bits - * may be set in the return value. We take advantage of this by - * making sure that X86_EFLAGS_IF has the right value (and other bits - * in that byte are 0), but other bits in the return value are - * undefined. We need to toggle the state of the bit, because Xen and - * x86 use opposite senses (mask vs enable). - */ -SYM_FUNC_START(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - setz %ah - addb %ah, %ah - ret -SYM_FUNC_END(xen_save_fl_direct) - /* * Force an event check by making a hypercall, but preserve regs * before making the call. @@ -103,6 +87,22 @@ SYM_FUNC_START(check_events) SYM_FUNC_END(check_events) .pushsection .noinstr.text, "ax" +/* + * (xen_)save_fl is used to get the current interrupt enable status. + * Callers expect the status to be in X86_EFLAGS_IF, and other bits + * may be set in the return value. We take advantage of this by + * making sure that X86_EFLAGS_IF has the right value (and other bits + * in that byte are 0), but other bits in the return value are + * undefined. We need to toggle the state of the bit, because Xen and + * x86 use opposite senses (mask vs enable). + */ +SYM_FUNC_START(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + setz %ah + addb %ah, %ah + ret +SYM_FUNC_END(xen_save_fl_direct) + SYM_FUNC_START(xen_read_cr2) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX From 74ea805b79d2b6eb472daa2540ed35ccb4ed23e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:18 +0200 Subject: [PATCH 20/41] x86/xen: Make hypercall_page noinstr vmlinux.o: warning: objtool: xen_set_debugreg()+0x3: call to hypercall_page() leaves .noinstr.text section vmlinux.o: warning: objtool: xen_get_debugreg()+0x3: call to hypercall_page() leaves .noinstr.text section vmlinux.o: warning: objtool: xen_irq_enable()+0x24: call to hypercall_page() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.810950584@infradead.org --- arch/x86/xen/xen-head.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 488944d6d430..9e27b86a0c31 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -20,7 +20,7 @@ #include #include -.pushsection .text +.pushsection .noinstr.text, "ax" .balign PAGE_SIZE SYM_CODE_START(hypercall_page) .rept (PAGE_SIZE / 32) From d7bfc7d57cbe13382fd3eb739667fd0e2f74122b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:19 +0200 Subject: [PATCH 21/41] x86/xen: Make irq_enable() noinstr vmlinux.o: warning: objtool: pv_ops[32]: native_irq_enable vmlinux.o: warning: objtool: pv_ops[32]: __raw_callee_save_xen_irq_enable vmlinux.o: warning: objtool: pv_ops[32]: xen_irq_enable_direct vmlinux.o: warning: objtool: lock_is_held_type()+0xfe: call to pv_ops[32]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.872254932@infradead.org --- arch/x86/kernel/paravirt.c | 7 ++++- arch/x86/xen/irq.c | 4 +-- arch/x86/xen/xen-asm.S | 56 +++++++++++++++++++------------------- 3 files changed, 36 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cdaf8624ea1b..75f0d241752b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -238,6 +238,11 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } + +static noinstr void pv_native_irq_enable(void) +{ + native_irq_enable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -302,7 +307,7 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 9c71f43ba303..7fb4cf28879e 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -53,7 +53,7 @@ asmlinkage __visible void xen_irq_disable(void) } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); -asmlinkage __visible void xen_irq_enable(void) +asmlinkage __visible noinstr void xen_irq_enable(void) { struct vcpu_info *vcpu; @@ -76,7 +76,7 @@ asmlinkage __visible void xen_irq_enable(void) preempt_enable(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); +__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable, ".noinstr.text"); static void xen_safe_halt(void) { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 0883e39fee2e..222519528890 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -21,33 +21,6 @@ #include #include -/* - * Enable events. This clears the event mask and tests the pending - * event status with one and operation. If there are pending events, - * then enter the hypervisor to get them handled. - */ -SYM_FUNC_START(xen_irq_enable_direct) - FRAME_BEGIN - /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - - /* - * Preempt here doesn't matter because that will deal with any - * pending interrupts. The pending check may end up being run - * on the wrong CPU, but that doesn't hurt. - */ - - /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jz 1f - - call check_events -1: - FRAME_END - ret -SYM_FUNC_END(xen_irq_enable_direct) - - /* * Disabling events is simply a matter of making the event mask * non-zero. @@ -57,6 +30,8 @@ SYM_FUNC_START(xen_irq_disable_direct) ret SYM_FUNC_END(xen_irq_disable_direct) +.pushsection .noinstr.text, "ax" + /* * Force an event check by making a hypercall, but preserve regs * before making the call. @@ -86,7 +61,32 @@ SYM_FUNC_START(check_events) ret SYM_FUNC_END(check_events) -.pushsection .noinstr.text, "ax" +/* + * Enable events. This clears the event mask and tests the pending + * event status with one and operation. If there are pending events, + * then enter the hypervisor to get them handled. + */ +SYM_FUNC_START(xen_irq_enable_direct) + FRAME_BEGIN + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + + /* + * Preempt here doesn't matter because that will deal with any + * pending interrupts. The pending check may end up being run + * on the wrong CPU, but that doesn't hurt. + */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + jz 1f + + call check_events +1: + FRAME_END + ret +SYM_FUNC_END(xen_irq_enable_direct) + /* * (xen_)save_fl is used to get the current interrupt enable status. * Callers expect the status to be in X86_EFLAGS_IF, and other bits From 09c413071e2de71d1f28813c560ae0c06b344520 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:20 +0200 Subject: [PATCH 22/41] x86/xen: Make irq_disable() noinstr vmlinux.o: warning: objtool: pv_ops[31]: native_irq_disable vmlinux.o: warning: objtool: pv_ops[31]: __raw_callee_save_xen_irq_disable vmlinux.o: warning: objtool: pv_ops[31]: xen_irq_disable_direct vmlinux.o: warning: objtool: lock_is_held_type()+0x5b: call to pv_ops[31]() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.933869441@infradead.org --- arch/x86/kernel/paravirt.c | 7 ++++++- arch/x86/xen/irq.c | 4 ++-- arch/x86/xen/xen-asm.S | 3 +-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 75f0d241752b..ebc45360ffd4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -243,6 +243,11 @@ static noinstr void pv_native_irq_enable(void) { native_irq_enable(); } + +static noinstr void pv_native_irq_disable(void) +{ + native_irq_disable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -306,7 +311,7 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 7fb4cf28879e..f52b60df4e0c 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -42,7 +42,7 @@ asmlinkage __visible noinstr unsigned long xen_save_fl(void) } __PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl, ".noinstr.text"); -asmlinkage __visible void xen_irq_disable(void) +asmlinkage __visible noinstr void xen_irq_disable(void) { /* There's a one instruction preempt window here. We need to make sure we're don't switch CPUs between getting the vcpu @@ -51,7 +51,7 @@ asmlinkage __visible void xen_irq_disable(void) this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); +__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable, ".noinstr.text"); asmlinkage __visible noinstr void xen_irq_enable(void) { diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 222519528890..220dd9678494 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -21,6 +21,7 @@ #include #include +.pushsection .noinstr.text, "ax" /* * Disabling events is simply a matter of making the event mask * non-zero. @@ -30,8 +31,6 @@ SYM_FUNC_START(xen_irq_disable_direct) ret SYM_FUNC_END(xen_irq_disable_direct) -.pushsection .noinstr.text, "ax" - /* * Force an event check by making a hypercall, but preserve regs * before making the call. From 847d9317b2b9c7ecc14b953e6ecf9c12bcdb42e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:21 +0200 Subject: [PATCH 23/41] x86/xen: Mark xen_force_evtchn_callback() noinstr vmlinux.o: warning: objtool: check_events()+0xd: call to xen_force_evtchn_callback() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095148.996055323@infradead.org --- arch/x86/include/asm/xen/hypercall.h | 2 +- arch/x86/xen/irq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 990b8aa179c8..4a7ff8b0db20 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -358,7 +358,7 @@ HYPERVISOR_event_channel_op(int cmd, void *arg) return _hypercall2(int, event_channel_op, cmd, arg); } -static inline int +static __always_inline int HYPERVISOR_xen_version(int cmd, void *arg) { return _hypercall2(int, xen_version, cmd, arg); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index f52b60df4e0c..2f695b5125f8 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -19,7 +19,7 @@ * callback mask. We do this in a very simple manner, by making a call * down into Xen. The pending flag will be checked by Xen on return. */ -void xen_force_evtchn_callback(void) +noinstr void xen_force_evtchn_callback(void) { (void)HYPERVISOR_xen_version(0, NULL); } From 1462eb381b4c27576a3e818bc9f918765d327fdf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:22 +0200 Subject: [PATCH 24/41] x86/xen: Rework the xen_{cpu,irq,mmu}_opsarrays In order to allow objtool to make sense of all the various paravirt functions, it needs to either parse whole pv_ops[] tables, or observe individual assignments in the form: bf87: 48 c7 05 00 00 00 00 00 00 00 00 movq $0x0,0x0(%rip) bf92 bf8a: R_X86_64_PC32 pv_ops+0x268 As is, xen_cpu_ops[] is at offset +0 in pv_ops[] and could thus be parsed as a 'normal' pv_ops[] table, however xen_irq_ops[] and xen_mmu_ops[] are not. Worse, both the latter two are compiled into the individual assignment for by current GCC, but that's not something one can rely on. Therefore, convert all three into full pv_ops[] tables. This has the benefit of not needing to teach objtool about the offsets and resulting in more conservative code-gen. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20210624095149.057262522@infradead.org --- arch/x86/xen/enlighten_pv.c | 66 ++++++++++++++------------- arch/x86/xen/irq.c | 17 ++++--- arch/x86/xen/mmu_pv.c | 90 +++++++++++++++++++------------------ 3 files changed, 90 insertions(+), 83 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 2b1a8ba71629..6ed0af756950 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1050,52 +1050,54 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, +static const typeof(pv_ops) xen_cpu_ops __initconst = { + .cpu = { + .cpuid = xen_cpuid, - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, - .write_cr4 = xen_write_cr4, + .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, + .wbinvd = native_wbinvd, - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, - .read_pmc = xen_read_pmc, + .read_pmc = xen_read_pmc, - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, - .load_gs_index = xen_load_gs_index, + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + .load_gs_index = xen_load_gs_index, - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, - .store_tr = xen_store_tr, + .store_tr = xen_store_tr, - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_sp0 = xen_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM - .invalidate_io_bitmap = xen_invalidate_io_bitmap, - .update_io_bitmap = xen_update_io_bitmap, + .invalidate_io_bitmap = xen_invalidate_io_bitmap, + .update_io_bitmap = xen_update_io_bitmap, #endif - .io_delay = xen_io_delay, + .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, - .end_context_switch = xen_end_context_switch, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, + }, }; static void xen_restart(char *msg) @@ -1231,7 +1233,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.cpu = xen_cpu_ops; + pv_ops.cpu = xen_cpu_ops.cpu; paravirt_iret = xen_iret; xen_init_irq_ops(); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 2f695b5125f8..4fe387e520af 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -94,17 +94,20 @@ static void xen_halt(void) xen_safe_halt(); } -static const struct pv_irq_ops xen_irq_ops __initconst = { - .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), - .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), +static const typeof(pv_ops) xen_irq_ops __initconst = { + .irq = { - .safe_halt = xen_safe_halt, - .halt = xen_halt, + .save_fl = PV_CALLEE_SAVE(xen_save_fl), + .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), + .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), + + .safe_halt = xen_safe_halt, + .halt = xen_halt, + }, }; void __init xen_init_irq_ops(void) { - pv_ops.irq = xen_irq_ops; + pv_ops.irq = xen_irq_ops.irq; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f3cafe56fbe5..b9a4f797ec1a 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2076,67 +2076,69 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), - .write_cr2 = xen_write_cr2, +static const typeof(pv_ops) xen_mmu_ops __initconst = { + .mmu = { + .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), + .write_cr2 = xen_write_cr2, - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_one_user = xen_flush_tlb_one_user, - .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, + .tlb_remove_table = tlb_remove_table, - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, - .set_pte = xen_set_pte_init, - .set_pmd = xen_set_pmd_hyper, + .set_pte = xen_set_pte_init, + .set_pmd = xen_set_pmd_hyper, - .ptep_modify_prot_start = xen_ptep_modify_prot_start, - .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, + .ptep_modify_prot_start = xen_ptep_modify_prot_start, + .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - .set_pud = xen_set_pud_hyper, + .set_pud = xen_set_pud_hyper, - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, #if CONFIG_PGTABLE_LEVELS >= 5 - .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), - .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, + + .set_fixmap = xen_set_fixmap, }, - - .set_fixmap = xen_set_fixmap, }; void __init xen_init_mmu_ops(void) @@ -2144,7 +2146,7 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.hyper.init_after_bootmem = xen_after_bootmem; - pv_ops.mmu = xen_mmu_ops; + pv_ops.mmu = xen_mmu_ops.mmu; memset(dummy_mapping, 0xff, PAGE_SIZE); } From db2b0c5d7b6f19b3c2cab08c531b65342eb5252b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:23 +0200 Subject: [PATCH 25/41] objtool: Support pv_opsindirect calls for noinstr Normally objtool will now follow indirect calls; there is no need. However, this becomes a problem with noinstr validation; if there's an indirect call from noinstr code, we very much need to know it is to another noinstr function. Luckily there aren't many indirect calls in entry code with the obvious exception of paravirt. As such, noinstr validation didn't work with paravirt kernels. In order to track pv_ops[] call targets, objtool reads the static pv_ops[] tables as well as direct assignments to the pv_ops[] array, provided the compiler makes them a single instruction like: bf87: 48 c7 05 00 00 00 00 00 00 00 00 movq $0x0,0x0(%rip) bf92 bf8a: R_X86_64_PC32 pv_ops+0x268 There are, as of yet, no warnings for when this goes wrong :/ Using the functions found with the above means, all pv_ops[] calls are now subject to noinstr validation. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095149.118815755@infradead.org --- lib/Kconfig.debug | 2 +- tools/objtool/arch/x86/decode.c | 34 +++++- tools/objtool/check.c | 151 ++++++++++++++++++++++-- tools/objtool/include/objtool/arch.h | 2 +- tools/objtool/include/objtool/elf.h | 1 + tools/objtool/include/objtool/objtool.h | 9 ++ tools/objtool/objtool.c | 22 ++++ 7 files changed, 208 insertions(+), 13 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ed4a31e34098..63a4735cef66 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -458,7 +458,7 @@ config STACK_VALIDATION config VMLINUX_VALIDATION bool - depends on STACK_VALIDATION && DEBUG_ENTRY && !PARAVIRT + depends on STACK_VALIDATION && DEBUG_ENTRY default y config VMLINUX_MAP diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 340a3dce94a0..3172983bf808 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include static int is_x86_64(const struct elf *elf) @@ -102,12 +103,13 @@ unsigned long arch_jump_destination(struct instruction *insn) #define rm_is_mem(reg) (mod_is_mem() && !is_RIP() && rm_is(reg)) #define rm_is_reg(reg) (mod_is_reg() && modrm_rm == (reg)) -int arch_decode_instruction(const struct elf *elf, const struct section *sec, +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, unsigned long *immediate, struct list_head *ops_list) { + const struct elf *elf = file->elf; struct insn insn; int x86_64, ret; unsigned char op1, op2, @@ -544,6 +546,36 @@ int arch_decode_instruction(const struct elf *elf, const struct section *sec, *type = INSN_RETURN; break; + case 0xc7: /* mov imm, r/m */ + if (!noinstr) + break; + + if (insn.length == 3+4+4 && !strncmp(sec->name, ".init.text", 10)) { + struct reloc *immr, *disp; + struct symbol *func; + int idx; + + immr = find_reloc_by_dest(elf, (void *)sec, offset+3); + disp = find_reloc_by_dest(elf, (void *)sec, offset+7); + + if (!immr || strcmp(immr->sym->name, "pv_ops")) + break; + + idx = (immr->addend + 8) / sizeof(void *); + + func = disp->sym; + if (disp->sym->type == STT_SECTION) + func = find_symbol_by_offset(disp->sym->sec, disp->addend); + if (!func) { + WARN("no func for pv_ops[]"); + return -1; + } + + objtool_pv_add(file, idx, func); + } + + break; + case 0xcf: /* iret */ /* * Handle sync_core(), which has an IRET to self. diff --git a/tools/objtool/check.c b/tools/objtool/check.c index c6f206fee8ff..84e59a97bab6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -382,7 +382,7 @@ static int decode_instructions(struct objtool_file *file) insn->sec = sec; insn->offset = offset; - ret = arch_decode_instruction(file->elf, sec, offset, + ret = arch_decode_instruction(file, sec, offset, sec->len - offset, &insn->len, &insn->type, &insn->immediate, @@ -420,6 +420,82 @@ err: return ret; } +/* + * Read the pv_ops[] .data table to find the static initialized values. + */ +static int add_pv_ops(struct objtool_file *file, const char *symname) +{ + struct symbol *sym, *func; + unsigned long off, end; + struct reloc *rel; + int idx; + + sym = find_symbol_by_name(file->elf, symname); + if (!sym) + return 0; + + off = sym->offset; + end = off + sym->len; + for (;;) { + rel = find_reloc_by_dest_range(file->elf, sym->sec, off, end - off); + if (!rel) + break; + + func = rel->sym; + if (func->type == STT_SECTION) + func = find_symbol_by_offset(rel->sym->sec, rel->addend); + + idx = (rel->offset - sym->offset) / sizeof(unsigned long); + + objtool_pv_add(file, idx, func); + + off = rel->offset + 1; + if (off > end) + break; + } + + return 0; +} + +/* + * Allocate and initialize file->pv_ops[]. + */ +static int init_pv_ops(struct objtool_file *file) +{ + static const char *pv_ops_tables[] = { + "pv_ops", + "xen_cpu_ops", + "xen_irq_ops", + "xen_mmu_ops", + NULL, + }; + const char *pv_ops; + struct symbol *sym; + int idx, nr; + + if (!noinstr) + return 0; + + file->pv_ops = NULL; + + sym = find_symbol_by_name(file->elf, "pv_ops"); + if (!sym) + return 0; + + nr = sym->len / sizeof(unsigned long); + file->pv_ops = calloc(sizeof(struct pv_state), nr); + if (!file->pv_ops) + return -1; + + for (idx = 0; idx < nr; idx++) + INIT_LIST_HEAD(&file->pv_ops[idx].targets); + + for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) + add_pv_ops(file, pv_ops); + + return 0; +} + static struct instruction *find_last_insn(struct objtool_file *file, struct section *sec) { @@ -893,6 +969,9 @@ static struct reloc *insn_reloc(struct objtool_file *file, struct instruction *i return NULL; if (!insn->reloc) { + if (!file) + return NULL; + insn->reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); if (!insn->reloc) { @@ -1882,6 +1961,10 @@ static int decode_sections(struct objtool_file *file) mark_rodata(file); + ret = init_pv_ops(file); + if (ret) + return ret; + ret = decode_instructions(file); if (ret) return ret; @@ -2663,20 +2746,64 @@ static inline bool func_uaccess_safe(struct symbol *func) static inline const char *call_dest_name(struct instruction *insn) { + static char pvname[16]; + struct reloc *rel; + int idx; + if (insn->call_dest) return insn->call_dest->name; + rel = insn_reloc(NULL, insn); + if (rel && !strcmp(rel->sym->name, "pv_ops")) { + idx = (rel->addend / sizeof(void *)); + snprintf(pvname, sizeof(pvname), "pv_ops[%d]", idx); + return pvname; + } + return "{dynamic}"; } -static inline bool noinstr_call_dest(struct symbol *func) +static bool pv_call_dest(struct objtool_file *file, struct instruction *insn) +{ + struct symbol *target; + struct reloc *rel; + int idx; + + rel = insn_reloc(file, insn); + if (!rel || strcmp(rel->sym->name, "pv_ops")) + return false; + + idx = (arch_dest_reloc_offset(rel->addend) / sizeof(void *)); + + if (file->pv_ops[idx].clean) + return true; + + file->pv_ops[idx].clean = true; + + list_for_each_entry(target, &file->pv_ops[idx].targets, pv_target) { + if (!target->sec->noinstr) { + WARN("pv_ops[%d]: %s", idx, target->name); + file->pv_ops[idx].clean = false; + } + } + + return file->pv_ops[idx].clean; +} + +static inline bool noinstr_call_dest(struct objtool_file *file, + struct instruction *insn, + struct symbol *func) { /* * We can't deal with indirect function calls at present; * assume they're instrumented. */ - if (!func) + if (!func) { + if (file->pv_ops) + return pv_call_dest(file, insn); + return false; + } /* * If the symbol is from a noinstr section; we good. @@ -2695,10 +2822,12 @@ static inline bool noinstr_call_dest(struct symbol *func) return false; } -static int validate_call(struct instruction *insn, struct insn_state *state) +static int validate_call(struct objtool_file *file, + struct instruction *insn, + struct insn_state *state) { if (state->noinstr && state->instr <= 0 && - !noinstr_call_dest(insn->call_dest)) { + !noinstr_call_dest(file, insn, insn->call_dest)) { WARN_FUNC("call to %s() leaves .noinstr.text section", insn->sec, insn->offset, call_dest_name(insn)); return 1; @@ -2719,7 +2848,9 @@ static int validate_call(struct instruction *insn, struct insn_state *state) return 0; } -static int validate_sibling_call(struct instruction *insn, struct insn_state *state) +static int validate_sibling_call(struct objtool_file *file, + struct instruction *insn, + struct insn_state *state) { if (has_modified_stack_frame(insn, state)) { WARN_FUNC("sibling call from callable instruction with modified stack frame", @@ -2727,7 +2858,7 @@ static int validate_sibling_call(struct instruction *insn, struct insn_state *st return 1; } - return validate_call(insn, state); + return validate_call(file, insn, state); } static int validate_return(struct symbol *func, struct instruction *insn, struct insn_state *state) @@ -2880,7 +3011,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_CALL: case INSN_CALL_DYNAMIC: - ret = validate_call(insn, &state); + ret = validate_call(file, insn, &state); if (ret) return ret; @@ -2899,7 +3030,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_CONDITIONAL: case INSN_JUMP_UNCONDITIONAL: if (is_sibling_call(insn)) { - ret = validate_sibling_call(insn, &state); + ret = validate_sibling_call(file, insn, &state); if (ret) return ret; @@ -2921,7 +3052,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, case INSN_JUMP_DYNAMIC: case INSN_JUMP_DYNAMIC_CONDITIONAL: if (is_sibling_call(insn)) { - ret = validate_sibling_call(insn, &state); + ret = validate_sibling_call(file, insn, &state); if (ret) return ret; } diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 6f482ae2d7d8..589ff58426ab 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -69,7 +69,7 @@ struct instruction; void arch_initial_func_cfi_state(struct cfi_init_state *state); -int arch_decode_instruction(const struct elf *elf, const struct section *sec, +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, unsigned int *len, enum insn_type *type, unsigned long *immediate, diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index e34395047530..c3857fadee7a 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -57,6 +57,7 @@ struct symbol { struct symbol *pfunc, *cfunc, *alias; bool uaccess_safe; bool static_call_tramp; + struct list_head pv_target; }; struct reloc { diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index 24fa83634de4..f99fbc6078d5 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -14,6 +14,11 @@ #define __weak __attribute__((weak)) +struct pv_state { + bool clean; + struct list_head targets; +}; + struct objtool_file { struct elf *elf; struct list_head insn_list; @@ -25,10 +30,14 @@ struct objtool_file { unsigned long jl_short, jl_long; unsigned long jl_nop_short, jl_nop_long; + + struct pv_state *pv_ops; }; struct objtool_file *objtool_open_read(const char *_objname); +void objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func); + int check(struct objtool_file *file); int orc_dump(const char *objname); int orc_create(struct objtool_file *file); diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index e21db8bce493..c90c7084e45a 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -135,6 +135,28 @@ struct objtool_file *objtool_open_read(const char *_objname) return &file; } +void objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func) +{ + if (!noinstr) + return; + + if (!f->pv_ops) { + WARN("paravirt confusion"); + return; + } + + /* + * These functions will be patched into native code, + * see paravirt_patch(). + */ + if (!strcmp(func->name, "_paravirt_nop") || + !strcmp(func->name, "_paravirt_ident_64")) + return; + + list_add(&func->pv_target, &f->pv_ops[idx].targets); + f->pv_ops[idx].clean = false; +} + static void cmd_usage(void) { unsigned int i, longest = 0; From 1739c66eb7bd5f27f1b69a5a26e10e8327d1e136 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:33 +0200 Subject: [PATCH 26/41] objtool: Classify symbols In order to avoid calling str*cmp() on symbol names, over and over, do them all once upfront and store the result. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120309.658539311@infradead.org --- tools/objtool/check.c | 34 +++++++++++++++++++---------- tools/objtool/include/objtool/elf.h | 7 ++++-- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7c865a10372a..fdbc6d2c5597 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1012,8 +1012,7 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, * so they need a little help, NOP out any KCOV calls from noinstr * text. */ - if (insn->sec->noinstr && - !strncmp(insn->call_dest->name, "__sanitizer_cov_", 16)) { + if (insn->sec->noinstr && insn->call_dest->kcov) { if (reloc) { reloc->type = R_NONE; elf_write_reloc(file->elf, reloc); @@ -1027,7 +1026,7 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, insn->type = sibling ? INSN_RETURN : INSN_NOP; } - if (mcount && !strcmp(insn->call_dest->name, "__fentry__")) { + if (mcount && insn->call_dest->fentry) { if (sibling) WARN_FUNC("Tail call to __fentry__ !?!?", insn->sec, insn->offset); @@ -1077,7 +1076,7 @@ static int add_jump_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); - } else if (arch_is_retpoline(reloc->sym)) { + } else if (reloc->sym->retpoline_thunk) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. @@ -1218,7 +1217,7 @@ static int add_call_destinations(struct objtool_file *file) add_call_dest(file, insn, dest, false); - } else if (arch_is_retpoline(reloc->sym)) { + } else if (reloc->sym->retpoline_thunk) { /* * Retpoline calls are really dynamic calls in * disguise, so convert them accordingly. @@ -1907,17 +1906,28 @@ static int read_intra_function_calls(struct objtool_file *file) return 0; } -static int read_static_call_tramps(struct objtool_file *file) +static int classify_symbols(struct objtool_file *file) { struct section *sec; struct symbol *func; for_each_sec(file, sec) { list_for_each_entry(func, &sec->symbol_list, list) { - if (func->bind == STB_GLOBAL && - !strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR, + if (func->bind != STB_GLOBAL) + continue; + + if (!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR, strlen(STATIC_CALL_TRAMP_PREFIX_STR))) func->static_call_tramp = true; + + if (arch_is_retpoline(func)) + func->retpoline_thunk = true; + + if (!strcmp(func->name, "__fentry__")) + func->fentry = true; + + if (!strncmp(func->name, "__sanitizer_cov_", 16)) + func->kcov = true; } } @@ -1983,7 +1993,7 @@ static int decode_sections(struct objtool_file *file) /* * Must be before add_{jump_call}_destination. */ - ret = read_static_call_tramps(file); + ret = classify_symbols(file); if (ret) return ret; @@ -2041,9 +2051,9 @@ static int decode_sections(struct objtool_file *file) static bool is_fentry_call(struct instruction *insn) { - if (insn->type == INSN_CALL && insn->call_dest && - insn->call_dest->type == STT_NOTYPE && - !strcmp(insn->call_dest->name, "__fentry__")) + if (insn->type == INSN_CALL && + insn->call_dest && + insn->call_dest->fentry) return true; return false; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index c48c1067797d..c2dbf53528c5 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -54,8 +54,11 @@ struct symbol { unsigned long offset; unsigned int len; struct symbol *pfunc, *cfunc, *alias; - bool uaccess_safe; - bool static_call_tramp; + u8 uaccess_safe : 1; + u8 static_call_tramp : 1; + u8 retpoline_thunk : 1; + u8 fentry : 1; + u8 kcov : 1; struct list_head pv_target; }; From dd003edeffa3cb87bc9862582004f405d77d7670 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:34 +0200 Subject: [PATCH 27/41] objtool: Explicitly avoid self modifying code in .altinstr_replacement Assume ALTERNATIVE()s know what they're doing and do not change, or cause to change, instructions in .altinstr_replacement sections. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120309.722511775@infradead.org --- tools/objtool/check.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index fdbc6d2c5597..8ab6f24f8753 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -993,18 +993,27 @@ static void remove_insn_ops(struct instruction *insn) } } -static void add_call_dest(struct objtool_file *file, struct instruction *insn, - struct symbol *dest, bool sibling) +static void annotate_call_site(struct objtool_file *file, + struct instruction *insn, bool sibling) { struct reloc *reloc = insn_reloc(file, insn); + struct symbol *sym = insn->call_dest; - insn->call_dest = dest; - if (!dest) + if (!sym) + sym = reloc->sym; + + /* + * Alternative replacement code is just template code which is + * sometimes copied to the original instruction. For now, don't + * annotate it. (In the future we might consider annotating the + * original instruction if/when it ever makes sense to do so.) + */ + if (!strcmp(insn->sec->name, ".altinstr_replacement")) return; - if (insn->call_dest->static_call_tramp) { - list_add_tail(&insn->call_node, - &file->static_call_list); + if (sym->static_call_tramp) { + list_add_tail(&insn->call_node, &file->static_call_list); + return; } /* @@ -1012,7 +1021,7 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, * so they need a little help, NOP out any KCOV calls from noinstr * text. */ - if (insn->sec->noinstr && insn->call_dest->kcov) { + if (insn->sec->noinstr && sym->kcov) { if (reloc) { reloc->type = R_NONE; elf_write_reloc(file->elf, reloc); @@ -1024,9 +1033,10 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, : arch_nop_insn(insn->len)); insn->type = sibling ? INSN_RETURN : INSN_NOP; + return; } - if (mcount && insn->call_dest->fentry) { + if (mcount && sym->fentry) { if (sibling) WARN_FUNC("Tail call to __fentry__ !?!?", insn->sec, insn->offset); @@ -1041,9 +1051,17 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, insn->type = INSN_NOP; - list_add_tail(&insn->mcount_loc_node, - &file->mcount_loc_list); + list_add_tail(&insn->mcount_loc_node, &file->mcount_loc_list); + return; } +} + +static void add_call_dest(struct objtool_file *file, struct instruction *insn, + struct symbol *dest, bool sibling) +{ + insn->call_dest = dest; + if (!dest) + return; /* * Whatever stack impact regular CALLs have, should be undone @@ -1053,6 +1071,8 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, * are converted to JUMP, see read_intra_function_calls(). */ remove_insn_ops(insn); + + annotate_call_site(file, insn, sibling); } /* From c509331b41b7365e17396c246e8c5797bccc8074 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:35 +0200 Subject: [PATCH 28/41] objtool: Shrink struct instruction Any one instruction can only ever call a single function, therefore insn->mcount_loc_node is superfluous and can use insn->call_node. This shrinks struct instruction, which is by far the most numerous structure objtool creates. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120309.785456706@infradead.org --- tools/objtool/check.c | 6 +++--- tools/objtool/include/objtool/check.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 8ab6f24f8753..ce3c25fec2a6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -701,7 +701,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) return 0; idx = 0; - list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) + list_for_each_entry(insn, &file->mcount_loc_list, call_node) idx++; sec = elf_create_section(file->elf, "__mcount_loc", 0, sizeof(unsigned long), idx); @@ -709,7 +709,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) return -1; idx = 0; - list_for_each_entry(insn, &file->mcount_loc_list, mcount_loc_node) { + list_for_each_entry(insn, &file->mcount_loc_list, call_node) { loc = (unsigned long *)sec->data->d_buf + idx; memset(loc, 0, sizeof(unsigned long)); @@ -1051,7 +1051,7 @@ static void annotate_call_site(struct objtool_file *file, insn->type = INSN_NOP; - list_add_tail(&insn->mcount_loc_node, &file->mcount_loc_list); + list_add_tail(&insn->call_node, &file->mcount_loc_list); return; } } diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 07e99c25c7ac..6cfff078897f 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -40,7 +40,6 @@ struct instruction { struct list_head list; struct hlist_node hash; struct list_head call_node; - struct list_head mcount_loc_node; struct section *sec; unsigned long offset; unsigned int len; From 134ab5bd1883312d7a4b3033b05c6b5a1bb8889b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:36 +0200 Subject: [PATCH 29/41] objtool,x86: Replace alternatives with .retpoline_sites Instead of writing complete alternatives, simply provide a list of all the retpoline thunk calls. Then the kernel is free to do with them as it pleases. Simpler code all-round. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120309.850007165@infradead.org --- arch/x86/kernel/vmlinux.lds.S | 14 +++ tools/objtool/arch/x86/decode.c | 120 ------------------------- tools/objtool/check.c | 132 ++++++++++++++++++++-------- tools/objtool/elf.c | 84 ------------------ tools/objtool/include/objtool/elf.h | 1 - tools/objtool/special.c | 8 -- 6 files changed, 107 insertions(+), 252 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index efd9e9ea17f2..3d6dc12d198f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -272,6 +272,20 @@ SECTIONS __parainstructions_end = .; } +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 1f2ae708b223..4d6d7fc13255 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -711,126 +711,6 @@ const char *arch_ret_insn(int len) return ret[len-1]; } -/* asm/alternative.h ? */ - -#define ALTINSTR_FLAG_INV (1 << 15) -#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) - -struct alt_instr { - s32 instr_offset; /* original instruction */ - s32 repl_offset; /* offset to replacement instruction */ - u16 cpuid; /* cpuid bit set for replacement */ - u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction */ -} __packed; - -static int elf_add_alternative(struct elf *elf, - struct instruction *orig, struct symbol *sym, - int cpuid, u8 orig_len, u8 repl_len) -{ - const int size = sizeof(struct alt_instr); - struct alt_instr *alt; - struct section *sec; - Elf_Scn *s; - - sec = find_section_by_name(elf, ".altinstructions"); - if (!sec) { - sec = elf_create_section(elf, ".altinstructions", - SHF_ALLOC, 0, 0); - - if (!sec) { - WARN_ELF("elf_create_section"); - return -1; - } - } - - s = elf_getscn(elf->elf, sec->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return -1; - } - - sec->data = elf_newdata(s); - if (!sec->data) { - WARN_ELF("elf_newdata"); - return -1; - } - - sec->data->d_size = size; - sec->data->d_align = 1; - - alt = sec->data->d_buf = malloc(size); - if (!sec->data->d_buf) { - perror("malloc"); - return -1; - } - memset(sec->data->d_buf, 0, size); - - if (elf_add_reloc_to_insn(elf, sec, sec->sh.sh_size, - R_X86_64_PC32, orig->sec, orig->offset)) { - WARN("elf_create_reloc: alt_instr::instr_offset"); - return -1; - } - - if (elf_add_reloc(elf, sec, sec->sh.sh_size + 4, - R_X86_64_PC32, sym, 0)) { - WARN("elf_create_reloc: alt_instr::repl_offset"); - return -1; - } - - alt->cpuid = bswap_if_needed(cpuid); - alt->instrlen = orig_len; - alt->replacementlen = repl_len; - - sec->sh.sh_size += size; - sec->changed = true; - - return 0; -} - -#define X86_FEATURE_RETPOLINE ( 7*32+12) - -int arch_rewrite_retpolines(struct objtool_file *file) -{ - struct instruction *insn; - struct reloc *reloc; - struct symbol *sym; - char name[32] = ""; - - list_for_each_entry(insn, &file->retpoline_call_list, call_node) { - - if (insn->type != INSN_JUMP_DYNAMIC && - insn->type != INSN_CALL_DYNAMIC) - continue; - - if (!strcmp(insn->sec->name, ".text.__x86.indirect_thunk")) - continue; - - reloc = insn->reloc; - - sprintf(name, "__x86_indirect_alt_%s_%s", - insn->type == INSN_JUMP_DYNAMIC ? "jmp" : "call", - reloc->sym->name + 21); - - sym = find_symbol_by_name(file->elf, name); - if (!sym) { - sym = elf_create_undef_symbol(file->elf, name); - if (!sym) { - WARN("elf_create_undef_symbol"); - return -1; - } - } - - if (elf_add_alternative(file->elf, insn, sym, - ALT_NOT(X86_FEATURE_RETPOLINE), 5, 5)) { - WARN("elf_add_alternative"); - return -1; - } - } - - return 0; -} - int arch_decode_hint_reg(u8 sp_reg, int *base) { switch (sp_reg) { diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ce3c25fec2a6..fb3f251ea021 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -683,6 +683,52 @@ static int create_static_call_sections(struct objtool_file *file) return 0; } +static int create_retpoline_sites_sections(struct objtool_file *file) +{ + struct instruction *insn; + struct section *sec; + int idx; + + sec = find_section_by_name(file->elf, ".retpoline_sites"); + if (sec) { + WARN("file already has .retpoline_sites, skipping"); + return 0; + } + + idx = 0; + list_for_each_entry(insn, &file->retpoline_call_list, call_node) + idx++; + + if (!idx) + return 0; + + sec = elf_create_section(file->elf, ".retpoline_sites", 0, + sizeof(int), idx); + if (!sec) { + WARN("elf_create_section: .retpoline_sites"); + return -1; + } + + idx = 0; + list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + + int *site = (int *)sec->data->d_buf + idx; + *site = 0; + + if (elf_add_reloc_to_insn(file->elf, sec, + idx * sizeof(int), + R_X86_64_PC32, + insn->sec, insn->offset)) { + WARN("elf_add_reloc_to_insn: .retpoline_sites"); + return -1; + } + + idx++; + } + + return 0; +} + static int create_mcount_loc_sections(struct objtool_file *file) { struct section *sec; @@ -1016,6 +1062,11 @@ static void annotate_call_site(struct objtool_file *file, return; } + if (sym->retpoline_thunk) { + list_add_tail(&insn->call_node, &file->retpoline_call_list); + return; + } + /* * Many compilers cannot disable KCOV with a function attribute * so they need a little help, NOP out any KCOV calls from noinstr @@ -1075,6 +1126,39 @@ static void add_call_dest(struct objtool_file *file, struct instruction *insn, annotate_call_site(file, insn, sibling); } +static void add_retpoline_call(struct objtool_file *file, struct instruction *insn) +{ + /* + * Retpoline calls/jumps are really dynamic calls/jumps in disguise, + * so convert them accordingly. + */ + switch (insn->type) { + case INSN_CALL: + insn->type = INSN_CALL_DYNAMIC; + break; + case INSN_JUMP_UNCONDITIONAL: + insn->type = INSN_JUMP_DYNAMIC; + break; + case INSN_JUMP_CONDITIONAL: + insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; + break; + default: + return; + } + + insn->retpoline_safe = true; + + /* + * Whatever stack impact regular CALLs have, should be undone + * by the RETURN of the called function. + * + * Annotated intra-function calls retain the stack_ops but + * are converted to JUMP, see read_intra_function_calls(). + */ + remove_insn_ops(insn); + + annotate_call_site(file, insn, false); +} /* * Find the destination instructions for all jumps. */ @@ -1097,19 +1181,7 @@ static int add_jump_destinations(struct objtool_file *file) dest_sec = reloc->sym->sec; dest_off = arch_dest_reloc_offset(reloc->addend); } else if (reloc->sym->retpoline_thunk) { - /* - * Retpoline jumps are really dynamic jumps in - * disguise, so convert them accordingly. - */ - if (insn->type == INSN_JUMP_UNCONDITIONAL) - insn->type = INSN_JUMP_DYNAMIC; - else - insn->type = INSN_JUMP_DYNAMIC_CONDITIONAL; - - list_add_tail(&insn->call_node, - &file->retpoline_call_list); - - insn->retpoline_safe = true; + add_retpoline_call(file, insn); continue; } else if (insn->func) { /* internal or external sibling call (with reloc) */ @@ -1238,18 +1310,7 @@ static int add_call_destinations(struct objtool_file *file) add_call_dest(file, insn, dest, false); } else if (reloc->sym->retpoline_thunk) { - /* - * Retpoline calls are really dynamic calls in - * disguise, so convert them accordingly. - */ - insn->type = INSN_CALL_DYNAMIC; - insn->retpoline_safe = true; - - list_add_tail(&insn->call_node, - &file->retpoline_call_list); - - remove_insn_ops(insn); - continue; + add_retpoline_call(file, insn); } else add_call_dest(file, insn, reloc->sym, false); @@ -1980,11 +2041,6 @@ static void mark_rodata(struct objtool_file *file) file->rodata = found; } -__weak int arch_rewrite_retpolines(struct objtool_file *file) -{ - return 0; -} - static int decode_sections(struct objtool_file *file) { int ret; @@ -2057,15 +2113,6 @@ static int decode_sections(struct objtool_file *file) if (ret) return ret; - /* - * Must be after add_special_section_alts(), since this will emit - * alternatives. Must be after add_{jump,call}_destination(), since - * those create the call insn lists. - */ - ret = arch_rewrite_retpolines(file); - if (ret) - return ret; - return 0; } @@ -3468,6 +3515,13 @@ int check(struct objtool_file *file) goto out; warnings += ret; + if (retpoline) { + ret = create_retpoline_sites_sections(file); + if (ret < 0) + goto out; + warnings += ret; + } + if (mcount) { ret = create_mcount_loc_sections(file); if (ret < 0) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index b18f0055b50b..5c029355c38c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -740,90 +740,6 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) return len; } -struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name) -{ - struct section *symtab, *symtab_shndx; - struct symbol *sym; - Elf_Data *data; - Elf_Scn *s; - - sym = malloc(sizeof(*sym)); - if (!sym) { - perror("malloc"); - return NULL; - } - memset(sym, 0, sizeof(*sym)); - - sym->name = strdup(name); - - sym->sym.st_name = elf_add_string(elf, NULL, sym->name); - if (sym->sym.st_name == -1) - return NULL; - - sym->sym.st_info = GELF_ST_INFO(STB_GLOBAL, STT_NOTYPE); - // st_other 0 - // st_shndx 0 - // st_value 0 - // st_size 0 - - symtab = find_section_by_name(elf, ".symtab"); - if (!symtab) { - WARN("can't find .symtab"); - return NULL; - } - - s = elf_getscn(elf->elf, symtab->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return NULL; - } - - data = elf_newdata(s); - if (!data) { - WARN_ELF("elf_newdata"); - return NULL; - } - - data->d_buf = &sym->sym; - data->d_size = sizeof(sym->sym); - data->d_align = 1; - data->d_type = ELF_T_SYM; - - sym->idx = symtab->sh.sh_size / sizeof(sym->sym); - - symtab->sh.sh_size += data->d_size; - symtab->changed = true; - - symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); - if (symtab_shndx) { - s = elf_getscn(elf->elf, symtab_shndx->idx); - if (!s) { - WARN_ELF("elf_getscn"); - return NULL; - } - - data = elf_newdata(s); - if (!data) { - WARN_ELF("elf_newdata"); - return NULL; - } - - data->d_buf = &sym->sym.st_size; /* conveniently 0 */ - data->d_size = sizeof(Elf32_Word); - data->d_align = 4; - data->d_type = ELF_T_WORD; - - symtab_shndx->sh.sh_size += 4; - symtab_shndx->changed = true; - } - - sym->sec = find_section_by_index(elf, 0); - - elf_add_symbol(elf, sym); - - return sym; -} - struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr) { diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index c2dbf53528c5..cdc739fa9a6f 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -144,7 +144,6 @@ int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, unsigned int len, const char *insn); int elf_write_reloc(struct elf *elf, struct reloc *reloc); -struct symbol *elf_create_undef_symbol(struct elf *elf, const char *name); int elf_write(struct elf *elf); void elf_close(struct elf *elf); diff --git a/tools/objtool/special.c b/tools/objtool/special.c index 06c3eacab3d5..e2223dd91c37 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -109,14 +109,6 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, return -1; } - /* - * Skip retpoline .altinstr_replacement... we already rewrite the - * instructions for retpolines anyway, see arch_is_retpoline() - * usage in add_{call,jump}_destinations(). - */ - if (arch_is_retpoline(new_reloc->sym)) - return 1; - reloc_to_sec_off(new_reloc, &alt->new_sec, &alt->new_off); /* _ASM_EXTABLE_EX hack */ From 4fe79e710d9574a14993f8b4e16b7252da72d5e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:37 +0200 Subject: [PATCH 30/41] x86/retpoline: Remove unused replacement symbols Now that objtool no longer creates alternatives, these replacement symbols are no longer needed, remove them. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120309.915051744@infradead.org --- arch/x86/include/asm/asm-prototypes.h | 10 ------- arch/x86/lib/retpoline.S | 42 --------------------------- 2 files changed, 52 deletions(-) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 4cb726c71ed8..a28c5cab893d 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -24,14 +24,4 @@ extern void cmpxchg8b_emu(void); extern asmlinkage void __x86_indirect_thunk_ ## reg (void); #include -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_call_ ## reg (void); -#include - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void); -#include - #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ec9922cba30a..a91e0dc0a8cc 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -40,36 +40,6 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) .endm -/* - * This generates .altinstr_replacement symbols for use by objtool. They, - * however, must not actually live in .altinstr_replacement since that will be - * discarded after init, but module alternatives will also reference these - * symbols. - * - * Their names matches the "__x86_indirect_" prefix to mark them as retpolines. - */ -.macro ALT_THUNK reg - - .align 1 - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) - ANNOTATE_RETPOLINE_SAFE -1: call *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_call_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg) - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) - ANNOTATE_RETPOLINE_SAFE -1: jmp *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) - -.endm - /* * Despite being an assembler file we can't just use .irp here * because __KSYM_DEPS__ only uses the C preprocessor and would @@ -92,15 +62,3 @@ STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) #undef GEN #define GEN(reg) EXPORT_THUNK(reg) #include - -#undef GEN -#define GEN(reg) ALT_THUNK reg -#include - -#undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg) -#include - -#undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg) -#include From a92ede2d584a2e070def59c7e47e6b6f6341c55c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:38 +0200 Subject: [PATCH 31/41] x86/asm: Fix register order Ensure the register order is correct; this allows for easy translation between register number and trampoline and vice-versa. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120309.978573921@infradead.org --- arch/x86/include/asm/GEN-for-each-reg.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h index 1b07fb102c4e..07949102a08d 100644 --- a/arch/x86/include/asm/GEN-for-each-reg.h +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -1,11 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are in machine order; things rely on that. + */ #ifdef CONFIG_64BIT GEN(rax) -GEN(rbx) GEN(rcx) GEN(rdx) +GEN(rbx) +GEN(rsp) +GEN(rbp) GEN(rsi) GEN(rdi) -GEN(rbp) GEN(r8) GEN(r9) GEN(r10) @@ -16,10 +21,11 @@ GEN(r14) GEN(r15) #else GEN(eax) -GEN(ebx) GEN(ecx) GEN(edx) +GEN(ebx) +GEN(esp) +GEN(ebp) GEN(esi) GEN(edi) -GEN(ebp) #endif From b6d3d9944bd7c9e8c06994ead3c9952f673f2a66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:39 +0200 Subject: [PATCH 32/41] x86/asm: Fixup odd GEN-for-each-reg.h usage Currently GEN-for-each-reg.h usage leaves GEN defined, relying on any subsequent usage to start with #undef, which is rude. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.041792350@infradead.org --- arch/x86/include/asm/asm-prototypes.h | 2 +- arch/x86/lib/retpoline.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index a28c5cab893d..a2bed09d3c11 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -19,9 +19,9 @@ extern void cmpxchg8b_emu(void); #ifdef CONFIG_RETPOLINE -#undef GEN #define GEN(reg) \ extern asmlinkage void __x86_indirect_thunk_ ## reg (void); #include +#undef GEN #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index a91e0dc0a8cc..4c910fa92086 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -55,10 +55,10 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#undef GEN #define GEN(reg) THUNK reg #include - #undef GEN + #define GEN(reg) EXPORT_THUNK(reg) #include +#undef GEN From 6fda8a38865607db739be3e567a2387376222dbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:40 +0200 Subject: [PATCH 33/41] x86/retpoline: Move the retpoline thunk declarations to nospec-branch.h Because it makes no sense to split the retpoline gunk over multiple headers. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.106290934@infradead.org --- arch/x86/include/asm/asm-prototypes.h | 8 -------- arch/x86/include/asm/nospec-branch.h | 7 +++++++ arch/x86/net/bpf_jit_comp.c | 1 - 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index a2bed09d3c11..8f80de627c60 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,11 +17,3 @@ extern void cmpxchg8b_emu(void); #endif -#ifdef CONFIG_RETPOLINE - -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_thunk_ ## reg (void); -#include -#undef GEN - -#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ec2d5c8c6694..14053cd314b9 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -118,6 +119,12 @@ ".popsection\n\t" #ifdef CONFIG_RETPOLINE + +#define GEN(reg) \ + extern asmlinkage void __x86_indirect_thunk_ ## reg (void); +#include +#undef GEN + #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9ea57389c554..7c03af65a7de 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,7 +15,6 @@ #include #include #include -#include static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { From 1a6f74429c42a3854980359a758e222005712aee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:41 +0200 Subject: [PATCH 34/41] x86/retpoline: Create a retpoline thunk array Stick all the retpolines in a single symbol and have the individual thunks as inner labels, this should guarantee thunk order and layout. Previously there were 16 (or rather 15 without rsp) separate symbols and a toolchain might reasonably expect it could displace them however it liked, with disregard for their relative position. However, now they're part of a larger symbol. Any change to their relative position would disrupt this larger _array symbol and thus not be sound. This is the same reasoning used for data symbols. On their own there is no guarantee about their relative position wrt to one aonther, but we're still able to do arrays because an array as a whole is a single larger symbol. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.169659320@infradead.org --- arch/x86/include/asm/nospec-branch.h | 8 +++++++- arch/x86/lib/retpoline.S | 14 +++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 14053cd314b9..e22aedbeb668 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,6 +12,8 @@ #include #include +#define RETPOLINE_THUNK_SIZE 32 + /* * Fill the CPU return stack buffer. * @@ -120,11 +122,15 @@ #ifdef CONFIG_RETPOLINE +typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; + #define GEN(reg) \ - extern asmlinkage void __x86_indirect_thunk_ ## reg (void); + extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; #include #undef GEN +extern retpoline_thunk_t __x86_indirect_thunk_array[]; + #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 4c910fa92086..cf0b39f97adc 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -28,16 +28,14 @@ .macro THUNK reg - .align 32 - -SYM_FUNC_START(__x86_indirect_thunk_\reg) + .align RETPOLINE_THUNK_SIZE +SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD -SYM_FUNC_END(__x86_indirect_thunk_\reg) - .endm /* @@ -55,10 +53,16 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_thunk_array) + #define GEN(reg) THUNK reg #include #undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_thunk_array) + #define GEN(reg) EXPORT_THUNK(reg) #include #undef GEN From 7508500900814d14e2e085cdc4e28142721abbdf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:42 +0200 Subject: [PATCH 35/41] x86/alternative: Implement .retpoline_sites support Rewrite retpoline thunk call sites to be indirect calls for spectre_v2=off. This ensures spectre_v2=off is as near to a RETPOLINE=n build as possible. This is the replacement for objtool writing alternative entries to ensure the same and achieves feature-parity with the previous approach. One noteworthy feature is that it relies on the thunks to be in machine order to compute the register index. Specifically, this does not yet address the Jcc __x86_indirect_thunk_* calls generated by clang, a future patch will add this. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.232495794@infradead.org --- arch/um/kernel/um_arch.c | 4 + arch/x86/include/asm/alternative.h | 1 + arch/x86/kernel/alternative.c | 141 ++++++++++++++++++++++++++++- arch/x86/kernel/module.c | 9 +- 4 files changed, 150 insertions(+), 5 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index a149a5e9a16a..54447690de11 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -421,6 +421,10 @@ void __init check_bugs(void) os_check_bugs(); } +void apply_retpolines(s32 *start, s32 *end) +{ +} + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index a3c2315aca12..58eee6402832 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,6 +75,7 @@ extern int alternatives_patched; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); struct module; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e9da3dc71254..5df403450359 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -29,6 +29,7 @@ #include #include #include +#include int __read_mostly alternatives_patched; @@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { struct insn insn; int i = 0; @@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins * optimized. */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, a->instrlen, i); + i += optimize_nops_range(instr, len, i); else i += insn.length; - if (i >= a->instrlen) + if (i >= len) return; } } @@ -331,10 +333,135 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); next: - optimize_nops(a, instr); + optimize_nops(instr, a->instrlen); } } +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, i = 0; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + return -1; + + i = emit_indirect(insn->opcode.bytes[0], reg, bytes); + if (i < 0) + return i; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + default: + WARN_ON_ONCE(1); + continue; + } + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + text_poke_early(addr, bytes, len); + } + } +} + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ + #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) @@ -642,6 +769,12 @@ void __init alternative_instructions(void) */ apply_paravirt(__parainstructions, __parainstructions_end); + /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + /* * Then patch alternatives, such that those paravirt calls that are in * alternatives can be overwritten by their immediate fragments. diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..169fb6f4cd2e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -251,7 +251,8 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -267,8 +268,14 @@ int module_finalize(const Elf_Ehdr *hdr, orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; From 2f0cbb2a8e5bbf101e9de118fc0eb168111a5e1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:43 +0200 Subject: [PATCH 36/41] x86/alternative: Handle Jcc __x86_indirect_thunk_\reg Handle the rare cases where the compiler (clang) does an indirect conditional tail-call using: Jcc __x86_indirect_thunk_\reg For the !RETPOLINE case this can be rewritten to fit the original (6 byte) instruction like: Jncc.d8 1f JMP *%\reg NOP 1: Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.296470217@infradead.org --- arch/x86/kernel/alternative.c | 40 +++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5df403450359..1dea2f6c5546 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -393,7 +393,8 @@ static int emit_indirect(int op, int reg, u8 *bytes) static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { retpoline_thunk_t *target; - int reg, i = 0; + int reg, ret, i = 0; + u8 op, cc; target = addr + insn->length + insn->immediate.value; reg = target - __x86_indirect_thunk_array; @@ -407,9 +408,36 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) return -1; - i = emit_indirect(insn->opcode.bytes[0], reg, bytes); - if (i < 0) - return i; + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * JMP *%\reg + * NOP + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; for (; i < insn->length;) bytes[i++] = BYTES_NOP1; @@ -443,6 +471,10 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) case JMP32_INSN_OPCODE: break; + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; default: WARN_ON_ONCE(1); continue; From bbe2df3f6b6da7848398d55b1311d58a16ec21e4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:44 +0200 Subject: [PATCH 37/41] x86/alternative: Try inline spectre_v2=retpoline,amd Try and replace retpoline thunk calls with: LFENCE CALL *%\reg for spectre_v2=retpoline,amd. Specifically, the sequence above is 5 bytes for the low 8 registers, but 6 bytes for the high 8 registers. This means that unless the compilers prefix stuff the call with higher registers this replacement will fail. Luckily GCC strongly favours RAX for the indirect calls and most (95%+ for defconfig-x86_64) will be converted. OTOH clang strongly favours R11 and almost nothing gets converted. Note: it will also generate a correct replacement for the Jcc.d32 case, except unless the compilers start to prefix stuff that, it'll never fit. Specifically: Jncc.d8 1f LFENCE JMP *%\reg 1: is 7-8 bytes long, where the original instruction in unpadded form is only 6 bytes. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.359986601@infradead.org --- arch/x86/kernel/alternative.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1dea2f6c5546..c89824c69ff7 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -389,6 +389,7 @@ static int emit_indirect(int op, int reg, u8 *bytes) * * CALL *%\reg * + * It also tries to inline spectre_v2=retpoline,amd when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { @@ -405,7 +406,8 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ BUG_ON(reg == 4); - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) return -1; op = insn->opcode.bytes[0]; @@ -418,8 +420,9 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * into: * * Jncc.d8 1f + * [ LFENCE ] * JMP *%\reg - * NOP + * [ NOP ] * 1: */ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ @@ -434,6 +437,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) op = JMP32_INSN_OPCODE; } + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; From d4b5a5c993009ffeb5febe3b701da3faab6adb96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:45 +0200 Subject: [PATCH 38/41] x86/alternative: Add debug prints to apply_retpolines() Make sure we can see the text changes when booting with 'debug-alternative'. Example output: [ ] SMP alternatives: retpoline at: __traceiter_initcall_level+0x1f/0x30 (ffffffff8100066f) len: 5 to: __x86_indirect_thunk_rax+0x0/0x20 [ ] SMP alternatives: ffffffff82603e58: [2:5) optimized NOPs: ff d0 0f 1f 00 [ ] SMP alternatives: ffffffff8100066f: orig: e8 cc 30 00 01 [ ] SMP alternatives: ffffffff8100066f: repl: ff d0 0f 1f 00 Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.422273830@infradead.org --- arch/x86/kernel/alternative.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c89824c69ff7..23fb4d51a5da 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -492,9 +492,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) continue; } + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } From f8a66d608a3e471e1202778c2a36cbdc96bae73b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:46 +0200 Subject: [PATCH 39/41] x86,bugs: Unconditionally allow spectre_v2=retpoline,amd Currently Linux prevents usage of retpoline,amd on !AMD hardware, this is unfriendly and gets in the way of testing. Remove this restriction. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.487348118@infradead.org --- arch/x86/kernel/cpu/bugs.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ecfca3bbcd96..ba43597f1027 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; From dceba0817ca329868a15e2e1dd46eb6340b69206 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:47 +0200 Subject: [PATCH 40/41] bpf,x86: Simplify computing label offsets Take an idea from the 32bit JIT, which uses the multi-pass nature of the JIT to compute the instruction offsets on a prior pass in order to compute the relative jump offsets on a later pass. Application to the x86_64 JIT is slightly more involved because the offsets depend on program variables (such as callee_regs_used and stack_depth) and hence the computed offsets need to be kept in the context of the JIT. This removes, IMO quite fragile, code that hard-codes the offsets and tries to compute the length of variable parts of it. Convert both emit_bpf_tail_call_*() functions which have an out: label at the end. Additionally emit_bpt_tail_call_direct() also has a poke table entry, for which it computes the offset from the end (and thus already relies on the previous pass to have computed addrs[i]), also convert this to be a forward based offset. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Alexei Starovoitov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.552304864@infradead.org --- arch/x86/net/bpf_jit_comp.c | 123 ++++++++++++------------------------ 1 file changed, 42 insertions(+), 81 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7c03af65a7de..b5c5fb42fd59 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -224,6 +224,14 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* Epilogue code offset */ + + /* + * Program specific offsets of labels in the code; these rely on the + * JIT doing at least 2 passes, recording the position on the first + * pass, only to generate the correct offset on the second pass. + */ + int tail_call_direct_label; + int tail_call_indirect_label; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ @@ -379,22 +387,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } -static int get_pop_bytes(bool *callee_regs_used) -{ - int bytes = 0; - - if (callee_regs_used[3]) - bytes += 2; - if (callee_regs_used[2]) - bytes += 2; - if (callee_regs_used[1]) - bytes += 2; - if (callee_regs_used[0]) - bytes += 1; - - return bytes; -} - /* * Generate the following code: * @@ -410,29 +402,12 @@ static int get_pop_bytes(bool *callee_regs_used) * out: */ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, - u32 stack_depth) + u32 stack_depth, u8 *ip, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 42; - int off2 = 31; - int off3 = 9; - - /* count the additional bytes used for popping callee regs from stack - * that need to be taken into account for each of the offsets that - * are used for bailing out of the tail call - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - off2 += pop_bytes; - off3 += pop_bytes; - - if (stack_depth) { - off1 += 7; - off2 += 7; - off3 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * rdi - pointer to ctx @@ -447,8 +422,9 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ - EMIT2(X86_JBE, OFFSET1); /* jbe out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JBE, offset); /* jbe out */ /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) @@ -456,8 +432,9 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JA, OFFSET2); /* ja out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JA, offset); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ @@ -470,12 +447,11 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * goto out; */ EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ -#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JE, OFFSET3); /* je out */ - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JE, offset); /* je out */ + + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) @@ -495,38 +471,18 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, RETPOLINE_RCX_BPF_JIT(); /* out: */ + ctx->tail_call_indirect_label = prog - start; *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image, - bool *callee_regs_used, u32 stack_depth) + u8 **pprog, u8 *ip, + bool *callee_regs_used, u32 stack_depth, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 20; - int poke_off; - - /* count the additional bytes used for popping callee regs to stack - * that need to be taken into account for jump offset that is used for - * bailing out from of the tail call when limit is reached - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - - /* - * total bytes for: - * - nop5/ jmpq $off - * - pop callee regs - * - sub rsp, $val if depth > 0 - * - pop rax - */ - poke_off = X86_PATCH_SIZE + pop_bytes + 1; - if (stack_depth) { - poke_off += 7; - off1 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * if (tail_call_cnt > MAX_TAIL_CALL_CNT) @@ -534,28 +490,30 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, off1); /* ja out */ + + offset = ctx->tail_call_direct_label - (prog + 2 - start); + EMIT2(X86_JA, offset); /* ja out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ - poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->tailcall_bypass = ip + (prog - start); poke->adj_off = X86_TAIL_CALL_OFFSET; - poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE; poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; + /* out: */ + ctx->tail_call_direct_label = prog - start; *pprog = prog; } @@ -1411,13 +1369,16 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image, + &prog, image + addrs[i - 1], callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + ctx); else emit_bpf_tail_call_indirect(&prog, callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + image + addrs[i - 1], + ctx); break; /* cond jump */ From 87c87ecd00c54ecd677798cb49ef27329e0fab41 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Oct 2021 14:01:48 +0200 Subject: [PATCH 41/41] bpf,x86: Respect X86_FEATURE_RETPOLINE* Current BPF codegen doesn't respect X86_FEATURE_RETPOLINE* flags and unconditionally emits a thunk call, this is sub-optimal and doesn't match the regular, compiler generated, code. Update the i386 JIT to emit code equal to what the compiler emits for the regular kernel text (IOW. a plain THUNK call). Update the x86_64 JIT to emit code similar to the result of compiler and kernel rewrites as according to X86_FEATURE_RETPOLINE* flags. Inlining RETPOLINE_AMD (lfence; jmp *%reg) and !RETPOLINE (jmp *%reg), while doing a THUNK call for RETPOLINE. This removes the hard-coded retpoline thunks and shrinks the generated code. Leaving a single retpoline thunk definition in the kernel. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Borislav Petkov Acked-by: Alexei Starovoitov Acked-by: Josh Poimboeuf Tested-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211026120310.614772675@infradead.org --- arch/x86/include/asm/nospec-branch.h | 59 ---------------------------- arch/x86/net/bpf_jit_comp.c | 46 +++++++++++----------- arch/x86/net/bpf_jit_comp32.c | 22 +++++++++-- 3 files changed, 41 insertions(+), 86 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e22aedbeb668..cc74dc584836 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -316,63 +316,4 @@ static inline void mds_idle_clear_cpu_buffers(void) #endif /* __ASSEMBLY__ */ -/* - * Below is used in the eBPF JIT compiler and emits the byte sequence - * for the following assembly: - * - * With retpolines configured: - * - * callq do_rop - * spec_trap: - * pause - * lfence - * jmp spec_trap - * do_rop: - * mov %rcx,(%rsp) for x86_64 - * mov %edx,(%esp) for x86_32 - * retq - * - * Without retpolines configured: - * - * jmp *%rcx for x86_64 - * jmp *%edx for x86_32 - */ -#ifdef CONFIG_RETPOLINE -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 17 -# define RETPOLINE_RCX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* callq do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ - EMIT1(0xC3); /* retq */ \ -} while (0) -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* call do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \ - EMIT1(0xC3); /* ret */ \ -} while (0) -# endif -#else /* !CONFIG_RETPOLINE */ -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 2 -# define RETPOLINE_RCX_BPF_JIT() \ - EMIT2(0xFF, 0xE1); /* jmp *%rcx */ -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ - EMIT2(0xFF, 0xE2) /* jmp *%edx */ -# endif -#endif - #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b5c5fb42fd59..39c802525fce 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -387,6 +387,25 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } +#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) + +static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) +{ + u8 *prog = *pprog; + +#ifdef CONFIG_RETPOLINE + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + EMIT_LFENCE(); + EMIT2(0xFF, 0xE0 + reg); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + } else +#endif + EMIT2(0xFF, 0xE0 + reg); + + *pprog = prog; +} + /* * Generate the following code: * @@ -468,7 +487,7 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * rdi == ctx (1st arg) * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RCX_BPF_JIT(); + emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start)); /* out: */ ctx->tail_call_indirect_label = prog - start; @@ -1179,8 +1198,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* speculation barrier */ case BPF_ST | BPF_NOSPEC: if (boot_cpu_has(X86_FEATURE_XMM2)) - /* Emit 'lfence' */ - EMIT3(0x0F, 0xAE, 0xE8); + EMIT_LFENCE(); break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -2084,24 +2102,6 @@ cleanup: return ret; } -static int emit_fallback_jump(u8 **pprog) -{ - u8 *prog = *pprog; - int err = 0; - -#ifdef CONFIG_RETPOLINE - /* Note that this assumes the the compiler uses external - * thunks for indirect calls. Both clang and GCC use the same - * naming convention for external thunks. - */ - err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); -#else - EMIT2(0xFF, 0xE2); /* jmp rdx */ -#endif - *pprog = prog; - return err; -} - static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; @@ -2123,9 +2123,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) if (err) return err; - err = emit_fallback_jump(&prog); /* jmp thunk/indirect */ - if (err) - return err; + emit_indirect_jump(&prog, 2 /* rdx */, prog); *pprog = prog; return 0; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3bfda5f502cb..da9b7cfa4632 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -1267,6 +1268,21 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) *pprog = prog; } +static int emit_jmp_edx(u8 **pprog, u8 *ip) +{ + u8 *prog = *pprog; + int cnt = 0; + +#ifdef CONFIG_RETPOLINE + EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5)); +#else + EMIT2(0xFF, 0xE2); +#endif + *pprog = prog; + + return cnt; +} + /* * Generate the following code: * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... @@ -1280,7 +1296,7 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call(u8 **pprog, u8 *ip) { u8 *prog = *pprog; int cnt = 0; @@ -1362,7 +1378,7 @@ static void emit_bpf_tail_call(u8 **pprog) * eax == ctx (1st arg) * edx == prog->bpf_func + prologue_size */ - RETPOLINE_EDX_BPF_JIT(); + cnt += emit_jmp_edx(&prog, ip + cnt); if (jmp_label1 == -1) jmp_label1 = cnt; @@ -2122,7 +2138,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; } case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + emit_bpf_tail_call(&prog, image + addrs[i - 1]); break; /* cond jump */