Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The changes in here are: - text_poke() fixes and an extensive set of executability lockdowns, to (hopefully) eliminate the last residual circumstances under which we are using W|X mappings even temporarily on x86 kernels. This required a broad range of surgery in text patching facilities, module loading, trampoline handling and other bits. - tweak page fault messages to be more informative and more structured. - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the default. - reduce KASLR granularity on 5-level paging kernels from 512 GB to 1 GB. - misc other changes and updates" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) x86/mm: Initialize PGD cache during mm initialization x86/alternatives: Add comment about module removal races x86/kprobes: Use vmalloc special flag x86/ftrace: Use vmalloc special flag bpf: Use vmalloc special flag modules: Use vmalloc special flag mm/vmalloc: Add flag for freeing of special permsissions mm/hibernation: Make hibernation handle unmapped pages x86/mm/cpa: Add set_direct_map_*() functions x86/alternatives: Remove the return value of text_poke_*() x86/jump-label: Remove support for custom text poker x86/modules: Avoid breaking W^X while loading modules x86/kprobes: Set instruction page as executable x86/ftrace: Set trampoline pages as executable x86/kgdb: Avoid redundant comparison of patched code x86/alternatives: Use temporary mm for text poking x86/alternatives: Initialize temporary mm for patching fork: Provide a function for copying init_mm uprobes: Initialize uprobes earlier x86/mm: Save debug registers when loading a temporary mm ...
This commit is contained in:
commit
0bc40e549a
@ -72,7 +72,7 @@ Complete virtual memory map with 5-level page tables
|
||||
Notes:
|
||||
|
||||
- With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
|
||||
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
|
||||
from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PB starting
|
||||
offset and many of the regions expand to support the much larger physical
|
||||
memory supported.
|
||||
|
||||
@ -83,7 +83,7 @@ Notes:
|
||||
0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
| | | |
|
||||
0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
|
||||
0100000000000000 | +64 PB | feffffffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
|
||||
| | | | virtual memory addresses up to the -64 PB
|
||||
| | | | starting offset of kernel mappings.
|
||||
__________________|____________|__________________|_________|___________________________________________________________
|
||||
@ -99,7 +99,7 @@ ____________________________________________________________|___________________
|
||||
ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
|
||||
ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
|
||||
ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
|
||||
ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
|
||||
ffdf000000000000 | -8.25 PB | fffffbffffffffff | ~8 PB | KASAN shadow memory
|
||||
__________________|____________|__________________|_________|____________________________________________________________
|
||||
|
|
||||
| Identical layout to the 47-bit one from here on:
|
||||
|
@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE
|
||||
config ARCH_HAS_SET_MEMORY
|
||||
bool
|
||||
|
||||
# Select if arch has all set_direct_map_invalid/default() functions
|
||||
config ARCH_HAS_SET_DIRECT_MAP
|
||||
bool
|
||||
|
||||
# Select if arch init_task must go in the __init_task_data section
|
||||
config ARCH_TASK_STRUCT_ON_STACK
|
||||
bool
|
||||
|
@ -65,6 +65,7 @@ config X86
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||
select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE
|
||||
select ARCH_HAS_SET_MEMORY
|
||||
select ARCH_HAS_SET_DIRECT_MAP
|
||||
select ARCH_HAS_STRICT_KERNEL_RWX
|
||||
select ARCH_HAS_STRICT_MODULE_RWX
|
||||
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
|
||||
@ -1592,12 +1593,9 @@ config ARCH_FLATMEM_ENABLE
|
||||
depends on X86_32 && !NUMA
|
||||
|
||||
config ARCH_DISCONTIGMEM_ENABLE
|
||||
def_bool y
|
||||
depends on NUMA && X86_32
|
||||
|
||||
config ARCH_DISCONTIGMEM_DEFAULT
|
||||
def_bool y
|
||||
def_bool n
|
||||
depends on NUMA && X86_32
|
||||
depends on BROKEN
|
||||
|
||||
config ARCH_SPARSEMEM_ENABLE
|
||||
def_bool y
|
||||
@ -1606,8 +1604,7 @@ config ARCH_SPARSEMEM_ENABLE
|
||||
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
|
||||
|
||||
config ARCH_SPARSEMEM_DEFAULT
|
||||
def_bool y
|
||||
depends on X86_64
|
||||
def_bool X86_64 || (NUMA && X86_32)
|
||||
|
||||
config ARCH_SELECT_MEMORY_MODEL
|
||||
def_bool y
|
||||
|
@ -103,8 +103,6 @@ enum fixed_addresses {
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
FIX_PARAVIRT_BOOTMAP,
|
||||
#endif
|
||||
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
|
||||
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
|
||||
#ifdef CONFIG_X86_INTEL_MID
|
||||
FIX_LNW_VRTC,
|
||||
#endif
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/mpx.h>
|
||||
#include <asm/debugreg.h>
|
||||
|
||||
extern atomic64_t last_mm_ctx_id;
|
||||
|
||||
@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void)
|
||||
return cr3;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
struct mm_struct *mm;
|
||||
} temp_mm_state_t;
|
||||
|
||||
/*
|
||||
* Using a temporary mm allows to set temporary mappings that are not accessible
|
||||
* by other CPUs. Such mappings are needed to perform sensitive memory writes
|
||||
* that override the kernel memory protections (e.g., W^X), without exposing the
|
||||
* temporary page-table mappings that are required for these write operations to
|
||||
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
|
||||
* mapping is torn down.
|
||||
*
|
||||
* Context: The temporary mm needs to be used exclusively by a single core. To
|
||||
* harden security IRQs must be disabled while the temporary mm is
|
||||
* loaded, thereby preventing interrupt handler bugs from overriding
|
||||
* the kernel memory protection.
|
||||
*/
|
||||
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
|
||||
{
|
||||
temp_mm_state_t temp_state;
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
switch_mm_irqs_off(NULL, mm, current);
|
||||
|
||||
/*
|
||||
* If breakpoints are enabled, disable them while the temporary mm is
|
||||
* used. Userspace might set up watchpoints on addresses that are used
|
||||
* in the temporary mm, which would lead to wrong signals being sent or
|
||||
* crashes.
|
||||
*
|
||||
* Note that breakpoints are not disabled selectively, which also causes
|
||||
* kernel breakpoints (e.g., perf's) to be disabled. This might be
|
||||
* undesirable, but still seems reasonable as the code that runs in the
|
||||
* temporary mm should be short.
|
||||
*/
|
||||
if (hw_breakpoint_active())
|
||||
hw_breakpoint_disable();
|
||||
|
||||
return temp_state;
|
||||
}
|
||||
|
||||
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
switch_mm_irqs_off(NULL, prev_state.mm, current);
|
||||
|
||||
/*
|
||||
* Restore the breakpoints if they were disabled before the temporary mm
|
||||
* was loaded.
|
||||
*/
|
||||
if (hw_breakpoint_active())
|
||||
hw_breakpoint_restore();
|
||||
}
|
||||
|
||||
#endif /* _ASM_X86_MMU_CONTEXT_H */
|
||||
|
@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void)
|
||||
/* Default trampoline pgd value */
|
||||
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
|
||||
}
|
||||
|
||||
void __init poking_init(void);
|
||||
|
||||
# ifdef CONFIG_RANDOMIZE_MEMORY
|
||||
void __meminit init_trampoline(void);
|
||||
# else
|
||||
|
@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages);
|
||||
int set_pages_ro(struct page *page, int numpages);
|
||||
int set_pages_rw(struct page *page, int numpages);
|
||||
|
||||
int set_direct_map_invalid_noflush(struct page *page);
|
||||
int set_direct_map_default_noflush(struct page *page);
|
||||
|
||||
extern int kernel_set_to_readonly;
|
||||
void set_kernel_text_rw(void);
|
||||
void set_kernel_text_ro(void);
|
||||
|
@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
|
||||
#define __parainstructions_end NULL
|
||||
#endif
|
||||
|
||||
extern void *text_poke_early(void *addr, const void *opcode, size_t len);
|
||||
extern void text_poke_early(void *addr, const void *opcode, size_t len);
|
||||
|
||||
/*
|
||||
* Clear and restore the kernel write-protection flag on the local CPU.
|
||||
@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
|
||||
* inconsistent instruction while you patch.
|
||||
*/
|
||||
extern void *text_poke(void *addr, const void *opcode, size_t len);
|
||||
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
|
||||
extern int poke_int3_handler(struct pt_regs *regs);
|
||||
extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
|
||||
extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
|
||||
extern int after_bootmem;
|
||||
extern __ro_after_init struct mm_struct *poking_mm;
|
||||
extern __ro_after_init unsigned long poking_addr;
|
||||
|
||||
#endif /* _ASM_X86_TEXT_PATCHING_H */
|
||||
|
@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
#define nmi_uaccess_okay nmi_uaccess_okay
|
||||
|
||||
/* Initialize cr4 shadow for this CPU. */
|
||||
static inline void cr4_init_shadow(void)
|
||||
{
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/kdebug.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/mmu_context.h>
|
||||
#include <asm/text-patching.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/sections.h>
|
||||
@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
|
||||
|
||||
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
|
||||
extern s32 __smp_locks[], __smp_locks_end[];
|
||||
void *text_poke_early(void *addr, const void *opcode, size_t len);
|
||||
void text_poke_early(void *addr, const void *opcode, size_t len);
|
||||
|
||||
/*
|
||||
* Are we looking at a near JMP with a 1 or 4-byte displacement.
|
||||
@ -666,16 +667,136 @@ void __init alternative_instructions(void)
|
||||
* instructions. And on the local CPU you need to be protected again NMI or MCE
|
||||
* handlers seeing an inconsistent instruction while you patch.
|
||||
*/
|
||||
void *__init_or_module text_poke_early(void *addr, const void *opcode,
|
||||
size_t len)
|
||||
void __init_or_module text_poke_early(void *addr, const void *opcode,
|
||||
size_t len)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_NX) &&
|
||||
is_module_text_address((unsigned long)addr)) {
|
||||
/*
|
||||
* Modules text is marked initially as non-executable, so the
|
||||
* code cannot be running and speculative code-fetches are
|
||||
* prevented. Just change the code.
|
||||
*/
|
||||
memcpy(addr, opcode, len);
|
||||
} else {
|
||||
local_irq_save(flags);
|
||||
memcpy(addr, opcode, len);
|
||||
local_irq_restore(flags);
|
||||
sync_core();
|
||||
|
||||
/*
|
||||
* Could also do a CLFLUSH here to speed up CPU recovery; but
|
||||
* that causes hangs on some VIA CPUs.
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
__ro_after_init struct mm_struct *poking_mm;
|
||||
__ro_after_init unsigned long poking_addr;
|
||||
|
||||
static void *__text_poke(void *addr, const void *opcode, size_t len)
|
||||
{
|
||||
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
|
||||
struct page *pages[2] = {NULL};
|
||||
temp_mm_state_t prev;
|
||||
unsigned long flags;
|
||||
pte_t pte, *ptep;
|
||||
spinlock_t *ptl;
|
||||
pgprot_t pgprot;
|
||||
|
||||
/*
|
||||
* While boot memory allocator is running we cannot use struct pages as
|
||||
* they are not yet initialized. There is no way to recover.
|
||||
*/
|
||||
BUG_ON(!after_bootmem);
|
||||
|
||||
if (!core_kernel_text((unsigned long)addr)) {
|
||||
pages[0] = vmalloc_to_page(addr);
|
||||
if (cross_page_boundary)
|
||||
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
|
||||
} else {
|
||||
pages[0] = virt_to_page(addr);
|
||||
WARN_ON(!PageReserved(pages[0]));
|
||||
if (cross_page_boundary)
|
||||
pages[1] = virt_to_page(addr + PAGE_SIZE);
|
||||
}
|
||||
/*
|
||||
* If something went wrong, crash and burn since recovery paths are not
|
||||
* implemented.
|
||||
*/
|
||||
BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
|
||||
|
||||
local_irq_save(flags);
|
||||
memcpy(addr, opcode, len);
|
||||
|
||||
/*
|
||||
* Map the page without the global bit, as TLB flushing is done with
|
||||
* flush_tlb_mm_range(), which is intended for non-global PTEs.
|
||||
*/
|
||||
pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
|
||||
|
||||
/*
|
||||
* The lock is not really needed, but this allows to avoid open-coding.
|
||||
*/
|
||||
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
|
||||
|
||||
/*
|
||||
* This must not fail; preallocated in poking_init().
|
||||
*/
|
||||
VM_BUG_ON(!ptep);
|
||||
|
||||
pte = mk_pte(pages[0], pgprot);
|
||||
set_pte_at(poking_mm, poking_addr, ptep, pte);
|
||||
|
||||
if (cross_page_boundary) {
|
||||
pte = mk_pte(pages[1], pgprot);
|
||||
set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Loading the temporary mm behaves as a compiler barrier, which
|
||||
* guarantees that the PTE will be set at the time memcpy() is done.
|
||||
*/
|
||||
prev = use_temporary_mm(poking_mm);
|
||||
|
||||
kasan_disable_current();
|
||||
memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
|
||||
kasan_enable_current();
|
||||
|
||||
/*
|
||||
* Ensure that the PTE is only cleared after the instructions of memcpy
|
||||
* were issued by using a compiler barrier.
|
||||
*/
|
||||
barrier();
|
||||
|
||||
pte_clear(poking_mm, poking_addr, ptep);
|
||||
if (cross_page_boundary)
|
||||
pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
|
||||
|
||||
/*
|
||||
* Loading the previous page-table hierarchy requires a serializing
|
||||
* instruction that already allows the core to see the updated version.
|
||||
* Xen-PV is assumed to serialize execution in a similar manner.
|
||||
*/
|
||||
unuse_temporary_mm(prev);
|
||||
|
||||
/*
|
||||
* Flushing the TLB might involve IPIs, which would require enabled
|
||||
* IRQs, but not if the mm is not used, as it is in this point.
|
||||
*/
|
||||
flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
|
||||
(cross_page_boundary ? 2 : 1) * PAGE_SIZE,
|
||||
PAGE_SHIFT, false);
|
||||
|
||||
/*
|
||||
* If the text does not match what we just wrote then something is
|
||||
* fundamentally screwy; there's nothing we can really do about that.
|
||||
*/
|
||||
BUG_ON(memcmp(addr, opcode, len));
|
||||
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
local_irq_restore(flags);
|
||||
sync_core();
|
||||
/* Could also do a CLFLUSH here to speed up CPU recovery; but
|
||||
that causes hangs on some VIA CPUs. */
|
||||
return addr;
|
||||
}
|
||||
|
||||
@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
|
||||
* It means the size must be writable atomically and the address must be aligned
|
||||
* in a way that permits an atomic write. It also makes sure we fit on a single
|
||||
* page.
|
||||
*
|
||||
* Note that the caller must ensure that if the modified code is part of a
|
||||
* module, the module would not be removed during poking. This can be achieved
|
||||
* by registering a module notifier, and ordering module removal and patching
|
||||
* trough a mutex.
|
||||
*/
|
||||
void *text_poke(void *addr, const void *opcode, size_t len)
|
||||
{
|
||||
unsigned long flags;
|
||||
char *vaddr;
|
||||
struct page *pages[2];
|
||||
int i;
|
||||
|
||||
/*
|
||||
* While boot memory allocator is runnig we cannot use struct
|
||||
* pages as they are not yet initialized.
|
||||
*/
|
||||
BUG_ON(!after_bootmem);
|
||||
|
||||
lockdep_assert_held(&text_mutex);
|
||||
|
||||
if (!core_kernel_text((unsigned long)addr)) {
|
||||
pages[0] = vmalloc_to_page(addr);
|
||||
pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
|
||||
} else {
|
||||
pages[0] = virt_to_page(addr);
|
||||
WARN_ON(!PageReserved(pages[0]));
|
||||
pages[1] = virt_to_page(addr + PAGE_SIZE);
|
||||
}
|
||||
BUG_ON(!pages[0]);
|
||||
local_irq_save(flags);
|
||||
set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
|
||||
if (pages[1])
|
||||
set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
|
||||
vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
|
||||
memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
|
||||
clear_fixmap(FIX_TEXT_POKE0);
|
||||
if (pages[1])
|
||||
clear_fixmap(FIX_TEXT_POKE1);
|
||||
local_flush_tlb();
|
||||
sync_core();
|
||||
/* Could also do a CLFLUSH here to speed up CPU recovery; but
|
||||
that causes hangs on some VIA CPUs. */
|
||||
for (i = 0; i < len; i++)
|
||||
BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
|
||||
local_irq_restore(flags);
|
||||
return addr;
|
||||
return __text_poke(addr, opcode, len);
|
||||
}
|
||||
|
||||
/**
|
||||
* text_poke_kgdb - Update instructions on a live kernel by kgdb
|
||||
* @addr: address to modify
|
||||
* @opcode: source of the copy
|
||||
* @len: length to copy
|
||||
*
|
||||
* Only atomic text poke/set should be allowed when not doing early patching.
|
||||
* It means the size must be writable atomically and the address must be aligned
|
||||
* in a way that permits an atomic write. It also makes sure we fit on a single
|
||||
* page.
|
||||
*
|
||||
* Context: should only be used by kgdb, which ensures no other core is running,
|
||||
* despite the fact it does not hold the text_mutex.
|
||||
*/
|
||||
void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
|
||||
{
|
||||
return __text_poke(addr, opcode, len);
|
||||
}
|
||||
|
||||
static void do_sync_core(void *info)
|
||||
@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler);
|
||||
* replacing opcode
|
||||
* - sync cores
|
||||
*/
|
||||
void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
{
|
||||
unsigned char int3 = 0xcc;
|
||||
|
||||
@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
|
||||
* the writing of the new instruction.
|
||||
*/
|
||||
bp_patching_in_progress = false;
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
|
@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size)
|
||||
{
|
||||
return module_alloc(size);
|
||||
}
|
||||
static inline void tramp_free(void *tramp, int size)
|
||||
static inline void tramp_free(void *tramp)
|
||||
{
|
||||
int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
|
||||
|
||||
set_memory_nx((unsigned long)tramp, npages);
|
||||
set_memory_rw((unsigned long)tramp, npages);
|
||||
module_memfree(tramp);
|
||||
}
|
||||
#else
|
||||
@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline void tramp_free(void *tramp, int size) { }
|
||||
static inline void tramp_free(void *tramp) { }
|
||||
#endif
|
||||
|
||||
/* Defined as markers to the end of the ftrace default trampolines */
|
||||
@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
|
||||
unsigned long end_offset;
|
||||
unsigned long op_offset;
|
||||
unsigned long offset;
|
||||
unsigned long npages;
|
||||
unsigned long size;
|
||||
unsigned long retq;
|
||||
unsigned long *ptr;
|
||||
@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
|
||||
return 0;
|
||||
|
||||
*tramp_size = size + RET_SIZE + sizeof(void *);
|
||||
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
|
||||
|
||||
/* Copy ftrace_caller onto the trampoline memory */
|
||||
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
|
||||
@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
|
||||
/* ALLOC_TRAMP flags lets us know we created it */
|
||||
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
|
||||
|
||||
set_vm_flush_reset_perms(trampoline);
|
||||
|
||||
/*
|
||||
* Module allocation needs to be completed by making the page
|
||||
* executable. The page is still writable, which is a security hazard,
|
||||
* but anyhow ftrace breaks W^X completely.
|
||||
*/
|
||||
set_memory_x((unsigned long)trampoline, npages);
|
||||
return (unsigned long)trampoline;
|
||||
fail:
|
||||
tramp_free(trampoline, *tramp_size);
|
||||
tramp_free(trampoline);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
|
||||
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
|
||||
return;
|
||||
|
||||
tramp_free((void *)ops->trampoline, ops->trampoline_size);
|
||||
tramp_free((void *)ops->trampoline);
|
||||
ops->trampoline = 0;
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line)
|
||||
|
||||
static void __ref __jump_label_transform(struct jump_entry *entry,
|
||||
enum jump_label_type type,
|
||||
void *(*poker)(void *, const void *, size_t),
|
||||
int init)
|
||||
{
|
||||
union jump_code_union jmp;
|
||||
@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
|
||||
jmp.offset = jump_entry_target(entry) -
|
||||
(jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
|
||||
|
||||
if (early_boot_irqs_disabled)
|
||||
poker = text_poke_early;
|
||||
|
||||
if (type == JUMP_LABEL_JMP) {
|
||||
if (init) {
|
||||
expect = default_nop; line = __LINE__;
|
||||
@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
|
||||
bug_at((void *)jump_entry_code(entry), line);
|
||||
|
||||
/*
|
||||
* Make text_poke_bp() a default fallback poker.
|
||||
* As long as only a single processor is running and the code is still
|
||||
* not marked as RO, text_poke_early() can be used; Checking that
|
||||
* system_state is SYSTEM_BOOTING guarantees it. It will be set to
|
||||
* SYSTEM_SCHEDULING before other cores are awaken and before the
|
||||
* code is write-protected.
|
||||
*
|
||||
* At the time the change is being done, just ignore whether we
|
||||
* are doing nop -> jump or jump -> nop transition, and assume
|
||||
* always nop being the 'currently valid' instruction
|
||||
*
|
||||
*/
|
||||
if (poker) {
|
||||
(*poker)((void *)jump_entry_code(entry), code,
|
||||
JUMP_LABEL_NOP_SIZE);
|
||||
if (init || system_state == SYSTEM_BOOTING) {
|
||||
text_poke_early((void *)jump_entry_code(entry), code,
|
||||
JUMP_LABEL_NOP_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
|
||||
enum jump_label_type type)
|
||||
{
|
||||
mutex_lock(&text_mutex);
|
||||
__jump_label_transform(entry, type, NULL, 0);
|
||||
__jump_label_transform(entry, type, 0);
|
||||
mutex_unlock(&text_mutex);
|
||||
}
|
||||
|
||||
@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
|
||||
jlstate = JL_STATE_NO_UPDATE;
|
||||
}
|
||||
if (jlstate == JL_STATE_UPDATE)
|
||||
__jump_label_transform(entry, type, text_poke_early, 1);
|
||||
__jump_label_transform(entry, type, 1);
|
||||
}
|
||||
|
@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
|
||||
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
|
||||
{
|
||||
int err;
|
||||
char opc[BREAK_INSTR_SIZE];
|
||||
|
||||
bpt->type = BP_BREAKPOINT;
|
||||
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
|
||||
@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
|
||||
if (!err)
|
||||
return err;
|
||||
/*
|
||||
* It is safe to call text_poke() because normal kernel execution
|
||||
* It is safe to call text_poke_kgdb() because normal kernel execution
|
||||
* is stopped on all cores, so long as the text_mutex is not locked.
|
||||
*/
|
||||
if (mutex_is_locked(&text_mutex))
|
||||
return -EBUSY;
|
||||
text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
|
||||
BREAK_INSTR_SIZE);
|
||||
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
|
||||
if (err)
|
||||
return err;
|
||||
if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
|
||||
return -EINVAL;
|
||||
text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
|
||||
BREAK_INSTR_SIZE);
|
||||
bpt->type = BP_POKE_BREAKPOINT;
|
||||
|
||||
return err;
|
||||
@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
|
||||
|
||||
int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
|
||||
{
|
||||
int err;
|
||||
char opc[BREAK_INSTR_SIZE];
|
||||
|
||||
if (bpt->type != BP_POKE_BREAKPOINT)
|
||||
goto knl_write;
|
||||
/*
|
||||
* It is safe to call text_poke() because normal kernel execution
|
||||
* It is safe to call text_poke_kgdb() because normal kernel execution
|
||||
* is stopped on all cores, so long as the text_mutex is not locked.
|
||||
*/
|
||||
if (mutex_is_locked(&text_mutex))
|
||||
goto knl_write;
|
||||
text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
|
||||
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
|
||||
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
|
||||
goto knl_write;
|
||||
return err;
|
||||
text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
|
||||
BREAK_INSTR_SIZE);
|
||||
return 0;
|
||||
|
||||
knl_write:
|
||||
return probe_kernel_write((char *)bpt->bpt_addr,
|
||||
|
@ -431,8 +431,21 @@ void *alloc_insn_page(void)
|
||||
void *page;
|
||||
|
||||
page = module_alloc(PAGE_SIZE);
|
||||
if (page)
|
||||
set_memory_ro((unsigned long)page & PAGE_MASK, 1);
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
set_vm_flush_reset_perms(page);
|
||||
/*
|
||||
* First make the page read-only, and only then make it executable to
|
||||
* prevent it from being W+X in between.
|
||||
*/
|
||||
set_memory_ro((unsigned long)page, 1);
|
||||
|
||||
/*
|
||||
* TODO: Once additional kernel code protection mechanisms are set, ensure
|
||||
* that the page was not maliciously altered and it is still zeroed.
|
||||
*/
|
||||
set_memory_x((unsigned long)page, 1);
|
||||
|
||||
return page;
|
||||
}
|
||||
@ -440,8 +453,6 @@ void *alloc_insn_page(void)
|
||||
/* Recover page to RW mode before releasing it */
|
||||
void free_insn_page(void *page)
|
||||
{
|
||||
set_memory_nx((unsigned long)page & PAGE_MASK, 1);
|
||||
set_memory_rw((unsigned long)page & PAGE_MASK, 1);
|
||||
module_memfree(page);
|
||||
}
|
||||
|
||||
|
@ -87,7 +87,7 @@ void *module_alloc(unsigned long size)
|
||||
p = __vmalloc_node_range(size, MODULE_ALIGN,
|
||||
MODULES_VADDR + get_module_load_offset(),
|
||||
MODULES_END, GFP_KERNEL,
|
||||
PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
|
||||
PAGE_KERNEL, 0, NUMA_NO_NODE,
|
||||
__builtin_return_address(0));
|
||||
if (p && (kasan_module_alloc(p, size) < 0)) {
|
||||
vfree(p);
|
||||
|
@ -141,11 +141,11 @@ SECTIONS
|
||||
*(.text.__x86.indirect_thunk)
|
||||
__indirect_thunk_end = .;
|
||||
#endif
|
||||
|
||||
/* End of text section */
|
||||
_etext = .;
|
||||
} :text = 0x9090
|
||||
|
||||
/* End of text section */
|
||||
_etext = .;
|
||||
|
||||
NOTES :text :note
|
||||
|
||||
EXCEPTION_TABLE(16) :text = 0x9090
|
||||
|
@ -360,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address)
|
||||
if (!(address >= VMALLOC_START && address < VMALLOC_END))
|
||||
return -1;
|
||||
|
||||
WARN_ON_ONCE(in_nmi());
|
||||
|
||||
/*
|
||||
* Copy kernel mappings over when needed. This can also
|
||||
* happen within a race in page table update. In the later
|
||||
@ -604,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
|
||||
name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
|
||||
}
|
||||
|
||||
/*
|
||||
* This helper function transforms the #PF error_code bits into
|
||||
* "[PROT] [USER]" type of descriptive, almost human-readable error strings:
|
||||
*/
|
||||
static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
|
||||
{
|
||||
if (error_code & mask) {
|
||||
if (buf[0])
|
||||
strcat(buf, " ");
|
||||
strcat(buf, txt);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
|
||||
{
|
||||
char err_txt[64];
|
||||
|
||||
if (!oops_may_print())
|
||||
return;
|
||||
|
||||
@ -645,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
|
||||
from_kuid(&init_user_ns, current_uid()));
|
||||
}
|
||||
|
||||
pr_alert("BUG: unable to handle kernel %s at %px\n",
|
||||
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
|
||||
(void *)address);
|
||||
if (address < PAGE_SIZE && !user_mode(regs))
|
||||
pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
|
||||
(void *)address);
|
||||
else
|
||||
pr_alert("BUG: unable to handle page fault for address: %px\n",
|
||||
(void *)address);
|
||||
|
||||
err_txt[0] = 0;
|
||||
|
||||
/*
|
||||
* Note: length of these appended strings including the separation space and the
|
||||
* zero delimiter must fit into err_txt[].
|
||||
*/
|
||||
err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
|
||||
err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
|
||||
err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
|
||||
err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
|
||||
err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
|
||||
err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
|
||||
|
||||
pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
|
||||
pr_alert("#PF: %s %s in %s mode\n",
|
||||
(error_code & X86_PF_USER) ? "user" : "supervisor",
|
||||
(error_code & X86_PF_INSTR) ? "instruction fetch" :
|
||||
(error_code & X86_PF_WRITE) ? "write access" :
|
||||
"read access",
|
||||
user_mode(regs) ? "user" : "kernel");
|
||||
pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
|
||||
!(error_code & X86_PF_PROT) ? "not-present page" :
|
||||
(error_code & X86_PF_RSVD) ? "reserved bit violation" :
|
||||
(error_code & X86_PF_PK) ? "protection keys violation" :
|
||||
"permissions violation");
|
||||
|
||||
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
|
||||
struct desc_ptr idt, gdt;
|
||||
u16 ldtr, tr;
|
||||
|
||||
pr_alert("This was a system access from user code\n");
|
||||
|
||||
/*
|
||||
* This can happen for quite a few reasons. The more obvious
|
||||
* ones are faults accessing the GDT, or LDT. Perhaps
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <linux/swapfile.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/sched/task.h>
|
||||
|
||||
#include <asm/set_memory.h>
|
||||
#include <asm/e820/api.h>
|
||||
@ -23,6 +24,7 @@
|
||||
#include <asm/hypervisor.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/pti.h>
|
||||
#include <asm/text-patching.h>
|
||||
|
||||
/*
|
||||
* We need to define the tracepoints somewhere, and tlb.c
|
||||
@ -701,6 +703,41 @@ void __init init_mem_mapping(void)
|
||||
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize an mm_struct to be used during poking and a pointer to be used
|
||||
* during patching.
|
||||
*/
|
||||
void __init poking_init(void)
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
pte_t *ptep;
|
||||
|
||||
poking_mm = copy_init_mm();
|
||||
BUG_ON(!poking_mm);
|
||||
|
||||
/*
|
||||
* Randomize the poking address, but make sure that the following page
|
||||
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
|
||||
* and adjust the address if the PMD ends after the first one.
|
||||
*/
|
||||
poking_addr = TASK_UNMAPPED_BASE;
|
||||
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
|
||||
poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
|
||||
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
|
||||
|
||||
if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0)
|
||||
poking_addr += PAGE_SIZE;
|
||||
|
||||
/*
|
||||
* We need to trigger the allocation of the page-tables that will be
|
||||
* needed for poking now. Later, poking may be performed in an atomic
|
||||
* section, which might cause allocation to fail.
|
||||
*/
|
||||
ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
|
||||
BUG_ON(!ptep);
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
}
|
||||
|
||||
/*
|
||||
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
|
||||
* is valid. The argument is a physical page number.
|
||||
|
@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void)
|
||||
*/
|
||||
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
|
||||
prandom_bytes_state(&rand_state, &rand, sizeof(rand));
|
||||
if (pgtable_l5_enabled())
|
||||
entropy = (rand % (entropy + 1)) & P4D_MASK;
|
||||
else
|
||||
entropy = (rand % (entropy + 1)) & PUD_MASK;
|
||||
entropy = (rand % (entropy + 1)) & PUD_MASK;
|
||||
vaddr += entropy;
|
||||
*kaslr_regions[i].base = vaddr;
|
||||
|
||||
@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void)
|
||||
* randomization alignment.
|
||||
*/
|
||||
vaddr += get_padding(&kaslr_regions[i]);
|
||||
if (pgtable_l5_enabled())
|
||||
vaddr = round_up(vaddr + 1, P4D_SIZE);
|
||||
else
|
||||
vaddr = round_up(vaddr + 1, PUD_SIZE);
|
||||
vaddr = round_up(vaddr + 1, PUD_SIZE);
|
||||
remain_entropy -= entropy;
|
||||
}
|
||||
}
|
||||
|
||||
static void __meminit init_trampoline_pud(void)
|
||||
{
|
||||
unsigned long paddr, paddr_next;
|
||||
pud_t *pud_page_tramp, *pud, *pud_tramp;
|
||||
p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
|
||||
unsigned long paddr, vaddr;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud_page, *pud_page_tramp;
|
||||
int i;
|
||||
|
||||
pud_page_tramp = alloc_low_page();
|
||||
|
||||
/*
|
||||
* There are two mappings for the low 1MB area, the direct mapping
|
||||
* and the 1:1 mapping for the real mode trampoline:
|
||||
*
|
||||
* Direct mapping: virt_addr = phys_addr + PAGE_OFFSET
|
||||
* 1:1 mapping: virt_addr = phys_addr
|
||||
*/
|
||||
paddr = 0;
|
||||
pgd = pgd_offset_k((unsigned long)__va(paddr));
|
||||
pud_page = (pud_t *) pgd_page_vaddr(*pgd);
|
||||
vaddr = (unsigned long)__va(paddr);
|
||||
pgd = pgd_offset_k(vaddr);
|
||||
|
||||
for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
|
||||
pud_t *pud, *pud_tramp;
|
||||
unsigned long vaddr = (unsigned long)__va(paddr);
|
||||
p4d = p4d_offset(pgd, vaddr);
|
||||
pud = pud_offset(p4d, vaddr);
|
||||
|
||||
pud_tramp = pud_page_tramp + pud_index(paddr);
|
||||
pud = pud_page + pud_index(vaddr);
|
||||
paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
|
||||
pud_tramp = pud_page_tramp + pud_index(paddr);
|
||||
*pud_tramp = *pud;
|
||||
|
||||
*pud_tramp = *pud;
|
||||
}
|
||||
|
||||
set_pgd(&trampoline_pgd_entry,
|
||||
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
|
||||
}
|
||||
|
||||
static void __meminit init_trampoline_p4d(void)
|
||||
{
|
||||
unsigned long paddr, paddr_next;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d_page, *p4d_page_tramp;
|
||||
int i;
|
||||
|
||||
p4d_page_tramp = alloc_low_page();
|
||||
|
||||
paddr = 0;
|
||||
pgd = pgd_offset_k((unsigned long)__va(paddr));
|
||||
p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
|
||||
|
||||
for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
|
||||
p4d_t *p4d, *p4d_tramp;
|
||||
unsigned long vaddr = (unsigned long)__va(paddr);
|
||||
if (pgtable_l5_enabled()) {
|
||||
p4d_page_tramp = alloc_low_page();
|
||||
|
||||
p4d_tramp = p4d_page_tramp + p4d_index(paddr);
|
||||
p4d = p4d_page + p4d_index(vaddr);
|
||||
paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
|
||||
|
||||
*p4d_tramp = *p4d;
|
||||
set_p4d(p4d_tramp,
|
||||
__p4d(_KERNPG_TABLE | __pa(pud_page_tramp)));
|
||||
|
||||
set_pgd(&trampoline_pgd_entry,
|
||||
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
|
||||
} else {
|
||||
set_pgd(&trampoline_pgd_entry,
|
||||
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
|
||||
}
|
||||
|
||||
set_pgd(&trampoline_pgd_entry,
|
||||
__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Create PGD aligned trampoline table to allow real mode initialization
|
||||
* of additional CPUs. Consume only 1 low memory page.
|
||||
* The real mode trampoline, which is required for bootstrapping CPUs
|
||||
* occupies only a small area under the low 1MB. See reserve_real_mode()
|
||||
* for details.
|
||||
*
|
||||
* If KASLR is disabled the first PGD entry of the direct mapping is copied
|
||||
* to map the real mode trampoline.
|
||||
*
|
||||
* If KASLR is enabled, copy only the PUD which covers the low 1MB
|
||||
* area. This limits the randomization granularity to 1GB for both 4-level
|
||||
* and 5-level paging.
|
||||
*/
|
||||
void __meminit init_trampoline(void)
|
||||
{
|
||||
|
||||
if (!kaslr_memory_enabled()) {
|
||||
init_trampoline_default();
|
||||
return;
|
||||
}
|
||||
|
||||
if (pgtable_l5_enabled())
|
||||
init_trampoline_p4d();
|
||||
else
|
||||
init_trampoline_pud();
|
||||
init_trampoline_pud();
|
||||
}
|
||||
|
@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages)
|
||||
return set_memory_rw(addr, numpages);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_PAGEALLOC
|
||||
|
||||
static int __set_pages_p(struct page *page, int numpages)
|
||||
{
|
||||
unsigned long tempaddr = (unsigned long) page_address(page);
|
||||
@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages)
|
||||
return __change_page_attr_set_clr(&cpa, 0);
|
||||
}
|
||||
|
||||
int set_direct_map_invalid_noflush(struct page *page)
|
||||
{
|
||||
return __set_pages_np(page, 1);
|
||||
}
|
||||
|
||||
int set_direct_map_default_noflush(struct page *page)
|
||||
{
|
||||
return __set_pages_p(page, 1);
|
||||
}
|
||||
|
||||
void __kernel_map_pages(struct page *page, int numpages, int enable)
|
||||
{
|
||||
if (PageHighMem(page))
|
||||
@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HIBERNATION
|
||||
|
||||
bool kernel_page_present(struct page *page)
|
||||
{
|
||||
unsigned int level;
|
||||
@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page)
|
||||
pte = lookup_address((unsigned long)page_address(page), &level);
|
||||
return (pte_val(*pte) & _PAGE_PRESENT);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HIBERNATION */
|
||||
|
||||
#endif /* CONFIG_DEBUG_PAGEALLOC */
|
||||
|
||||
int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
|
||||
unsigned numpages, unsigned long page_flags)
|
||||
{
|
||||
|
@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
|
||||
|
||||
static struct kmem_cache *pgd_cache;
|
||||
|
||||
static int __init pgd_cache_init(void)
|
||||
void __init pgd_cache_init(void)
|
||||
{
|
||||
/*
|
||||
* When PAE kernel is running as a Xen domain, it does not use
|
||||
* shared kernel pmd. And this requires a whole page for pgd.
|
||||
*/
|
||||
if (!SHARED_KERNEL_PMD)
|
||||
return 0;
|
||||
return;
|
||||
|
||||
/*
|
||||
* when PAE kernel is not running as a Xen domain, it uses
|
||||
@ -390,9 +390,7 @@ static int __init pgd_cache_init(void)
|
||||
*/
|
||||
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
|
||||
SLAB_PANIC, NULL);
|
||||
return 0;
|
||||
}
|
||||
core_initcall(pgd_cache_init);
|
||||
|
||||
static inline pgd_t *_pgd_alloc(void)
|
||||
{
|
||||
@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd)
|
||||
}
|
||||
#else
|
||||
|
||||
void __init pgd_cache_init(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline pgd_t *_pgd_alloc(void)
|
||||
{
|
||||
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
|
||||
|
@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
||||
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
|
||||
}
|
||||
|
||||
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
|
||||
static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
|
||||
{
|
||||
const struct flush_tlb_info *f = info;
|
||||
|
||||
@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
*/
|
||||
unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
|
||||
|
||||
static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
|
||||
#endif
|
||||
|
||||
static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned int stride_shift, bool freed_tables,
|
||||
u64 new_tlb_gen)
|
||||
{
|
||||
struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
/*
|
||||
* Ensure that the following code is non-reentrant and flush_tlb_info
|
||||
* is not overwritten. This means no TLB flushing is initiated by
|
||||
* interrupt handlers and machine-check exception handlers.
|
||||
*/
|
||||
BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1);
|
||||
#endif
|
||||
|
||||
info->start = start;
|
||||
info->end = end;
|
||||
info->mm = mm;
|
||||
info->stride_shift = stride_shift;
|
||||
info->freed_tables = freed_tables;
|
||||
info->new_tlb_gen = new_tlb_gen;
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
static inline void put_flush_tlb_info(void)
|
||||
{
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
/* Complete reentrency prevention checks */
|
||||
barrier();
|
||||
this_cpu_dec(flush_tlb_info_idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned int stride_shift,
|
||||
bool freed_tables)
|
||||
{
|
||||
struct flush_tlb_info *info;
|
||||
u64 new_tlb_gen;
|
||||
int cpu;
|
||||
|
||||
struct flush_tlb_info info = {
|
||||
.mm = mm,
|
||||
.stride_shift = stride_shift,
|
||||
.freed_tables = freed_tables,
|
||||
};
|
||||
|
||||
cpu = get_cpu();
|
||||
|
||||
/* This is also a barrier that synchronizes with switch_mm(). */
|
||||
info.new_tlb_gen = inc_mm_tlb_gen(mm);
|
||||
|
||||
/* Should we flush just the requested range? */
|
||||
if ((end != TLB_FLUSH_ALL) &&
|
||||
((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
|
||||
info.start = start;
|
||||
info.end = end;
|
||||
} else {
|
||||
info.start = 0UL;
|
||||
info.end = TLB_FLUSH_ALL;
|
||||
if ((end == TLB_FLUSH_ALL) ||
|
||||
((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {
|
||||
start = 0;
|
||||
end = TLB_FLUSH_ALL;
|
||||
}
|
||||
|
||||
/* This is also a barrier that synchronizes with switch_mm(). */
|
||||
new_tlb_gen = inc_mm_tlb_gen(mm);
|
||||
|
||||
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
|
||||
new_tlb_gen);
|
||||
|
||||
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
|
||||
VM_WARN_ON(irqs_disabled());
|
||||
lockdep_assert_irqs_enabled();
|
||||
local_irq_disable();
|
||||
flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
|
||||
flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
|
||||
flush_tlb_others(mm_cpumask(mm), &info);
|
||||
flush_tlb_others(mm_cpumask(mm), info);
|
||||
|
||||
put_flush_tlb_info();
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info)
|
||||
|
||||
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
{
|
||||
|
||||
/* Balance as user space task's flush, a bit conservative */
|
||||
if (end == TLB_FLUSH_ALL ||
|
||||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
|
||||
on_each_cpu(do_flush_tlb_all, NULL, 1);
|
||||
} else {
|
||||
struct flush_tlb_info info;
|
||||
info.start = start;
|
||||
info.end = end;
|
||||
on_each_cpu(do_kernel_range_flush, &info, 1);
|
||||
struct flush_tlb_info *info;
|
||||
|
||||
preempt_disable();
|
||||
info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
|
||||
|
||||
on_each_cpu(do_kernel_range_flush, info, 1);
|
||||
|
||||
put_flush_tlb_info();
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
|
||||
* This means that the 'struct flush_tlb_info' that describes which mappings to
|
||||
* flush is actually fixed. We therefore set a single fixed struct and use it in
|
||||
* arch_tlbbatch_flush().
|
||||
*/
|
||||
static const struct flush_tlb_info full_flush_tlb_info = {
|
||||
.mm = NULL,
|
||||
.start = 0,
|
||||
.end = TLB_FLUSH_ALL,
|
||||
};
|
||||
|
||||
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
|
||||
{
|
||||
struct flush_tlb_info info = {
|
||||
.mm = NULL,
|
||||
.start = 0UL,
|
||||
.end = TLB_FLUSH_ALL,
|
||||
};
|
||||
|
||||
int cpu = get_cpu();
|
||||
|
||||
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
|
||||
VM_WARN_ON(irqs_disabled());
|
||||
lockdep_assert_irqs_enabled();
|
||||
local_irq_disable();
|
||||
flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
|
||||
flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
|
||||
local_irq_enable();
|
||||
}
|
||||
|
||||
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
|
||||
flush_tlb_others(&batch->cpumask, &info);
|
||||
flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
|
||||
|
||||
cpumask_clear(&batch->cpumask);
|
||||
|
||||
|
@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
|
||||
#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
|
||||
case VSYSCALL_PAGE:
|
||||
#endif
|
||||
case FIX_TEXT_POKE0:
|
||||
case FIX_TEXT_POKE1:
|
||||
/* All local page mappings */
|
||||
pte = pfn_pte(phys, prot);
|
||||
break;
|
||||
|
@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
|
||||
static inline void init_espfix_bsp(void) { }
|
||||
#endif
|
||||
|
||||
extern void __init pgd_cache_init(void);
|
||||
|
||||
#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
|
||||
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
|
||||
{
|
||||
|
@ -21,6 +21,15 @@
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/cacheflush.h>
|
||||
|
||||
/*
|
||||
* Blindly accessing user memory from NMI context can be dangerous
|
||||
* if we're in the middle of switching the current user task or switching
|
||||
* the loaded mm.
|
||||
*/
|
||||
#ifndef nmi_uaccess_okay
|
||||
# define nmi_uaccess_okay() true
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
/*
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <linux/set_memory.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include <net/sch_generic.h>
|
||||
|
||||
@ -503,7 +504,6 @@ struct bpf_prog {
|
||||
u16 pages; /* Number of allocated pages */
|
||||
u16 jited:1, /* Is our filter JIT'ed? */
|
||||
jit_requested:1,/* archs need to JIT the prog */
|
||||
undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
|
||||
gpl_compatible:1, /* Is filter GPL compatible? */
|
||||
cb_access:1, /* Is control block accessed? */
|
||||
dst_needed:1, /* Do we need dst entry? */
|
||||
@ -733,24 +733,15 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
|
||||
|
||||
static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
|
||||
{
|
||||
fp->undo_set_mem = 1;
|
||||
set_vm_flush_reset_perms(fp);
|
||||
set_memory_ro((unsigned long)fp, fp->pages);
|
||||
}
|
||||
|
||||
static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
|
||||
{
|
||||
if (fp->undo_set_mem)
|
||||
set_memory_rw((unsigned long)fp, fp->pages);
|
||||
}
|
||||
|
||||
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
|
||||
{
|
||||
set_vm_flush_reset_perms(hdr);
|
||||
set_memory_ro((unsigned long)hdr, hdr->pages);
|
||||
}
|
||||
|
||||
static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
|
||||
{
|
||||
set_memory_rw((unsigned long)hdr, hdr->pages);
|
||||
set_memory_x((unsigned long)hdr, hdr->pages);
|
||||
}
|
||||
|
||||
static inline struct bpf_binary_header *
|
||||
@ -788,7 +779,6 @@ void __bpf_prog_free(struct bpf_prog *fp);
|
||||
|
||||
static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
|
||||
{
|
||||
bpf_prog_unlock_ro(fp);
|
||||
__bpf_prog_free(fp);
|
||||
}
|
||||
|
||||
|
@ -2610,37 +2610,31 @@ static inline void kernel_poison_pages(struct page *page, int numpages,
|
||||
int enable) { }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DEBUG_PAGEALLOC
|
||||
extern bool _debug_pagealloc_enabled;
|
||||
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
|
||||
|
||||
static inline bool debug_pagealloc_enabled(void)
|
||||
{
|
||||
return _debug_pagealloc_enabled;
|
||||
return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
|
||||
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
|
||||
|
||||
static inline void
|
||||
kernel_map_pages(struct page *page, int numpages, int enable)
|
||||
{
|
||||
if (!debug_pagealloc_enabled())
|
||||
return;
|
||||
|
||||
__kernel_map_pages(page, numpages, enable);
|
||||
}
|
||||
#ifdef CONFIG_HIBERNATION
|
||||
extern bool kernel_page_present(struct page *page);
|
||||
#endif /* CONFIG_HIBERNATION */
|
||||
#else /* CONFIG_DEBUG_PAGEALLOC */
|
||||
#else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
|
||||
static inline void
|
||||
kernel_map_pages(struct page *page, int numpages, int enable) {}
|
||||
#ifdef CONFIG_HIBERNATION
|
||||
static inline bool kernel_page_present(struct page *page) { return true; }
|
||||
#endif /* CONFIG_HIBERNATION */
|
||||
static inline bool debug_pagealloc_enabled(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_DEBUG_PAGEALLOC */
|
||||
#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */
|
||||
|
||||
#ifdef __HAVE_ARCH_GATE_AREA
|
||||
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
|
||||
|
@ -76,6 +76,7 @@ extern void exit_itimers(struct signal_struct *);
|
||||
extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
|
||||
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
|
||||
struct task_struct *fork_idle(int);
|
||||
struct mm_struct *copy_init_mm(void);
|
||||
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
|
||||
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
|
||||
|
||||
|
@ -17,6 +17,17 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
|
||||
static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP
|
||||
static inline int set_direct_map_invalid_noflush(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline int set_direct_map_default_noflush(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef set_mce_nospec
|
||||
static inline int set_mce_nospec(unsigned long pfn)
|
||||
{
|
||||
|
@ -115,6 +115,7 @@ struct uprobes_state {
|
||||
struct xol_area *xol_area;
|
||||
};
|
||||
|
||||
extern void __init uprobes_init(void);
|
||||
extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
|
||||
extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
|
||||
extern bool is_swbp_insn(uprobe_opcode_t *insn);
|
||||
@ -154,6 +155,10 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
|
||||
struct uprobes_state {
|
||||
};
|
||||
|
||||
static inline void uprobes_init(void)
|
||||
{
|
||||
}
|
||||
|
||||
#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
|
||||
|
||||
static inline int
|
||||
|
@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */
|
||||
#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
|
||||
#define VM_NO_GUARD 0x00000040 /* don't add guard page */
|
||||
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
|
||||
/*
|
||||
* Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
|
||||
* vfree_atomic().
|
||||
*/
|
||||
#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */
|
||||
/* bits [20..32] reserved for arch specific ioremap internals */
|
||||
|
||||
/*
|
||||
@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
|
||||
pgprot_t prot, struct page **pages);
|
||||
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
|
||||
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
|
||||
static inline void set_vm_flush_reset_perms(void *addr)
|
||||
{
|
||||
struct vm_struct *vm = find_vm_area(addr);
|
||||
|
||||
if (vm)
|
||||
vm->flags |= VM_FLUSH_RESET_PERMS;
|
||||
}
|
||||
#else
|
||||
static inline int
|
||||
map_kernel_range_noflush(unsigned long start, unsigned long size,
|
||||
@ -157,6 +169,9 @@ static inline void
|
||||
unmap_kernel_range(unsigned long addr, unsigned long size)
|
||||
{
|
||||
}
|
||||
static inline void set_vm_flush_reset_perms(void *addr)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Allocate/destroy a 'vmalloc' VM area. */
|
||||
|
@ -504,6 +504,10 @@ void __init __weak thread_stack_cache_init(void)
|
||||
|
||||
void __init __weak mem_encrypt_init(void) { }
|
||||
|
||||
void __init __weak poking_init(void) { }
|
||||
|
||||
void __init __weak pgd_cache_init(void) { }
|
||||
|
||||
bool initcall_debug;
|
||||
core_param(initcall_debug, initcall_debug, bool, 0644);
|
||||
|
||||
@ -535,6 +539,7 @@ static void __init mm_init(void)
|
||||
init_espfix_bsp();
|
||||
/* Should be run after espfix64 is set up. */
|
||||
pti_init();
|
||||
pgd_cache_init();
|
||||
}
|
||||
|
||||
void __init __weak arch_call_rest_init(void)
|
||||
@ -737,6 +742,7 @@ asmlinkage __visible void __init start_kernel(void)
|
||||
taskstats_init_early();
|
||||
delayacct_init();
|
||||
|
||||
poking_init();
|
||||
check_bugs();
|
||||
|
||||
acpi_subsystem_init();
|
||||
|
@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
|
||||
if (fp->jited) {
|
||||
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
|
||||
|
||||
bpf_jit_binary_unlock_ro(hdr);
|
||||
bpf_jit_binary_free(hdr);
|
||||
|
||||
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
|
||||
|
@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
|
||||
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
|
||||
};
|
||||
|
||||
static int __init init_uprobes(void)
|
||||
void __init uprobes_init(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < UPROBES_HASH_SZ; i++)
|
||||
mutex_init(&uprobes_mmap_mutex[i]);
|
||||
|
||||
if (percpu_init_rwsem(&dup_mmap_sem))
|
||||
return -ENOMEM;
|
||||
BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
|
||||
|
||||
return register_die_notifier(&uprobe_exception_nb);
|
||||
BUG_ON(register_die_notifier(&uprobe_exception_nb));
|
||||
}
|
||||
__initcall(init_uprobes);
|
||||
|
@ -815,6 +815,7 @@ void __init fork_init(void)
|
||||
#endif
|
||||
|
||||
lockdep_init_task(&init_task);
|
||||
uprobes_init();
|
||||
}
|
||||
|
||||
int __weak arch_dup_task_struct(struct task_struct *dst,
|
||||
@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
|
||||
complete_vfork_done(tsk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new mm structure and copy contents from the
|
||||
* mm structure of the passed in task structure.
|
||||
/**
|
||||
* dup_mm() - duplicates an existing mm structure
|
||||
* @tsk: the task_struct with which the new mm will be associated.
|
||||
* @oldmm: the mm to duplicate.
|
||||
*
|
||||
* Allocates a new mm structure and duplicates the provided @oldmm structure
|
||||
* content into it.
|
||||
*
|
||||
* Return: the duplicated mm or NULL on failure.
|
||||
*/
|
||||
static struct mm_struct *dup_mm(struct task_struct *tsk)
|
||||
static struct mm_struct *dup_mm(struct task_struct *tsk,
|
||||
struct mm_struct *oldmm)
|
||||
{
|
||||
struct mm_struct *mm, *oldmm = current->mm;
|
||||
struct mm_struct *mm;
|
||||
int err;
|
||||
|
||||
mm = allocate_mm();
|
||||
@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
|
||||
}
|
||||
|
||||
retval = -ENOMEM;
|
||||
mm = dup_mm(tsk);
|
||||
mm = dup_mm(tsk, current->mm);
|
||||
if (!mm)
|
||||
goto fail_nomem;
|
||||
|
||||
@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
|
||||
return task;
|
||||
}
|
||||
|
||||
struct mm_struct *copy_init_mm(void)
|
||||
{
|
||||
return dup_mm(NULL, &init_mm);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ok, this is the main fork-routine.
|
||||
*
|
||||
|
@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
|
||||
EXPORT_SYMBOL_GPL(module_mutex);
|
||||
static LIST_HEAD(modules);
|
||||
|
||||
/* Work queue for freeing init sections in success case */
|
||||
static struct work_struct init_free_wq;
|
||||
static struct llist_head init_free_list;
|
||||
|
||||
#ifdef CONFIG_MODULES_TREE_LOOKUP
|
||||
|
||||
/*
|
||||
@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
|
||||
if (!rodata_enabled)
|
||||
return;
|
||||
|
||||
set_vm_flush_reset_perms(mod->core_layout.base);
|
||||
set_vm_flush_reset_perms(mod->init_layout.base);
|
||||
frob_text(&mod->core_layout, set_memory_ro);
|
||||
frob_text(&mod->core_layout, set_memory_x);
|
||||
|
||||
frob_rodata(&mod->core_layout, set_memory_ro);
|
||||
|
||||
frob_text(&mod->init_layout, set_memory_ro);
|
||||
frob_text(&mod->init_layout, set_memory_x);
|
||||
|
||||
frob_rodata(&mod->init_layout, set_memory_ro);
|
||||
|
||||
if (after_init)
|
||||
@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
|
||||
frob_writable_data(&mod->init_layout, set_memory_nx);
|
||||
}
|
||||
|
||||
static void module_disable_nx(const struct module *mod)
|
||||
{
|
||||
frob_rodata(&mod->core_layout, set_memory_x);
|
||||
frob_ro_after_init(&mod->core_layout, set_memory_x);
|
||||
frob_writable_data(&mod->core_layout, set_memory_x);
|
||||
frob_rodata(&mod->init_layout, set_memory_x);
|
||||
frob_writable_data(&mod->init_layout, set_memory_x);
|
||||
}
|
||||
|
||||
/* Iterate through all modules and set each module's text as RW */
|
||||
void set_all_modules_text_rw(void)
|
||||
{
|
||||
@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
|
||||
}
|
||||
mutex_unlock(&module_mutex);
|
||||
}
|
||||
|
||||
static void disable_ro_nx(const struct module_layout *layout)
|
||||
{
|
||||
if (rodata_enabled) {
|
||||
frob_text(layout, set_memory_rw);
|
||||
frob_rodata(layout, set_memory_rw);
|
||||
frob_ro_after_init(layout, set_memory_rw);
|
||||
}
|
||||
frob_rodata(layout, set_memory_x);
|
||||
frob_ro_after_init(layout, set_memory_x);
|
||||
frob_writable_data(layout, set_memory_x);
|
||||
}
|
||||
|
||||
#else
|
||||
static void disable_ro_nx(const struct module_layout *layout) { }
|
||||
static void module_enable_nx(const struct module *mod) { }
|
||||
static void module_disable_nx(const struct module *mod) { }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_LIVEPATCH
|
||||
@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
|
||||
|
||||
void __weak module_memfree(void *module_region)
|
||||
{
|
||||
/*
|
||||
* This memory may be RO, and freeing RO memory in an interrupt is not
|
||||
* supported by vmalloc.
|
||||
*/
|
||||
WARN_ON(in_interrupt());
|
||||
vfree(module_region);
|
||||
}
|
||||
|
||||
@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
|
||||
mutex_unlock(&module_mutex);
|
||||
|
||||
/* This may be empty, but that's OK */
|
||||
disable_ro_nx(&mod->init_layout);
|
||||
module_arch_freeing_init(mod);
|
||||
module_memfree(mod->init_layout.base);
|
||||
kfree(mod->args);
|
||||
@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
|
||||
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
|
||||
|
||||
/* Finally, free the core (containing the module structure) */
|
||||
disable_ro_nx(&mod->core_layout);
|
||||
module_memfree(mod->core_layout.base);
|
||||
}
|
||||
|
||||
@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
|
||||
|
||||
/* For freeing module_init on success, in case kallsyms traversing */
|
||||
struct mod_initfree {
|
||||
struct rcu_head rcu;
|
||||
struct llist_node node;
|
||||
void *module_init;
|
||||
};
|
||||
|
||||
static void do_free_init(struct rcu_head *head)
|
||||
static void do_free_init(struct work_struct *w)
|
||||
{
|
||||
struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
|
||||
module_memfree(m->module_init);
|
||||
kfree(m);
|
||||
struct llist_node *pos, *n, *list;
|
||||
struct mod_initfree *initfree;
|
||||
|
||||
list = llist_del_all(&init_free_list);
|
||||
|
||||
synchronize_rcu();
|
||||
|
||||
llist_for_each_safe(pos, n, list) {
|
||||
initfree = container_of(pos, struct mod_initfree, node);
|
||||
module_memfree(initfree->module_init);
|
||||
kfree(initfree);
|
||||
}
|
||||
}
|
||||
|
||||
static int __init modules_wq_init(void)
|
||||
{
|
||||
INIT_WORK(&init_free_wq, do_free_init);
|
||||
init_llist_head(&init_free_list);
|
||||
return 0;
|
||||
}
|
||||
module_init(modules_wq_init);
|
||||
|
||||
/*
|
||||
* This is where the real work happens.
|
||||
*
|
||||
@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
|
||||
#endif
|
||||
module_enable_ro(mod, true);
|
||||
mod_tree_remove_init(mod);
|
||||
disable_ro_nx(&mod->init_layout);
|
||||
module_arch_freeing_init(mod);
|
||||
mod->init_layout.base = NULL;
|
||||
mod->init_layout.size = 0;
|
||||
@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
|
||||
* We want to free module_init, but be aware that kallsyms may be
|
||||
* walking this with preempt disabled. In all the failure paths, we
|
||||
* call synchronize_rcu(), but we don't want to slow down the success
|
||||
* path, so use actual RCU here.
|
||||
* path. module_memfree() cannot be called in an interrupt, so do the
|
||||
* work and call synchronize_rcu() in a work queue.
|
||||
*
|
||||
* Note that module_alloc() on most architectures creates W+X page
|
||||
* mappings which won't be cleaned up until do_free_init() runs. Any
|
||||
* code such as mark_rodata_ro() which depends on those mappings to
|
||||
* be cleaned up needs to sync with the queued work - ie
|
||||
* rcu_barrier()
|
||||
*/
|
||||
call_rcu(&freeinit->rcu, do_free_init);
|
||||
if (llist_add(&freeinit->node, &init_free_list))
|
||||
schedule_work(&init_free_wq);
|
||||
|
||||
mutex_unlock(&module_mutex);
|
||||
wake_up_all(&module_wq);
|
||||
|
||||
@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
|
||||
module_bug_cleanup(mod);
|
||||
mutex_unlock(&module_mutex);
|
||||
|
||||
/* we can't deallocate the module until we clear memory protection */
|
||||
module_disable_ro(mod);
|
||||
module_disable_nx(mod);
|
||||
|
||||
ddebug_cleanup:
|
||||
ftrace_release_mod(mod);
|
||||
dynamic_debug_remove(mod, info->debug);
|
||||
|
@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
|
||||
* safe_copy_page - Copy a page in a safe way.
|
||||
*
|
||||
* Check if the page we are going to copy is marked as present in the kernel
|
||||
* page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
|
||||
* and in that case kernel_page_present() always returns 'true').
|
||||
* page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
|
||||
* CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
|
||||
* always returns 'true'.
|
||||
*/
|
||||
static void safe_copy_page(void *dst, struct page *s_page)
|
||||
{
|
||||
|
@ -14,6 +14,8 @@
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/error-injection.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
|
||||
#include "trace_probe.h"
|
||||
#include "trace.h"
|
||||
|
||||
@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
|
||||
* access_ok() should prevent writing to non-user memory, but in
|
||||
* some situations (nommu, temporary switch, etc) access_ok() does
|
||||
* not provide enough validation, hence the check on KERNEL_DS.
|
||||
*
|
||||
* nmi_uaccess_okay() ensures the probe is not run in an interim
|
||||
* state, when the task or mm are switched. This is specifically
|
||||
* required to prevent the use of temporary mm.
|
||||
*/
|
||||
|
||||
if (unlikely(in_interrupt() ||
|
||||
@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
|
||||
return -EPERM;
|
||||
if (unlikely(uaccess_kernel()))
|
||||
return -EPERM;
|
||||
if (unlikely(!nmi_uaccess_okay()))
|
||||
return -EPERM;
|
||||
if (!access_ok(unsafe_ptr, size))
|
||||
return -EPERM;
|
||||
|
||||
|
@ -1144,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
}
|
||||
arch_free_page(page, order);
|
||||
kernel_poison_pages(page, 1 << order, 0);
|
||||
kernel_map_pages(page, 1 << order, 0);
|
||||
if (debug_pagealloc_enabled())
|
||||
kernel_map_pages(page, 1 << order, 0);
|
||||
|
||||
kasan_free_nondeferred_pages(page, order);
|
||||
|
||||
return true;
|
||||
@ -2014,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
|
||||
set_page_refcounted(page);
|
||||
|
||||
arch_alloc_page(page, order);
|
||||
kernel_map_pages(page, 1 << order, 1);
|
||||
if (debug_pagealloc_enabled())
|
||||
kernel_map_pages(page, 1 << order, 1);
|
||||
kasan_alloc_pages(page, order);
|
||||
kernel_poison_pages(page, 1 << order, 1);
|
||||
set_page_owner(page, order, gfp_flags);
|
||||
|
113
mm/vmalloc.c
113
mm/vmalloc.c
@ -18,6 +18,7 @@
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/set_memory.h>
|
||||
#include <linux/debugobjects.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/list.h>
|
||||
@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size)
|
||||
spin_unlock(&vb->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
|
||||
*
|
||||
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
|
||||
* to amortize TLB flushing overheads. What this means is that any page you
|
||||
* have now, may, in a former life, have been mapped into kernel virtual
|
||||
* address by the vmap layer and so there might be some CPUs with TLB entries
|
||||
* still referencing that page (additional to the regular 1:1 kernel mapping).
|
||||
*
|
||||
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
|
||||
* be sure that none of the pages we have control over will have any aliases
|
||||
* from the vmap layer.
|
||||
*/
|
||||
void vm_unmap_aliases(void)
|
||||
static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
|
||||
{
|
||||
unsigned long start = ULONG_MAX, end = 0;
|
||||
int cpu;
|
||||
int flush = 0;
|
||||
|
||||
if (unlikely(!vmap_initialized))
|
||||
return;
|
||||
@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void)
|
||||
flush_tlb_kernel_range(start, end);
|
||||
mutex_unlock(&vmap_purge_lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
|
||||
*
|
||||
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
|
||||
* to amortize TLB flushing overheads. What this means is that any page you
|
||||
* have now, may, in a former life, have been mapped into kernel virtual
|
||||
* address by the vmap layer and so there might be some CPUs with TLB entries
|
||||
* still referencing that page (additional to the regular 1:1 kernel mapping).
|
||||
*
|
||||
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
|
||||
* be sure that none of the pages we have control over will have any aliases
|
||||
* from the vmap layer.
|
||||
*/
|
||||
void vm_unmap_aliases(void)
|
||||
{
|
||||
unsigned long start = ULONG_MAX, end = 0;
|
||||
int flush = 0;
|
||||
|
||||
_vm_unmap_aliases(start, end, flush);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
|
||||
|
||||
/**
|
||||
@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void set_area_direct_map(const struct vm_struct *area,
|
||||
int (*set_direct_map)(struct page *page))
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < area->nr_pages; i++)
|
||||
if (page_address(area->pages[i]))
|
||||
set_direct_map(area->pages[i]);
|
||||
}
|
||||
|
||||
/* Handle removing and resetting vm mappings related to the vm_struct. */
|
||||
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
|
||||
{
|
||||
unsigned long addr = (unsigned long)area->addr;
|
||||
unsigned long start = ULONG_MAX, end = 0;
|
||||
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* The below block can be removed when all architectures that have
|
||||
* direct map permissions also have set_direct_map_() implementations.
|
||||
* This is concerned with resetting the direct map any an vm alias with
|
||||
* execute permissions, without leaving a RW+X window.
|
||||
*/
|
||||
if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
|
||||
set_memory_nx(addr, area->nr_pages);
|
||||
set_memory_rw(addr, area->nr_pages);
|
||||
}
|
||||
|
||||
remove_vm_area(area->addr);
|
||||
|
||||
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
|
||||
if (!flush_reset)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If not deallocating pages, just do the flush of the VM area and
|
||||
* return.
|
||||
*/
|
||||
if (!deallocate_pages) {
|
||||
vm_unmap_aliases();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If execution gets here, flush the vm mapping and reset the direct
|
||||
* map. Find the start and end range of the direct mappings to make sure
|
||||
* the vm_unmap_aliases() flush includes the direct map.
|
||||
*/
|
||||
for (i = 0; i < area->nr_pages; i++) {
|
||||
if (page_address(area->pages[i])) {
|
||||
start = min(addr, start);
|
||||
end = max(addr, end);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set direct map to something invalid so that it won't be cached if
|
||||
* there are any accesses after the TLB flush, then flush the TLB and
|
||||
* reset the direct map permissions to the default.
|
||||
*/
|
||||
set_area_direct_map(area, set_direct_map_invalid_noflush);
|
||||
_vm_unmap_aliases(start, end, 1);
|
||||
set_area_direct_map(area, set_direct_map_default_noflush);
|
||||
}
|
||||
|
||||
static void __vunmap(const void *addr, int deallocate_pages)
|
||||
{
|
||||
struct vm_struct *area;
|
||||
@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
|
||||
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
|
||||
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
|
||||
|
||||
remove_vm_area(addr);
|
||||
vm_remove_mappings(area, deallocate_pages);
|
||||
|
||||
if (deallocate_pages) {
|
||||
int i;
|
||||
|
||||
@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node);
|
||||
*/
|
||||
void *vmalloc_exec(unsigned long size)
|
||||
{
|
||||
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
|
||||
NUMA_NO_NODE, __builtin_return_address(0));
|
||||
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
|
||||
GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
|
||||
NUMA_NO_NODE, __builtin_return_address(0));
|
||||
}
|
||||
|
||||
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
|
||||
|
Loading…
Reference in New Issue
Block a user