diff --git a/include/linux/mm.h b/include/linux/mm.h index 450fc977ed02..ed6407d1b7b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1132,6 +1132,8 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + bool ignore_dirty; /* Ignore dirty pages */ + bool check_swap_entries; /* Check also swap entries */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/internal.h b/mm/internal.h index 7449392c6faa..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); + extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/memory.c b/mm/memory.c index 81dca0083fcd..098f00d05461 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1102,6 +1102,12 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { + /* + * oom_reaper cannot tear down dirty + * pages + */ + if (unlikely(details && details->ignore_dirty)) + continue; force_flush = 1; set_page_dirty(page); } @@ -1120,8 +1126,8 @@ again: } continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) + /* only check swap_entries if explicitly asked for in details */ + if (unlikely(details && !details->check_swap_entries)) continue; entry = pte_to_swp_entry(ptent); @@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping) - details = NULL; - BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details; + struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 06f7e1707847..f7ed6ece0719 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -35,6 +35,11 @@ #include #include #include +#include +#include + +#include +#include "internal.h" #define CREATE_TRACE_POINTS #include @@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static struct mm_struct *mm_to_reap; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); + +static bool __oom_reap_vmas(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + struct zap_details details = {.check_swap_entries = true, + .ignore_dirty = true}; + bool ret = true; + + /* We might have raced with exit path */ + if (!atomic_inc_not_zero(&mm->mm_users)) + return true; + + if (!down_read_trylock(&mm->mmap_sem)) { + ret = false; + goto out; + } + + tlb_gather_mmu(&tlb, mm, 0, -1); + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + /* + * mlocked VMAs require explicit munlocking before unmap. + * Let's keep it simple here and skip such VMAs. + */ + if (vma->vm_flags & VM_LOCKED) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, + &details); + } + tlb_finish_mmu(&tlb, 0, -1); + up_read(&mm->mmap_sem); +out: + mmput(mm); + return ret; +} + +static void oom_reap_vmas(struct mm_struct *mm) +{ + int attempts = 0; + + /* Retry the down_read_trylock(mmap_sem) a few times */ + while (attempts++ < 10 && !__oom_reap_vmas(mm)) + schedule_timeout_idle(HZ/10); + + /* Drop a reference taken by wake_oom_reaper */ + mmdrop(mm); +} + +static int oom_reaper(void *unused) +{ + while (true) { + struct mm_struct *mm; + + wait_event_freezable(oom_reaper_wait, + (mm = READ_ONCE(mm_to_reap))); + oom_reap_vmas(mm); + WRITE_ONCE(mm_to_reap, NULL); + } + + return 0; +} + +static void wake_oom_reaper(struct mm_struct *mm) +{ + struct mm_struct *old_mm; + + if (!oom_reaper_th) + return; + + /* + * Pin the given mm. Use mm_count instead of mm_users because + * we do not want to delay the address space tear down. + */ + atomic_inc(&mm->mm_count); + + /* + * Make sure that only a single mm is ever queued for the reaper + * because multiple are not necessary and the operation might be + * disruptive so better reduce it to the bare minimum. + */ + old_mm = cmpxchg(&mm_to_reap, NULL, mm); + if (!old_mm) + wake_up(&oom_reaper_wait); + else + mmdrop(mm); +} + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); + if (IS_ERR(oom_reaper_th)) { + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", + PTR_ERR(oom_reaper_th)); + oom_reaper_th = NULL; + } + return 0; +} +subsys_initcall(oom_init) +#else +static void wake_oom_reaper(struct mm_struct *mm) +{ +} +#endif + /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark @@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD)) + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + /* + * We cannot use oom_reaper for the mm shared by this + * process because it wouldn't get killed and so the + * memory might be still used. + */ + can_oom_reap = false; continue; - if (is_global_init(p)) - continue; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - continue; - + } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); + if (can_oom_reap) + wake_oom_reaper(mm); + mmdrop(mm); put_task_struct(victim); }