447 lines
16 KiB
Diff
447 lines
16 KiB
Diff
|
In some cases it may happen that pmd_none_or_clear_bad() is called
|
||
|
with the mmap_sem hold in read mode. In those cases the huge page
|
||
|
faults can allocate hugepmds under pmd_none_or_clear_bad() and that
|
||
|
can trigger a false positive from pmd_bad() that will not like to see
|
||
|
a pmd materializing as trans huge.
|
||
|
|
||
|
It's not khugepaged the problem, khugepaged holds the mmap_sem in
|
||
|
write mode (and all those sites must hold the mmap_sem in read mode to
|
||
|
prevent pagetables to go away from under them, during code review it
|
||
|
seems vm86 mode on 32bit kernels requires that too unless it's
|
||
|
restricted to 1 thread per process or UP builds). The race is only
|
||
|
with the huge pagefaults that can convert a pmd_none() into a
|
||
|
pmd_trans_huge().
|
||
|
|
||
|
Effectively all these pmd_none_or_clear_bad() sites running with
|
||
|
mmap_sem in read mode are somewhat speculative with the page faults,
|
||
|
and the result is always undefined when they run simultaneously. This
|
||
|
is probably why it wasn't common to run into this. For example if the
|
||
|
madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page
|
||
|
fault, the hugepage will not be zapped, if the page fault runs first
|
||
|
it will be zapped.
|
||
|
|
||
|
Altering pmd_bad() not to error out if it finds hugepmds won't be
|
||
|
enough to fix this, because zap_pmd_range would then proceed to call
|
||
|
zap_pte_range (which would be incorrect if the pmd become a
|
||
|
pmd_trans_huge()).
|
||
|
|
||
|
The simplest way to fix this is to read the pmd in the local stack
|
||
|
(regardless of what we read, no need of actual CPU barriers, only
|
||
|
compiler barrier needed), and be sure it is not changing under the
|
||
|
code that computes its value. Even if the real pmd is changing under
|
||
|
the value we hold on the stack, we don't care. If we actually end up
|
||
|
in zap_pte_range it means the pmd was not none already and it was not
|
||
|
huge, and it can't become huge from under us (khugepaged locking
|
||
|
explained above).
|
||
|
|
||
|
All we need is to enforce that there is no way anymore that in a code
|
||
|
path like below, pmd_trans_huge can be false, but
|
||
|
pmd_none_or_clear_bad can run into a hugepmd. The overhead of a
|
||
|
barrier() is just a compiler tweak and should not be measurable (I
|
||
|
only added it for THP builds). I don't exclude different compiler
|
||
|
versions may have prevented the race too by caching the value of *pmd
|
||
|
on the stack (that hasn't been verified, but it wouldn't be impossible
|
||
|
considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none
|
||
|
are all inlines and there's no external function called in between
|
||
|
pmd_trans_huge and pmd_none_or_clear_bad).
|
||
|
|
||
|
if (pmd_trans_huge(*pmd)) {
|
||
|
if (next-addr != HPAGE_PMD_SIZE) {
|
||
|
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
|
||
|
split_huge_page_pmd(vma->vm_mm, pmd);
|
||
|
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
|
||
|
continue;
|
||
|
/* fall through */
|
||
|
}
|
||
|
if (pmd_none_or_clear_bad(pmd))
|
||
|
|
||
|
Because this race condition could be exercised without special
|
||
|
privileges this was reported in CVE-2012-1179.
|
||
|
|
||
|
The race was identified and fully explained by Ulrich who debugged it.
|
||
|
I'm quoting his accurate explanation below, for reference.
|
||
|
|
||
|
====== start quote =======
|
||
|
mapcount 0 page_mapcount 1
|
||
|
kernel BUG at mm/huge_memory.c:1384!
|
||
|
|
||
|
At some point prior to the panic, a "bad pmd ..." message similar to the
|
||
|
following is logged on the console:
|
||
|
|
||
|
mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7).
|
||
|
|
||
|
The "bad pmd ..." message is logged by pmd_clear_bad() before it clears
|
||
|
the page's PMD table entry.
|
||
|
|
||
|
143 void pmd_clear_bad(pmd_t *pmd)
|
||
|
144 {
|
||
|
-> 145 pmd_ERROR(*pmd);
|
||
|
146 pmd_clear(pmd);
|
||
|
147 }
|
||
|
|
||
|
After the PMD table entry has been cleared, there is an inconsistency
|
||
|
between the actual number of PMD table entries that are mapping the page
|
||
|
and the page's map count (_mapcount field in struct page). When the page
|
||
|
is subsequently reclaimed, __split_huge_page() detects this inconsistency.
|
||
|
|
||
|
1381 if (mapcount != page_mapcount(page))
|
||
|
1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
|
||
|
1383 mapcount, page_mapcount(page));
|
||
|
-> 1384 BUG_ON(mapcount != page_mapcount(page));
|
||
|
|
||
|
The root cause of the problem is a race of two threads in a multithreaded
|
||
|
process. Thread B incurs a page fault on a virtual address that has never
|
||
|
been accessed (PMD entry is zero) while Thread A is executing an madvise()
|
||
|
system call on a virtual address within the same 2 MB (huge page) range.
|
||
|
|
||
|
virtual address space
|
||
|
.---------------------.
|
||
|
| |
|
||
|
| |
|
||
|
.-|---------------------|
|
||
|
| | |
|
||
|
| | |<-- B(fault)
|
||
|
| | |
|
||
|
2 MB | |/////////////////////|-.
|
||
|
huge < |/////////////////////| > A(range)
|
||
|
page | |/////////////////////|-'
|
||
|
| | |
|
||
|
| | |
|
||
|
'-|---------------------|
|
||
|
| |
|
||
|
| |
|
||
|
'---------------------'
|
||
|
|
||
|
- Thread A is executing an madvise(..., MADV_DONTNEED) system call
|
||
|
on the virtual address range "A(range)" shown in the picture.
|
||
|
|
||
|
sys_madvise
|
||
|
// Acquire the semaphore in shared mode.
|
||
|
down_read(¤t->mm->mmap_sem)
|
||
|
...
|
||
|
madvise_vma
|
||
|
switch (behavior)
|
||
|
case MADV_DONTNEED:
|
||
|
madvise_dontneed
|
||
|
zap_page_range
|
||
|
unmap_vmas
|
||
|
unmap_page_range
|
||
|
zap_pud_range
|
||
|
zap_pmd_range
|
||
|
//
|
||
|
// Assume that this huge page has never been accessed.
|
||
|
// I.e. content of the PMD entry is zero (not mapped).
|
||
|
//
|
||
|
if (pmd_trans_huge(*pmd)) {
|
||
|
// We don't get here due to the above assumption.
|
||
|
}
|
||
|
//
|
||
|
// Assume that Thread B incurred a page fault and
|
||
|
.---------> // sneaks in here as shown below.
|
||
|
| //
|
||
|
| if (pmd_none_or_clear_bad(pmd))
|
||
|
| {
|
||
|
| if (unlikely(pmd_bad(*pmd)))
|
||
|
| pmd_clear_bad
|
||
|
| {
|
||
|
| pmd_ERROR
|
||
|
| // Log "bad pmd ..." message here.
|
||
|
| pmd_clear
|
||
|
| // Clear the page's PMD entry.
|
||
|
| // Thread B incremented the map count
|
||
|
| // in page_add_new_anon_rmap(), but
|
||
|
| // now the page is no longer mapped
|
||
|
| // by a PMD entry (-> inconsistency).
|
||
|
| }
|
||
|
| }
|
||
|
|
|
||
|
v
|
||
|
- Thread B is handling a page fault on virtual address "B(fault)" shown
|
||
|
in the picture.
|
||
|
|
||
|
...
|
||
|
do_page_fault
|
||
|
__do_page_fault
|
||
|
// Acquire the semaphore in shared mode.
|
||
|
down_read_trylock(&mm->mmap_sem)
|
||
|
...
|
||
|
handle_mm_fault
|
||
|
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma))
|
||
|
// We get here due to the above assumption (PMD entry is zero).
|
||
|
do_huge_pmd_anonymous_page
|
||
|
alloc_hugepage_vma
|
||
|
// Allocate a new transparent huge page here.
|
||
|
...
|
||
|
__do_huge_pmd_anonymous_page
|
||
|
...
|
||
|
spin_lock(&mm->page_table_lock)
|
||
|
...
|
||
|
page_add_new_anon_rmap
|
||
|
// Here we increment the page's map count (starts at -1).
|
||
|
atomic_set(&page->_mapcount, 0)
|
||
|
set_pmd_at
|
||
|
// Here we set the page's PMD entry which will be cleared
|
||
|
// when Thread A calls pmd_clear_bad().
|
||
|
...
|
||
|
spin_unlock(&mm->page_table_lock)
|
||
|
|
||
|
The mmap_sem does not prevent the race because both threads are acquiring
|
||
|
it in shared mode (down_read). Thread B holds the page_table_lock while
|
||
|
the page's map count and PMD table entry are updated. However, Thread A
|
||
|
does not synchronize on that lock.
|
||
|
====== end quote =======
|
||
|
|
||
|
Reported-by: Ulrich Obergfell <uobergfe@redhat.com>
|
||
|
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
|
||
|
---
|
||
|
arch/x86/kernel/vm86_32.c | 2 +
|
||
|
fs/proc/task_mmu.c | 9 ++++++
|
||
|
include/asm-generic/pgtable.h | 57 +++++++++++++++++++++++++++++++++++++++++
|
||
|
mm/memcontrol.c | 4 +++
|
||
|
mm/memory.c | 14 ++++++++--
|
||
|
mm/mempolicy.c | 2 +-
|
||
|
mm/mincore.c | 2 +-
|
||
|
mm/pagewalk.c | 2 +-
|
||
|
mm/swapfile.c | 4 +--
|
||
|
9 files changed, 87 insertions(+), 9 deletions(-)
|
||
|
|
||
|
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
|
||
|
index b466cab..328cb37 100644
|
||
|
--- a/arch/x86/kernel/vm86_32.c
|
||
|
+++ b/arch/x86/kernel/vm86_32.c
|
||
|
@@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
|
||
|
spinlock_t *ptl;
|
||
|
int i;
|
||
|
|
||
|
+ down_write(&mm->mmap_sem);
|
||
|
pgd = pgd_offset(mm, 0xA0000);
|
||
|
if (pgd_none_or_clear_bad(pgd))
|
||
|
goto out;
|
||
|
@@ -190,6 +191,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
|
||
|
}
|
||
|
pte_unmap_unlock(pte, ptl);
|
||
|
out:
|
||
|
+ up_write(&mm->mmap_sem);
|
||
|
flush_tlb();
|
||
|
}
|
||
|
|
||
|
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
|
||
|
index 7dcd2a2..3efa725 100644
|
||
|
--- a/fs/proc/task_mmu.c
|
||
|
+++ b/fs/proc/task_mmu.c
|
||
|
@@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||
|
} else {
|
||
|
spin_unlock(&walk->mm->page_table_lock);
|
||
|
}
|
||
|
+
|
||
|
+ if (pmd_trans_unstable(pmd))
|
||
|
+ return 0;
|
||
|
/*
|
||
|
* The mmap_sem held all the way back in m_start() is what
|
||
|
* keeps khugepaged out of here and from collapsing things
|
||
|
@@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
|
||
|
struct page *page;
|
||
|
|
||
|
split_huge_page_pmd(walk->mm, pmd);
|
||
|
+ if (pmd_trans_unstable(pmd))
|
||
|
+ return 0;
|
||
|
|
||
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
||
|
for (; addr != end; pte++, addr += PAGE_SIZE) {
|
||
|
@@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||
|
int err = 0;
|
||
|
|
||
|
split_huge_page_pmd(walk->mm, pmd);
|
||
|
+ if (pmd_trans_unstable(pmd))
|
||
|
+ return 0;
|
||
|
|
||
|
/* find the first VMA at or above 'addr' */
|
||
|
vma = find_vma(walk->mm, addr);
|
||
|
@@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
|
||
|
spin_unlock(&walk->mm->page_table_lock);
|
||
|
}
|
||
|
|
||
|
+ if (pmd_trans_unstable(pmd))
|
||
|
+ return 0;
|
||
|
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
|
||
|
do {
|
||
|
struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
|
||
|
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
|
||
|
index 76bff2b..10f8291 100644
|
||
|
--- a/include/asm-generic/pgtable.h
|
||
|
+++ b/include/asm-generic/pgtable.h
|
||
|
@@ -443,6 +443,63 @@ static inline int pmd_write(pmd_t pmd)
|
||
|
#endif /* __HAVE_ARCH_PMD_WRITE */
|
||
|
#endif
|
||
|
|
||
|
+/*
|
||
|
+ * This function is meant to be used by sites walking pagetables with
|
||
|
+ * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
|
||
|
+ * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
|
||
|
+ * into a null pmd and the transhuge page fault can convert a null pmd
|
||
|
+ * into an hugepmd or into a regular pmd (if the hugepage allocation
|
||
|
+ * fails). While holding the mmap_sem in read mode the pmd becomes
|
||
|
+ * stable and stops changing under us only if it's not null and not a
|
||
|
+ * transhuge pmd. When those races occurs and this function makes a
|
||
|
+ * difference vs the standard pmd_none_or_clear_bad, the result is
|
||
|
+ * undefined so behaving like if the pmd was none is safe (because it
|
||
|
+ * can return none anyway). The compiler level barrier() is critically
|
||
|
+ * important to compute the two checks atomically on the same pmdval.
|
||
|
+ */
|
||
|
+static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
|
||
|
+{
|
||
|
+ /* depend on compiler for an atomic pmd read */
|
||
|
+ pmd_t pmdval = *pmd;
|
||
|
+ /*
|
||
|
+ * The barrier will stabilize the pmdval in a register or on
|
||
|
+ * the stack so that it will stop changing under the code.
|
||
|
+ */
|
||
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||
|
+ barrier();
|
||
|
+#endif
|
||
|
+ if (pmd_none(pmdval))
|
||
|
+ return 1;
|
||
|
+ if (unlikely(pmd_bad(pmdval))) {
|
||
|
+ if (!pmd_trans_huge(pmdval))
|
||
|
+ pmd_clear_bad(pmd);
|
||
|
+ return 1;
|
||
|
+ }
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * This is a noop if Transparent Hugepage Support is not built into
|
||
|
+ * the kernel. Otherwise it is equivalent to
|
||
|
+ * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
|
||
|
+ * places that already verified the pmd is not none and they want to
|
||
|
+ * walk ptes while holding the mmap sem in read mode (write mode don't
|
||
|
+ * need this). If THP is not enabled, the pmd can't go away under the
|
||
|
+ * code even if MADV_DONTNEED runs, but if THP is enabled we need to
|
||
|
+ * run a pmd_trans_unstable before walking the ptes after
|
||
|
+ * split_huge_page_pmd returns (because it may have run when the pmd
|
||
|
+ * become null, but then a page fault can map in a THP and not a
|
||
|
+ * regular page).
|
||
|
+ */
|
||
|
+static inline int pmd_trans_unstable(pmd_t *pmd)
|
||
|
+{
|
||
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||
|
+ return pmd_none_or_trans_huge_or_clear_bad(pmd);
|
||
|
+#else
|
||
|
+ return 0;
|
||
|
+#endif
|
||
|
+}
|
||
|
+
|
||
|
#endif /* !__ASSEMBLY__ */
|
||
|
|
||
|
#endif /* _ASM_GENERIC_PGTABLE_H */
|
||
|
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
|
||
|
index d0e57a3..67b0578 100644
|
||
|
--- a/mm/memcontrol.c
|
||
|
+++ b/mm/memcontrol.c
|
||
|
@@ -5193,6 +5193,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
|
||
|
spinlock_t *ptl;
|
||
|
|
||
|
split_huge_page_pmd(walk->mm, pmd);
|
||
|
+ if (pmd_trans_unstable(pmd))
|
||
|
+ return 0;
|
||
|
|
||
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
||
|
for (; addr != end; pte++, addr += PAGE_SIZE)
|
||
|
@@ -5355,6 +5357,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
|
||
|
spinlock_t *ptl;
|
||
|
|
||
|
split_huge_page_pmd(walk->mm, pmd);
|
||
|
+ if (pmd_trans_unstable(pmd))
|
||
|
+ return 0;
|
||
|
retry:
|
||
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
||
|
for (; addr != end; addr += PAGE_SIZE) {
|
||
|
diff --git a/mm/memory.c b/mm/memory.c
|
||
|
index fa2f04e..e3090fc 100644
|
||
|
--- a/mm/memory.c
|
||
|
+++ b/mm/memory.c
|
||
|
@@ -1251,12 +1251,20 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||
|
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
|
||
|
split_huge_page_pmd(vma->vm_mm, pmd);
|
||
|
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
|
||
|
- continue;
|
||
|
+ goto next;
|
||
|
/* fall through */
|
||
|
}
|
||
|
- if (pmd_none_or_clear_bad(pmd))
|
||
|
- continue;
|
||
|
+ /*
|
||
|
+ * Here there can be other concurrent MADV_DONTNEED or
|
||
|
+ * trans huge page faults running, and if the pmd is
|
||
|
+ * none or trans huge it can change under us. This is
|
||
|
+ * because MADV_DONTNEED holds the mmap_sem in read
|
||
|
+ * mode.
|
||
|
+ */
|
||
|
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||
|
+ goto next;
|
||
|
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
|
||
|
+ next:
|
||
|
cond_resched();
|
||
|
} while (pmd++, addr = next, addr != end);
|
||
|
|
||
|
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
|
||
|
index 47296fe..0a37570 100644
|
||
|
--- a/mm/mempolicy.c
|
||
|
+++ b/mm/mempolicy.c
|
||
|
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||
|
do {
|
||
|
next = pmd_addr_end(addr, end);
|
||
|
split_huge_page_pmd(vma->vm_mm, pmd);
|
||
|
- if (pmd_none_or_clear_bad(pmd))
|
||
|
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||
|
continue;
|
||
|
if (check_pte_range(vma, pmd, addr, next, nodes,
|
||
|
flags, private))
|
||
|
diff --git a/mm/mincore.c b/mm/mincore.c
|
||
|
index 636a868..936b4ce 100644
|
||
|
--- a/mm/mincore.c
|
||
|
+++ b/mm/mincore.c
|
||
|
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||
|
}
|
||
|
/* fall through */
|
||
|
}
|
||
|
- if (pmd_none_or_clear_bad(pmd))
|
||
|
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||
|
mincore_unmapped_range(vma, addr, next, vec);
|
||
|
else
|
||
|
mincore_pte_range(vma, pmd, addr, next, vec);
|
||
|
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
|
||
|
index 2f5cf10..aa9701e 100644
|
||
|
--- a/mm/pagewalk.c
|
||
|
+++ b/mm/pagewalk.c
|
||
|
@@ -59,7 +59,7 @@ again:
|
||
|
continue;
|
||
|
|
||
|
split_huge_page_pmd(walk->mm, pmd);
|
||
|
- if (pmd_none_or_clear_bad(pmd))
|
||
|
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||
|
goto again;
|
||
|
err = walk_pte_range(pmd, addr, next, walk);
|
||
|
if (err)
|
||
|
diff --git a/mm/swapfile.c b/mm/swapfile.c
|
||
|
index d999f09..f31b29d 100644
|
||
|
--- a/mm/swapfile.c
|
||
|
+++ b/mm/swapfile.c
|
||
|
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||
|
pmd = pmd_offset(pud, addr);
|
||
|
do {
|
||
|
next = pmd_addr_end(addr, end);
|
||
|
- if (unlikely(pmd_trans_huge(*pmd)))
|
||
|
- continue;
|
||
|
- if (pmd_none_or_clear_bad(pmd))
|
||
|
+ if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||
|
continue;
|
||
|
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
|
||
|
if (ret)
|
||
|
|
||
|
--
|
||
|
To unsubscribe, send a message with 'unsubscribe linux-mm' in
|
||
|
the body to majordomo@kvack.org. For more info on Linux MM,
|
||
|
see: http://www.linux-mm.org/ .
|
||
|
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
|
||
|
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
|