1d798ca3f1
Hugh has pointed that compound_head() call can be unsafe in some context. There's one example: CPU0 CPU1 isolate_migratepages_block() page_count() compound_head() !!PageTail() == true put_page() tail->first_page = NULL head = tail->first_page alloc_pages(__GFP_COMP) prep_compound_page() tail->first_page = head __SetPageTail(p); !!PageTail() == true <head == NULL dereferencing> The race is pure theoretical. I don't it's possible to trigger it in practice. But who knows. We can fix the race by changing how encode PageTail() and compound_head() within struct page to be able to update them in one shot. The patch introduces page->compound_head into third double word block in front of compound_dtor and compound_order. Bit 0 encodes PageTail() and the rest bits are pointer to head page if bit zero is set. The patch moves page->pmd_huge_pte out of word, just in case if an architecture defines pgtable_t into something what can have the bit 0 set. hugetlb_cgroup uses page->lru.next in the second tail page to store pointer struct hugetlb_cgroup. The patch switch it to use page->private in the second tail page instead. The space is free since ->first_page is removed from the union. The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER limitation, since there's now space in first tail page to store struct hugetlb_cgroup pointer. But that's out of scope of the patch. That means page->compound_head shares storage space with: - page->lru.next; - page->next; - page->rcu_head.next; That's too long list to be absolutely sure, but looks like nobody uses bit 0 of the word. page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future call_rcu_lazy() is not allowed as it makes use of the bit and we can get false positive PageTail(). [1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: David Rientjes <rientjes@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
241 lines
6.6 KiB
C
241 lines
6.6 KiB
C
/*
|
|
* mm/debug.c
|
|
*
|
|
* mm/ specific debug routines.
|
|
*
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/trace_events.h>
|
|
#include <linux/memcontrol.h>
|
|
|
|
static const struct trace_print_flags pageflag_names[] = {
|
|
{1UL << PG_locked, "locked" },
|
|
{1UL << PG_error, "error" },
|
|
{1UL << PG_referenced, "referenced" },
|
|
{1UL << PG_uptodate, "uptodate" },
|
|
{1UL << PG_dirty, "dirty" },
|
|
{1UL << PG_lru, "lru" },
|
|
{1UL << PG_active, "active" },
|
|
{1UL << PG_slab, "slab" },
|
|
{1UL << PG_owner_priv_1, "owner_priv_1" },
|
|
{1UL << PG_arch_1, "arch_1" },
|
|
{1UL << PG_reserved, "reserved" },
|
|
{1UL << PG_private, "private" },
|
|
{1UL << PG_private_2, "private_2" },
|
|
{1UL << PG_writeback, "writeback" },
|
|
{1UL << PG_head, "head" },
|
|
{1UL << PG_swapcache, "swapcache" },
|
|
{1UL << PG_mappedtodisk, "mappedtodisk" },
|
|
{1UL << PG_reclaim, "reclaim" },
|
|
{1UL << PG_swapbacked, "swapbacked" },
|
|
{1UL << PG_unevictable, "unevictable" },
|
|
#ifdef CONFIG_MMU
|
|
{1UL << PG_mlocked, "mlocked" },
|
|
#endif
|
|
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
|
|
{1UL << PG_uncached, "uncached" },
|
|
#endif
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
{1UL << PG_hwpoison, "hwpoison" },
|
|
#endif
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
{1UL << PG_compound_lock, "compound_lock" },
|
|
#endif
|
|
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
|
|
{1UL << PG_young, "young" },
|
|
{1UL << PG_idle, "idle" },
|
|
#endif
|
|
};
|
|
|
|
static void dump_flags(unsigned long flags,
|
|
const struct trace_print_flags *names, int count)
|
|
{
|
|
const char *delim = "";
|
|
unsigned long mask;
|
|
int i;
|
|
|
|
pr_emerg("flags: %#lx(", flags);
|
|
|
|
/* remove zone id */
|
|
flags &= (1UL << NR_PAGEFLAGS) - 1;
|
|
|
|
for (i = 0; i < count && flags; i++) {
|
|
|
|
mask = names[i].mask;
|
|
if ((flags & mask) != mask)
|
|
continue;
|
|
|
|
flags &= ~mask;
|
|
pr_cont("%s%s", delim, names[i].name);
|
|
delim = "|";
|
|
}
|
|
|
|
/* check for left over flags */
|
|
if (flags)
|
|
pr_cont("%s%#lx", delim, flags);
|
|
|
|
pr_cont(")\n");
|
|
}
|
|
|
|
void dump_page_badflags(struct page *page, const char *reason,
|
|
unsigned long badflags)
|
|
{
|
|
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
|
|
page, atomic_read(&page->_count), page_mapcount(page),
|
|
page->mapping, page->index);
|
|
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
|
|
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
|
|
if (reason)
|
|
pr_alert("page dumped because: %s\n", reason);
|
|
if (page->flags & badflags) {
|
|
pr_alert("bad because of flags:\n");
|
|
dump_flags(page->flags & badflags,
|
|
pageflag_names, ARRAY_SIZE(pageflag_names));
|
|
}
|
|
#ifdef CONFIG_MEMCG
|
|
if (page->mem_cgroup)
|
|
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
|
|
#endif
|
|
}
|
|
|
|
void dump_page(struct page *page, const char *reason)
|
|
{
|
|
dump_page_badflags(page, reason, 0);
|
|
}
|
|
EXPORT_SYMBOL(dump_page);
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
static const struct trace_print_flags vmaflags_names[] = {
|
|
{VM_READ, "read" },
|
|
{VM_WRITE, "write" },
|
|
{VM_EXEC, "exec" },
|
|
{VM_SHARED, "shared" },
|
|
{VM_MAYREAD, "mayread" },
|
|
{VM_MAYWRITE, "maywrite" },
|
|
{VM_MAYEXEC, "mayexec" },
|
|
{VM_MAYSHARE, "mayshare" },
|
|
{VM_GROWSDOWN, "growsdown" },
|
|
{VM_PFNMAP, "pfnmap" },
|
|
{VM_DENYWRITE, "denywrite" },
|
|
{VM_LOCKONFAULT, "lockonfault" },
|
|
{VM_LOCKED, "locked" },
|
|
{VM_IO, "io" },
|
|
{VM_SEQ_READ, "seqread" },
|
|
{VM_RAND_READ, "randread" },
|
|
{VM_DONTCOPY, "dontcopy" },
|
|
{VM_DONTEXPAND, "dontexpand" },
|
|
{VM_ACCOUNT, "account" },
|
|
{VM_NORESERVE, "noreserve" },
|
|
{VM_HUGETLB, "hugetlb" },
|
|
#if defined(CONFIG_X86)
|
|
{VM_PAT, "pat" },
|
|
#elif defined(CONFIG_PPC)
|
|
{VM_SAO, "sao" },
|
|
#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
|
|
{VM_GROWSUP, "growsup" },
|
|
#elif !defined(CONFIG_MMU)
|
|
{VM_MAPPED_COPY, "mappedcopy" },
|
|
#else
|
|
{VM_ARCH_1, "arch_1" },
|
|
#endif
|
|
{VM_DONTDUMP, "dontdump" },
|
|
#ifdef CONFIG_MEM_SOFT_DIRTY
|
|
{VM_SOFTDIRTY, "softdirty" },
|
|
#endif
|
|
{VM_MIXEDMAP, "mixedmap" },
|
|
{VM_HUGEPAGE, "hugepage" },
|
|
{VM_NOHUGEPAGE, "nohugepage" },
|
|
{VM_MERGEABLE, "mergeable" },
|
|
};
|
|
|
|
void dump_vma(const struct vm_area_struct *vma)
|
|
{
|
|
pr_emerg("vma %p start %p end %p\n"
|
|
"next %p prev %p mm %p\n"
|
|
"prot %lx anon_vma %p vm_ops %p\n"
|
|
"pgoff %lx file %p private_data %p\n",
|
|
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
|
|
vma->vm_prev, vma->vm_mm,
|
|
(unsigned long)pgprot_val(vma->vm_page_prot),
|
|
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
|
|
vma->vm_file, vma->vm_private_data);
|
|
dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
|
|
}
|
|
EXPORT_SYMBOL(dump_vma);
|
|
|
|
void dump_mm(const struct mm_struct *mm)
|
|
{
|
|
pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
|
|
#ifdef CONFIG_MMU
|
|
"get_unmapped_area %p\n"
|
|
#endif
|
|
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
|
|
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
|
|
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
|
|
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
|
|
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
|
|
"start_brk %lx brk %lx start_stack %lx\n"
|
|
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
|
|
"binfmt %p flags %lx core_state %p\n"
|
|
#ifdef CONFIG_AIO
|
|
"ioctx_table %p\n"
|
|
#endif
|
|
#ifdef CONFIG_MEMCG
|
|
"owner %p "
|
|
#endif
|
|
"exe_file %p\n"
|
|
#ifdef CONFIG_MMU_NOTIFIER
|
|
"mmu_notifier_mm %p\n"
|
|
#endif
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
|
|
#endif
|
|
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
|
|
"tlb_flush_pending %d\n"
|
|
#endif
|
|
"%s", /* This is here to hold the comma */
|
|
|
|
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
|
|
#ifdef CONFIG_MMU
|
|
mm->get_unmapped_area,
|
|
#endif
|
|
mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
|
|
mm->pgd, atomic_read(&mm->mm_users),
|
|
atomic_read(&mm->mm_count),
|
|
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
|
|
mm_nr_pmds((struct mm_struct *)mm),
|
|
mm->map_count,
|
|
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
|
|
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
|
|
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
|
|
mm->start_brk, mm->brk, mm->start_stack,
|
|
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
|
|
mm->binfmt, mm->flags, mm->core_state,
|
|
#ifdef CONFIG_AIO
|
|
mm->ioctx_table,
|
|
#endif
|
|
#ifdef CONFIG_MEMCG
|
|
mm->owner,
|
|
#endif
|
|
mm->exe_file,
|
|
#ifdef CONFIG_MMU_NOTIFIER
|
|
mm->mmu_notifier_mm,
|
|
#endif
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
|
|
#endif
|
|
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
|
|
mm->tlb_flush_pending,
|
|
#endif
|
|
"" /* This is here to not have a comma! */
|
|
);
|
|
|
|
dump_flags(mm->def_flags, vmaflags_names,
|
|
ARRAY_SIZE(vmaflags_names));
|
|
}
|
|
|
|
#endif /* CONFIG_DEBUG_VM */
|