kernel-ark/include/linux/memblock.h
Yinghai Lu 20e6926dcb x86, ACPI, mm: Revert movablemem_map support
Tim found:

  WARNING: at arch/x86/kernel/smpboot.c:324 topology_sane.isra.2+0x6f/0x80()
  Hardware name: S2600CP
  sched: CPU #1's llc-sibling CPU #0 is not on the same node! [node: 1 != 0]. Ignoring dependency.
  smpboot: Booting Node   1, Processors  #1
  Modules linked in:
  Pid: 0, comm: swapper/1 Not tainted 3.9.0-0-generic #1
  Call Trace:
    set_cpu_sibling_map+0x279/0x449
    start_secondary+0x11d/0x1e5

Don Morris reproduced on a HP z620 workstation, and bisected it to
commit e8d1955258 ("acpi, memory-hotplug: parse SRAT before memblock
is ready")

It turns out movable_map has some problems, and it breaks several things

1. numa_init is called several times, NOT just for srat. so those
	nodes_clear(numa_nodes_parsed)
	memset(&numa_meminfo, 0, sizeof(numa_meminfo))
   can not be just removed.  Need to consider sequence is: numaq, srat, amd, dummy.
   and make fall back path working.

2. simply split acpi_numa_init to early_parse_srat.
   a. that early_parse_srat is NOT called for ia64, so you break ia64.
   b.  for (i = 0; i < MAX_LOCAL_APIC; i++)
	     set_apicid_to_node(i, NUMA_NO_NODE)
     still left in numa_init. So it will just clear result from early_parse_srat.
     it should be moved before that....
   c.  it breaks ACPI_TABLE_OVERIDE...as the acpi table scan is moved
       early before override from INITRD is settled.

3. that patch TITLE is total misleading, there is NO x86 in the title,
   but it changes critical x86 code. It caused x86 guys did not
   pay attention to find the problem early. Those patches really should
   be routed via tip/x86/mm.

4. after that commit, following range can not use movable ram:
  a. real_mode code.... well..funny, legacy Node0 [0,1M) could be hot-removed?
  b. initrd... it will be freed after booting, so it could be on movable...
  c. crashkernel for kdump...: looks like we can not put kdump kernel above 4G
	anymore.
  d. init_mem_mapping: can not put page table high anymore.
  e. initmem_init: vmemmap can not be high local node anymore. That is
     not good.

If node is hotplugable, the mem related range like page table and
vmemmap could be on the that node without problem and should be on that
node.

We have workaround patch that could fix some problems, but some can not
be fixed.

So just remove that offending commit and related ones including:

 f7210e6c4a ("mm/memblock.c: use CONFIG_HAVE_MEMBLOCK_NODE_MAP to
    protect movablecore_map in memblock_overlaps_region().")

 01a178a94e ("acpi, memory-hotplug: support getting hotplug info from
    SRAT")

 27168d38fa ("acpi, memory-hotplug: extend movablemem_map ranges to
    the end of node")

 e8d1955258 ("acpi, memory-hotplug: parse SRAT before memblock is
    ready")

 fb06bc8e5f ("page_alloc: bootmem limit with movablecore_map")

 42f47e27e7 ("page_alloc: make movablemem_map have higher priority")

 6981ec3114 ("page_alloc: introduce zone_movable_limit[] to keep
    movable limit for nodes")

 34b71f1e04 ("page_alloc: add movable_memmap kernel parameter")

 4d59a75125 ("x86: get pg_data_t's memory from other node")

Later we should have patches that will make sure kernel put page table
and vmemmap on local node ram instead of push them down to node0.  Also
need to find way to put other kernel used ram to local node ram.

Reported-by: Tim Gardner <tim.gardner@canonical.com>
Reported-by: Don Morris <don.morris@hp.com>
Bisected-by: Don Morris <don.morris@hp.com>
Tested-by: Don Morris <don.morris@hp.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-03-02 09:34:39 -08:00

253 lines
8.0 KiB
C

#ifndef _LINUX_MEMBLOCK_H
#define _LINUX_MEMBLOCK_H
#ifdef __KERNEL__
#ifdef CONFIG_HAVE_MEMBLOCK
/*
* Logical memory blocks.
*
* Copyright (C) 2001 Peter Bergner, IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/init.h>
#include <linux/mm.h>
#define INIT_MEMBLOCK_REGIONS 128
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
struct memblock_type {
unsigned long cnt; /* number of regions */
unsigned long max; /* size of the allocated array */
phys_addr_t total_size; /* size of all regions */
struct memblock_region *regions;
};
struct memblock {
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
};
extern struct memblock memblock;
extern int memblock_debug;
#define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
unsigned long *out_end_pfn, int *out_nid);
/**
* for_each_mem_pfn_range - early memory pfn range iterator
* @i: an integer used as loop variable
* @nid: node selector, %MAX_NUMNODES for all nodes
* @p_start: ptr to ulong for start pfn of the range, can be %NULL
* @p_end: ptr to ulong for end pfn of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over configured memory ranges.
*/
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %MAX_NUMNODES for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \
for (i = 0, \
__next_free_mem_range(&i, nid, p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_free_mem_range(&i, nid, p_start, p_end, p_nid))
void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
* @i: u64 used as loop variable
* @nid: node selector, %MAX_NUMNODES for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*
* Walks over free (memory && !reserved) areas of memblock in reverse
* order. Available as soon as memblock is initialized.
*/
#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
__next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
__next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
r->nid = nid;
}
static inline int memblock_get_region_node(const struct memblock_region *r)
{
return r->nid;
}
#else
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
}
static inline int memblock_get_region_node(const struct memblock_region *r)
{
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
/* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t memblock_phys_mem_size(void);
phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
int memblock_is_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
int memblock_is_reserved(phys_addr_t addr);
int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
static inline void memblock_dump_all(void)
{
if (memblock_debug)
__memblock_dump_all();
}
/**
* memblock_set_current_limit - Set the current allocation limit to allow
* limiting allocations to what is currently
* accessible during boot
* @limit: New limit value (physical address)
*/
void memblock_set_current_limit(phys_addr_t limit);
/*
* pfn conversion functions
*
* While the memory MEMBLOCKs should always be page aligned, the reserved
* MEMBLOCKs may not be. This accessor attempt to provide a very clear
* idea of what they return for such non aligned MEMBLOCKs.
*/
/**
* memblock_region_memory_base_pfn - Return the lowest pfn intersecting with the memory region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
{
return PFN_UP(reg->base);
}
/**
* memblock_region_memory_end_pfn - Return the end_pfn this region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
{
return PFN_DOWN(reg->base + reg->size);
}
/**
* memblock_region_reserved_base_pfn - Return the lowest pfn intersecting with the reserved region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
{
return PFN_DOWN(reg->base);
}
/**
* memblock_region_reserved_end_pfn - Return the end_pfn this region
* @reg: memblock_region structure
*/
static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
{
return PFN_UP(reg->base + reg->size);
}
#define for_each_memblock(memblock_type, region) \
for (region = memblock.memblock_type.regions; \
region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
region++)
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
#define __init_memblock __meminit
#define __initdata_memblock __meminitdata
#else
#define __init_memblock
#define __initdata_memblock
#endif
#else
static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
{
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK */
#endif /* __KERNEL__ */
#endif /* _LINUX_MEMBLOCK_H */