kernel-ark/mm/migrate.c
Christoph Lameter d75a0fcda2 [PATCH] Swapless page migration: rip out swap based logic
Rip the page migration logic out.

Remove all code that has to do with swapping during page migration.

This also guts the ability to migrate pages to swap.  No one used that so lets
let it go for good.

Page migration should be a bit broken after this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 07:42:50 -07:00

693 lines
16 KiB
C

/*
* Memory Migration functionality - linux/mm/migration.c
*
* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
*
* Page migration was first developed in the context of the memory hotplug
* project. The main authors of the migration code are:
*
* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
* Hirokazu Takahashi <taka@valinux.co.jp>
* Dave Hansen <haveblue@us.ibm.com>
* Christoph Lameter <clameter@sgi.com>
*/
#include <linux/migrate.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include "internal.h"
/* The maximum number of pages to take off the LRU for migration */
#define MIGRATE_CHUNK_SIZE 256
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
/*
* Isolate one page from the LRU lists. If successful put it onto
* the indicated list with elevated page count.
*
* Result:
* -EBUSY: page not on LRU list
* 0: page removed from LRU list and added to the specified list.
*/
int isolate_lru_page(struct page *page, struct list_head *pagelist)
{
int ret = -EBUSY;
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
ret = 0;
get_page(page);
ClearPageLRU(page);
if (PageActive(page))
del_page_from_active_list(zone, page);
else
del_page_from_inactive_list(zone, page);
list_add_tail(&page->lru, pagelist);
}
spin_unlock_irq(&zone->lru_lock);
}
return ret;
}
/*
* migrate_prep() needs to be called after we have compiled the list of pages
* to be migrated using isolate_lru_page() but before we begin a series of calls
* to migrate_pages().
*/
int migrate_prep(void)
{
/*
* Clear the LRU lists so pages can be isolated.
* Note that pages may be moved off the LRU after we have
* drained them. Those pages will fail to migrate like other
* pages that may be busy.
*/
lru_add_drain_all();
return 0;
}
static inline void move_to_lru(struct page *page)
{
list_del(&page->lru);
if (PageActive(page)) {
/*
* lru_cache_add_active checks that
* the PG_active bit is off.
*/
ClearPageActive(page);
lru_cache_add_active(page);
} else {
lru_cache_add(page);
}
put_page(page);
}
/*
* Add isolated pages on the list back to the LRU.
*
* returns the number of pages put back.
*/
int putback_lru_pages(struct list_head *l)
{
struct page *page;
struct page *page2;
int count = 0;
list_for_each_entry_safe(page, page2, l, lru) {
move_to_lru(page);
count++;
}
return count;
}
static inline int is_swap_pte(pte_t pte)
{
return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
}
/*
* Restore a potential migration pte to a working pte entry
*/
static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr,
struct page *old, struct page *new)
{
struct mm_struct *mm = vma->vm_mm;
swp_entry_t entry;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
return;
pud = pud_offset(pgd, addr);
if (!pud_present(*pud))
return;
pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd))
return;
ptep = pte_offset_map(pmd, addr);
if (!is_swap_pte(*ptep)) {
pte_unmap(ptep);
return;
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
goto out;
entry = pte_to_swp_entry(pte);
if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
goto out;
inc_mm_counter(mm, anon_rss);
get_page(new);
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
set_pte_at(mm, addr, ptep, pte);
page_add_anon_rmap(new, vma, addr);
out:
pte_unmap_unlock(ptep, ptl);
}
/*
* Get rid of all migration entries and replace them by
* references to the indicated page.
*
* Must hold mmap_sem lock on at least one of the vmas containing
* the page so that the anon_vma cannot vanish.
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
unsigned long mapping;
mapping = (unsigned long)new->mapping;
if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
return;
/*
* We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
*/
anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
spin_lock(&anon_vma->lock);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
remove_migration_pte(vma, page_address_in_vma(new, vma),
old, new);
spin_unlock(&anon_vma->lock);
}
/*
* Something used the pte of a page under migration. We need to
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*
* This function is called from do_swap_page().
*/
void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address)
{
pte_t *ptep, pte;
spinlock_t *ptl;
swp_entry_t entry;
struct page *page;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
pte = *ptep;
if (!is_swap_pte(pte))
goto out;
entry = pte_to_swp_entry(pte);
if (!is_migration_entry(entry))
goto out;
page = migration_entry_to_page(entry);
get_page(page);
pte_unmap_unlock(ptep, ptl);
wait_on_page_locked(page);
put_page(page);
return;
out:
pte_unmap_unlock(ptep, ptl);
}
/*
* Replace the page in the mapping.
*
* The number of remaining references must be:
* 1 for anonymous pages without a mapping
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate set.
*/
static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
struct page **radix_pointer;
write_lock_irq(&mapping->tree_lock);
radix_pointer = (struct page **)radix_tree_lookup_slot(
&mapping->page_tree,
page_index(page));
if (!page_mapping(page) ||
page_count(page) != 2 + !!PagePrivate(page) ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
/*
* Now we know that no one else is looking at the page.
*/
get_page(newpage);
if (PageSwapCache(page)) {
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
*radix_pointer = newpage;
__put_page(page);
write_unlock_irq(&mapping->tree_lock);
return 0;
}
/*
* Copy the page to its new location
*/
static void migrate_page_copy(struct page *newpage, struct page *page)
{
copy_highpage(newpage, page);
if (PageError(page))
SetPageError(newpage);
if (PageReferenced(page))
SetPageReferenced(newpage);
if (PageUptodate(page))
SetPageUptodate(newpage);
if (PageActive(page))
SetPageActive(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
SetPageMappedToDisk(newpage);
if (PageDirty(page)) {
clear_page_dirty_for_io(page);
set_page_dirty(newpage);
}
ClearPageSwapCache(page);
ClearPageActive(page);
ClearPagePrivate(page);
set_page_private(page, 0);
page->mapping = NULL;
/*
* If any waiters have accumulated on the new page then
* wake them up.
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
}
/************************************************************
* Migration functions
***********************************************************/
/* Always fail migration. Used for mappings that are not movable */
int fail_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
return -EIO;
}
EXPORT_SYMBOL(fail_migrate_page);
/*
* Common logic to directly migrate a single page suitable for
* pages that do not use PagePrivate.
*
* Pages are locked upon entry and exit.
*/
int migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int rc;
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
rc = migrate_page_move_mapping(mapping, newpage, page);
if (rc)
return rc;
migrate_page_copy(newpage, page);
/*
* Remove auxiliary swap entries and replace
* them with real ptes.
*
* Note that a real pte entry will allow processes that are not
* waiting on the page lock to use the new page via the page tables
* before the new page is unlocked.
*/
remove_from_swap(newpage);
return 0;
}
EXPORT_SYMBOL(migrate_page);
/*
* Migration function for pages with buffers. This function can only be used
* if the underlying filesystem guarantees that no other references to "page"
* exist.
*/
int buffer_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
struct buffer_head *bh, *head;
int rc;
if (!page_has_buffers(page))
return migrate_page(mapping, newpage, page);
head = page_buffers(page);
rc = migrate_page_move_mapping(mapping, newpage, page);
if (rc)
return rc;
bh = head;
do {
get_bh(bh);
lock_buffer(bh);
bh = bh->b_this_page;
} while (bh != head);
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
set_page_private(page, 0);
put_page(page);
get_page(newpage);
bh = head;
do {
set_bh_page(bh, newpage, bh_offset(bh));
bh = bh->b_this_page;
} while (bh != head);
SetPagePrivate(newpage);
migrate_page_copy(newpage, page);
bh = head;
do {
unlock_buffer(bh);
put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
return 0;
}
EXPORT_SYMBOL(buffer_migrate_page);
static int fallback_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
/*
* Default handling if a filesystem does not provide
* a migration function. We can only migrate clean
* pages so try to write out any dirty pages first.
*/
if (PageDirty(page)) {
switch (pageout(page, mapping)) {
case PAGE_KEEP:
case PAGE_ACTIVATE:
return -EAGAIN;
case PAGE_SUCCESS:
/* Relock since we lost the lock */
lock_page(page);
/* Must retry since page state may have changed */
return -EAGAIN;
case PAGE_CLEAN:
; /* try to migrate the page below */
}
}
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
if (page_has_buffers(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
return migrate_page(mapping, newpage, page);
}
/*
* migrate_pages
*
* Two lists are passed to this function. The first list
* contains the pages isolated from the LRU to be migrated.
* The second list contains new pages that the pages isolated
* can be moved to.
*
* The function returns after 10 attempts or if no pages
* are movable anymore because to has become empty
* or no retryable pages exist anymore.
*
* Return: Number of pages not migrated when "to" ran empty.
*/
int migrate_pages(struct list_head *from, struct list_head *to,
struct list_head *moved, struct list_head *failed)
{
int retry;
int nr_failed = 0;
int pass = 0;
struct page *page;
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
int rc;
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
redo:
retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
struct page *newpage = NULL;
struct address_space *mapping;
cond_resched();
rc = 0;
if (page_count(page) == 1)
/* page was freed from under us. So we are done. */
goto next;
if (to && list_empty(to))
break;
/*
* Skip locked pages during the first two passes to give the
* functions holding the lock time to release the page. Later we
* use lock_page() to have a higher chance of acquiring the
* lock.
*/
rc = -EAGAIN;
if (pass > 2)
lock_page(page);
else
if (TestSetPageLocked(page))
goto next;
/*
* Only wait on writeback if we have already done a pass where
* we we may have triggered writeouts for lots of pages.
*/
if (pass > 0)
wait_on_page_writeback(page);
else
if (PageWriteback(page))
goto unlock_page;
/*
* Establish swap ptes for anonymous pages or destroy pte
* maps for files.
*
* In order to reestablish file backed mappings the fault handlers
* will take the radix tree_lock which may then be used to stop
* processses from accessing this page until the new page is ready.
*
* A process accessing via a swap pte (an anonymous page) will take a
* page_lock on the old page which will block the process until the
* migration attempt is complete. At that time the PageSwapCache bit
* will be examined. If the page was migrated then the PageSwapCache
* bit will be clear and the operation to retrieve the page will be
* retried which will find the new page in the radix tree. Then a new
* direct mapping may be generated based on the radix tree contents.
*
* If the page was not migrated then the PageSwapCache bit
* is still set and the operation may continue.
*/
rc = -EPERM;
if (try_to_unmap(page, 1) == SWAP_FAIL)
/* A vma has VM_LOCKED set -> permanent failure */
goto unlock_page;
rc = -EAGAIN;
if (page_mapped(page))
goto unlock_page;
newpage = lru_to_page(to);
lock_page(newpage);
/* Prepare mapping for the new page.*/
newpage->index = page->index;
newpage->mapping = page->mapping;
/*
* Pages are properly locked and writeback is complete.
* Try to migrate the page.
*/
mapping = page_mapping(page);
if (!mapping)
goto unlock_both;
if (mapping->a_ops->migratepage)
/*
* Most pages have a mapping and most filesystems
* should provide a migration function. Anonymous
* pages are part of swap space which also has its
* own migration function. This is the most common
* path for page migration.
*/
rc = mapping->a_ops->migratepage(mapping,
newpage, page);
else
rc = fallback_migrate_page(mapping, newpage, page);
unlock_both:
unlock_page(newpage);
unlock_page:
unlock_page(page);
next:
if (rc) {
if (newpage)
newpage->mapping = NULL;
if (rc == -EAGAIN)
retry++;
else {
/* Permanent failure */
list_move(&page->lru, failed);
nr_failed++;
}
} else {
if (newpage) {
/* Successful migration. Return page to LRU */
move_to_lru(newpage);
}
list_move(&page->lru, moved);
}
}
if (retry && pass++ < 10)
goto redo;
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
return nr_failed + retry;
}
/*
* Migrate the list 'pagelist' of pages to a certain destination.
*
* Specify destination with either non-NULL vma or dest_node >= 0
* Return the number of pages not migrated or error code
*/
int migrate_pages_to(struct list_head *pagelist,
struct vm_area_struct *vma, int dest)
{
LIST_HEAD(newlist);
LIST_HEAD(moved);
LIST_HEAD(failed);
int err = 0;
unsigned long offset = 0;
int nr_pages;
struct page *page;
struct list_head *p;
redo:
nr_pages = 0;
list_for_each(p, pagelist) {
if (vma) {
/*
* The address passed to alloc_page_vma is used to
* generate the proper interleave behavior. We fake
* the address here by an increasing offset in order
* to get the proper distribution of pages.
*
* No decision has been made as to which page
* a certain old page is moved to so we cannot
* specify the correct address.
*/
page = alloc_page_vma(GFP_HIGHUSER, vma,
offset + vma->vm_start);
offset += PAGE_SIZE;
}
else
page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
if (!page) {
err = -ENOMEM;
goto out;
}
list_add_tail(&page->lru, &newlist);
nr_pages++;
if (nr_pages > MIGRATE_CHUNK_SIZE)
break;
}
err = migrate_pages(pagelist, &newlist, &moved, &failed);
putback_lru_pages(&moved); /* Call release pages instead ?? */
if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
goto redo;
out:
/* Return leftover allocated pages */
while (!list_empty(&newlist)) {
page = list_entry(newlist.next, struct page, lru);
list_del(&page->lru);
__free_page(page);
}
list_splice(&failed, pagelist);
if (err < 0)
return err;
/* Calculate number of leftover pages */
nr_pages = 0;
list_for_each(p, pagelist)
nr_pages++;
return nr_pages;
}