kernel-ark/kernel/power/snapshot.c
Rafael J. Wysocki 4a3b98a422 [PATCH] swsusp: prevent possible image corruption on resume
The function free_pagedir() used by swsusp for freeing its internal data
structures clears the PG_nosave and PG_nosave_free flags for each page
being freed.

However, during resume PG_nosave_free set means that the page in
question is "unsafe" (ie.  it will be overwritten in the process of
restoring the saved system state from the image), so it should not be
used for the image data.

Therefore free_pagedir() should not clear PG_nosave_free if it's called
during resume (otherwise "unsafe" pages freed by it may be used for
storing the image data and the data may get corrupted later on).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-04-19 09:13:49 -07:00

831 lines
20 KiB
C

/*
* linux/kernel/power/snapshot.c
*
* This file provide system snapshot/restore functionality.
*
* Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
*
* This file is released under the GPLv2, and is based on swsusp.c.
*
*/
#include <linux/version.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/pm.h>
#include <linux/device.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/console.h>
#include <linux/highmem.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
#include "power.h"
struct pbe *pagedir_nosave;
static unsigned int nr_copy_pages;
static unsigned int nr_meta_pages;
static unsigned long *buffer;
#ifdef CONFIG_HIGHMEM
unsigned int count_highmem_pages(void)
{
struct zone *zone;
unsigned long zone_pfn;
unsigned int n = 0;
for_each_zone (zone)
if (is_highmem(zone)) {
mark_free_pages(zone);
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
struct page *page;
unsigned long pfn = zone_pfn + zone->zone_start_pfn;
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
if (PageReserved(page))
continue;
if (PageNosaveFree(page))
continue;
n++;
}
}
return n;
}
struct highmem_page {
char *data;
struct page *page;
struct highmem_page *next;
};
static struct highmem_page *highmem_copy;
static int save_highmem_zone(struct zone *zone)
{
unsigned long zone_pfn;
mark_free_pages(zone);
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
struct page *page;
struct highmem_page *save;
void *kaddr;
unsigned long pfn = zone_pfn + zone->zone_start_pfn;
if (!(pfn%10000))
printk(".");
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
/*
* This condition results from rvmalloc() sans vmalloc_32()
* and architectural memory reservations. This should be
* corrected eventually when the cases giving rise to this
* are better understood.
*/
if (PageReserved(page))
continue;
BUG_ON(PageNosave(page));
if (PageNosaveFree(page))
continue;
save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
if (!save)
return -ENOMEM;
save->next = highmem_copy;
save->page = page;
save->data = (void *) get_zeroed_page(GFP_ATOMIC);
if (!save->data) {
kfree(save);
return -ENOMEM;
}
kaddr = kmap_atomic(page, KM_USER0);
memcpy(save->data, kaddr, PAGE_SIZE);
kunmap_atomic(kaddr, KM_USER0);
highmem_copy = save;
}
return 0;
}
int save_highmem(void)
{
struct zone *zone;
int res = 0;
pr_debug("swsusp: Saving Highmem");
drain_local_pages();
for_each_zone (zone) {
if (is_highmem(zone))
res = save_highmem_zone(zone);
if (res)
return res;
}
printk("\n");
return 0;
}
int restore_highmem(void)
{
printk("swsusp: Restoring Highmem\n");
while (highmem_copy) {
struct highmem_page *save = highmem_copy;
void *kaddr;
highmem_copy = save->next;
kaddr = kmap_atomic(save->page, KM_USER0);
memcpy(kaddr, save->data, PAGE_SIZE);
kunmap_atomic(kaddr, KM_USER0);
free_page((long) save->data);
kfree(save);
}
return 0;
}
#endif
static int pfn_is_nosave(unsigned long pfn)
{
unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
/**
* saveable - Determine whether a page should be cloned or not.
* @pfn: The page
*
* We save a page if it's Reserved, and not in the range of pages
* statically defined as 'unsaveable', or if it isn't reserved, and
* isn't part of a free chunk of pages.
*/
static int saveable(struct zone *zone, unsigned long *zone_pfn)
{
unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
struct page *page;
if (!pfn_valid(pfn))
return 0;
page = pfn_to_page(pfn);
BUG_ON(PageReserved(page) && PageNosave(page));
if (PageNosave(page))
return 0;
if (PageReserved(page) && pfn_is_nosave(pfn))
return 0;
if (PageNosaveFree(page))
return 0;
return 1;
}
unsigned int count_data_pages(void)
{
struct zone *zone;
unsigned long zone_pfn;
unsigned int n = 0;
for_each_zone (zone) {
if (is_highmem(zone))
continue;
mark_free_pages(zone);
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
n += saveable(zone, &zone_pfn);
}
return n;
}
static void copy_data_pages(struct pbe *pblist)
{
struct zone *zone;
unsigned long zone_pfn;
struct pbe *pbe, *p;
pbe = pblist;
for_each_zone (zone) {
if (is_highmem(zone))
continue;
mark_free_pages(zone);
/* This is necessary for swsusp_free() */
for_each_pb_page (p, pblist)
SetPageNosaveFree(virt_to_page(p));
for_each_pbe (p, pblist)
SetPageNosaveFree(virt_to_page(p->address));
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
if (saveable(zone, &zone_pfn)) {
struct page *page;
page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
BUG_ON(!pbe);
pbe->orig_address = (unsigned long)page_address(page);
/* copy_page is not usable for copying task structs. */
memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
pbe = pbe->next;
}
}
}
BUG_ON(pbe);
}
/**
* free_pagedir - free pages allocated with alloc_pagedir()
*/
static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
{
struct pbe *pbe;
while (pblist) {
pbe = (pblist + PB_PAGE_SKIP)->next;
ClearPageNosave(virt_to_page(pblist));
if (clear_nosave_free)
ClearPageNosaveFree(virt_to_page(pblist));
free_page((unsigned long)pblist);
pblist = pbe;
}
}
/**
* fill_pb_page - Create a list of PBEs on a given memory page
*/
static inline void fill_pb_page(struct pbe *pbpage)
{
struct pbe *p;
p = pbpage;
pbpage += PB_PAGE_SKIP;
do
p->next = p + 1;
while (++p < pbpage);
}
/**
* create_pbe_list - Create a list of PBEs on top of a given chain
* of memory pages allocated with alloc_pagedir()
*/
static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
{
struct pbe *pbpage, *p;
unsigned int num = PBES_PER_PAGE;
for_each_pb_page (pbpage, pblist) {
if (num >= nr_pages)
break;
fill_pb_page(pbpage);
num += PBES_PER_PAGE;
}
if (pbpage) {
for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
p->next = p + 1;
p->next = NULL;
}
}
/**
* On resume it is necessary to trace and eventually free the unsafe
* pages that have been allocated, because they are needed for I/O
* (on x86-64 we likely will "eat" these pages once again while
* creating the temporary page translation tables)
*/
struct eaten_page {
struct eaten_page *next;
char padding[PAGE_SIZE - sizeof(void *)];
};
static struct eaten_page *eaten_pages = NULL;
static void release_eaten_pages(void)
{
struct eaten_page *p, *q;
p = eaten_pages;
while (p) {
q = p->next;
/* We don't want swsusp_free() to free this page again */
ClearPageNosave(virt_to_page(p));
free_page((unsigned long)p);
p = q;
}
eaten_pages = NULL;
}
/**
* @safe_needed - on resume, for storing the PBE list and the image,
* we can only use memory pages that do not conflict with the pages
* which had been used before suspend.
*
* The unsafe pages are marked with the PG_nosave_free flag
*
* Allocated but unusable (ie eaten) memory pages should be marked
* so that swsusp_free() can release them
*/
static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
if (safe_needed)
do {
res = (void *)get_zeroed_page(gfp_mask);
if (res && PageNosaveFree(virt_to_page(res))) {
/* This is for swsusp_free() */
SetPageNosave(virt_to_page(res));
((struct eaten_page *)res)->next = eaten_pages;
eaten_pages = res;
}
} while (res && PageNosaveFree(virt_to_page(res)));
else
res = (void *)get_zeroed_page(gfp_mask);
if (res) {
SetPageNosave(virt_to_page(res));
SetPageNosaveFree(virt_to_page(res));
}
return res;
}
unsigned long get_safe_page(gfp_t gfp_mask)
{
return (unsigned long)alloc_image_page(gfp_mask, 1);
}
/**
* alloc_pagedir - Allocate the page directory.
*
* First, determine exactly how many pages we need and
* allocate them.
*
* We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
* struct pbe elements (pbes) and the last element in the page points
* to the next page.
*
* On each page we set up a list of struct_pbe elements.
*/
struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
{
unsigned int num;
struct pbe *pblist, *pbe;
if (!nr_pages)
return NULL;
pblist = alloc_image_page(gfp_mask, safe_needed);
/* FIXME: rewrite this ugly loop */
for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
pbe = pbe->next, num += PBES_PER_PAGE) {
pbe += PB_PAGE_SKIP;
pbe->next = alloc_image_page(gfp_mask, safe_needed);
}
if (!pbe) { /* get_zeroed_page() failed */
free_pagedir(pblist, 1);
pblist = NULL;
} else
create_pbe_list(pblist, nr_pages);
return pblist;
}
/**
* Free pages we allocated for suspend. Suspend pages are alocated
* before atomic copy, so we need to free them after resume.
*/
void swsusp_free(void)
{
struct zone *zone;
unsigned long zone_pfn;
for_each_zone(zone) {
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
struct page *page;
page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
if (PageNosave(page) && PageNosaveFree(page)) {
ClearPageNosave(page);
ClearPageNosaveFree(page);
free_page((long) page_address(page));
}
}
}
nr_copy_pages = 0;
nr_meta_pages = 0;
pagedir_nosave = NULL;
buffer = NULL;
}
/**
* enough_free_mem - Make sure we enough free memory to snapshot.
*
* Returns TRUE or FALSE after checking the number of available
* free pages.
*/
static int enough_free_mem(unsigned int nr_pages)
{
struct zone *zone;
unsigned int n = 0;
for_each_zone (zone)
if (!is_highmem(zone))
n += zone->free_pages;
pr_debug("swsusp: available memory: %u pages\n", n);
return n > (nr_pages + PAGES_FOR_IO +
(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
}
static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
{
struct pbe *p;
for_each_pbe (p, pblist) {
p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
if (!p->address)
return -ENOMEM;
}
return 0;
}
static struct pbe *swsusp_alloc(unsigned int nr_pages)
{
struct pbe *pblist;
if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
return NULL;
}
if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
printk(KERN_ERR "suspend: Allocating image pages failed.\n");
swsusp_free();
return NULL;
}
return pblist;
}
asmlinkage int swsusp_save(void)
{
unsigned int nr_pages;
pr_debug("swsusp: critical section: \n");
drain_local_pages();
nr_pages = count_data_pages();
printk("swsusp: Need to copy %u pages\n", nr_pages);
pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
nr_pages,
(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
PAGES_FOR_IO, nr_free_pages());
if (!enough_free_mem(nr_pages)) {
printk(KERN_ERR "swsusp: Not enough free memory\n");
return -ENOMEM;
}
pagedir_nosave = swsusp_alloc(nr_pages);
if (!pagedir_nosave)
return -ENOMEM;
/* During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages();
copy_data_pages(pagedir_nosave);
/*
* End of critical section. From now on, we can write to memory,
* but we should not touch disk. This specially means we must _not_
* touch swap space! Except we must write out our image of course.
*/
nr_copy_pages = nr_pages;
nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
return 0;
}
static void init_header(struct swsusp_info *info)
{
memset(info, 0, sizeof(struct swsusp_info));
info->version_code = LINUX_VERSION_CODE;
info->num_physpages = num_physpages;
memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
info->size = info->pages;
info->size <<= PAGE_SHIFT;
}
/**
* pack_orig_addresses - the .orig_address fields of the PBEs from the
* list starting at @pbe are stored in the array @buf[] (1 page)
*/
static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
{
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
buf[j] = pbe->orig_address;
pbe = pbe->next;
}
if (!pbe)
for (; j < PAGE_SIZE / sizeof(long); j++)
buf[j] = 0;
return pbe;
}
/**
* snapshot_read_next - used for reading the system memory snapshot.
*
* On the first call to it @handle should point to a zeroed
* snapshot_handle structure. The structure gets updated and a pointer
* to it should be passed to this function every next time.
*
* The @count parameter should contain the number of bytes the caller
* wants to read from the snapshot. It must not be zero.
*
* On success the function returns a positive number. Then, the caller
* is allowed to read up to the returned number of bytes from the memory
* location computed by the data_of() macro. The number returned
* may be smaller than @count, but this only happens if the read would
* cross a page boundary otherwise.
*
* The function returns 0 to indicate the end of data stream condition,
* and a negative number is returned on error. In such cases the
* structure pointed to by @handle is not updated and should not be used
* any more.
*/
int snapshot_read_next(struct snapshot_handle *handle, size_t count)
{
if (handle->page > nr_meta_pages + nr_copy_pages)
return 0;
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
buffer = alloc_image_page(GFP_ATOMIC, 0);
if (!buffer)
return -ENOMEM;
}
if (!handle->offset) {
init_header((struct swsusp_info *)buffer);
handle->buffer = buffer;
handle->pbe = pagedir_nosave;
}
if (handle->prev < handle->page) {
if (handle->page <= nr_meta_pages) {
handle->pbe = pack_orig_addresses(buffer, handle->pbe);
if (!handle->pbe)
handle->pbe = pagedir_nosave;
} else {
handle->buffer = (void *)handle->pbe->address;
handle->pbe = handle->pbe->next;
}
handle->prev = handle->page;
}
handle->buf_offset = handle->page_offset;
if (handle->page_offset + count >= PAGE_SIZE) {
count = PAGE_SIZE - handle->page_offset;
handle->page_offset = 0;
handle->page++;
} else {
handle->page_offset += count;
}
handle->offset += count;
return count;
}
/**
* mark_unsafe_pages - mark the pages that cannot be used for storing
* the image during resume, because they conflict with the pages that
* had been used before suspend
*/
static int mark_unsafe_pages(struct pbe *pblist)
{
struct zone *zone;
unsigned long zone_pfn;
struct pbe *p;
if (!pblist) /* a sanity check */
return -EINVAL;
/* Clear page flags */
for_each_zone (zone) {
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
if (pfn_valid(zone_pfn + zone->zone_start_pfn))
ClearPageNosaveFree(pfn_to_page(zone_pfn +
zone->zone_start_pfn));
}
/* Mark orig addresses */
for_each_pbe (p, pblist) {
if (virt_addr_valid(p->orig_address))
SetPageNosaveFree(virt_to_page(p->orig_address));
else
return -EFAULT;
}
return 0;
}
static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
{
/* We assume both lists contain the same number of elements */
while (src) {
dst->orig_address = src->orig_address;
dst = dst->next;
src = src->next;
}
}
static int check_header(struct swsusp_info *info)
{
char *reason = NULL;
if (info->version_code != LINUX_VERSION_CODE)
reason = "kernel version";
if (info->num_physpages != num_physpages)
reason = "memory size";
if (strcmp(info->uts.sysname,system_utsname.sysname))
reason = "system type";
if (strcmp(info->uts.release,system_utsname.release))
reason = "kernel release";
if (strcmp(info->uts.version,system_utsname.version))
reason = "version";
if (strcmp(info->uts.machine,system_utsname.machine))
reason = "machine";
if (reason) {
printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
return -EPERM;
}
return 0;
}
/**
* load header - check the image header and copy data from it
*/
static int load_header(struct snapshot_handle *handle,
struct swsusp_info *info)
{
int error;
struct pbe *pblist;
error = check_header(info);
if (!error) {
pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
if (!pblist)
return -ENOMEM;
pagedir_nosave = pblist;
handle->pbe = pblist;
nr_copy_pages = info->image_pages;
nr_meta_pages = info->pages - info->image_pages - 1;
}
return error;
}
/**
* unpack_orig_addresses - copy the elements of @buf[] (1 page) to
* the PBEs in the list starting at @pbe
*/
static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
struct pbe *pbe)
{
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
pbe->orig_address = buf[j];
pbe = pbe->next;
}
return pbe;
}
/**
* create_image - use metadata contained in the PBE list
* pointed to by pagedir_nosave to mark the pages that will
* be overwritten in the process of restoring the system
* memory state from the image and allocate memory for
* the image avoiding these pages
*/
static int create_image(struct snapshot_handle *handle)
{
int error = 0;
struct pbe *p, *pblist;
p = pagedir_nosave;
error = mark_unsafe_pages(p);
if (!error) {
pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
if (pblist)
copy_page_backup_list(pblist, p);
free_pagedir(p, 0);
if (!pblist)
error = -ENOMEM;
}
if (!error)
error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
if (!error) {
release_eaten_pages();
pagedir_nosave = pblist;
} else {
pagedir_nosave = NULL;
handle->pbe = NULL;
nr_copy_pages = 0;
nr_meta_pages = 0;
}
return error;
}
/**
* snapshot_write_next - used for writing the system memory snapshot.
*
* On the first call to it @handle should point to a zeroed
* snapshot_handle structure. The structure gets updated and a pointer
* to it should be passed to this function every next time.
*
* The @count parameter should contain the number of bytes the caller
* wants to write to the image. It must not be zero.
*
* On success the function returns a positive number. Then, the caller
* is allowed to write up to the returned number of bytes to the memory
* location computed by the data_of() macro. The number returned
* may be smaller than @count, but this only happens if the write would
* cross a page boundary otherwise.
*
* The function returns 0 to indicate the "end of file" condition,
* and a negative number is returned on error. In such cases the
* structure pointed to by @handle is not updated and should not be used
* any more.
*/
int snapshot_write_next(struct snapshot_handle *handle, size_t count)
{
int error = 0;
if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
return 0;
if (!buffer) {
/* This makes the buffer be freed by swsusp_free() */
buffer = alloc_image_page(GFP_ATOMIC, 0);
if (!buffer)
return -ENOMEM;
}
if (!handle->offset)
handle->buffer = buffer;
if (handle->prev < handle->page) {
if (!handle->prev) {
error = load_header(handle, (struct swsusp_info *)buffer);
if (error)
return error;
} else if (handle->prev <= nr_meta_pages) {
handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
if (!handle->pbe) {
error = create_image(handle);
if (error)
return error;
handle->pbe = pagedir_nosave;
handle->buffer = (void *)handle->pbe->address;
}
} else {
handle->pbe = handle->pbe->next;
handle->buffer = (void *)handle->pbe->address;
}
handle->prev = handle->page;
}
handle->buf_offset = handle->page_offset;
if (handle->page_offset + count >= PAGE_SIZE) {
count = PAGE_SIZE - handle->page_offset;
handle->page_offset = 0;
handle->page++;
} else {
handle->page_offset += count;
}
handle->offset += count;
return count;
}
int snapshot_image_loaded(struct snapshot_handle *handle)
{
return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
handle->page <= nr_meta_pages + nr_copy_pages);
}