Merge branch 'rawhide/user/kyle/kernel-git' into rawhide/user/myoung/xendom0

Conflicts: kernel.spec
2010-12-08 11:19:52 +00:00 · 2010-12-08 11:19:52 +00:00 · ed706b65af
parent 85b5730b7e abce2aa994
commit ed706b65af
12 changed files with 639 additions and 1843 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,5 +3,4 @@ patch-*.bz2
 clog
 *.rpm
 kernel-2.6.*/
-/patch-2.6.37-rc4.bz2
-/patch-2.6.37-rc4-git1.bz2
+/patch-2.6.37-rc5.bz2
--- a/1
+++ b/1
@ -3529,6 +3529,7 @@ CONFIG_CIFS_UPCALL=y
 CONFIG_CIFS_XATTR=y
 CONFIG_CIFS_POSIX=y
 CONFIG_CIFS_FSCACHE=y
+CONFIG_CIFS_ACL=y
 CONFIG_CIFS_WEAK_PW_HASH=y
 # CONFIG_CIFS_DEBUG2 is not set
 CONFIG_CIFS_DFS_UPCALL=y
--- a/2
+++ b/2
@ -438,3 +438,5 @@ CONFIG_PCH_PHUB=m
 CONFIG_VIDEO_VIA_CAMERA=m

 CONFIG_JUMP_LABEL=y
+
+CONFIG_HP_ILO=m
--- a/drm-fixes.patch
+++ b/drm-fixes.patch
--- a/drm-intel-edp-fixes.patch
+++ b/drm-intel-edp-fixes.patch
@ -0,0 +1,44 @@
+diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
+index f737960..b1f8164 100644
+--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
+@@ -509,6 +509,8 @@ i915_pci_remove(struct pci_dev *pdev)
+ {
+ 	struct drm_device *dev = pci_get_drvdata(pdev);
+ 
+	pci_disable_device(pdev); /* core did previous enable */
+
+ 	drm_put_dev(dev);
+ }
+ 
+diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
+index 300f64b..2e3db37 100644
+--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
+@@ -795,7 +795,8 @@ static bool ironlake_edp_panel_on (struct intel_dp *intel_dp)
+ {
+ 	struct drm_device *dev = intel_dp->base.base.dev;
+ 	struct drm_i915_private *dev_priv = dev->dev_private;
+-	u32 pp, idle_on_mask = PP_ON | PP_SEQUENCE_STATE_ON_IDLE;
+	u32 pp, idle_on = PP_ON | PP_SEQUENCE_STATE_ON_IDLE;
+	u32 idle_on_mask = PP_ON | PP_SEQUENCE_STATE_MASK;
+ 
+ 	if (I915_READ(PCH_PP_STATUS) & PP_ON)
+ 		return true;
+@@ -816,7 +817,7 @@ static bool ironlake_edp_panel_on (struct intel_dp *intel_dp)
+ 	 */
+ 	msleep(300);
+ 
+-	if (wait_for((I915_READ(PCH_PP_STATUS) & idle_on_mask) == idle_on_mask,
+	if (wait_for((I915_READ(PCH_PP_STATUS) & idle_on_mask) == idle_on,
+ 		     5000))
+ 		DRM_ERROR("panel on wait timed out: 0x%08x\n",
+ 			  I915_READ(PCH_PP_STATUS));
+@@ -922,6 +923,7 @@ static void intel_dp_prepare(struct drm_encoder *encoder)
+ 
+ 	if (is_edp(intel_dp)) {
+ 		ironlake_edp_backlight_off(dev);
+		ironlake_edp_panel_off(dev);
+ 		ironlake_edp_panel_on(intel_dp);
+ 		if (!is_pch_edp(intel_dp))
+ 			ironlake_edp_pll_on(encoder);
--- a/kernel.spec
+++ b/kernel.spec
@ -83,9 +83,9 @@ Summary: The Linux kernel
 # The next upstream release sublevel (base_sublevel+1)
 %define upstream_sublevel %(echo $((%{base_sublevel} + 1)))
 # The rc snapshot level
-%define rcrev 4
+%define rcrev 5
 # The git snapshot level
-%define gitrev 1
+%define gitrev 0
 # Set rpm version accordingly
 %define rpmversion 2.6.%{upstream_sublevel}
 %endif
@ -650,13 +650,14 @@ Patch1555: fix_xen_guest_on_old_EC2.patch
 # DRM

 # nouveau + drm fixes
-Patch1801: drm-fixes.patch
 Patch1810: drm-nouveau-updates.patch
 Patch1819: drm-intel-big-hammer.patch
 # intel drm is all merged upstream
 Patch1824: drm-intel-next.patch
 # make sure the lvds comes back on lid open
 Patch1825: drm-intel-make-lvds-work.patch
+Patch1826: drm-intel-edp-fixes.patch
+
 Patch1900: linux-2.6-intel-iommu-igfx.patch

 # linux1394 git patches
@ -702,10 +703,10 @@ Patch12205: runtime_pm_fixups.patch

 Patch12303: dmar-disable-when-ricoh-multifunction.patch

-Patch12400: tty-dont-allow-reopen-when-ldisc-is-changing.patch
 Patch12401: debug-tty-print-dev-name.patch
-Patch12402: tty-ldisc-fix-open-flag-handling.patch
-Patch12403: tty-open-hangup-race-fixup.patch
+
+Patch12410: mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
+Patch12411: mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch

 # Xen patches
 # git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git branches
@ -1254,7 +1255,6 @@ ApplyPatch linux-2.6-e1000-ich9-montevina.patch
 ApplyPatch fix_xen_guest_on_old_EC2.patch

 # DRM core
-ApplyPatch drm-fixes.patch

 # Nouveau DRM
 ApplyOptionalPatch drm-nouveau-updates.patch
@ -1264,6 +1264,7 @@ ApplyOptionalPatch drm-intel-next.patch
 ApplyPatch drm-intel-big-hammer.patch
 ApplyPatch drm-intel-make-lvds-work.patch
 ApplyPatch linux-2.6-intel-iommu-igfx.patch
+ApplyPatch drm-intel-edp-fixes.patch

 # linux1394 git patches
 #ApplyPatch linux-2.6-firewire-git-update.patch
@ -1306,10 +1307,11 @@ ApplyPatch runtime_pm_fixups.patch
 ApplyPatch dmar-disable-when-ricoh-multifunction.patch

 # rhbz#630464
-ApplyPatch tty-dont-allow-reopen-when-ldisc-is-changing.patch
 ApplyPatch debug-tty-print-dev-name.patch
-ApplyPatch tty-ldisc-fix-open-flag-handling.patch
-ApplyPatch tty-open-hangup-race-fixup.patch
+
+# backport some fixes for kswapd from mmotm, rhbz#649694
+ApplyPatch mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
+ApplyPatch mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch

 # Xen patches
 ApplyPatch xen.next-2.6.37.patch
@ -1590,6 +1592,9 @@ BuildKernel() {
    mkdir -p $RPM_BUILD_ROOT/usr/src/kernels
    mv $RPM_BUILD_ROOT/lib/modules/$KernelVer/build $RPM_BUILD_ROOT/$DevelDir
    ln -sf ../../..$DevelDir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
+
+    # prune junk from kernel-devel
+    find $RPM_BUILD_ROOT/usr/src/kernels -name ".*.cmd" -exec rm -f {} \;
 }

 ###
@ -1624,7 +1629,7 @@ BuildKernel %make_target %kernel_image smp
 %endif

 %global perf_make \
-  make %{?_smp_mflags} -C tools/perf -s V=1 NO_DEMANGLE=1 prefix=%{_prefix}
+  make %{?_smp_mflags} -C tools/perf -s V=1 HAVE_CPLUS_DEMANGLE=1 prefix=%{_prefix}
 %if %{with_perf}
 %{perf_make} all
 %{perf_make} man || %{doc_build_fail}
@ -1708,8 +1713,6 @@ find $RPM_BUILD_ROOT/usr/include \
     \( -name .install -o -name .check -o \
     	-name ..install.cmd -o -name ..check.cmd \) | xargs rm -f

-find $RPM_BUILD_ROOT/usr/src/kernels -name ".*.cmd" -exec rm -f {} \;
-
 # glibc provides scsi headers for itself, for now
 rm -rf $RPM_BUILD_ROOT/usr/include/scsi
 rm -f $RPM_BUILD_ROOT/usr/include/asm*/atomic.h
@ -1929,6 +1932,24 @@ fi
 #                 ||     ||

 %changelog
+* Tue Dec 07 2010 Kyle McMartin <kyle@redhat.com> 2.6.37-0.rc5.git0.1
+- Linux 2.6.37-rc5
+
+* Sat Dec 04 2010 Kyle McMartin <kyle@redhat.com>
+- Enable C++ symbol demangling with perf by linking against libiberty.a,
+  which is LGPL2.
+
+* Fri Dec 03 2010 Kyle McMartin <kyle@redhat.com>
+- Linux 2.6.37-rc4-git3
+- Enable HP ILO on x86_64 for (#571329)
+- Drop merged drm-fixes.patch, split out edp-fixes.
+- tty-dont-allow-reopen-when-ldisc-is-changing.patch: upstream.
+- tty-ldisc-fix-open-flag-handling.patch: upstream.
+- Enable CIFS_ACL.
+
+* Thu Dec 02 2010 Kyle McMartin <kyle@redhat.com>
+- Grab some of Mel's fixes from -mmotm to hopefully sort out #649694.
+
 * Thu Dec 02 2010 Michael Young <m.a.young@durham.ac.uk>
 - Update the xen/next-2.6.37 patch and rebuild for rc4-git1
 - xen-pcifront-fixes patch is now upstream
--- a/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
+++ b/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
@ -0,0 +1,389 @@
+From df43fae25437d7bc7dfff72599c1e825038b67cf Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Wed, 24 Nov 2010 22:18:23 -0500
+Subject: [PATCH 1/2] mm: page allocator: Adjust the per-cpu counter threshold when memory is low
+
+Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
+is low") noted that watermarks were based on the vmstat NR_FREE_PAGES.  To
+avoid synchronization overhead, these counters are maintained on a per-cpu
+basis and drained both periodically and when a threshold is above a
+threshold.  On large CPU systems, the difference between the estimate and
+real value of NR_FREE_PAGES can be very high.  The system can get into a
+case where pages are allocated far below the min watermark potentially
+causing livelock issues.  The commit solved the problem by taking a better
+reading of NR_FREE_PAGES when memory was low.
+
+Unfortately, as reported by Shaohua Li this accurate reading can consume a
+large amount of CPU time on systems with many sockets due to cache line
+bouncing.  This patch takes a different approach.  For large machines
+where counter drift might be unsafe and while kswapd is awake, the per-cpu
+thresholds for the target pgdat are reduced to limit the level of drift to
+what should be a safe level.  This incurs a performance penalty in heavy
+memory pressure by a factor that depends on the workload and the machine
+but the machine should function correctly without accidentally exhausting
+all memory on a node.  There is an additional cost when kswapd wakes and
+sleeps but the event is not expected to be frequent - in Shaohua's test
+case, there was one recorded sleep and wake event at least.
+
+To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
+introduced that takes a more accurate reading of NR_FREE_PAGES when called
+from wakeup_kswapd, when deciding whether it is really safe to go back to
+sleep in sleeping_prematurely() and when deciding if a zone is really
+balanced or not in balance_pgdat().  We are still using an expensive
+function but limiting how often it is called.
+
+When the test case is reproduced, the time spent in the watermark
+functions is reduced.  The following report is on the percentage of time
+spent cumulatively spent in the functions zone_nr_free_pages(),
+zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
+zone_page_state_snapshot(), zone_page_state().
+
+vanilla                      11.6615%
+disable-threshold            0.2584%
+
+Reported-by: Shaohua Li <shaohua.li@intel.com>
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch]]
+---
+ include/linux/mmzone.h |   10 ++-----
+ include/linux/vmstat.h |    5 +++
+ mm/mmzone.c            |   21 ---------------
+ mm/page_alloc.c        |   35 +++++++++++++++++++-----
+ mm/vmscan.c            |   23 +++++++++-------
+ mm/vmstat.c            |   68 +++++++++++++++++++++++++++++++++++++++++++++++-
+ 6 files changed, 115 insertions(+), 47 deletions(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 3984c4e..8d789d7 100644
+--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
+@@ -448,12 +448,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
+ 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
+ }
+ 
+-#ifdef CONFIG_SMP
+-unsigned long zone_nr_free_pages(struct zone *zone);
+-#else
+-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
+-#endif /* CONFIG_SMP */
+-
+ /*
+  * The "priority" of VM scanning is how much of the queues we will scan in one
+  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
+@@ -651,7 +645,9 @@ typedef struct pglist_data {
+ extern struct mutex zonelists_mutex;
+ void build_all_zonelists(void *data);
+ void wakeup_kswapd(struct zone *zone, int order);
+-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ 		int classzone_idx, int alloc_flags);
+ enum memmap_context {
+ 	MEMMAP_EARLY,
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index eaaea37..e4cc21c 100644
+--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
+@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
+ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+ 
+ void refresh_cpu_vm_stats(int);
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
+ #else /* CONFIG_SMP */
+ 
+ /*
+@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
+ #define dec_zone_page_state __dec_zone_page_state
+ #define mod_zone_page_state __mod_zone_page_state
+ 
+static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+
+ static inline void refresh_cpu_vm_stats(int cpu) { }
+ #endif
+ 
+diff --git a/mm/mmzone.c b/mm/mmzone.c
+index e35bfb8..f5b7d17 100644
+--- a/mm/mmzone.c
+++ b/mm/mmzone.c
+@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
+ 	return 1;
+ }
+ #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+-
+-#ifdef CONFIG_SMP
+-/* Called when a more accurate view of NR_FREE_PAGES is needed */
+-unsigned long zone_nr_free_pages(struct zone *zone)
+-{
+-	unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+-
+-	/*
+-	 * While kswapd is awake, it is considered the zone is under some
+-	 * memory pressure. Under pressure, there is a risk that
+-	 * per-cpu-counter-drift will allow the min watermark to be breached
+-	 * potentially causing a live-lock. While kswapd is awake and
+-	 * free pages are low, get a better estimate for free pages
+-	 */
+-	if (nr_free_pages < zone->percpu_drift_mark &&
+-			!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+-		return zone_page_state_snapshot(zone, NR_FREE_PAGES);
+-
+-	return nr_free_pages;
+-}
+-#endif /* CONFIG_SMP */
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index f12ad18..0286150 100644
+--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
+@@ -1454,24 +1454,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+ #endif /* CONFIG_FAIL_PAGE_ALLOC */
+ 
+ /*
+- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
+  * of the allocation.
+  */
+-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+-		      int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags, long free_pages)
+ {
+ 	/* free_pages my go negative - that's OK */
+ 	long min = mark;
+-	long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
+ 	int o;
+ 
+	free_pages -= (1 << order) + 1;
+ 	if (alloc_flags & ALLOC_HIGH)
+ 		min -= min / 2;
+ 	if (alloc_flags & ALLOC_HARDER)
+ 		min -= min / 4;
+ 
+ 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+-		return 0;
+		return false;
+ 	for (o = 0; o < order; o++) {
+ 		/* At the next order, this order's pages become unavailable */
+ 		free_pages -= z->free_area[o].nr_free << o;
+@@ -1480,9 +1480,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ 		min >>= 1;
+ 
+ 		if (free_pages <= min)
+-			return 0;
+			return false;
+ 	}
+-	return 1;
+	return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+					zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+		      int classzone_idx, int alloc_flags)
+{
+	long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+								free_pages);
+ }
+ 
+ #ifdef CONFIG_NUMA
+@@ -2436,7 +2455,7 @@ void show_free_areas(void)
+ 			" all_unreclaimable? %s"
+ 			"\n",
+ 			zone->name,
+-			K(zone_nr_free_pages(zone)),
+			K(zone_page_state(zone, NR_FREE_PAGES)),
+ 			K(min_wmark_pages(zone)),
+ 			K(low_wmark_pages(zone)),
+ 			K(high_wmark_pages(zone)),
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index c5dfabf..3e71cb1 100644
+--- a/mm/vmscan.c
+++ b/mm/vmscan.c
+@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+ 		if (zone->all_unreclaimable)
+ 			continue;
+ 
+-		if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+		if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+ 								0, 0))
+ 			return 1;
+ 	}
+@@ -2169,7 +2169,7 @@ loop_again:
+ 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
+ 							&sc, priority, 0);
+ 
+-			if (!zone_watermark_ok(zone, order,
+			if (!zone_watermark_ok_safe(zone, order,
+ 					high_wmark_pages(zone), 0, 0)) {
+ 				end_zone = i;
+ 				break;
+@@ -2215,7 +2215,7 @@ loop_again:
+ 			 * We put equal pressure on every zone, unless one
+ 			 * zone has way too many pages free already.
+ 			 */
+-			if (!zone_watermark_ok(zone, order,
+			if (!zone_watermark_ok_safe(zone, order,
+ 					8*high_wmark_pages(zone), end_zone, 0))
+ 				shrink_zone(priority, zone, &sc);
+ 			reclaim_state->reclaimed_slab = 0;
+@@ -2236,7 +2236,7 @@ loop_again:
+ 			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
+ 				sc.may_writepage = 1;
+ 
+-			if (!zone_watermark_ok(zone, order,
+			if (!zone_watermark_ok_safe(zone, order,
+ 					high_wmark_pages(zone), end_zone, 0)) {
+ 				all_zones_ok = 0;
+ 				/*
+@@ -2244,7 +2244,7 @@ loop_again:
+ 				 * means that we have a GFP_ATOMIC allocation
+ 				 * failure risk. Hurry up!
+ 				 */
+-				if (!zone_watermark_ok(zone, order,
+				if (!zone_watermark_ok_safe(zone, order,
+ 					    min_wmark_pages(zone), end_zone, 0))
+ 					has_under_min_watermark_zone = 1;
+ 			}
+@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
+ 				 */
+ 				if (!sleeping_prematurely(pgdat, order, remaining)) {
+ 					trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+					restore_pgdat_percpu_threshold(pgdat);
+ 					schedule();
+					reduce_pgdat_percpu_threshold(pgdat);
+ 				} else {
+ 					if (remaining)
+ 						count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
+ 	if (!populated_zone(zone))
+ 		return;
+ 
+-	pgdat = zone->zone_pgdat;
+-	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ 		return;
+	pgdat = zone->zone_pgdat;
+ 	if (pgdat->kswapd_max_order < order)
+ 		pgdat->kswapd_max_order = order;
+-	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+-	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+-		return;
+ 	if (!waitqueue_active(&pgdat->kswapd_wait))
+ 		return;
+	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+		return;
+
+	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ 	wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+ 
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 355a9e6..4d7faeb 100644
+--- a/mm/vmstat.c
+++ b/mm/vmstat.c
+@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
+ 
+ #ifdef CONFIG_SMP
+ 
+static int calculate_pressure_threshold(struct zone *zone)
+{
+	int threshold;
+	int watermark_distance;
+
+	/*
+	 * As vmstats are not up to date, there is drift between the estimated
+	 * and real values. For high thresholds and a high number of CPUs, it
+	 * is possible for the min watermark to be breached while the estimated
+	 * value looks fine. The pressure threshold is a reduced value such
+	 * that even the maximum amount of drift will not accidentally breach
+	 * the min watermark
+	 */
+	watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+	/*
+	 * Maximum threshold is 125
+	 */
+	threshold = min(125, threshold);
+
+	return threshold;
+}
+
+ static int calculate_threshold(struct zone *zone)
+ {
+ 	int threshold;
+@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
+ 	}
+ }
+ 
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	int cpu;
+	int threshold;
+	int i;
+
+	get_online_cpus();
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		zone = &pgdat->node_zones[i];
+		if (!zone->percpu_drift_mark)
+			continue;
+
+		threshold = calculate_pressure_threshold(zone);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+							= threshold;
+	}
+	put_online_cpus();
+}
+
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	int cpu;
+	int threshold;
+	int i;
+
+	get_online_cpus();
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		zone = &pgdat->node_zones[i];
+		if (!zone->percpu_drift_mark)
+			continue;
+
+		threshold = calculate_threshold(zone);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+							= threshold;
+	}
+	put_online_cpus();
+}
+
+ /*
+  * For use when we know that interrupts are disabled.
+  */
+@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ 		   "\n        scanned  %lu"
+ 		   "\n        spanned  %lu"
+ 		   "\n        present  %lu",
+-		   zone_nr_free_pages(zone),
+		   zone_page_state(zone, NR_FREE_PAGES),
+ 		   min_wmark_pages(zone),
+ 		   low_wmark_pages(zone),
+ 		   high_wmark_pages(zone),
+-- 
+1.7.3.2
+
--- a/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
+++ b/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
@ -0,0 +1,167 @@
+From 82e3d4969144377d13da97d511e849e8cf3e6dcc Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Wed, 24 Nov 2010 22:24:24 -0500
+Subject: [PATCH 2/2] mm: vmstat: Use a single setter function and callback for adjusting percpu thresholds
+
+reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist
+to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid
+errors due to counter drift.  The functions duplicate some code so this
+patch replaces them with a single set_pgdat_percpu_threshold() that takes
+a callback function to calculate the desired threshold as a parameter.
+
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+[the various mmotm patches updating this were rolled up. --kyle]
+[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds-fix-set_pgdat_percpu_threshold-dont-use-for_each_online_cpu.patch]]
+---
+ include/linux/vmstat.h |   10 ++++++----
+ mm/vmscan.c            |   19 +++++++++++++++++--
+ mm/vmstat.c            |   36 +++++++-----------------------------
+ 3 files changed, 30 insertions(+), 35 deletions(-)
+
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index e4cc21c..833e676 100644
+--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
+@@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
+ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+ 
+ void refresh_cpu_vm_stats(int);
+-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
+-void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
+
+int calculate_pressure_threshold(struct zone *zone);
+int calculate_normal_threshold(struct zone *zone);
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+				int (*calculate_pressure)(struct zone *));
+ #else /* CONFIG_SMP */
+ 
+ /*
+@@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page,
+ #define dec_zone_page_state __dec_zone_page_state
+ #define mod_zone_page_state __mod_zone_page_state
+ 
+-static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+-static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
+ 
+ static inline void refresh_cpu_vm_stats(int cpu) { }
+ #endif
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 3e71cb1..ba39948 100644
+--- a/mm/vmscan.c
+++ b/mm/vmscan.c
+@@ -2378,9 +2378,24 @@ static int kswapd(void *p)
+ 				 */
+ 				if (!sleeping_prematurely(pgdat, order, remaining)) {
+ 					trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+-					restore_pgdat_percpu_threshold(pgdat);
+
+					/*
+					 * vmstat counters are not perfectly
+					 * accurate and the estimated value
+					 * for counters such as NR_FREE_PAGES
+					 * can deviate from the true value by
+					 * nr_online_cpus * threshold. To
+					 * avoid the zone watermarks being
+					 * breached while under pressure, we
+					 * reduce the per-cpu vmstat threshold
+					 * while kswapd is awake and restore
+					 * them before going back to sleep.
+					 */
+					set_pgdat_percpu_threshold(pgdat,
+						calculate_normal_threshold);
+ 					schedule();
+-					reduce_pgdat_percpu_threshold(pgdat);
+					set_pgdat_percpu_threshold(pgdat,
+						calculate_pressure_threshold);
+ 				} else {
+ 					if (remaining)
+ 						count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 4d7faeb..511c2c0 100644
+--- a/mm/vmstat.c
+++ b/mm/vmstat.c
+@@ -81,7 +81,7 @@ EXPORT_SYMBOL(vm_stat);
+ 
+ #ifdef CONFIG_SMP
+ 
+-static int calculate_pressure_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+ {
+ 	int threshold;
+ 	int watermark_distance;
+@@ -105,7 +105,7 @@ static int calculate_pressure_threshold(struct zone *zone)
+ 	return threshold;
+ }
+ 
+-static int calculate_threshold(struct zone *zone)
+int calculate_normal_threshold(struct zone *zone)
+ {
+ 	int threshold;
+ 	int mem;	/* memory in 128 MB units */
+@@ -164,7 +164,7 @@ static void refresh_zone_stat_thresholds(void)
+ 	for_each_populated_zone(zone) {
+ 		unsigned long max_drift, tolerate_drift;
+ 
+-		threshold = calculate_threshold(zone);
+		threshold = calculate_normal_threshold(zone);
+ 
+ 		for_each_online_cpu(cpu)
+ 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+@@ -183,46 +183,24 @@ static void refresh_zone_stat_thresholds(void)
+ 	}
+ }
+ 
+-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+				int (*calculate_pressure)(struct zone *))
+ {
+ 	struct zone *zone;
+ 	int cpu;
+ 	int threshold;
+ 	int i;
+ 
+-	get_online_cpus();
+-	for (i = 0; i < pgdat->nr_zones; i++) {
+-		zone = &pgdat->node_zones[i];
+-		if (!zone->percpu_drift_mark)
+-			continue;
+-
+-		threshold = calculate_pressure_threshold(zone);
+-		for_each_online_cpu(cpu)
+-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+-							= threshold;
+-	}
+-	put_online_cpus();
+-}
+-
+-void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+-{
+-	struct zone *zone;
+-	int cpu;
+-	int threshold;
+-	int i;
+-
+-	get_online_cpus();
+ 	for (i = 0; i < pgdat->nr_zones; i++) {
+ 		zone = &pgdat->node_zones[i];
+ 		if (!zone->percpu_drift_mark)
+ 			continue;
+ 
+-		threshold = calculate_threshold(zone);
+-		for_each_online_cpu(cpu)
+		threshold = (*calculate_pressure)(zone);
+		for_each_possible_cpu(cpu)
+ 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ 							= threshold;
+ 	}
+-	put_online_cpus();
+ }
+ 
+ /*
+-- 
+1.7.3.2
+
--- a/3
+++ b/3
@ -1,3 +1,2 @@
 61f3739a73afb6914cb007f37fb09b62  linux-2.6.36.tar.bz2
-854ca0c7eca8930a71a6382a7dabbf65  patch-2.6.37-rc4.bz2
-c3146fe28bb10e77d8388bc26e16483c  patch-2.6.37-rc4-git1.bz2
+a84cf559615b5168ec1d5591841601ed  patch-2.6.37-rc5.bz2
--- a/tty-dont-allow-reopen-when-ldisc-is-changing.patch
+++ b/tty-dont-allow-reopen-when-ldisc-is-changing.patch
@ -1,84 +0,0 @@
-From jirislaby@gmail.com Thu Nov 25 12:16:42 2010
-From: Jiri Slaby <jslaby@suse.cz>
-Subject: [PATCH 1/1] TTY: don't allow reopen when ldisc is changing
-Date: Thu, 25 Nov 2010 18:16:23 +0100
-
-There are many WARNINGs like the following reported nowadays:
-WARNING: at drivers/tty/tty_io.c:1331 tty_open+0x2a2/0x49a()
-Hardware name: Latitude E6500
-Modules linked in:
-Pid: 1207, comm: plymouthd Not tainted 2.6.37-rc3-mmotm1123 #3
-Call Trace:
- [<ffffffff8103b189>] warn_slowpath_common+0x80/0x98
- [<ffffffff8103b1b6>] warn_slowpath_null+0x15/0x17
- [<ffffffff8128a3ab>] tty_open+0x2a2/0x49a
- [<ffffffff810fd53f>] chrdev_open+0x11d/0x146
-...
-
-This means tty_reopen is called without TTY_LDISC set. For further
-considerations, note tty_lock is held in tty_open. TTY_LDISC is cleared in:
-1) __tty_hangup from tty_ldisc_hangup to tty_ldisc_enable. During this
-section tty_lock is held.
-
-2) tty_release via tty_ldisc_release till the end of tty existence. If
-tty->count <= 1, tty_lock is taken, TTY_CLOSING bit set and then
-tty_ldisc_release called. tty_reopen checks TTY_CLOSING before checking
-TTY_LDISC.
-
-3) tty_set_ldisc from tty_ldisc_halt to tty_ldisc_enable. We:
-   * take tty_lock, set TTY_LDISC_CHANGING, put tty_lock
-   * call tty_ldisc_halt (clear TTY_LDISC), tty_lock is _not_ held
-   * do some other work
-   * take tty_lock, call tty_ldisc_enable (set TTY_LDISC), put
-     tty_lock
-
-So the only option I see is 3). The solution is to check
-TTY_LDISC_CHANGING along with TTY_CLOSING in tty_reopen.
-
-Nicely reproducible with two processes:
-while (1) {
-	fd = open("/dev/ttyS1", O_RDWR);
-	if (fd < 0) {
-		warn("open");
-		continue;
-	}
-	close(fd);
-}
--------
-while (1) {
-        fd = open("/dev/ttyS1", O_RDWR);
-        ld1 = 0; ld2 = 2;
-        while (1) {
-                ioctl(fd, TIOCSETD, &ld1);
-                ioctl(fd, TIOCSETD, &ld2);
-        }
-        close(fd);
-}
-
-Signed-off-by: Jiri Slaby <jslaby@suse.cz>
-Reported-by: <Valdis.Kletnieks@vt.edu>
-Cc: Kyle McMartin <kyle@mcmartin.ca>
-Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
---
- drivers/tty/tty_io.c |    3 ++-
- 1 files changed, 2 insertions(+), 1 deletions(-)
-
-diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
-index c05c5af..878f6d6 100644
--- a/drivers/tty/tty_io.c
-+++ b/drivers/tty/tty_io.c
-@@ -1310,7 +1310,8 @@ static int tty_reopen(struct tty_struct *tty)
- {
- 	struct tty_driver *driver = tty->driver;
- 
-	if (test_bit(TTY_CLOSING, &tty->flags))
-+	if (test_bit(TTY_CLOSING, &tty->flags) ||
-+			test_bit(TTY_LDISC_CHANGING, &tty->flags))
- 		return -EIO;
- 
- 	if (driver->type == TTY_DRIVER_TYPE_PTY &&
-- 
-1.7.3.1
-
-
-
--- a/tty-ldisc-fix-open-flag-handling.patch
+++ b/tty-ldisc-fix-open-flag-handling.patch
@ -1,54 +0,0 @@
-From linux-kernel-owner@vger.kernel.org Wed Nov 24 18:28:11 2010
-From:	Jiri Slaby <jslaby@suse.cz>
-Subject: [PATCH 1/2] TTY: ldisc, fix open flag handling
-Date:	Thu, 25 Nov 2010 00:27:54 +0100
-
-When a concrete ldisc open fails in tty_ldisc_open, we forget to clear
-TTY_LDISC_OPEN. This causes a false warning on the next ldisc open:
-WARNING: at drivers/char/tty_ldisc.c:445 tty_ldisc_open+0x26/0x38()
-Hardware name: System Product Name
-Modules linked in: ...
-Pid: 5251, comm: a.out Tainted: G        W  2.6.32-5-686 #1
-Call Trace:
- [<c1030321>] ? warn_slowpath_common+0x5e/0x8a
- [<c1030357>] ? warn_slowpath_null+0xa/0xc
- [<c119311c>] ? tty_ldisc_open+0x26/0x38
- [<c11936c5>] ? tty_set_ldisc+0x218/0x304
-...
-
-So clear the bit when failing...
-
-Introduced in c65c9bc3efa (tty: rewrite the ldisc locking) back in
-2.6.31-rc1.
-
-Signed-off-by: Jiri Slaby <jslaby@suse.cz>
-Cc: Alan Cox <alan@linux.intel.com>
-Reported-by: Sergey Lapin <slapin@ossfans.org>
-Tested-by: Sergey Lapin <slapin@ossfans.org>
---
- drivers/tty/tty_ldisc.c |    2 ++
- 1 files changed, 2 insertions(+), 0 deletions(-)
-
-diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
-index d8e96b0..4214d58 100644
--- a/drivers/tty/tty_ldisc.c
-+++ b/drivers/tty/tty_ldisc.c
-@@ -454,6 +454,8 @@ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
-                 /* BTM here locks versus a hangup event */
- 		WARN_ON(!tty_locked());
- 		ret = ld->ops->open(tty);
-+		if (ret)
-+			clear_bit(TTY_LDISC_OPEN, &tty->flags);
- 		return ret;
- 	}
- 	return 0;
-- 
-1.7.3.1
-
-
--
-To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
-the body of a message to majordomo@vger.kernel.org
-More majordomo info at  http://vger.kernel.org/majordomo-info.html
-Please read the FAQ at  http://www.tux.org/lkml/
-
--- a/tty-open-hangup-race-fixup.patch
+++ b/tty-open-hangup-race-fixup.patch
@ -1,76 +0,0 @@
-From 9e88e8b9915b5e067507a087437d80e6a133d612 Mon Sep 17 00:00:00 2001
-From: Jiri Slaby <jslaby@suse.cz>
-Date: Sat, 27 Nov 2010 16:06:46 +0100
-Subject: [PATCH 1/1] TTY: open/hangup race fixup
-
-
-Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
- drivers/tty/tty_io.c |   10 +++++++++-
- include/linux/tty.h  |    1 +
- 2 files changed, 10 insertions(+), 1 deletions(-)
-
-diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
-index 878f6d6..35480dd 100644
--- a/drivers/tty/tty_io.c
-+++ b/drivers/tty/tty_io.c
-@@ -559,6 +559,9 @@ void __tty_hangup(struct tty_struct *tty)
- 
- 	tty_lock();
- 
-+	/* some functions below drop BTM, so we need this bit */
-+	set_bit(TTY_HUPPING, &tty->flags);
-+
- 	/* inuse_filps is protected by the single tty lock,
- 	   this really needs to change if we want to flush the
- 	   workqueue with the lock held */
-@@ -578,6 +581,10 @@ void __tty_hangup(struct tty_struct *tty)
- 	}
- 	spin_unlock(&tty_files_lock);
- 
-+	/*
-+	 * it drops BTM and thus races with reopen
-+	 * we protect the race by TTY_HUPPING
-+	 */
- 	tty_ldisc_hangup(tty);
- 
- 	read_lock(&tasklist_lock);
-@@ -615,7 +622,6 @@ void __tty_hangup(struct tty_struct *tty)
- 	tty->session = NULL;
- 	tty->pgrp = NULL;
- 	tty->ctrl_status = 0;
-	set_bit(TTY_HUPPED, &tty->flags);
- 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
- 
- 	/* Account for the p->signal references we killed */
-@@ -641,6 +647,7 @@ void __tty_hangup(struct tty_struct *tty)
- 	 * can't yet guarantee all that.
- 	 */
- 	set_bit(TTY_HUPPED, &tty->flags);
-+	clear_bit(TTY_HUPPING, &tty->flags);
- 	tty_ldisc_enable(tty);
- 
- 	tty_unlock();
-@@ -1311,6 +1318,7 @@ static int tty_reopen(struct tty_struct *tty)
- 	struct tty_driver *driver = tty->driver;
- 
- 	if (test_bit(TTY_CLOSING, &tty->flags) ||
-+			test_bit(TTY_HUPPING, &tty->flags) ||
- 			test_bit(TTY_LDISC_CHANGING, &tty->flags))
- 		return -EIO;
- 
-diff --git a/include/linux/tty.h b/include/linux/tty.h
-index 032d79f..54e4eaa 100644
--- a/include/linux/tty.h
-+++ b/include/linux/tty.h
-@@ -366,6 +366,7 @@ struct tty_file_private {
- #define TTY_HUPPED 		18	/* Post driver->hangup() */
- #define TTY_FLUSHING		19	/* Flushing to ldisc in progress */
- #define TTY_FLUSHPENDING	20	/* Queued buffer flush pending */
-+#define TTY_HUPPING 		21	/* ->hangup() in progress */
- 
- #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
- 
-- 
-1.7.3.1
-