Merge branch 'rawhide/user/kyle/kernel-git' into rawhide/user/myoung/xendom0

Conflicts:
	kernel.spec
This commit is contained in:
Michael Young 2010-12-08 11:19:52 +00:00
commit ed706b65af
12 changed files with 639 additions and 1843 deletions

3
.gitignore vendored
View File

@ -3,5 +3,4 @@ patch-*.bz2
clog
*.rpm
kernel-2.6.*/
/patch-2.6.37-rc4.bz2
/patch-2.6.37-rc4-git1.bz2
/patch-2.6.37-rc5.bz2

View File

@ -3529,6 +3529,7 @@ CONFIG_CIFS_UPCALL=y
CONFIG_CIFS_XATTR=y
CONFIG_CIFS_POSIX=y
CONFIG_CIFS_FSCACHE=y
CONFIG_CIFS_ACL=y
CONFIG_CIFS_WEAK_PW_HASH=y
# CONFIG_CIFS_DEBUG2 is not set
CONFIG_CIFS_DFS_UPCALL=y

View File

@ -438,3 +438,5 @@ CONFIG_PCH_PHUB=m
CONFIG_VIDEO_VIA_CAMERA=m
CONFIG_JUMP_LABEL=y
CONFIG_HP_ILO=m

File diff suppressed because it is too large Load Diff

44
drm-intel-edp-fixes.patch Normal file
View File

@ -0,0 +1,44 @@
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f737960..b1f8164 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -509,6 +509,8 @@ i915_pci_remove(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
+ pci_disable_device(pdev); /* core did previous enable */
+
drm_put_dev(dev);
}
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 300f64b..2e3db37 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -795,7 +795,8 @@ static bool ironlake_edp_panel_on (struct intel_dp *intel_dp)
{
struct drm_device *dev = intel_dp->base.base.dev;
struct drm_i915_private *dev_priv = dev->dev_private;
- u32 pp, idle_on_mask = PP_ON | PP_SEQUENCE_STATE_ON_IDLE;
+ u32 pp, idle_on = PP_ON | PP_SEQUENCE_STATE_ON_IDLE;
+ u32 idle_on_mask = PP_ON | PP_SEQUENCE_STATE_MASK;
if (I915_READ(PCH_PP_STATUS) & PP_ON)
return true;
@@ -816,7 +817,7 @@ static bool ironlake_edp_panel_on (struct intel_dp *intel_dp)
*/
msleep(300);
- if (wait_for((I915_READ(PCH_PP_STATUS) & idle_on_mask) == idle_on_mask,
+ if (wait_for((I915_READ(PCH_PP_STATUS) & idle_on_mask) == idle_on,
5000))
DRM_ERROR("panel on wait timed out: 0x%08x\n",
I915_READ(PCH_PP_STATUS));
@@ -922,6 +923,7 @@ static void intel_dp_prepare(struct drm_encoder *encoder)
if (is_edp(intel_dp)) {
ironlake_edp_backlight_off(dev);
+ ironlake_edp_panel_off(dev);
ironlake_edp_panel_on(intel_dp);
if (!is_pch_edp(intel_dp))
ironlake_edp_pll_on(encoder);

View File

@ -83,9 +83,9 @@ Summary: The Linux kernel
# The next upstream release sublevel (base_sublevel+1)
%define upstream_sublevel %(echo $((%{base_sublevel} + 1)))
# The rc snapshot level
%define rcrev 4
%define rcrev 5
# The git snapshot level
%define gitrev 1
%define gitrev 0
# Set rpm version accordingly
%define rpmversion 2.6.%{upstream_sublevel}
%endif
@ -650,13 +650,14 @@ Patch1555: fix_xen_guest_on_old_EC2.patch
# DRM
# nouveau + drm fixes
Patch1801: drm-fixes.patch
Patch1810: drm-nouveau-updates.patch
Patch1819: drm-intel-big-hammer.patch
# intel drm is all merged upstream
Patch1824: drm-intel-next.patch
# make sure the lvds comes back on lid open
Patch1825: drm-intel-make-lvds-work.patch
Patch1826: drm-intel-edp-fixes.patch
Patch1900: linux-2.6-intel-iommu-igfx.patch
# linux1394 git patches
@ -702,10 +703,10 @@ Patch12205: runtime_pm_fixups.patch
Patch12303: dmar-disable-when-ricoh-multifunction.patch
Patch12400: tty-dont-allow-reopen-when-ldisc-is-changing.patch
Patch12401: debug-tty-print-dev-name.patch
Patch12402: tty-ldisc-fix-open-flag-handling.patch
Patch12403: tty-open-hangup-race-fixup.patch
Patch12410: mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
Patch12411: mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
# Xen patches
# git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen.git branches
@ -1254,7 +1255,6 @@ ApplyPatch linux-2.6-e1000-ich9-montevina.patch
ApplyPatch fix_xen_guest_on_old_EC2.patch
# DRM core
ApplyPatch drm-fixes.patch
# Nouveau DRM
ApplyOptionalPatch drm-nouveau-updates.patch
@ -1264,6 +1264,7 @@ ApplyOptionalPatch drm-intel-next.patch
ApplyPatch drm-intel-big-hammer.patch
ApplyPatch drm-intel-make-lvds-work.patch
ApplyPatch linux-2.6-intel-iommu-igfx.patch
ApplyPatch drm-intel-edp-fixes.patch
# linux1394 git patches
#ApplyPatch linux-2.6-firewire-git-update.patch
@ -1306,10 +1307,11 @@ ApplyPatch runtime_pm_fixups.patch
ApplyPatch dmar-disable-when-ricoh-multifunction.patch
# rhbz#630464
ApplyPatch tty-dont-allow-reopen-when-ldisc-is-changing.patch
ApplyPatch debug-tty-print-dev-name.patch
ApplyPatch tty-ldisc-fix-open-flag-handling.patch
ApplyPatch tty-open-hangup-race-fixup.patch
# backport some fixes for kswapd from mmotm, rhbz#649694
ApplyPatch mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch
ApplyPatch mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds.patch
# Xen patches
ApplyPatch xen.next-2.6.37.patch
@ -1590,6 +1592,9 @@ BuildKernel() {
mkdir -p $RPM_BUILD_ROOT/usr/src/kernels
mv $RPM_BUILD_ROOT/lib/modules/$KernelVer/build $RPM_BUILD_ROOT/$DevelDir
ln -sf ../../..$DevelDir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build
# prune junk from kernel-devel
find $RPM_BUILD_ROOT/usr/src/kernels -name ".*.cmd" -exec rm -f {} \;
}
###
@ -1624,7 +1629,7 @@ BuildKernel %make_target %kernel_image smp
%endif
%global perf_make \
make %{?_smp_mflags} -C tools/perf -s V=1 NO_DEMANGLE=1 prefix=%{_prefix}
make %{?_smp_mflags} -C tools/perf -s V=1 HAVE_CPLUS_DEMANGLE=1 prefix=%{_prefix}
%if %{with_perf}
%{perf_make} all
%{perf_make} man || %{doc_build_fail}
@ -1708,8 +1713,6 @@ find $RPM_BUILD_ROOT/usr/include \
\( -name .install -o -name .check -o \
-name ..install.cmd -o -name ..check.cmd \) | xargs rm -f
find $RPM_BUILD_ROOT/usr/src/kernels -name ".*.cmd" -exec rm -f {} \;
# glibc provides scsi headers for itself, for now
rm -rf $RPM_BUILD_ROOT/usr/include/scsi
rm -f $RPM_BUILD_ROOT/usr/include/asm*/atomic.h
@ -1929,6 +1932,24 @@ fi
# || ||
%changelog
* Tue Dec 07 2010 Kyle McMartin <kyle@redhat.com> 2.6.37-0.rc5.git0.1
- Linux 2.6.37-rc5
* Sat Dec 04 2010 Kyle McMartin <kyle@redhat.com>
- Enable C++ symbol demangling with perf by linking against libiberty.a,
which is LGPL2.
* Fri Dec 03 2010 Kyle McMartin <kyle@redhat.com>
- Linux 2.6.37-rc4-git3
- Enable HP ILO on x86_64 for (#571329)
- Drop merged drm-fixes.patch, split out edp-fixes.
- tty-dont-allow-reopen-when-ldisc-is-changing.patch: upstream.
- tty-ldisc-fix-open-flag-handling.patch: upstream.
- Enable CIFS_ACL.
* Thu Dec 02 2010 Kyle McMartin <kyle@redhat.com>
- Grab some of Mel's fixes from -mmotm to hopefully sort out #649694.
* Thu Dec 02 2010 Michael Young <m.a.young@durham.ac.uk>
- Update the xen/next-2.6.37 patch and rebuild for rc4-git1
- xen-pcifront-fixes patch is now upstream

View File

@ -0,0 +1,389 @@
From df43fae25437d7bc7dfff72599c1e825038b67cf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 24 Nov 2010 22:18:23 -0500
Subject: [PATCH 1/2] mm: page allocator: Adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold. On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high. The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues. The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.
Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing. This patch takes a different approach. For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level. This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node. There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat(). We are still using an expensive
function but limiting how often it is called.
When the test case is reproduced, the time spent in the watermark
functions is reduced. The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().
vanilla 11.6615%
disable-threshold 0.2584%
Reported-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-page-allocator-adjust-the-per-cpu-counter-threshold-when-memory-is-low.patch]]
---
include/linux/mmzone.h | 10 ++-----
include/linux/vmstat.h | 5 +++
mm/mmzone.c | 21 ---------------
mm/page_alloc.c | 35 +++++++++++++++++++-----
mm/vmscan.c | 23 +++++++++-------
mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 115 insertions(+), 47 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3984c4e..8d789d7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -448,12 +448,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
return test_bit(ZONE_OOM_LOCKED, &zone->flags);
}
-#ifdef CONFIG_SMP
-unsigned long zone_nr_free_pages(struct zone *zone);
-#else
-#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
-#endif /* CONFIG_SMP */
-
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -651,7 +645,9 @@ typedef struct pglist_data {
extern struct mutex zonelists_mutex;
void build_all_zonelists(void *data);
void wakeup_kswapd(struct zone *zone, int order);
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags);
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags);
enum memmap_context {
MEMMAP_EARLY,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index eaaea37..e4cc21c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
void refresh_cpu_vm_stats(int);
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
#else /* CONFIG_SMP */
/*
@@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state
+static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+
static inline void refresh_cpu_vm_stats(int cpu) { }
#endif
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb8..f5b7d17 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
- /*
- * While kswapd is awake, it is considered the zone is under some
- * memory pressure. Under pressure, there is a risk that
- * per-cpu-counter-drift will allow the min watermark to be breached
- * potentially causing a live-lock. While kswapd is awake and
- * free pages are low, get a better estimate for free pages
- */
- if (nr_free_pages < zone->percpu_drift_mark &&
- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
- return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad18..0286150 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1454,24 +1454,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1480,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -2436,7 +2455,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_nr_free_pages(zone)),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf..3e71cb1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
if (zone->all_unreclaimable)
continue;
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
0, 0))
return 1;
}
@@ -2169,7 +2169,7 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
@@ -2215,7 +2215,7 @@ loop_again:
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2236,7 +2236,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2244,7 +2244,7 @@ loop_again:
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
}
@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
*/
if (!sleeping_prematurely(pgdat, order, remaining)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+ restore_pgdat_percpu_threshold(pgdat);
schedule();
+ reduce_pgdat_percpu_threshold(pgdat);
} else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e6..4d7faeb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
+static int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
static int calculate_threshold(struct zone *zone)
{
int threshold;
@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
}
}
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = calculate_pressure_threshold(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+ put_online_cpus();
+}
+
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = calculate_threshold(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+ put_online_cpus();
+}
+
/*
* For use when we know that interrupts are disabled.
*/
@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_nr_free_pages(zone),
+ zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
--
1.7.3.2

View File

@ -0,0 +1,167 @@
From 82e3d4969144377d13da97d511e849e8cf3e6dcc Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 24 Nov 2010 22:24:24 -0500
Subject: [PATCH 2/2] mm: vmstat: Use a single setter function and callback for adjusting percpu thresholds
reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist
to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid
errors due to counter drift. The functions duplicate some code so this
patch replaces them with a single set_pgdat_percpu_threshold() that takes
a callback function to calculate the desired threshold as a parameter.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[the various mmotm patches updating this were rolled up. --kyle]
[[http://userweb.kernel.org/~akpm/mmotm/broken-out/mm-vmstat-use-a-single-setter-function-and-callback-for-adjusting-percpu-thresholds-fix-set_pgdat_percpu_threshold-dont-use-for_each_online_cpu.patch]]
---
include/linux/vmstat.h | 10 ++++++----
mm/vmscan.c | 19 +++++++++++++++++--
mm/vmstat.c | 36 +++++++-----------------------------
3 files changed, 30 insertions(+), 35 deletions(-)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index e4cc21c..833e676 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
void refresh_cpu_vm_stats(int);
-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
-void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
+
+int calculate_pressure_threshold(struct zone *zone);
+int calculate_normal_threshold(struct zone *zone);
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */
/*
@@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page,
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state
-static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
-static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
+#define set_pgdat_percpu_threshold(pgdat, callback) { }
static inline void refresh_cpu_vm_stats(int cpu) { }
#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3e71cb1..ba39948 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2378,9 +2378,24 @@ static int kswapd(void *p)
*/
if (!sleeping_prematurely(pgdat, order, remaining)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
- restore_pgdat_percpu_threshold(pgdat);
+
+ /*
+ * vmstat counters are not perfectly
+ * accurate and the estimated value
+ * for counters such as NR_FREE_PAGES
+ * can deviate from the true value by
+ * nr_online_cpus * threshold. To
+ * avoid the zone watermarks being
+ * breached while under pressure, we
+ * reduce the per-cpu vmstat threshold
+ * while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat,
+ calculate_normal_threshold);
schedule();
- reduce_pgdat_percpu_threshold(pgdat);
+ set_pgdat_percpu_threshold(pgdat,
+ calculate_pressure_threshold);
} else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4d7faeb..511c2c0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-static int calculate_pressure_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
{
int threshold;
int watermark_distance;
@@ -105,7 +105,7 @@ static int calculate_pressure_threshold(struct zone *zone)
return threshold;
}
-static int calculate_threshold(struct zone *zone)
+int calculate_normal_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
@@ -164,7 +164,7 @@ static void refresh_zone_stat_thresholds(void)
for_each_populated_zone(zone) {
unsigned long max_drift, tolerate_drift;
- threshold = calculate_threshold(zone);
+ threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -183,46 +183,24 @@ static void refresh_zone_stat_thresholds(void)
}
}
-void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
{
struct zone *zone;
int cpu;
int threshold;
int i;
- get_online_cpus();
- for (i = 0; i < pgdat->nr_zones; i++) {
- zone = &pgdat->node_zones[i];
- if (!zone->percpu_drift_mark)
- continue;
-
- threshold = calculate_pressure_threshold(zone);
- for_each_online_cpu(cpu)
- per_cpu_ptr(zone->pageset, cpu)->stat_threshold
- = threshold;
- }
- put_online_cpus();
-}
-
-void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
-{
- struct zone *zone;
- int cpu;
- int threshold;
- int i;
-
- get_online_cpus();
for (i = 0; i < pgdat->nr_zones; i++) {
zone = &pgdat->node_zones[i];
if (!zone->percpu_drift_mark)
continue;
- threshold = calculate_threshold(zone);
- for_each_online_cpu(cpu)
+ threshold = (*calculate_pressure)(zone);
+ for_each_possible_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
}
- put_online_cpus();
}
/*
--
1.7.3.2

View File

@ -1,3 +1,2 @@
61f3739a73afb6914cb007f37fb09b62 linux-2.6.36.tar.bz2
854ca0c7eca8930a71a6382a7dabbf65 patch-2.6.37-rc4.bz2
c3146fe28bb10e77d8388bc26e16483c patch-2.6.37-rc4-git1.bz2
a84cf559615b5168ec1d5591841601ed patch-2.6.37-rc5.bz2

View File

@ -1,84 +0,0 @@
From jirislaby@gmail.com Thu Nov 25 12:16:42 2010
From: Jiri Slaby <jslaby@suse.cz>
Subject: [PATCH 1/1] TTY: don't allow reopen when ldisc is changing
Date: Thu, 25 Nov 2010 18:16:23 +0100
There are many WARNINGs like the following reported nowadays:
WARNING: at drivers/tty/tty_io.c:1331 tty_open+0x2a2/0x49a()
Hardware name: Latitude E6500
Modules linked in:
Pid: 1207, comm: plymouthd Not tainted 2.6.37-rc3-mmotm1123 #3
Call Trace:
[<ffffffff8103b189>] warn_slowpath_common+0x80/0x98
[<ffffffff8103b1b6>] warn_slowpath_null+0x15/0x17
[<ffffffff8128a3ab>] tty_open+0x2a2/0x49a
[<ffffffff810fd53f>] chrdev_open+0x11d/0x146
...
This means tty_reopen is called without TTY_LDISC set. For further
considerations, note tty_lock is held in tty_open. TTY_LDISC is cleared in:
1) __tty_hangup from tty_ldisc_hangup to tty_ldisc_enable. During this
section tty_lock is held.
2) tty_release via tty_ldisc_release till the end of tty existence. If
tty->count <= 1, tty_lock is taken, TTY_CLOSING bit set and then
tty_ldisc_release called. tty_reopen checks TTY_CLOSING before checking
TTY_LDISC.
3) tty_set_ldisc from tty_ldisc_halt to tty_ldisc_enable. We:
* take tty_lock, set TTY_LDISC_CHANGING, put tty_lock
* call tty_ldisc_halt (clear TTY_LDISC), tty_lock is _not_ held
* do some other work
* take tty_lock, call tty_ldisc_enable (set TTY_LDISC), put
tty_lock
So the only option I see is 3). The solution is to check
TTY_LDISC_CHANGING along with TTY_CLOSING in tty_reopen.
Nicely reproducible with two processes:
while (1) {
fd = open("/dev/ttyS1", O_RDWR);
if (fd < 0) {
warn("open");
continue;
}
close(fd);
}
--------
while (1) {
fd = open("/dev/ttyS1", O_RDWR);
ld1 = 0; ld2 = 2;
while (1) {
ioctl(fd, TIOCSETD, &ld1);
ioctl(fd, TIOCSETD, &ld2);
}
close(fd);
}
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Reported-by: <Valdis.Kletnieks@vt.edu>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
---
drivers/tty/tty_io.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index c05c5af..878f6d6 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1310,7 +1310,8 @@ static int tty_reopen(struct tty_struct *tty)
{
struct tty_driver *driver = tty->driver;
- if (test_bit(TTY_CLOSING, &tty->flags))
+ if (test_bit(TTY_CLOSING, &tty->flags) ||
+ test_bit(TTY_LDISC_CHANGING, &tty->flags))
return -EIO;
if (driver->type == TTY_DRIVER_TYPE_PTY &&
--
1.7.3.1

View File

@ -1,54 +0,0 @@
From linux-kernel-owner@vger.kernel.org Wed Nov 24 18:28:11 2010
From: Jiri Slaby <jslaby@suse.cz>
Subject: [PATCH 1/2] TTY: ldisc, fix open flag handling
Date: Thu, 25 Nov 2010 00:27:54 +0100
When a concrete ldisc open fails in tty_ldisc_open, we forget to clear
TTY_LDISC_OPEN. This causes a false warning on the next ldisc open:
WARNING: at drivers/char/tty_ldisc.c:445 tty_ldisc_open+0x26/0x38()
Hardware name: System Product Name
Modules linked in: ...
Pid: 5251, comm: a.out Tainted: G W 2.6.32-5-686 #1
Call Trace:
[<c1030321>] ? warn_slowpath_common+0x5e/0x8a
[<c1030357>] ? warn_slowpath_null+0xa/0xc
[<c119311c>] ? tty_ldisc_open+0x26/0x38
[<c11936c5>] ? tty_set_ldisc+0x218/0x304
...
So clear the bit when failing...
Introduced in c65c9bc3efa (tty: rewrite the ldisc locking) back in
2.6.31-rc1.
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Alan Cox <alan@linux.intel.com>
Reported-by: Sergey Lapin <slapin@ossfans.org>
Tested-by: Sergey Lapin <slapin@ossfans.org>
---
drivers/tty/tty_ldisc.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index d8e96b0..4214d58 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -454,6 +454,8 @@ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
/* BTM here locks versus a hangup event */
WARN_ON(!tty_locked());
ret = ld->ops->open(tty);
+ if (ret)
+ clear_bit(TTY_LDISC_OPEN, &tty->flags);
return ret;
}
return 0;
--
1.7.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

View File

@ -1,76 +0,0 @@
From 9e88e8b9915b5e067507a087437d80e6a133d612 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Sat, 27 Nov 2010 16:06:46 +0100
Subject: [PATCH 1/1] TTY: open/hangup race fixup
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
drivers/tty/tty_io.c | 10 +++++++++-
include/linux/tty.h | 1 +
2 files changed, 10 insertions(+), 1 deletions(-)
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 878f6d6..35480dd 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -559,6 +559,9 @@ void __tty_hangup(struct tty_struct *tty)
tty_lock();
+ /* some functions below drop BTM, so we need this bit */
+ set_bit(TTY_HUPPING, &tty->flags);
+
/* inuse_filps is protected by the single tty lock,
this really needs to change if we want to flush the
workqueue with the lock held */
@@ -578,6 +581,10 @@ void __tty_hangup(struct tty_struct *tty)
}
spin_unlock(&tty_files_lock);
+ /*
+ * it drops BTM and thus races with reopen
+ * we protect the race by TTY_HUPPING
+ */
tty_ldisc_hangup(tty);
read_lock(&tasklist_lock);
@@ -615,7 +622,6 @@ void __tty_hangup(struct tty_struct *tty)
tty->session = NULL;
tty->pgrp = NULL;
tty->ctrl_status = 0;
- set_bit(TTY_HUPPED, &tty->flags);
spin_unlock_irqrestore(&tty->ctrl_lock, flags);
/* Account for the p->signal references we killed */
@@ -641,6 +647,7 @@ void __tty_hangup(struct tty_struct *tty)
* can't yet guarantee all that.
*/
set_bit(TTY_HUPPED, &tty->flags);
+ clear_bit(TTY_HUPPING, &tty->flags);
tty_ldisc_enable(tty);
tty_unlock();
@@ -1311,6 +1318,7 @@ static int tty_reopen(struct tty_struct *tty)
struct tty_driver *driver = tty->driver;
if (test_bit(TTY_CLOSING, &tty->flags) ||
+ test_bit(TTY_HUPPING, &tty->flags) ||
test_bit(TTY_LDISC_CHANGING, &tty->flags))
return -EIO;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 032d79f..54e4eaa 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -366,6 +366,7 @@ struct tty_file_private {
#define TTY_HUPPED 18 /* Post driver->hangup() */
#define TTY_FLUSHING 19 /* Flushing to ldisc in progress */
#define TTY_FLUSHPENDING 20 /* Queued buffer flush pending */
+#define TTY_HUPPING 21 /* ->hangup() in progress */
#define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
--
1.7.3.1