Scheduler fixes for Bugzilla #635813 and #633037

2010-09-21 09:27:25 -04:00 · 2010-09-21 09:27:25 -04:00 · b218718b2b
parent 7a68cb0b55
commit b218718b2b
7 changed files with 1125 additions and 1 deletions
--- a/kernel.spec
+++ b/kernel.spec
@ -742,6 +742,13 @@ Patch12540: irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch
 Patch12550: keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch
 Patch12551: keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch

+Patch12560: sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
+Patch12565: sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
+Patch12570: sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
+Patch12575: sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
+Patch12580: sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
+Patch12585: sched-25-move-sched_avg_update-to-update_cpu_load.patch
+
 %endif

 BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
@ -1371,6 +1378,14 @@ ApplyPatch irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch
 ApplyPatch keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch
 ApplyPatch keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch

+# Scheduler fixes (#635813 and #633037)
+ApplyPatch sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
+ApplyPatch sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
+ApplyPatch sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
+ApplyPatch sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
+ApplyPatch sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
+ApplyPatch sched-25-move-sched_avg_update-to-update_cpu_load.patch
+
 # END OF PATCH APPLICATIONS

 %endif
@ -1957,7 +1972,10 @@ fi
 # and build.

 %changelog
-* Mon Sep 20 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.35.5-29
+* Tue Sep 21 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.35.5-29
+- Scheduler fixes for Bugzilla #635813 and #633037
+
+* Mon Sep 20 2010 Chuck Ebbert <cebbert@redhat.com>
 - Linux 2.6.35.5
 - Drop merged patches:
  01-compat-make-compat_alloc_user_space-incorporate-the-access_ok-check.patch
--- a/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
+++ b/sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
@ -0,0 +1,55 @@
+From: Stanislaw Gruszka <sgruszka@redhat.com>
+Date: Tue, 14 Sep 2010 14:35:14 +0000 (+0200)
+Subject: sched: Fix user time incorrectly accounted as system time on 32-bit
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fx86%2Flinux-2.6-tip.git;a=commitdiff_plain;h=e75e863dd5c7d96b91ebbd241da5328fc38a78cc
+
+sched: Fix user time incorrectly accounted as system time on 32-bit
+
+We have 32-bit variable overflow possibility when multiply in
+task_times() and thread_group_times() functions. When the
+overflow happens then the scaled utime value becomes erroneously
+small and the scaled stime becomes i erroneously big.
+
+Reported here:
+
+ https://bugzilla.redhat.com/show_bug.cgi?id=633037
+ https://bugzilla.kernel.org/show_bug.cgi?id=16559
+
+Reported-by: Michael Chapman <redhat-bugzilla@very.puzzling.org>
+Reported-by: Ciriaco Garcia de Celis <sysman@etherpilot.com>
+Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Cc: <stable@kernel.org>  # 2.6.32.19+ (partially) and 2.6.33+
+LKML-Reference: <20100914143513.GB8415@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index ed09d4f..dc85ceb 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -3513,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ 	rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+ 
+ 	if (total) {
+-		u64 temp;
+		u64 temp = rtime;
+ 
+-		temp = (u64)(rtime * utime);
+		temp *= utime;
+ 		do_div(temp, total);
+ 		utime = (cputime_t)temp;
+ 	} else
+@@ -3546,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+ 	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+ 
+ 	if (total) {
+-		u64 temp;
+		u64 temp = rtime;
+ 
+-		temp = (u64)(rtime * cputime.utime);
+		temp *= cputime.utime;
+ 		do_div(temp, total);
+ 		utime = (cputime_t)temp;
+ 	} else
--- a/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
+++ b/sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
@ -0,0 +1,276 @@
+From: Venkatesh Pallipadi <venki@google.com>
+Date: Tue, 18 May 2010 01:14:43 +0000 (-0700)
+Subject: sched: Avoid side-effect of tickless idle on update_cpu_load
+X-Git-Tag: v2.6.36-rc1~531^2~22
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=fdf3e95d3916f18bf8703fb065499fdbc4dfe34c
+
+sched: Avoid side-effect of tickless idle on update_cpu_load
+
+tickless idle has a negative side effect on update_cpu_load(), which
+in turn can affect load balancing behavior.
+
+update_cpu_load() is supposed to be called every tick, to keep track
+of various load indicies. With tickless idle, there are no scheduler
+ticks called on the idle CPUs. Idle CPUs may still do load balancing
+(with idle_load_balance CPU) using the stale cpu_load. It will also
+cause problems when all CPUs go idle for a while and become active
+again. In this case loads would not degrade as expected.
+
+This is how rq->nr_load_updates change looks like under different
+conditions:
+
+<cpu_num> <nr_load_updates change>
+All CPUS idle for 10 seconds (HZ=1000)
+0 1621
+10 496
+11 139
+12 875
+13 1672
+14 12
+15 21
+1 1472
+2 2426
+3 1161
+4 2108
+5 1525
+6 701
+7 249
+8 766
+9 1967
+
+One CPU busy rest idle for 10 seconds
+0 10003
+10 601
+11 95
+12 966
+13 1597
+14 114
+15 98
+1 3457
+2 93
+3 6679
+4 1425
+5 1479
+6 595
+7 193
+8 633
+9 1687
+
+All CPUs busy for 10 seconds
+0 10026
+10 10026
+11 10026
+12 10026
+13 10025
+14 10025
+15 10025
+1 10026
+2 10026
+3 10026
+4 10026
+5 10026
+6 10026
+7 10026
+8 10026
+9 10026
+
+That is update_cpu_load works properly only when all CPUs are busy.
+If all are idle, all the CPUs get way lower updates.  And when few
+CPUs are busy and rest are idle, only busy and ilb CPU does proper
+updates and rest of the idle CPUs will do lower updates.
+
+The patch keeps track of when a last update was done and fixes up
+the load avg based on current time.
+
+On one of my test system SPECjbb with warehouse 1..numcpus, patch
+improves throughput numbers by ~1% (average of 6 runs).  On another
+test system (with different domain hierarchy) there is no noticable
+change in perf.
+
+Signed-off-by: Venkatesh Pallipadi <venki@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+LKML-Reference: <AANLkTilLtDWQsAUrIxJ6s04WTgmw9GuOODc5AOrYsaR5@mail.gmail.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index f37a961..a757f6b 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -457,6 +457,7 @@ struct rq {
+ 	unsigned long nr_running;
+ 	#define CPU_LOAD_IDX_MAX 5
+ 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+	unsigned long last_load_update_tick;
+ #ifdef CONFIG_NO_HZ
+ 	u64 nohz_stamp;
+ 	unsigned char in_nohz_recently;
+@@ -1803,6 +1804,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+ static void calc_load_account_idle(struct rq *this_rq);
+ static void update_sysctl(void);
+ static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
+ 
+ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+ {
+@@ -3050,23 +3052,102 @@ static void calc_load_account_active(struct rq *this_rq)
+ }
+ 
+ /*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT		7
+static const unsigned char
+		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+					{0, 0, 0, 0, 0, 0, 0, 0},
+					{64, 32, 8, 0, 0, 0, 0, 0},
+					{96, 72, 40, 12, 1, 0, 0},
+					{112, 98, 75, 43, 15, 1, 0},
+					{120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+	int j = 0;
+
+	if (!missed_updates)
+		return load;
+
+	if (missed_updates >= degrade_zero_ticks[idx])
+		return 0;
+
+	if (idx == 1)
+		return load >> missed_updates;
+
+	while (missed_updates) {
+		if (missed_updates % 2)
+			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+		missed_updates >>= 1;
+		j++;
+	}
+	return load;
+}
+
+/*
+  * Update rq->cpu_load[] statistics. This function is usually called every
+- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+  */
+ static void update_cpu_load(struct rq *this_rq)
+ {
+ 	unsigned long this_load = this_rq->load.weight;
+	unsigned long curr_jiffies = jiffies;
+	unsigned long pending_updates;
+ 	int i, scale;
+ 
+ 	this_rq->nr_load_updates++;
+ 
+	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
+	if (curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	this_rq->last_load_update_tick = curr_jiffies;
+
+ 	/* Update our load: */
+-	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ 		unsigned long old_load, new_load;
+ 
+ 		/* scale is effectively 1 << i now, and >> i divides by scale */
+ 
+ 		old_load = this_rq->cpu_load[i];
+		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+ 		new_load = this_load;
+ 		/*
+ 		 * Round up the averaging division if load is increasing. This
+@@ -3074,9 +3155,15 @@ static void update_cpu_load(struct rq *this_rq)
+ 		 * example.
+ 		 */
+ 		if (new_load > old_load)
+-			new_load += scale-1;
+-		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+			new_load += scale - 1;
+
+		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ 	}
+}
+
+static void update_cpu_load_active(struct rq *this_rq)
+{
+	update_cpu_load(this_rq);
+ 
+ 	calc_load_account_active(this_rq);
+ }
+@@ -3464,7 +3551,7 @@ void scheduler_tick(void)
+ 
+ 	raw_spin_lock(&rq->lock);
+ 	update_rq_clock(rq);
+-	update_cpu_load(rq);
+	update_cpu_load_active(rq);
+ 	curr->sched_class->task_tick(rq, curr, 0);
+ 	raw_spin_unlock(&rq->lock);
+ 
+@@ -7688,6 +7775,9 @@ void __init sched_init(void)
+ 
+ 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+ 			rq->cpu_load[j] = 0;
+
+		rq->last_load_update_tick = jiffies;
+
+ #ifdef CONFIG_SMP
+ 		rq->sd = NULL;
+ 		rq->rd = NULL;
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index eed35ed..22b8b4f 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -3420,9 +3420,12 @@ static void run_rebalance_domains(struct softirq_action *h)
+ 			if (need_resched())
+ 				break;
+ 
+			rq = cpu_rq(balance_cpu);
+			raw_spin_lock_irq(&rq->lock);
+			update_cpu_load(rq);
+			raw_spin_unlock_irq(&rq->lock);
+ 			rebalance_domains(balance_cpu, CPU_IDLE);
+ 
+-			rq = cpu_rq(balance_cpu);
+ 			if (time_after(this_rq->next_balance, rq->next_balance))
+ 				this_rq->next_balance = rq->next_balance;
+ 		}
--- a/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
+++ b/sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
@ -0,0 +1,651 @@
+From: Venkatesh Pallipadi <venki@google.com>
+Date: Sat, 22 May 2010 00:09:41 +0000 (-0700)
+Subject: sched: Change nohz idle load balancing logic to push model
+X-Git-Tag: v2.6.36-rc1~531^2~21
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=83cd4fe27ad8446619b2e030b171b858501de87d
+
+sched: Change nohz idle load balancing logic to push model
+
+In the new push model, all idle CPUs indeed go into nohz mode. There is
+still the concept of idle load balancer (performing the load balancing
+on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
+balancer when any of the nohz CPUs need idle load balancing.
+The kickee CPU does the idle load balancing on behalf of all idle CPUs
+instead of the normal idle balance.
+
+This addresses the below two problems with the current nohz ilb logic:
+* the idle load balancer continued to have periodic ticks during idle and
+  wokeup frequently, even though it did not have any rebalancing to do on
+  behalf of any of the idle CPUs.
+* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
+  periodic wakeup can result in a periodic additional interrupt on a CPU
+  doing the timer broadcast.
+
+Also currently we are migrating the unpinned timers from an idle to the cpu
+doing idle load balancing (when all the cpus in the system are idle,
+there is no idle load balancing cpu and timers get added to the same idle cpu
+where the request was made. So the existing optimization works only on semi idle
+system).
+
+And In semi idle system, we no longer have periodic ticks on the idle load
+balancer CPU. Using that cpu will add more delays to the timers than intended
+(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
+causing mysterious slowdowns during boot etc.
+
+For now, in the semi idle case, use the nearest busy cpu for migrating timers
+from an idle cpu.  This is good for power-savings anyway.
+
+Signed-off-by: Venkatesh Pallipadi <venki@google.com>
+Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+
+[ backported for 2.6.35 ]
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index c2d4316..a3e5b1c 100644
+--- a/include/linux/sched.h
+++ b/include/linux/sched.h
+@@ -271,13 +271,10 @@ extern int runqueue_is_locked(int cpu);
+ 
+ extern cpumask_var_t nohz_cpu_mask;
+ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
+-extern int select_nohz_load_balancer(int cpu);
+-extern int get_nohz_load_balancer(void);
+extern void select_nohz_load_balancer(int stop_tick);
+extern int get_nohz_timer_target(void);
+ #else
+-static inline int select_nohz_load_balancer(int cpu)
+-{
+-	return 0;
+-}
+static inline void select_nohz_load_balancer(int stop_tick) { }
+ #endif
+ 
+ /*
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index 5c69e99..e934339 100644
+--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
+@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ static int hrtimer_get_target(int this_cpu, int pinned)
+ {
+ #ifdef CONFIG_NO_HZ
+-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
+-		int preferred_cpu = get_nohz_load_balancer();
+-
+-		if (preferred_cpu >= 0)
+-			return preferred_cpu;
+-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
+		return get_nohz_timer_target();
+ #endif
+ 	return this_cpu;
+ }
+diff --git a/kernel/sched.c b/kernel/sched.c
+index a757f6b..132950b 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -460,7 +460,7 @@ struct rq {
+ 	unsigned long last_load_update_tick;
+ #ifdef CONFIG_NO_HZ
+ 	u64 nohz_stamp;
+-	unsigned char in_nohz_recently;
+	unsigned char nohz_balance_kick;
+ #endif
+ 	unsigned int skip_clock_update;
+ 
+@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu)
+ 
+ #ifdef CONFIG_NO_HZ
+ /*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd))
+			if (!idle_cpu(i))
+				return i;
+	}
+	return cpu;
+}
+/*
+  * When add_timer_on() enqueues a timer into the timer wheel of an
+  * idle CPU then this timer might expire before the next timer event
+  * which is scheduled to wake up that CPU. In case of a completely
+@@ -7791,6 +7812,10 @@ void __init sched_init(void)
+ 		rq->idle_stamp = 0;
+ 		rq->avg_idle = 2*sysctl_sched_migration_cost;
+ 		rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+		rq->nohz_balance_kick = 0;
+		init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
+ #endif
+ 		init_rq_hrtick(rq);
+ 		atomic_set(&rq->nr_iowait, 0);
+@@ -7835,8 +7860,11 @@ void __init sched_init(void)
+ 	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+-	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+-	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+	atomic_set(&nohz.load_balancer, nr_cpu_ids);
+	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
+ #endif
+ 	/* May be allocated at isolcpus cmdline parse time */
+ 	if (cpu_isolated_map == NULL)
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 22b8b4f..6ee2e0a 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -3091,13 +3091,40 @@ out_unlock:
+ }
+ 
+ #ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+	raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+	csd->func = trigger_sched_softirq;
+	csd->info = NULL;
+	csd->flags = 0;
+	csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ *   entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ *   it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
+ static struct {
+ 	atomic_t load_balancer;
+-	cpumask_var_t cpu_mask;
+-	cpumask_var_t ilb_grp_nohz_mask;
+-} nohz ____cacheline_aligned = {
+-	.load_balancer = ATOMIC_INIT(-1),
+-};
+	atomic_t first_pick_cpu;
+	atomic_t second_pick_cpu;
+	cpumask_var_t idle_cpus_mask;
+	cpumask_var_t grp_idle_mask;
+	unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
+ 
+ int get_nohz_load_balancer(void)
+ {
+@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+  */
+ static inline int is_semi_idle_group(struct sched_group *ilb_group)
+ {
+-	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+	cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
+ 					sched_group_cpus(ilb_group));
+ 
+ 	/*
+ 	 * A sched_group is semi-idle when it has atleast one busy cpu
+ 	 * and atleast one idle cpu.
+ 	 */
+-	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+	if (cpumask_empty(nohz.grp_idle_mask))
+ 		return 0;
+ 
+-	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+	if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
+ 		return 0;
+ 
+ 	return 1;
+@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
+ 	 * Optimize for the case when we have no idle CPUs or only one
+ 	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
+ 	 */
+-	if (cpumask_weight(nohz.cpu_mask) < 2)
+	if (cpumask_weight(nohz.idle_cpus_mask) < 2)
+ 		goto out_done;
+ 
+ 	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
+ 
+ 		do {
+ 			if (is_semi_idle_group(ilb_group))
+-				return cpumask_first(nohz.ilb_grp_nohz_mask);
+				return cpumask_first(nohz.grp_idle_mask);
+ 
+ 			ilb_group = ilb_group->next;
+ 
+@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
+ 	}
+ 
+ out_done:
+-	return cpumask_first(nohz.cpu_mask);
+	return nr_cpu_ids;
+ }
+ #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+ static inline int find_new_ilb(int call_cpu)
+ {
+-	return cpumask_first(nohz.cpu_mask);
+	return nr_cpu_ids;
+ }
+ #endif
+ 
+ /*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+	int ilb_cpu;
+
+	nohz.next_balance++;
+
+	ilb_cpu = get_nohz_load_balancer();
+
+	if (ilb_cpu >= nr_cpu_ids) {
+		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+		if (ilb_cpu >= nr_cpu_ids)
+			return;
+	}
+
+	if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+		struct call_single_data *cp;
+
+		cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+		cp = &per_cpu(remote_sched_softirq_cb, cpu);
+		__smp_call_function_single(ilb_cpu, cp, 0);
+	}
+	return;
+}
+
+/*
+  * This routine will try to nominate the ilb (idle load balancing)
+  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+- * load balancing on behalf of all those cpus. If all the cpus in the system
+- * go into this tickless mode, then there will be no ilb owner (as there is
+- * no need for one) and all the cpus will sleep till the next wakeup event
+- * arrives...
+- *
+- * For the ilb owner, tick is not stopped. And this tick will be used
+- * for idle load balancing. ilb owner will still be part of
+- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
+  *
+- * While stopping the tick, this cpu will become the ilb owner if there
+- * is no other owner. And will be the owner till that cpu becomes busy
+- * or if all cpus in the system stop their ticks at which point
+- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
+  *
+- * When the ilb owner becomes busy, it nominates another owner, during the
+- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
+  */
+-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
+ {
+ 	int cpu = smp_processor_id();
+ 
+ 	if (stop_tick) {
+-		cpu_rq(cpu)->in_nohz_recently = 1;
+-
+ 		if (!cpu_active(cpu)) {
+ 			if (atomic_read(&nohz.load_balancer) != cpu)
+-				return 0;
+				return;
+ 
+ 			/*
+ 			 * If we are going offline and still the leader,
+ 			 * give up!
+ 			 */
+-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
+ 				BUG();
+ 
+-			return 0;
+			return;
+ 		}
+ 
+-		cpumask_set_cpu(cpu, nohz.cpu_mask);
+		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ 
+-		/* time for ilb owner also to sleep */
+-		if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+-			if (atomic_read(&nohz.load_balancer) == cpu)
+-				atomic_set(&nohz.load_balancer, -1);
+-			return 0;
+-		}
+		if (atomic_read(&nohz.first_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+		if (atomic_read(&nohz.second_pick_cpu) == cpu)
+			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ 
+-		if (atomic_read(&nohz.load_balancer) == -1) {
+-			/* make me the ilb owner */
+-			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+-				return 1;
+-		} else if (atomic_read(&nohz.load_balancer) == cpu) {
+		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
+ 			int new_ilb;
+ 
+-			if (!(sched_smt_power_savings ||
+-						sched_mc_power_savings))
+-				return 1;
+			/* make me the ilb owner */
+			if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+					   cpu) != nr_cpu_ids)
+				return;
+
+ 			/*
+ 			 * Check to see if there is a more power-efficient
+ 			 * ilb.
+ 			 */
+ 			new_ilb = find_new_ilb(cpu);
+ 			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+-				atomic_set(&nohz.load_balancer, -1);
+				atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ 				resched_cpu(new_ilb);
+-				return 0;
+				return;
+ 			}
+-			return 1;
+			return;
+ 		}
+ 	} else {
+-		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+-			return 0;
+		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+			return;
+ 
+-		cpumask_clear_cpu(cpu, nohz.cpu_mask);
+		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ 
+ 		if (atomic_read(&nohz.load_balancer) == cpu)
+-			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+			if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+					   nr_cpu_ids) != cpu)
+ 				BUG();
+ 	}
+-	return 0;
+	return;
+ }
+ #endif
+ 
+@@ -3383,11 +3428,101 @@ out:
+ 		rq->next_balance = next_balance;
+ }
+ 
+#ifdef CONFIG_NO_HZ
+ /*
+- * run_rebalance_domains is triggered when needed from the scheduler tick.
+- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
+  * rebalancing for all the cpus for whom scheduler ticks are stopped.
+  */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq;
+	int balance_cpu;
+
+	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+		return;
+
+	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+		if (balance_cpu == this_cpu)
+			continue;
+
+		/*
+		 * If this cpu gets work to do, stop the load balancing
+		 * work being done for other cpus. Next load
+		 * balancing owner will pick it up.
+		 */
+		if (need_resched()) {
+			this_rq->nohz_balance_kick = 0;
+			break;
+		}
+
+		raw_spin_lock_irq(&this_rq->lock);
+		update_cpu_load(this_rq);
+		raw_spin_unlock_irq(&this_rq->lock);
+
+		rebalance_domains(balance_cpu, CPU_IDLE);
+
+		rq = cpu_rq(balance_cpu);
+		if (time_after(this_rq->next_balance, rq->next_balance))
+			this_rq->next_balance = rq->next_balance;
+	}
+	nohz.next_balance = this_rq->next_balance;
+	this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ *   idle load balancer when it has more than one process active. This
+ *   eliminates the need for idle load balancing altogether when we have
+ *   only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *   SMT or core siblings and can run better if they move to different
+ *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ *   which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+	unsigned long now = jiffies;
+	int ret;
+	int first_pick_cpu, second_pick_cpu;
+
+	if (time_before(now, nohz.next_balance))
+		return 0;
+
+	if (!rq->nr_running)
+		return 0;
+
+	first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+	second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+	if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+	    second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+		return 0;
+
+	ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+	if (ret == nr_cpu_ids || ret == cpu) {
+		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+		if (rq->nr_running > 1)
+			return 1;
+	} else {
+		ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+		if (ret == nr_cpu_ids || ret == cpu) {
+			if (rq->nr_running)
+				return 1;
+		}
+	}
+	return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
+ static void run_rebalance_domains(struct softirq_action *h)
+ {
+ 	int this_cpu = smp_processor_id();
+@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
+ 
+ 	rebalance_domains(this_cpu, idle);
+ 
+-#ifdef CONFIG_NO_HZ
+ 	/*
+-	 * If this cpu is the owner for idle load balancing, then do the
+	 * If this cpu has a pending nohz_balance_kick, then do the
+ 	 * balancing on behalf of the other idle cpus whose ticks are
+ 	 * stopped.
+ 	 */
+-	if (this_rq->idle_at_tick &&
+-	    atomic_read(&nohz.load_balancer) == this_cpu) {
+-		struct rq *rq;
+-		int balance_cpu;
+-
+-		for_each_cpu(balance_cpu, nohz.cpu_mask) {
+-			if (balance_cpu == this_cpu)
+-				continue;
+-
+-			/*
+-			 * If this cpu gets work to do, stop the load balancing
+-			 * work being done for other cpus. Next load
+-			 * balancing owner will pick it up.
+-			 */
+-			if (need_resched())
+-				break;
+-
+-			rq = cpu_rq(balance_cpu);
+-			raw_spin_lock_irq(&rq->lock);
+-			update_cpu_load(rq);
+-			raw_spin_unlock_irq(&rq->lock);
+-			rebalance_domains(balance_cpu, CPU_IDLE);
+-
+-			if (time_after(this_rq->next_balance, rq->next_balance))
+-				this_rq->next_balance = rq->next_balance;
+-		}
+-	}
+-#endif
+	nohz_idle_balance(this_cpu, idle);
+ }
+ 
+ static inline int on_null_domain(int cpu)
+@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
+ 
+ /*
+  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+- *
+- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+- * idle load balancing owner or decide to stop the periodic load balancing,
+- * if the whole system is idle.
+  */
+ static inline void trigger_load_balance(struct rq *rq, int cpu)
+ {
+-#ifdef CONFIG_NO_HZ
+-	/*
+-	 * If we were in the nohz mode recently and busy at the current
+-	 * scheduler tick, then check if we need to nominate new idle
+-	 * load balancer.
+-	 */
+-	if (rq->in_nohz_recently && !rq->idle_at_tick) {
+-		rq->in_nohz_recently = 0;
+-
+-		if (atomic_read(&nohz.load_balancer) == cpu) {
+-			cpumask_clear_cpu(cpu, nohz.cpu_mask);
+-			atomic_set(&nohz.load_balancer, -1);
+-		}
+-
+-		if (atomic_read(&nohz.load_balancer) == -1) {
+-			int ilb = find_new_ilb(cpu);
+-
+-			if (ilb < nr_cpu_ids)
+-				resched_cpu(ilb);
+-		}
+-	}
+-
+-	/*
+-	 * If this cpu is idle and doing idle load balancing for all the
+-	 * cpus with ticks stopped, is it time for that to stop?
+-	 */
+-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+-	    cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+-		resched_cpu(cpu);
+-		return;
+-	}
+-
+-	/*
+-	 * If this cpu is idle and the idle load balancing is done by
+-	 * someone else, then no need raise the SCHED_SOFTIRQ
+-	 */
+-	if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+-	    cpumask_test_cpu(cpu, nohz.cpu_mask))
+-		return;
+-#endif
+ 	/* Don't need to rebalance while attached to NULL domain */
+ 	if (time_after_eq(jiffies, rq->next_balance) &&
+ 	    likely(!on_null_domain(cpu)))
+ 		raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+		nohz_balancer_kick(cpu);
+#endif
+ }
+ 
+ static void rq_online_fair(struct rq *rq)
+diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
+index 1d7b9bc..5f171f0 100644
+--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
+@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
+ 		 * the scheduler tick in nohz_restart_sched_tick.
+ 		 */
+ 		if (!ts->tick_stopped) {
+-			if (select_nohz_load_balancer(1)) {
+-				/*
+-				 * sched tick not stopped!
+-				 */
+-				cpumask_clear_cpu(cpu, nohz_cpu_mask);
+-				goto out;
+-			}
+			select_nohz_load_balancer(1);
+ 
+ 			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
+ 			ts->tick_stopped = 1;
+diff --git a/kernel/timer.c b/kernel/timer.c
+index ee305c8..48d6aec 100644
+--- a/kernel/timer.c
+++ b/kernel/timer.c
+@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
+ 	cpu = smp_processor_id();
+ 
+ #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
+-		int preferred_cpu = get_nohz_load_balancer();
+-
+-		if (preferred_cpu >= 0)
+-			cpu = preferred_cpu;
+-	}
+	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
+		cpu = get_nohz_timer_target();
+ #endif
+ 	new_base = per_cpu(tvec_bases, cpu);
+ 
--- a/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
+++ b/sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
@ -0,0 +1,28 @@
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+Date: Fri, 9 Jul 2010 13:19:54 +0000 (+0200)
+Subject: sched: Update rq->clock for nohz balanced cpus
+X-Git-Tag: v2.6.36-rc1~531^2~5
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=5343bdb8fd076f16edc9d113a9e35e2a1d1f4966
+
+sched: Update rq->clock for nohz balanced cpus
+
+Suresh spotted that we don't update the rq->clock in the nohz
+load-balancer path.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1278626014.2834.74.camel@sbs-t61.sc.intel.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index b4da534..e44a591 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -3596,6 +3596,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+ 		}
+ 
+ 		raw_spin_lock_irq(&this_rq->lock);
+		update_rq_clock(this_rq);
+ 		update_cpu_load(this_rq);
+ 		raw_spin_unlock_irq(&this_rq->lock);
+ 
--- a/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
+++ b/sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
@ -0,0 +1,38 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Thu, 19 Aug 2010 11:31:43 +0000 (+0200)
+Subject: sched: Fix rq->clock synchronization when migrating tasks
+X-Git-Tag: v2.6.36-rc3~25^2~1
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=861d034ee814917a83bd5de4b26e3b8336ddeeb8
+
+sched: Fix rq->clock synchronization when migrating tasks
+
+sched_fork() -- we do task placement in ->task_fork_fair() ensure we
+  update_rq_clock() so we work with current time. We leave the vruntime
+  in relative state, so the time delay until wake_up_new_task() doesn't
+  matter.
+
+wake_up_new_task() -- Since task_fork_fair() left p->vruntime in
+  relative state we can safely migrate, the activate_task() on the
+  remote rq will call update_rq_clock() and causes the clock to be
+  synced (enough).
+
+Tested-by: Jack Daniel <wanders.thirst@gmail.com>
+Tested-by: Philby John <pjohn@mvista.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1281002322.1923.1708.camel@laptop>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 806d1b2..ab661eb 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p)
+ 
+ 	raw_spin_lock_irqsave(&rq->lock, flags);
+ 
+	update_rq_clock(rq);
+
+ 	if (unlikely(task_cpu(p) != this_cpu))
+ 		__set_task_cpu(p, this_cpu);
+ 
--- a/sched-25-move-sched_avg_update-to-update_cpu_load.patch
+++ b/sched-25-move-sched_avg_update-to-update_cpu_load.patch
@ -0,0 +1,58 @@
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+Date: Mon, 23 Aug 2010 20:42:51 +0000 (-0700)
+Subject: sched: Move sched_avg_update() to update_cpu_load()
+X-Git-Tag: v2.6.36-rc4~8^2~1
+X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=da2b71edd8a7db44fe1746261410a981f3e03632
+
+sched: Move sched_avg_update() to update_cpu_load()
+
+Currently sched_avg_update() (which updates rt_avg stats in the rq)
+is getting called from scale_rt_power() (in the load balance context)
+which doesn't take rq->lock.
+
+Fix it by moving the sched_avg_update() to more appropriate
+update_cpu_load() where the CFS load gets updated as well.
+
+Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1282596171.2694.3.camel@sbsiddha-MOBL3>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 09b574e..ed09d4f 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -1294,6 +1294,10 @@ static void resched_task(struct task_struct *p)
+ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+ {
+ }
+
+static void sched_avg_update(struct rq *rq)
+{
+}
+ #endif /* CONFIG_SMP */
+ 
+ #if BITS_PER_LONG == 32
+@@ -3182,6 +3186,8 @@ static void update_cpu_load(struct rq *this_rq)
+ 
+ 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+ 	}
+
+	sched_avg_update(this_rq);
+ }
+ 
+ static void update_cpu_load_active(struct rq *this_rq)
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index ab661eb..f53ec75 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -2268,8 +2268,6 @@ unsigned long scale_rt_power(int cpu)
+ 	struct rq *rq = cpu_rq(cpu);
+ 	u64 total, available;
+ 
+-	sched_avg_update(rq);
+-
+ 	total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ 	available = total - rq->rt_avg;
+