parent
7a68cb0b55
commit
b218718b2b
20
kernel.spec
20
kernel.spec
|
@ -742,6 +742,13 @@ Patch12540: irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch
|
|||
Patch12550: keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch
|
||||
Patch12551: keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch
|
||||
|
||||
Patch12560: sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
|
||||
Patch12565: sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
|
||||
Patch12570: sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
|
||||
Patch12575: sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
|
||||
Patch12580: sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
|
||||
Patch12585: sched-25-move-sched_avg_update-to-update_cpu_load.patch
|
||||
|
||||
%endif
|
||||
|
||||
BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
|
||||
|
@ -1371,6 +1378,14 @@ ApplyPatch irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch
|
|||
ApplyPatch keys-fix-bug-in-keyctl_session_to_parent-if-parent-has-no-session-keyring.patch
|
||||
ApplyPatch keys-fix-rcu-no-lock-warning-in-keyctl_session_to_parent.patch
|
||||
|
||||
# Scheduler fixes (#635813 and #633037)
|
||||
ApplyPatch sched-00-fix-user-time-incorrectly-accounted-as-system-time-on-32-bit.patch
|
||||
ApplyPatch sched-05-avoid-side-effect-of-tickless-idle-on-update_cpu_load.patch
|
||||
ApplyPatch sched-10-change-nohz-idle-load-balancing-logic-to-push-model.patch
|
||||
ApplyPatch sched-15-update-rq-clock-for-nohz-balanced-cpus.patch
|
||||
ApplyPatch sched-20-fix-rq-clock-synchronization-when-migrating-tasks.patch
|
||||
ApplyPatch sched-25-move-sched_avg_update-to-update_cpu_load.patch
|
||||
|
||||
# END OF PATCH APPLICATIONS
|
||||
|
||||
%endif
|
||||
|
@ -1957,7 +1972,10 @@ fi
|
|||
# and build.
|
||||
|
||||
%changelog
|
||||
* Mon Sep 20 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.35.5-29
|
||||
* Tue Sep 21 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.35.5-29
|
||||
- Scheduler fixes for Bugzilla #635813 and #633037
|
||||
|
||||
* Mon Sep 20 2010 Chuck Ebbert <cebbert@redhat.com>
|
||||
- Linux 2.6.35.5
|
||||
- Drop merged patches:
|
||||
01-compat-make-compat_alloc_user_space-incorporate-the-access_ok-check.patch
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
From: Stanislaw Gruszka <sgruszka@redhat.com>
|
||||
Date: Tue, 14 Sep 2010 14:35:14 +0000 (+0200)
|
||||
Subject: sched: Fix user time incorrectly accounted as system time on 32-bit
|
||||
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fx86%2Flinux-2.6-tip.git;a=commitdiff_plain;h=e75e863dd5c7d96b91ebbd241da5328fc38a78cc
|
||||
|
||||
sched: Fix user time incorrectly accounted as system time on 32-bit
|
||||
|
||||
We have 32-bit variable overflow possibility when multiply in
|
||||
task_times() and thread_group_times() functions. When the
|
||||
overflow happens then the scaled utime value becomes erroneously
|
||||
small and the scaled stime becomes i erroneously big.
|
||||
|
||||
Reported here:
|
||||
|
||||
https://bugzilla.redhat.com/show_bug.cgi?id=633037
|
||||
https://bugzilla.kernel.org/show_bug.cgi?id=16559
|
||||
|
||||
Reported-by: Michael Chapman <redhat-bugzilla@very.puzzling.org>
|
||||
Reported-by: Ciriaco Garcia de Celis <sysman@etherpilot.com>
|
||||
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Cc: <stable@kernel.org> # 2.6.32.19+ (partially) and 2.6.33+
|
||||
LKML-Reference: <20100914143513.GB8415@redhat.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index ed09d4f..dc85ceb 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -3513,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
|
||||
|
||||
if (total) {
|
||||
- u64 temp;
|
||||
+ u64 temp = rtime;
|
||||
|
||||
- temp = (u64)(rtime * utime);
|
||||
+ temp *= utime;
|
||||
do_div(temp, total);
|
||||
utime = (cputime_t)temp;
|
||||
} else
|
||||
@@ -3546,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
|
||||
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
|
||||
|
||||
if (total) {
|
||||
- u64 temp;
|
||||
+ u64 temp = rtime;
|
||||
|
||||
- temp = (u64)(rtime * cputime.utime);
|
||||
+ temp *= cputime.utime;
|
||||
do_div(temp, total);
|
||||
utime = (cputime_t)temp;
|
||||
} else
|
|
@ -0,0 +1,276 @@
|
|||
From: Venkatesh Pallipadi <venki@google.com>
|
||||
Date: Tue, 18 May 2010 01:14:43 +0000 (-0700)
|
||||
Subject: sched: Avoid side-effect of tickless idle on update_cpu_load
|
||||
X-Git-Tag: v2.6.36-rc1~531^2~22
|
||||
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=fdf3e95d3916f18bf8703fb065499fdbc4dfe34c
|
||||
|
||||
sched: Avoid side-effect of tickless idle on update_cpu_load
|
||||
|
||||
tickless idle has a negative side effect on update_cpu_load(), which
|
||||
in turn can affect load balancing behavior.
|
||||
|
||||
update_cpu_load() is supposed to be called every tick, to keep track
|
||||
of various load indicies. With tickless idle, there are no scheduler
|
||||
ticks called on the idle CPUs. Idle CPUs may still do load balancing
|
||||
(with idle_load_balance CPU) using the stale cpu_load. It will also
|
||||
cause problems when all CPUs go idle for a while and become active
|
||||
again. In this case loads would not degrade as expected.
|
||||
|
||||
This is how rq->nr_load_updates change looks like under different
|
||||
conditions:
|
||||
|
||||
<cpu_num> <nr_load_updates change>
|
||||
All CPUS idle for 10 seconds (HZ=1000)
|
||||
0 1621
|
||||
10 496
|
||||
11 139
|
||||
12 875
|
||||
13 1672
|
||||
14 12
|
||||
15 21
|
||||
1 1472
|
||||
2 2426
|
||||
3 1161
|
||||
4 2108
|
||||
5 1525
|
||||
6 701
|
||||
7 249
|
||||
8 766
|
||||
9 1967
|
||||
|
||||
One CPU busy rest idle for 10 seconds
|
||||
0 10003
|
||||
10 601
|
||||
11 95
|
||||
12 966
|
||||
13 1597
|
||||
14 114
|
||||
15 98
|
||||
1 3457
|
||||
2 93
|
||||
3 6679
|
||||
4 1425
|
||||
5 1479
|
||||
6 595
|
||||
7 193
|
||||
8 633
|
||||
9 1687
|
||||
|
||||
All CPUs busy for 10 seconds
|
||||
0 10026
|
||||
10 10026
|
||||
11 10026
|
||||
12 10026
|
||||
13 10025
|
||||
14 10025
|
||||
15 10025
|
||||
1 10026
|
||||
2 10026
|
||||
3 10026
|
||||
4 10026
|
||||
5 10026
|
||||
6 10026
|
||||
7 10026
|
||||
8 10026
|
||||
9 10026
|
||||
|
||||
That is update_cpu_load works properly only when all CPUs are busy.
|
||||
If all are idle, all the CPUs get way lower updates. And when few
|
||||
CPUs are busy and rest are idle, only busy and ilb CPU does proper
|
||||
updates and rest of the idle CPUs will do lower updates.
|
||||
|
||||
The patch keeps track of when a last update was done and fixes up
|
||||
the load avg based on current time.
|
||||
|
||||
On one of my test system SPECjbb with warehouse 1..numcpus, patch
|
||||
improves throughput numbers by ~1% (average of 6 runs). On another
|
||||
test system (with different domain hierarchy) there is no noticable
|
||||
change in perf.
|
||||
|
||||
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
LKML-Reference: <AANLkTilLtDWQsAUrIxJ6s04WTgmw9GuOODc5AOrYsaR5@mail.gmail.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index f37a961..a757f6b 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -457,6 +457,7 @@ struct rq {
|
||||
unsigned long nr_running;
|
||||
#define CPU_LOAD_IDX_MAX 5
|
||||
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
|
||||
+ unsigned long last_load_update_tick;
|
||||
#ifdef CONFIG_NO_HZ
|
||||
u64 nohz_stamp;
|
||||
unsigned char in_nohz_recently;
|
||||
@@ -1803,6 +1804,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
|
||||
static void calc_load_account_idle(struct rq *this_rq);
|
||||
static void update_sysctl(void);
|
||||
static int get_update_sysctl_factor(void);
|
||||
+static void update_cpu_load(struct rq *this_rq);
|
||||
|
||||
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
||||
{
|
||||
@@ -3050,23 +3052,102 @@ static void calc_load_account_active(struct rq *this_rq)
|
||||
}
|
||||
|
||||
/*
|
||||
+ * The exact cpuload at various idx values, calculated at every tick would be
|
||||
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
|
||||
+ *
|
||||
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
|
||||
+ * on nth tick when cpu may be busy, then we have:
|
||||
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
|
||||
+ *
|
||||
+ * decay_load_missed() below does efficient calculation of
|
||||
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
|
||||
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
|
||||
+ *
|
||||
+ * The calculation is approximated on a 128 point scale.
|
||||
+ * degrade_zero_ticks is the number of ticks after which load at any
|
||||
+ * particular idx is approximated to be zero.
|
||||
+ * degrade_factor is a precomputed table, a row for each load idx.
|
||||
+ * Each column corresponds to degradation factor for a power of two ticks,
|
||||
+ * based on 128 point scale.
|
||||
+ * Example:
|
||||
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
|
||||
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
|
||||
+ *
|
||||
+ * With this power of 2 load factors, we can degrade the load n times
|
||||
+ * by looking at 1 bits in n and doing as many mult/shift instead of
|
||||
+ * n mult/shifts needed by the exact degradation.
|
||||
+ */
|
||||
+#define DEGRADE_SHIFT 7
|
||||
+static const unsigned char
|
||||
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
|
||||
+static const unsigned char
|
||||
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
|
||||
+ {0, 0, 0, 0, 0, 0, 0, 0},
|
||||
+ {64, 32, 8, 0, 0, 0, 0, 0},
|
||||
+ {96, 72, 40, 12, 1, 0, 0},
|
||||
+ {112, 98, 75, 43, 15, 1, 0},
|
||||
+ {120, 112, 98, 76, 45, 16, 2} };
|
||||
+
|
||||
+/*
|
||||
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
|
||||
+ * would be when CPU is idle and so we just decay the old load without
|
||||
+ * adding any new load.
|
||||
+ */
|
||||
+static unsigned long
|
||||
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
||||
+{
|
||||
+ int j = 0;
|
||||
+
|
||||
+ if (!missed_updates)
|
||||
+ return load;
|
||||
+
|
||||
+ if (missed_updates >= degrade_zero_ticks[idx])
|
||||
+ return 0;
|
||||
+
|
||||
+ if (idx == 1)
|
||||
+ return load >> missed_updates;
|
||||
+
|
||||
+ while (missed_updates) {
|
||||
+ if (missed_updates % 2)
|
||||
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
|
||||
+
|
||||
+ missed_updates >>= 1;
|
||||
+ j++;
|
||||
+ }
|
||||
+ return load;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* Update rq->cpu_load[] statistics. This function is usually called every
|
||||
- * scheduler tick (TICK_NSEC).
|
||||
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
||||
+ * every tick. We fix it up based on jiffies.
|
||||
*/
|
||||
static void update_cpu_load(struct rq *this_rq)
|
||||
{
|
||||
unsigned long this_load = this_rq->load.weight;
|
||||
+ unsigned long curr_jiffies = jiffies;
|
||||
+ unsigned long pending_updates;
|
||||
int i, scale;
|
||||
|
||||
this_rq->nr_load_updates++;
|
||||
|
||||
+ /* Avoid repeated calls on same jiffy, when moving in and out of idle */
|
||||
+ if (curr_jiffies == this_rq->last_load_update_tick)
|
||||
+ return;
|
||||
+
|
||||
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
||||
+ this_rq->last_load_update_tick = curr_jiffies;
|
||||
+
|
||||
/* Update our load: */
|
||||
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
|
||||
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
||||
unsigned long old_load, new_load;
|
||||
|
||||
/* scale is effectively 1 << i now, and >> i divides by scale */
|
||||
|
||||
old_load = this_rq->cpu_load[i];
|
||||
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
||||
new_load = this_load;
|
||||
/*
|
||||
* Round up the averaging division if load is increasing. This
|
||||
@@ -3074,9 +3155,15 @@ static void update_cpu_load(struct rq *this_rq)
|
||||
* example.
|
||||
*/
|
||||
if (new_load > old_load)
|
||||
- new_load += scale-1;
|
||||
- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
|
||||
+ new_load += scale - 1;
|
||||
+
|
||||
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
+}
|
||||
+
|
||||
+static void update_cpu_load_active(struct rq *this_rq)
|
||||
+{
|
||||
+ update_cpu_load(this_rq);
|
||||
|
||||
calc_load_account_active(this_rq);
|
||||
}
|
||||
@@ -3464,7 +3551,7 @@ void scheduler_tick(void)
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
update_rq_clock(rq);
|
||||
- update_cpu_load(rq);
|
||||
+ update_cpu_load_active(rq);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
@@ -7688,6 +7775,9 @@ void __init sched_init(void)
|
||||
|
||||
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
||||
rq->cpu_load[j] = 0;
|
||||
+
|
||||
+ rq->last_load_update_tick = jiffies;
|
||||
+
|
||||
#ifdef CONFIG_SMP
|
||||
rq->sd = NULL;
|
||||
rq->rd = NULL;
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index eed35ed..22b8b4f 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -3420,9 +3420,12 @@ static void run_rebalance_domains(struct softirq_action *h)
|
||||
if (need_resched())
|
||||
break;
|
||||
|
||||
+ rq = cpu_rq(balance_cpu);
|
||||
+ raw_spin_lock_irq(&rq->lock);
|
||||
+ update_cpu_load(rq);
|
||||
+ raw_spin_unlock_irq(&rq->lock);
|
||||
rebalance_domains(balance_cpu, CPU_IDLE);
|
||||
|
||||
- rq = cpu_rq(balance_cpu);
|
||||
if (time_after(this_rq->next_balance, rq->next_balance))
|
||||
this_rq->next_balance = rq->next_balance;
|
||||
}
|
|
@ -0,0 +1,651 @@
|
|||
From: Venkatesh Pallipadi <venki@google.com>
|
||||
Date: Sat, 22 May 2010 00:09:41 +0000 (-0700)
|
||||
Subject: sched: Change nohz idle load balancing logic to push model
|
||||
X-Git-Tag: v2.6.36-rc1~531^2~21
|
||||
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=83cd4fe27ad8446619b2e030b171b858501de87d
|
||||
|
||||
sched: Change nohz idle load balancing logic to push model
|
||||
|
||||
In the new push model, all idle CPUs indeed go into nohz mode. There is
|
||||
still the concept of idle load balancer (performing the load balancing
|
||||
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
|
||||
balancer when any of the nohz CPUs need idle load balancing.
|
||||
The kickee CPU does the idle load balancing on behalf of all idle CPUs
|
||||
instead of the normal idle balance.
|
||||
|
||||
This addresses the below two problems with the current nohz ilb logic:
|
||||
* the idle load balancer continued to have periodic ticks during idle and
|
||||
wokeup frequently, even though it did not have any rebalancing to do on
|
||||
behalf of any of the idle CPUs.
|
||||
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
|
||||
periodic wakeup can result in a periodic additional interrupt on a CPU
|
||||
doing the timer broadcast.
|
||||
|
||||
Also currently we are migrating the unpinned timers from an idle to the cpu
|
||||
doing idle load balancing (when all the cpus in the system are idle,
|
||||
there is no idle load balancing cpu and timers get added to the same idle cpu
|
||||
where the request was made. So the existing optimization works only on semi idle
|
||||
system).
|
||||
|
||||
And In semi idle system, we no longer have periodic ticks on the idle load
|
||||
balancer CPU. Using that cpu will add more delays to the timers than intended
|
||||
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
|
||||
causing mysterious slowdowns during boot etc.
|
||||
|
||||
For now, in the semi idle case, use the nearest busy cpu for migrating timers
|
||||
from an idle cpu. This is good for power-savings anyway.
|
||||
|
||||
Signed-off-by: Venkatesh Pallipadi <venki@google.com>
|
||||
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
|
||||
[ backported for 2.6.35 ]
|
||||
|
||||
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
||||
index c2d4316..a3e5b1c 100644
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -271,13 +271,10 @@ extern int runqueue_is_locked(int cpu);
|
||||
|
||||
extern cpumask_var_t nohz_cpu_mask;
|
||||
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
|
||||
-extern int select_nohz_load_balancer(int cpu);
|
||||
-extern int get_nohz_load_balancer(void);
|
||||
+extern void select_nohz_load_balancer(int stop_tick);
|
||||
+extern int get_nohz_timer_target(void);
|
||||
#else
|
||||
-static inline int select_nohz_load_balancer(int cpu)
|
||||
-{
|
||||
- return 0;
|
||||
-}
|
||||
+static inline void select_nohz_load_balancer(int stop_tick) { }
|
||||
#endif
|
||||
|
||||
/*
|
||||
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
|
||||
index 5c69e99..e934339 100644
|
||||
--- a/kernel/hrtimer.c
|
||||
+++ b/kernel/hrtimer.c
|
||||
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
|
||||
static int hrtimer_get_target(int this_cpu, int pinned)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ
|
||||
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
|
||||
- int preferred_cpu = get_nohz_load_balancer();
|
||||
-
|
||||
- if (preferred_cpu >= 0)
|
||||
- return preferred_cpu;
|
||||
- }
|
||||
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
|
||||
+ return get_nohz_timer_target();
|
||||
#endif
|
||||
return this_cpu;
|
||||
}
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index a757f6b..132950b 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -460,7 +460,7 @@ struct rq {
|
||||
unsigned long last_load_update_tick;
|
||||
#ifdef CONFIG_NO_HZ
|
||||
u64 nohz_stamp;
|
||||
- unsigned char in_nohz_recently;
|
||||
+ unsigned char nohz_balance_kick;
|
||||
#endif
|
||||
unsigned int skip_clock_update;
|
||||
|
||||
@@ -1195,6 +1195,27 @@ static void resched_cpu(int cpu)
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
+ * In the semi idle case, use the nearest busy cpu for migrating timers
|
||||
+ * from an idle cpu. This is good for power-savings.
|
||||
+ *
|
||||
+ * We don't do similar optimization for completely idle system, as
|
||||
+ * selecting an idle cpu will add more delays to the timers than intended
|
||||
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
|
||||
+ */
|
||||
+int get_nohz_timer_target(void)
|
||||
+{
|
||||
+ int cpu = smp_processor_id();
|
||||
+ int i;
|
||||
+ struct sched_domain *sd;
|
||||
+
|
||||
+ for_each_domain(cpu, sd) {
|
||||
+ for_each_cpu(i, sched_domain_span(sd))
|
||||
+ if (!idle_cpu(i))
|
||||
+ return i;
|
||||
+ }
|
||||
+ return cpu;
|
||||
+}
|
||||
+/*
|
||||
* When add_timer_on() enqueues a timer into the timer wheel of an
|
||||
* idle CPU then this timer might expire before the next timer event
|
||||
* which is scheduled to wake up that CPU. In case of a completely
|
||||
@@ -7791,6 +7812,10 @@ void __init sched_init(void)
|
||||
rq->idle_stamp = 0;
|
||||
rq->avg_idle = 2*sysctl_sched_migration_cost;
|
||||
rq_attach_root(rq, &def_root_domain);
|
||||
+#ifdef CONFIG_NO_HZ
|
||||
+ rq->nohz_balance_kick = 0;
|
||||
+ init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
|
||||
+#endif
|
||||
#endif
|
||||
init_rq_hrtick(rq);
|
||||
atomic_set(&rq->nr_iowait, 0);
|
||||
@@ -7835,8 +7860,11 @@ void __init sched_init(void)
|
||||
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
|
||||
#ifdef CONFIG_SMP
|
||||
#ifdef CONFIG_NO_HZ
|
||||
- zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
|
||||
- alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
|
||||
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
|
||||
+ alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
|
||||
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
|
||||
+ atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
|
||||
+ atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
|
||||
#endif
|
||||
/* May be allocated at isolcpus cmdline parse time */
|
||||
if (cpu_isolated_map == NULL)
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 22b8b4f..6ee2e0a 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -3091,13 +3091,40 @@ out_unlock:
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NO_HZ
|
||||
+
|
||||
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
|
||||
+
|
||||
+static void trigger_sched_softirq(void *data)
|
||||
+{
|
||||
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
|
||||
+}
|
||||
+
|
||||
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
|
||||
+{
|
||||
+ csd->func = trigger_sched_softirq;
|
||||
+ csd->info = NULL;
|
||||
+ csd->flags = 0;
|
||||
+ csd->priv = 0;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * idle load balancing details
|
||||
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
|
||||
+ * entering idle.
|
||||
+ * - This idle load balancer CPU will also go into tickless mode when
|
||||
+ * it is idle, just like all other idle CPUs
|
||||
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
|
||||
+ * needed, they will kick the idle load balancer, which then does idle
|
||||
+ * load balancing for all the idle CPUs.
|
||||
+ */
|
||||
static struct {
|
||||
atomic_t load_balancer;
|
||||
- cpumask_var_t cpu_mask;
|
||||
- cpumask_var_t ilb_grp_nohz_mask;
|
||||
-} nohz ____cacheline_aligned = {
|
||||
- .load_balancer = ATOMIC_INIT(-1),
|
||||
-};
|
||||
+ atomic_t first_pick_cpu;
|
||||
+ atomic_t second_pick_cpu;
|
||||
+ cpumask_var_t idle_cpus_mask;
|
||||
+ cpumask_var_t grp_idle_mask;
|
||||
+ unsigned long next_balance; /* in jiffy units */
|
||||
+} nohz ____cacheline_aligned;
|
||||
|
||||
int get_nohz_load_balancer(void)
|
||||
{
|
||||
@@ -3151,17 +3178,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
|
||||
*/
|
||||
static inline int is_semi_idle_group(struct sched_group *ilb_group)
|
||||
{
|
||||
- cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
|
||||
+ cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
|
||||
sched_group_cpus(ilb_group));
|
||||
|
||||
/*
|
||||
* A sched_group is semi-idle when it has atleast one busy cpu
|
||||
* and atleast one idle cpu.
|
||||
*/
|
||||
- if (cpumask_empty(nohz.ilb_grp_nohz_mask))
|
||||
+ if (cpumask_empty(nohz.grp_idle_mask))
|
||||
return 0;
|
||||
|
||||
- if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
|
||||
+ if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
@@ -3194,7 +3221,7 @@ static int find_new_ilb(int cpu)
|
||||
* Optimize for the case when we have no idle CPUs or only one
|
||||
* idle CPU. Don't walk the sched_domain hierarchy in such cases
|
||||
*/
|
||||
- if (cpumask_weight(nohz.cpu_mask) < 2)
|
||||
+ if (cpumask_weight(nohz.idle_cpus_mask) < 2)
|
||||
goto out_done;
|
||||
|
||||
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
|
||||
@@ -3202,7 +3229,7 @@ static int find_new_ilb(int cpu)
|
||||
|
||||
do {
|
||||
if (is_semi_idle_group(ilb_group))
|
||||
- return cpumask_first(nohz.ilb_grp_nohz_mask);
|
||||
+ return cpumask_first(nohz.grp_idle_mask);
|
||||
|
||||
ilb_group = ilb_group->next;
|
||||
|
||||
@@ -3210,98 +3237,116 @@ static int find_new_ilb(int cpu)
|
||||
}
|
||||
|
||||
out_done:
|
||||
- return cpumask_first(nohz.cpu_mask);
|
||||
+ return nr_cpu_ids;
|
||||
}
|
||||
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
|
||||
static inline int find_new_ilb(int call_cpu)
|
||||
{
|
||||
- return cpumask_first(nohz.cpu_mask);
|
||||
+ return nr_cpu_ids;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
|
||||
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
|
||||
+ * CPU (if there is one).
|
||||
+ */
|
||||
+static void nohz_balancer_kick(int cpu)
|
||||
+{
|
||||
+ int ilb_cpu;
|
||||
+
|
||||
+ nohz.next_balance++;
|
||||
+
|
||||
+ ilb_cpu = get_nohz_load_balancer();
|
||||
+
|
||||
+ if (ilb_cpu >= nr_cpu_ids) {
|
||||
+ ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
|
||||
+ if (ilb_cpu >= nr_cpu_ids)
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
|
||||
+ struct call_single_data *cp;
|
||||
+
|
||||
+ cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
|
||||
+ cp = &per_cpu(remote_sched_softirq_cb, cpu);
|
||||
+ __smp_call_function_single(ilb_cpu, cp, 0);
|
||||
+ }
|
||||
+ return;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
* This routine will try to nominate the ilb (idle load balancing)
|
||||
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
|
||||
- * load balancing on behalf of all those cpus. If all the cpus in the system
|
||||
- * go into this tickless mode, then there will be no ilb owner (as there is
|
||||
- * no need for one) and all the cpus will sleep till the next wakeup event
|
||||
- * arrives...
|
||||
- *
|
||||
- * For the ilb owner, tick is not stopped. And this tick will be used
|
||||
- * for idle load balancing. ilb owner will still be part of
|
||||
- * nohz.cpu_mask..
|
||||
+ * load balancing on behalf of all those cpus.
|
||||
*
|
||||
- * While stopping the tick, this cpu will become the ilb owner if there
|
||||
- * is no other owner. And will be the owner till that cpu becomes busy
|
||||
- * or if all cpus in the system stop their ticks at which point
|
||||
- * there is no need for ilb owner.
|
||||
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
|
||||
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
|
||||
+ * idle load balancing by kicking one of the idle CPUs.
|
||||
*
|
||||
- * When the ilb owner becomes busy, it nominates another owner, during the
|
||||
- * next busy scheduler_tick()
|
||||
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
|
||||
+ * ilb owner CPU in future (when there is a need for idle load balancing on
|
||||
+ * behalf of all idle CPUs).
|
||||
*/
|
||||
-int select_nohz_load_balancer(int stop_tick)
|
||||
+void select_nohz_load_balancer(int stop_tick)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
if (stop_tick) {
|
||||
- cpu_rq(cpu)->in_nohz_recently = 1;
|
||||
-
|
||||
if (!cpu_active(cpu)) {
|
||||
if (atomic_read(&nohz.load_balancer) != cpu)
|
||||
- return 0;
|
||||
+ return;
|
||||
|
||||
/*
|
||||
* If we are going offline and still the leader,
|
||||
* give up!
|
||||
*/
|
||||
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
|
||||
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
|
||||
+ nr_cpu_ids) != cpu)
|
||||
BUG();
|
||||
|
||||
- return 0;
|
||||
+ return;
|
||||
}
|
||||
|
||||
- cpumask_set_cpu(cpu, nohz.cpu_mask);
|
||||
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
|
||||
|
||||
- /* time for ilb owner also to sleep */
|
||||
- if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
|
||||
- if (atomic_read(&nohz.load_balancer) == cpu)
|
||||
- atomic_set(&nohz.load_balancer, -1);
|
||||
- return 0;
|
||||
- }
|
||||
+ if (atomic_read(&nohz.first_pick_cpu) == cpu)
|
||||
+ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
|
||||
+ if (atomic_read(&nohz.second_pick_cpu) == cpu)
|
||||
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
|
||||
|
||||
- if (atomic_read(&nohz.load_balancer) == -1) {
|
||||
- /* make me the ilb owner */
|
||||
- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
|
||||
- return 1;
|
||||
- } else if (atomic_read(&nohz.load_balancer) == cpu) {
|
||||
+ if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
|
||||
int new_ilb;
|
||||
|
||||
- if (!(sched_smt_power_savings ||
|
||||
- sched_mc_power_savings))
|
||||
- return 1;
|
||||
+ /* make me the ilb owner */
|
||||
+ if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
|
||||
+ cpu) != nr_cpu_ids)
|
||||
+ return;
|
||||
+
|
||||
/*
|
||||
* Check to see if there is a more power-efficient
|
||||
* ilb.
|
||||
*/
|
||||
new_ilb = find_new_ilb(cpu);
|
||||
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
|
||||
- atomic_set(&nohz.load_balancer, -1);
|
||||
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
|
||||
resched_cpu(new_ilb);
|
||||
- return 0;
|
||||
+ return;
|
||||
}
|
||||
- return 1;
|
||||
+ return;
|
||||
}
|
||||
} else {
|
||||
- if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
|
||||
- return 0;
|
||||
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
|
||||
+ return;
|
||||
|
||||
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
|
||||
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
|
||||
|
||||
if (atomic_read(&nohz.load_balancer) == cpu)
|
||||
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
|
||||
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
|
||||
+ nr_cpu_ids) != cpu)
|
||||
BUG();
|
||||
}
|
||||
- return 0;
|
||||
+ return;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3383,11 +3428,101 @@ out:
|
||||
rq->next_balance = next_balance;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
- * run_rebalance_domains is triggered when needed from the scheduler tick.
|
||||
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
|
||||
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
|
||||
* rebalancing for all the cpus for whom scheduler ticks are stopped.
|
||||
*/
|
||||
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
|
||||
+{
|
||||
+ struct rq *this_rq = cpu_rq(this_cpu);
|
||||
+ struct rq *rq;
|
||||
+ int balance_cpu;
|
||||
+
|
||||
+ if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
|
||||
+ return;
|
||||
+
|
||||
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
|
||||
+ if (balance_cpu == this_cpu)
|
||||
+ continue;
|
||||
+
|
||||
+ /*
|
||||
+ * If this cpu gets work to do, stop the load balancing
|
||||
+ * work being done for other cpus. Next load
|
||||
+ * balancing owner will pick it up.
|
||||
+ */
|
||||
+ if (need_resched()) {
|
||||
+ this_rq->nohz_balance_kick = 0;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ raw_spin_lock_irq(&this_rq->lock);
|
||||
+ update_cpu_load(this_rq);
|
||||
+ raw_spin_unlock_irq(&this_rq->lock);
|
||||
+
|
||||
+ rebalance_domains(balance_cpu, CPU_IDLE);
|
||||
+
|
||||
+ rq = cpu_rq(balance_cpu);
|
||||
+ if (time_after(this_rq->next_balance, rq->next_balance))
|
||||
+ this_rq->next_balance = rq->next_balance;
|
||||
+ }
|
||||
+ nohz.next_balance = this_rq->next_balance;
|
||||
+ this_rq->nohz_balance_kick = 0;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Current heuristic for kicking the idle load balancer
|
||||
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
|
||||
+ * idle load balancer when it has more than one process active. This
|
||||
+ * eliminates the need for idle load balancing altogether when we have
|
||||
+ * only one running process in the system (common case).
|
||||
+ * - If there are more than one busy CPU, idle load balancer may have
|
||||
+ * to run for active_load_balance to happen (i.e., two busy CPUs are
|
||||
+ * SMT or core siblings and can run better if they move to different
|
||||
+ * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
|
||||
+ * which will kick idle load balancer as soon as it has any load.
|
||||
+ */
|
||||
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
|
||||
+{
|
||||
+ unsigned long now = jiffies;
|
||||
+ int ret;
|
||||
+ int first_pick_cpu, second_pick_cpu;
|
||||
+
|
||||
+ if (time_before(now, nohz.next_balance))
|
||||
+ return 0;
|
||||
+
|
||||
+ if (!rq->nr_running)
|
||||
+ return 0;
|
||||
+
|
||||
+ first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
|
||||
+ second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
|
||||
+
|
||||
+ if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
|
||||
+ second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
|
||||
+ return 0;
|
||||
+
|
||||
+ ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
|
||||
+ if (ret == nr_cpu_ids || ret == cpu) {
|
||||
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
|
||||
+ if (rq->nr_running > 1)
|
||||
+ return 1;
|
||||
+ } else {
|
||||
+ ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
|
||||
+ if (ret == nr_cpu_ids || ret == cpu) {
|
||||
+ if (rq->nr_running)
|
||||
+ return 1;
|
||||
+ }
|
||||
+ }
|
||||
+ return 0;
|
||||
+}
|
||||
+#else
|
||||
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
|
||||
+#endif
|
||||
+
|
||||
+/*
|
||||
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
|
||||
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
|
||||
+ */
|
||||
static void run_rebalance_domains(struct softirq_action *h)
|
||||
{
|
||||
int this_cpu = smp_processor_id();
|
||||
@@ -3397,40 +3532,12 @@ static void run_rebalance_domains(struct softirq_action *h)
|
||||
|
||||
rebalance_domains(this_cpu, idle);
|
||||
|
||||
-#ifdef CONFIG_NO_HZ
|
||||
/*
|
||||
- * If this cpu is the owner for idle load balancing, then do the
|
||||
+ * If this cpu has a pending nohz_balance_kick, then do the
|
||||
* balancing on behalf of the other idle cpus whose ticks are
|
||||
* stopped.
|
||||
*/
|
||||
- if (this_rq->idle_at_tick &&
|
||||
- atomic_read(&nohz.load_balancer) == this_cpu) {
|
||||
- struct rq *rq;
|
||||
- int balance_cpu;
|
||||
-
|
||||
- for_each_cpu(balance_cpu, nohz.cpu_mask) {
|
||||
- if (balance_cpu == this_cpu)
|
||||
- continue;
|
||||
-
|
||||
- /*
|
||||
- * If this cpu gets work to do, stop the load balancing
|
||||
- * work being done for other cpus. Next load
|
||||
- * balancing owner will pick it up.
|
||||
- */
|
||||
- if (need_resched())
|
||||
- break;
|
||||
-
|
||||
- rq = cpu_rq(balance_cpu);
|
||||
- raw_spin_lock_irq(&rq->lock);
|
||||
- update_cpu_load(rq);
|
||||
- raw_spin_unlock_irq(&rq->lock);
|
||||
- rebalance_domains(balance_cpu, CPU_IDLE);
|
||||
-
|
||||
- if (time_after(this_rq->next_balance, rq->next_balance))
|
||||
- this_rq->next_balance = rq->next_balance;
|
||||
- }
|
||||
- }
|
||||
-#endif
|
||||
+ nohz_idle_balance(this_cpu, idle);
|
||||
}
|
||||
|
||||
static inline int on_null_domain(int cpu)
|
||||
@@ -3440,57 +3547,17 @@ static inline int on_null_domain(int cpu)
|
||||
|
||||
/*
|
||||
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
|
||||
- *
|
||||
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
|
||||
- * idle load balancing owner or decide to stop the periodic load balancing,
|
||||
- * if the whole system is idle.
|
||||
*/
|
||||
static inline void trigger_load_balance(struct rq *rq, int cpu)
|
||||
{
|
||||
-#ifdef CONFIG_NO_HZ
|
||||
- /*
|
||||
- * If we were in the nohz mode recently and busy at the current
|
||||
- * scheduler tick, then check if we need to nominate new idle
|
||||
- * load balancer.
|
||||
- */
|
||||
- if (rq->in_nohz_recently && !rq->idle_at_tick) {
|
||||
- rq->in_nohz_recently = 0;
|
||||
-
|
||||
- if (atomic_read(&nohz.load_balancer) == cpu) {
|
||||
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
|
||||
- atomic_set(&nohz.load_balancer, -1);
|
||||
- }
|
||||
-
|
||||
- if (atomic_read(&nohz.load_balancer) == -1) {
|
||||
- int ilb = find_new_ilb(cpu);
|
||||
-
|
||||
- if (ilb < nr_cpu_ids)
|
||||
- resched_cpu(ilb);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * If this cpu is idle and doing idle load balancing for all the
|
||||
- * cpus with ticks stopped, is it time for that to stop?
|
||||
- */
|
||||
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
|
||||
- cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
|
||||
- resched_cpu(cpu);
|
||||
- return;
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * If this cpu is idle and the idle load balancing is done by
|
||||
- * someone else, then no need raise the SCHED_SOFTIRQ
|
||||
- */
|
||||
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
|
||||
- cpumask_test_cpu(cpu, nohz.cpu_mask))
|
||||
- return;
|
||||
-#endif
|
||||
/* Don't need to rebalance while attached to NULL domain */
|
||||
if (time_after_eq(jiffies, rq->next_balance) &&
|
||||
likely(!on_null_domain(cpu)))
|
||||
raise_softirq(SCHED_SOFTIRQ);
|
||||
+#ifdef CONFIG_NO_HZ
|
||||
+ else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
|
||||
+ nohz_balancer_kick(cpu);
|
||||
+#endif
|
||||
}
|
||||
|
||||
static void rq_online_fair(struct rq *rq)
|
||||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
|
||||
index 1d7b9bc..5f171f0 100644
|
||||
--- a/kernel/time/tick-sched.c
|
||||
+++ b/kernel/time/tick-sched.c
|
||||
@@ -408,13 +408,7 @@ void tick_nohz_stop_sched_tick(int inidle)
|
||||
* the scheduler tick in nohz_restart_sched_tick.
|
||||
*/
|
||||
if (!ts->tick_stopped) {
|
||||
- if (select_nohz_load_balancer(1)) {
|
||||
- /*
|
||||
- * sched tick not stopped!
|
||||
- */
|
||||
- cpumask_clear_cpu(cpu, nohz_cpu_mask);
|
||||
- goto out;
|
||||
- }
|
||||
+ select_nohz_load_balancer(1);
|
||||
|
||||
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
|
||||
ts->tick_stopped = 1;
|
||||
diff --git a/kernel/timer.c b/kernel/timer.c
|
||||
index ee305c8..48d6aec 100644
|
||||
--- a/kernel/timer.c
|
||||
+++ b/kernel/timer.c
|
||||
@@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
|
||||
cpu = smp_processor_id();
|
||||
|
||||
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
|
||||
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
|
||||
- int preferred_cpu = get_nohz_load_balancer();
|
||||
-
|
||||
- if (preferred_cpu >= 0)
|
||||
- cpu = preferred_cpu;
|
||||
- }
|
||||
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
|
||||
+ cpu = get_nohz_timer_target();
|
||||
#endif
|
||||
new_base = per_cpu(tvec_bases, cpu);
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
From: Suresh Siddha <suresh.b.siddha@intel.com>
|
||||
Date: Fri, 9 Jul 2010 13:19:54 +0000 (+0200)
|
||||
Subject: sched: Update rq->clock for nohz balanced cpus
|
||||
X-Git-Tag: v2.6.36-rc1~531^2~5
|
||||
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=5343bdb8fd076f16edc9d113a9e35e2a1d1f4966
|
||||
|
||||
sched: Update rq->clock for nohz balanced cpus
|
||||
|
||||
Suresh spotted that we don't update the rq->clock in the nohz
|
||||
load-balancer path.
|
||||
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
LKML-Reference: <1278626014.2834.74.camel@sbs-t61.sc.intel.com>
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index b4da534..e44a591 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -3596,6 +3596,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
|
||||
}
|
||||
|
||||
raw_spin_lock_irq(&this_rq->lock);
|
||||
+ update_rq_clock(this_rq);
|
||||
update_cpu_load(this_rq);
|
||||
raw_spin_unlock_irq(&this_rq->lock);
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
From: Peter Zijlstra <peterz@infradead.org>
|
||||
Date: Thu, 19 Aug 2010 11:31:43 +0000 (+0200)
|
||||
Subject: sched: Fix rq->clock synchronization when migrating tasks
|
||||
X-Git-Tag: v2.6.36-rc3~25^2~1
|
||||
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=861d034ee814917a83bd5de4b26e3b8336ddeeb8
|
||||
|
||||
sched: Fix rq->clock synchronization when migrating tasks
|
||||
|
||||
sched_fork() -- we do task placement in ->task_fork_fair() ensure we
|
||||
update_rq_clock() so we work with current time. We leave the vruntime
|
||||
in relative state, so the time delay until wake_up_new_task() doesn't
|
||||
matter.
|
||||
|
||||
wake_up_new_task() -- Since task_fork_fair() left p->vruntime in
|
||||
relative state we can safely migrate, the activate_task() on the
|
||||
remote rq will call update_rq_clock() and causes the clock to be
|
||||
synced (enough).
|
||||
|
||||
Tested-by: Jack Daniel <wanders.thirst@gmail.com>
|
||||
Tested-by: Philby John <pjohn@mvista.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
LKML-Reference: <1281002322.1923.1708.camel@laptop>
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 806d1b2..ab661eb 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p)
|
||||
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
|
||||
+ update_rq_clock(rq);
|
||||
+
|
||||
if (unlikely(task_cpu(p) != this_cpu))
|
||||
__set_task_cpu(p, this_cpu);
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
From: Suresh Siddha <suresh.b.siddha@intel.com>
|
||||
Date: Mon, 23 Aug 2010 20:42:51 +0000 (-0700)
|
||||
Subject: sched: Move sched_avg_update() to update_cpu_load()
|
||||
X-Git-Tag: v2.6.36-rc4~8^2~1
|
||||
X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=da2b71edd8a7db44fe1746261410a981f3e03632
|
||||
|
||||
sched: Move sched_avg_update() to update_cpu_load()
|
||||
|
||||
Currently sched_avg_update() (which updates rt_avg stats in the rq)
|
||||
is getting called from scale_rt_power() (in the load balance context)
|
||||
which doesn't take rq->lock.
|
||||
|
||||
Fix it by moving the sched_avg_update() to more appropriate
|
||||
update_cpu_load() where the CFS load gets updated as well.
|
||||
|
||||
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
LKML-Reference: <1282596171.2694.3.camel@sbsiddha-MOBL3>
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 09b574e..ed09d4f 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -1294,6 +1294,10 @@ static void resched_task(struct task_struct *p)
|
||||
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
||||
{
|
||||
}
|
||||
+
|
||||
+static void sched_avg_update(struct rq *rq)
|
||||
+{
|
||||
+}
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
@@ -3182,6 +3186,8 @@ static void update_cpu_load(struct rq *this_rq)
|
||||
|
||||
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
|
||||
}
|
||||
+
|
||||
+ sched_avg_update(this_rq);
|
||||
}
|
||||
|
||||
static void update_cpu_load_active(struct rq *this_rq)
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index ab661eb..f53ec75 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -2268,8 +2268,6 @@ unsigned long scale_rt_power(int cpu)
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
u64 total, available;
|
||||
|
||||
- sched_avg_update(rq);
|
||||
-
|
||||
total = sched_avg_period() + (rq->clock - rq->age_stamp);
|
||||
available = total - rq->rt_avg;
|
||||
|
Loading…
Reference in New Issue