kernel-ark/kernel/sched_cpupri.c
Steven Rostedt d473750b40 sched/cpupri: Fix memory barriers for vec updates to always be in order
[ This patch actually compiles. Thanks to Mike Galbraith for pointing
that out. I compiled and booted this patch with no issues. ]

Re-examining the cpupri patch, I see there's a possible race because the
update of the two priorities vec->counts are not protected by a memory
barrier.

When a RT runqueue is overloaded and wants to push an RT task to another
runqueue, it scans the RT priority vectors in a loop from lowest
priority to highest.

When we queue or dequeue an RT task that changes a runqueue's highest
priority task, we update the vectors to show that a runqueue is rated at
a different priority. To do this, we first set the new priority mask,
and increment the vec->count, and then set the old priority mask by
decrementing the vec->count.

If we are lowering the runqueue's RT priority rating, it will trigger a
RT pull, and we do not care if we miss pushing to this runqueue or not.

But if we raise the priority, but the priority is still lower than an RT
task that is looking to be pushed, we must make sure that this runqueue
is still seen by the push algorithm (the loop).

Because the loop reads from lowest to highest, and the new priority is
set before the old one is cleared, we will either see the new or old
priority set and the vector will be checked.

But! Since there's no memory barrier between the updates of the two, the
old count may be decremented first before the new count is incremented.
This means the loop may see the old count of zero and skip it, and also
the new count of zero before it was updated. A possible runqueue that
the RT task could move to could be missed.

A conditional memory barrier is placed between the vec->count updates
and is only called when both updates are done.

The smp_wmb() has also been changed to smp_mb__before_atomic_inc/dec(),
as they are not needed by archs that already synchronize
atomic_inc/dec().

The smp_rmb() has been moved to be called at every iteration of the loop
so that the race between seeing the two updates is visible by each
iteration of the loop, as an arch is free to optimize the reading of
memory of the counters in the loop.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1312547269.18583.194.camel@gandalf.stny.rr.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-08-14 12:01:07 +02:00

243 lines
6.8 KiB
C

/*
* kernel/sched_cpupri.c
*
* CPU priority management
*
* Copyright (C) 2007-2008 Novell
*
* Author: Gregory Haskins <ghaskins@novell.com>
*
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
* (INVALID), IDLE, NORMAL, RT1, ... RT99
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
* a 2 dimensional bitmap (the first for priority class, the second for cpus
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/gfp.h>
#include "sched_cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static int convert_prio(int prio)
{
int cpupri;
if (prio == CPUPRI_INVALID)
cpupri = CPUPRI_INVALID;
else if (prio == MAX_PRIO)
cpupri = CPUPRI_IDLE;
else if (prio >= MAX_RT_PRIO)
cpupri = CPUPRI_NORMAL;
else
cpupri = MAX_RT_PRIO - prio + 1;
return cpupri;
}
/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
*
* Note: This function returns the recommended CPUs as calculated during the
* current invocation. By the time the call returns, the CPUs may have in
* fact changed priorities any number of times. While not ideal, it is not
* an issue of correctness since the normal rebalancer logic will correct
* any discrepancies created by racing against the uncertainty of the current
* priority configuration.
*
* Returns: (int)bool - CPUs were found
*/
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
{
int idx = 0;
int task_pri = convert_prio(p->prio);
if (task_pri >= MAX_RT_PRIO)
return 0;
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
int skip = 0;
if (!atomic_read(&(vec)->count))
skip = 1;
/*
* When looking at the vector, we need to read the counter,
* do a memory barrier, then read the mask.
*
* Note: This is still all racey, but we can deal with it.
* Ideally, we only want to look at masks that are set.
*
* If a mask is not set, then the only thing wrong is that we
* did a little more work than necessary.
*
* If we read a zero count but the mask is set, because of the
* memory barriers, that can only happen when the highest prio
* task for a run queue has left the run queue, in which case,
* it will be followed by a pull. If the task we are processing
* fails to find a proper place to go, that pull request will
* pull this task if the run queue is running at a lower
* priority.
*/
smp_rmb();
/* Need to do the rmb for every iteration */
if (skip)
continue;
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
/*
* We have to ensure that we have at least one bit
* still set in the array, since the map could have
* been concurrently emptied between the first and
* second reads of vec->mask. If we hit this
* condition, simply act as though we never hit this
* priority level and continue on.
*/
if (cpumask_any(lowest_mask) >= nr_cpu_ids)
continue;
}
return 1;
}
return 0;
}
/**
* cpupri_set - update the cpu priority setting
* @cp: The cpupri context
* @cpu: The target cpu
* @pri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
int *currpri = &cp->cpu_to_pri[cpu];
int oldpri = *currpri;
int do_mb = 0;
newpri = convert_prio(newpri);
BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
if (newpri == oldpri)
return;
/*
* If the cpu was currently mapped to a different value, we
* need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being cleared from pri_active, and this cpu could be
* missed for a push or pull.
*/
if (likely(newpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
cpumask_set_cpu(cpu, vec->mask);
/*
* When adding a new vector, we update the mask first,
* do a write memory barrier, and then update the count, to
* make sure the vector is visible when count is set.
*/
smp_mb__before_atomic_inc();
atomic_inc(&(vec)->count);
do_mb = 1;
}
if (likely(oldpri != CPUPRI_INVALID)) {
struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
/*
* Because the order of modification of the vec->count
* is important, we must make sure that the update
* of the new prio is seen before we decrement the
* old prio. This makes sure that the loop sees
* one or the other when we raise the priority of
* the run queue. We don't care about when we lower the
* priority, as that will trigger an rt pull anyway.
*
* We only need to do a memory barrier if we updated
* the new priority vec.
*/
if (do_mb)
smp_mb__after_atomic_inc();
/*
* When removing from the vector, we decrement the counter first
* do a memory barrier and then clear the mask.
*/
atomic_dec(&(vec)->count);
smp_mb__after_atomic_inc();
cpumask_clear_cpu(cpu, vec->mask);
}
*currpri = newpri;
}
/**
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
* @bootmem: true if allocations need to use bootmem
*
* Returns: -ENOMEM if memory fails.
*/
int cpupri_init(struct cpupri *cp)
{
int i;
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
atomic_set(&vec->count, 0);
if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
goto cleanup;
}
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
return 0;
cleanup:
for (i--; i >= 0; i--)
free_cpumask_var(cp->pri_to_cpu[i].mask);
return -ENOMEM;
}
/**
* cpupri_cleanup - clean up the cpupri structure
* @cp: The cpupri context
*/
void cpupri_cleanup(struct cpupri *cp)
{
int i;
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}