kernel-ark/arch/x86/kernel/process.c

714 lines
17 KiB
C
Raw Normal View History

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
#include <linux/random.h>
#include <linux/user-return-notifier.h>
#include <linux/dmi.h>
#include <linux/utsname.h>
#include <linux/stackprotector.h>
#include <linux/tick.h>
#include <linux/cpuidle.h>
#include <trace/events/power.h>
hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events This patch rebase the implementation of the breakpoints API on top of perf events instances. Each breakpoints are now perf events that handle the register scheduling, thread/cpu attachment, etc.. The new layering is now made as follows: ptrace kgdb ftrace perf syscall \ | / / \ | / / / Core breakpoint API / / | / | / Breakpoints perf events | | Breakpoints PMU ---- Debug Register constraints handling (Part of core breakpoint API) | | Hardware debug registers Reasons of this rewrite: - Use the centralized/optimized pmu registers scheduling, implying an easier arch integration - More powerful register handling: perf attributes (pinned/flexible events, exclusive/non-exclusive, tunable period, etc...) Impact: - New perf ABI: the hardware breakpoints counters - Ptrace breakpoints setting remains tricky and still needs some per thread breakpoints references. Todo (in the order): - Support breakpoints perf counter events for perf tools (ie: implement perf_bpcounter_event()) - Support from perf tools Changes in v2: - Follow the perf "event " rename - The ptrace regression have been fixed (ptrace breakpoint perf events weren't released when a task ended) - Drop the struct hw_breakpoint and store generic fields in perf_event_attr. - Separate core and arch specific headers, drop asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h - Use new generic len/type for breakpoint - Handle off case: when breakpoints api is not supported by an arch Changes in v3: - Fix broken CONFIG_KVM, we need to propagate the breakpoint api changes to kvm when we exit the guest and restore the bp registers to the host. Changes in v4: - Drop the hw_breakpoint_restore() stub as it is only used by KVM - EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a module - Restore the breakpoints unconditionally on kvm guest exit: TIF_DEBUG_THREAD doesn't anymore cover every cases of running breakpoints and vcpu->arch.switch_db_regs might not always be set when the guest used debug registers. (Waiting for a reliable optimization) Changes in v5: - Split-up the asm-generic/hw-breakpoint.h moving to linux/hw_breakpoint.h into a separate patch - Optimize the breakpoints restoring while switching from kvm guest to host. We only want to restore the state if we have active breakpoints to the host, otherwise we don't care about messed-up address registers. - Add asm/hw_breakpoint.h to Kbuild - Fix bad breakpoint type in trace_selftest.c Changes in v6: - Fix wrong header inclusion in trace.h (triggered a build error with CONFIG_FTRACE_SELFTEST Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Prasad <prasad@linux.vnet.ibm.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Jan Kiszka <jan.kiszka@web.de> Cc: Jiri Slaby <jirislaby@gmail.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Avi Kivity <avi@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Paul Mundt <lethal@linux-sh.org>
2009-09-09 17:22:48 +00:00
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
* so they are allowed to end up in the .data..cacheline_aligned
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
{
atomic_notifier_chain_register(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_register);
void idle_notifier_unregister(struct notifier_block *n)
{
atomic_notifier_chain_unregister(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
fork: move the real prepare_to_copy() users to arch_dup_task_struct() Historical prepare_to_copy() is mostly a no-op, duplicated for majority of the architectures and the rest following the x86 model of flushing the extended register state like fpu there. Remove it and use the arch_dup_task_struct() instead. Suggested-by: Oleg Nesterov <oleg@redhat.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Link: http://lkml.kernel.org/r/1336692811-30576-1-git-send-email-suresh.b.siddha@intel.com Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Chris Zankel <chris@zankel.net> Cc: Richard Henderson <rth@twiddle.net> Cc: Russell King <linux@arm.linux.org.uk> Cc: Haavard Skinnemoen <hskinnemoen@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Mark Salter <msalter@redhat.com> Cc: Aurelien Jacquiot <a-jacquiot@ti.com> Cc: Mikael Starvik <starvik@axis.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: Tony Luck <tony.luck@intel.com> Cc: Michal Simek <monstr@monstr.eu> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Jonas Bonn <jonas@southpole.se> Cc: James E.J. Bottomley <jejb@parisc-linux.org> Cc: Helge Deller <deller@gmx.de> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Chen Liqin <liqin.chen@sunplusct.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: David S. Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2012-05-16 22:03:51 +00:00
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
int ret;
fork: move the real prepare_to_copy() users to arch_dup_task_struct() Historical prepare_to_copy() is mostly a no-op, duplicated for majority of the architectures and the rest following the x86 model of flushing the extended register state like fpu there. Remove it and use the arch_dup_task_struct() instead. Suggested-by: Oleg Nesterov <oleg@redhat.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Link: http://lkml.kernel.org/r/1336692811-30576-1-git-send-email-suresh.b.siddha@intel.com Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Chris Zankel <chris@zankel.net> Cc: Richard Henderson <rth@twiddle.net> Cc: Russell King <linux@arm.linux.org.uk> Cc: Haavard Skinnemoen <hskinnemoen@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Mark Salter <msalter@redhat.com> Cc: Aurelien Jacquiot <a-jacquiot@ti.com> Cc: Mikael Starvik <starvik@axis.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: Richard Kuo <rkuo@codeaurora.org> Cc: Tony Luck <tony.luck@intel.com> Cc: Michal Simek <monstr@monstr.eu> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Jonas Bonn <jonas@southpole.se> Cc: James E.J. Bottomley <jejb@parisc-linux.org> Cc: Helge Deller <deller@gmx.de> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Chen Liqin <liqin.chen@sunplusct.com> Cc: Lennox Wu <lennox.wu@gmail.com> Cc: David S. Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Richard Weinberger <richard@nod.at> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
2012-05-16 22:03:51 +00:00
unlazy_fpu(src);
*dst = *src;
if (fpu_allocated(&src->thread.fpu)) {
memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
ret = fpu_alloc(&dst->thread.fpu);
if (ret)
return ret;
fpu_copy(&dst->thread.fpu, &src->thread.fpu);
}
return 0;
}
void free_thread_xstate(struct task_struct *tsk)
{
fpu_free(&tsk->thread.fpu);
}
void arch_release_task_struct(struct task_struct *tsk)
{
free_thread_xstate(tsk);
}
void arch_task_cache_init(void)
{
task_xstate_cachep =
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
2008-05-31 13:56:17 +00:00
SLAB_PANIC | SLAB_NOTRACK, NULL);
}
static inline void drop_fpu(struct task_struct *tsk)
{
/*
* Forget coprocessor state..
*/
tsk->fpu_counter = 0;
clear_fpu(tsk);
clear_used_math();
}
/*
* Free current thread data structures etc..
*/
void exit_thread(void)
{
struct task_struct *me = current;
struct thread_struct *t = &me->thread;
unsigned long *bp = t->io_bitmap_ptr;
if (bp) {
struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
/*
* Careful, clear this in the TSS too:
*/
memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
t->io_bitmap_max = 0;
put_cpu();
kfree(bp);
}
drop_fpu(me);
}
void show_regs_common(void)
{
const char *vendor, *product, *board;
vendor = dmi_get_system_info(DMI_SYS_VENDOR);
if (!vendor)
vendor = "";
product = dmi_get_system_info(DMI_PRODUCT_NAME);
if (!product)
product = "";
/* Board Name is optional */
board = dmi_get_system_info(DMI_BOARD_NAME);
printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version,
vendor, product,
board ? "/" : "",
board ? board : "");
}
void flush_thread(void)
{
struct task_struct *tsk = current;
hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events This patch rebase the implementation of the breakpoints API on top of perf events instances. Each breakpoints are now perf events that handle the register scheduling, thread/cpu attachment, etc.. The new layering is now made as follows: ptrace kgdb ftrace perf syscall \ | / / \ | / / / Core breakpoint API / / | / | / Breakpoints perf events | | Breakpoints PMU ---- Debug Register constraints handling (Part of core breakpoint API) | | Hardware debug registers Reasons of this rewrite: - Use the centralized/optimized pmu registers scheduling, implying an easier arch integration - More powerful register handling: perf attributes (pinned/flexible events, exclusive/non-exclusive, tunable period, etc...) Impact: - New perf ABI: the hardware breakpoints counters - Ptrace breakpoints setting remains tricky and still needs some per thread breakpoints references. Todo (in the order): - Support breakpoints perf counter events for perf tools (ie: implement perf_bpcounter_event()) - Support from perf tools Changes in v2: - Follow the perf "event " rename - The ptrace regression have been fixed (ptrace breakpoint perf events weren't released when a task ended) - Drop the struct hw_breakpoint and store generic fields in perf_event_attr. - Separate core and arch specific headers, drop asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h - Use new generic len/type for breakpoint - Handle off case: when breakpoints api is not supported by an arch Changes in v3: - Fix broken CONFIG_KVM, we need to propagate the breakpoint api changes to kvm when we exit the guest and restore the bp registers to the host. Changes in v4: - Drop the hw_breakpoint_restore() stub as it is only used by KVM - EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a module - Restore the breakpoints unconditionally on kvm guest exit: TIF_DEBUG_THREAD doesn't anymore cover every cases of running breakpoints and vcpu->arch.switch_db_regs might not always be set when the guest used debug registers. (Waiting for a reliable optimization) Changes in v5: - Split-up the asm-generic/hw-breakpoint.h moving to linux/hw_breakpoint.h into a separate patch - Optimize the breakpoints restoring while switching from kvm guest to host. We only want to restore the state if we have active breakpoints to the host, otherwise we don't care about messed-up address registers. - Add asm/hw_breakpoint.h to Kbuild - Fix bad breakpoint type in trace_selftest.c Changes in v6: - Fix wrong header inclusion in trace.h (triggered a build error with CONFIG_FTRACE_SELFTEST Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Prasad <prasad@linux.vnet.ibm.com> Cc: Alan Stern <stern@rowland.harvard.edu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Jan Kiszka <jan.kiszka@web.de> Cc: Jiri Slaby <jirislaby@gmail.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Avi Kivity <avi@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Paul Mundt <lethal@linux-sh.org>
2009-09-09 17:22:48 +00:00
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
drop_fpu(tsk);
}
static void hard_disable_TSC(void)
{
write_cr4(read_cr4() | X86_CR4_TSD);
}
void disable_TSC(void)
{
preempt_disable();
if (!test_and_set_thread_flag(TIF_NOTSC))
/*
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
hard_disable_TSC();
preempt_enable();
}
static void hard_enable_TSC(void)
{
write_cr4(read_cr4() & ~X86_CR4_TSD);
}
static void enable_TSC(void)
{
preempt_disable();
if (test_and_clear_thread_flag(TIF_NOTSC))
/*
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
hard_enable_TSC();
preempt_enable();
}
int get_tsc_mode(unsigned long adr)
{
unsigned int val;
if (test_thread_flag(TIF_NOTSC))
val = PR_TSC_SIGSEGV;
else
val = PR_TSC_ENABLE;
return put_user(val, (unsigned int __user *)adr);
}
int set_tsc_mode(unsigned int val)
{
if (val == PR_TSC_SIGSEGV)
disable_TSC();
else if (val == PR_TSC_ENABLE)
enable_TSC();
else
return -EINVAL;
return 0;
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
{
struct thread_struct *prev, *next;
prev = &prev_p->thread;
next = &next_p->thread;
if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
debugctl &= ~DEBUGCTLMSR_BTF;
if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
debugctl |= DEBUGCTLMSR_BTF;
update_debugctlmsr(debugctl);
}
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
if (test_tsk_thread_flag(next_p, TIF_NOTSC))
hard_disable_TSC();
else
hard_enable_TSC();
}
if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Copy the relevant range of the IO bitmap.
* Normally this is 128 bytes or less:
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
/*
* Clear any possible leftover bits:
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
propagate_user_return_notify(prev_p, next_p);
}
int sys_fork(struct pt_regs *regs)
{
return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
*
* Not so, for quite unobvious reasons - register pressure.
* In user mode vfork() cannot have a stack frame, and if
* done by calling the "clone()" system call directly, you
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
int sys_vfork(struct pt_regs *regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
NULL, NULL);
}
long
sys_clone(unsigned long clone_flags, unsigned long newsp,
void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->sp;
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
/*
* sys_execve() executes a new program.
*/
long sys_execve(const char __user *name,
const char __user *const __user *argv,
const char __user *const __user *envp, struct pt_regs *regs)
{
long error;
char *filename;
filename = getname(name);
error = PTR_ERR(filename);
if (IS_ERR(filename))
return error;
error = do_execve(filename, argv, envp, regs);
putname(filename);
return error;
}
/*
* Idle related variables and functions
*/
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
/*
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(pm_idle);
#endif
static inline int hlt_use_halt(void)
{
return 1;
}
#ifndef CONFIG_SMP
static inline void play_dead(void)
{
BUG();
}
#endif
#ifdef CONFIG_X86_64
void enter_idle(void)
{
this_cpu_write(is_idle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
/* Called from interrupts to signify idle end */
void exit_idle(void)
{
/* idle loop has pid 0 */
if (current->pid)
return;
__exit_idle();
}
#endif
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
* low exit latency (ie sit in a loop waiting for
* somebody to say that they'd like to reschedule)
*/
void cpu_idle(void)
{
/*
* If we're the non-boot CPU, nothing set the stack canary up
* for us. CPU0 already has it initialized but no harm in
* doing it again. This is a good place for updating it, as
* we wont ever return from this function (so the invalid
* canaries already on the stack wont ever trigger).
*/
boot_init_stack_canary();
current_thread_info()->status |= TS_POLLING;
while (1) {
tick_nohz_idle_enter();
while (!need_resched()) {
rmb();
if (cpu_is_offline(smp_processor_id()))
play_dead();
/*
* Idle routines should keep interrupts disabled
* from here on, until they go to idle.
* Otherwise, idle callbacks can misfire.
*/
local_touch_nmi();
local_irq_disable();
enter_idle();
/* Don't trace irqs off for idle */
stop_critical_timings();
/* enter_idle() needs rcu for notifiers */
rcu_idle_enter();
if (cpuidle_idle_call())
pm_idle();
rcu_idle_exit();
start_critical_timings();
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
__exit_idle();
}
tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
}
}
/*
* We use this if we don't have any better
* idle routine..
*/
void default_idle(void)
{
if (hlt_use_halt()) {
trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle_rcuidle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
* test NEED_RESCHED:
*/
smp_mb();
if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
/* loop is done by the caller */
cpu_relax();
}
}
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(default_idle);
#endif
xen/pm_idle: Make pm_idle be default_idle under Xen. The idea behind commit d91ee5863b71 ("cpuidle: replace xen access to x86 pm_idle and default_idle") was to have one call - disable_cpuidle() which would make pm_idle not be molested by other code. It disallows cpuidle_idle_call to be set to pm_idle (which is excellent). But in the select_idle_routine() and idle_setup(), the pm_idle can still be set to either: amd_e400_idle, mwait_idle or default_idle. This depends on some CPU flags (MWAIT) and in AMD case on the type of CPU. In case of mwait_idle we can hit some instances where the hypervisor (Amazon EC2 specifically) sets the MWAIT and we get: Brought up 2 CPUs invalid opcode: 0000 [#1] SMP Pid: 0, comm: swapper Not tainted 3.1.0-0.rc6.git0.3.fc16.x86_64 #1 RIP: e030:[<ffffffff81015d1d>] [<ffffffff81015d1d>] mwait_idle+0x6f/0xb4 ... Call Trace: [<ffffffff8100e2ed>] cpu_idle+0xae/0xe8 [<ffffffff8149ee78>] cpu_bringup_and_idle+0xe/0x10 RIP [<ffffffff81015d1d>] mwait_idle+0x6f/0xb4 RSP <ffff8801d28ddf10> In the case of amd_e400_idle we don't get so spectacular crashes, but we do end up making an MSR which is trapped in the hypervisor, and then follow it up with a yield hypercall. Meaning we end up going to hypervisor twice instead of just once. The previous behavior before v3.0 was that pm_idle was set to default_idle regardless of select_idle_routine/idle_setup. We want to do that, but only for one specific case: Xen. This patch does that. Fixes RH BZ #739499 and Ubuntu #881076 Reported-by: Stefan Bader <stefan.bader@canonical.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-11-21 23:02:02 +00:00
bool set_pm_idle_to_default(void)
{
bool ret = !!pm_idle;
pm_idle = default_idle;
return ret;
}
void stop_this_cpu(void *dummy)
{
local_irq_disable();
/*
* Remove this CPU:
*/
set_cpu_online(smp_processor_id(), false);
disable_local_APIC();
for (;;) {
if (hlt_works(smp_processor_id()))
halt();
}
}
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
if (!need_resched()) {
trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle_rcuidle(1, smp_processor_id());
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else
local_irq_enable();
}
/*
* On SMP it's slightly faster (but much more power-consuming!)
* to poll the ->work.need_resched flag instead of waiting for the
* cross-CPU IPI to arrive. Use this option with caution.
*/
static void poll_idle(void)
{
trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
}
/*
* mwait selection logic:
*
* It depends on the CPU. For AMD CPUs that support MWAIT this is
* wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
* then depend on a clock divisor and current Pstate of the core. If
* all cores of a processor are in halt state (C1) the processor can
* enter the C1E (C1 enhanced) state. If mwait is used this will never
* happen.
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
#define MWAIT_INFO 0x05
#define MWAIT_ECX_EXTENDED_INFO 0x01
#define MWAIT_EDX_C1 0xf0
int mwait_usable(const struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly The checks that exist in mwait_usable() for "idle=" kernel parameters are insufficient. As a result, mwait_usable() can return 1 even if "idle=nomwait" or "idle=poll" or "idle=halt" parameters are passed. Of these cases, incorrect handling of idle=nomwait is a universal problem since mwait can get used for usual CPU idling. However the rest of the cases are problematic only during CPU Hotplug (offline) because, in the CPU offline path, the function mwait_play_dead() is called, which might result in mwait being used in the offline CPUs, if mwait_usable() happens to return 1. Fix these issues by checking for the boot time "idle=" kernel parameter properly in mwait_usable(). The first issue (usual cpu idling) is demonstrated below: Before applying the patch (dmesg snippet): [ 0.000000] Command line: [...] idle=nomwait [ 0.000000] Kernel command line: [...] idle=nomwait [ 0.000000] RCU dyntick-idle grace-period acceleration is enabled. [ 0.140606] using mwait in idle threads. <======= mwait being used [ 4.303986] cpuidle: using governor ladder [ 4.308232] cpuidle: using governor menu After applying the patch: [ 0.000000] Command line: [...] idle=nomwait [ 0.000000] Kernel command line: [...] idle=nomwait [ 0.000000] RCU dyntick-idle grace-period acceleration is enabled. [ 4.264100] cpuidle: using governor ladder [ 4.268342] cpuidle: using governor menu Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Acked-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Cc: venki@google.com Cc: suresh.b.siddha@intel.com Cc: Borislav Petkov <bp@amd64.org> Cc: lenb@kernel.org Cc: Rafael J. Wysocki <rjw@sisk.pl> Link: http://lkml.kernel.org/r/4F9E37B8.30001@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-04-30 06:56:56 +00:00
/* Use mwait if idle=mwait boot option is given */
if (boot_option_idle_override == IDLE_FORCE_MWAIT)
return 1;
x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly The checks that exist in mwait_usable() for "idle=" kernel parameters are insufficient. As a result, mwait_usable() can return 1 even if "idle=nomwait" or "idle=poll" or "idle=halt" parameters are passed. Of these cases, incorrect handling of idle=nomwait is a universal problem since mwait can get used for usual CPU idling. However the rest of the cases are problematic only during CPU Hotplug (offline) because, in the CPU offline path, the function mwait_play_dead() is called, which might result in mwait being used in the offline CPUs, if mwait_usable() happens to return 1. Fix these issues by checking for the boot time "idle=" kernel parameter properly in mwait_usable(). The first issue (usual cpu idling) is demonstrated below: Before applying the patch (dmesg snippet): [ 0.000000] Command line: [...] idle=nomwait [ 0.000000] Kernel command line: [...] idle=nomwait [ 0.000000] RCU dyntick-idle grace-period acceleration is enabled. [ 0.140606] using mwait in idle threads. <======= mwait being used [ 4.303986] cpuidle: using governor ladder [ 4.308232] cpuidle: using governor menu After applying the patch: [ 0.000000] Command line: [...] idle=nomwait [ 0.000000] Kernel command line: [...] idle=nomwait [ 0.000000] RCU dyntick-idle grace-period acceleration is enabled. [ 4.264100] cpuidle: using governor ladder [ 4.268342] cpuidle: using governor menu Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Acked-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Cc: venki@google.com Cc: suresh.b.siddha@intel.com Cc: Borislav Petkov <bp@amd64.org> Cc: lenb@kernel.org Cc: Rafael J. Wysocki <rjw@sisk.pl> Link: http://lkml.kernel.org/r/4F9E37B8.30001@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-04-30 06:56:56 +00:00
/*
* Any idle= boot option other than idle=mwait means that we must not
* use mwait. Eg: idle=halt or idle=poll or idle=nomwait
*/
if (boot_option_idle_override != IDLE_NO_OVERRIDE)
return 0;
if (c->cpuid_level < MWAIT_INFO)
return 0;
cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
/* Check, whether EDX has extended info about MWAIT */
if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
return 1;
/*
* edx enumeratios MONITOR/MWAIT extensions. Check, whether
* C1 supports MWAIT
*/
return (edx & MWAIT_EDX_C1);
}
bool amd_e400_c1e_detected;
EXPORT_SYMBOL(amd_e400_c1e_detected);
static cpumask_var_t amd_e400_c1e_mask;
void amd_e400_remove_cpu(int cpu)
{
if (amd_e400_c1e_mask != NULL)
cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
}
/*
* AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
* pending message MSR. If we detect C1E, then we handle it the same
* way as C3 power states (local apic timer and TSC stop)
*/
static void amd_e400_idle(void)
{
if (need_resched())
return;
if (!amd_e400_c1e_detected) {
u32 lo, hi;
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
amd_e400_c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
pr_info("System has AMD C1E enabled\n");
}
}
if (amd_e400_c1e_detected) {
int cpu = smp_processor_id();
if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
cpumask_set_cpu(cpu, amd_e400_c1e_mask);
/*
2009-08-17 21:34:59 +00:00
* Force broadcast so ACPI can not interfere.
*/
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
&cpu);
pr_info("Switch to broadcast mode on CPU%d\n", cpu);
}
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
default_idle();
/*
* The switch back from broadcast mode needs to be
* called with interrupts disabled.
*/
local_irq_disable();
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
local_irq_enable();
} else
default_idle();
}
void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
}
#endif
if (pm_idle)
return;
if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
/*
* One CPU supports mwait => All CPUs supports mwait
*/
pr_info("using mwait in idle threads\n");
pm_idle = mwait_idle;
} else if (cpu_has_amd_erratum(amd_erratum_400)) {
/* E400: APIC timer interrupt does not wake up CPU from C1e */
pr_info("using AMD E400 aware idle routine\n");
pm_idle = amd_e400_idle;
} else
pm_idle = default_idle;
}
void __init init_amd_e400_c1e_mask(void)
{
/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
if (pm_idle == amd_e400_idle)
zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
}
static int __init idle_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strcmp(str, "poll")) {
pr_info("using polling idle threads\n");
pm_idle = poll_idle;
boot_option_idle_override = IDLE_POLL;
} else if (!strcmp(str, "mwait")) {
boot_option_idle_override = IDLE_FORCE_MWAIT;
WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
} else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
* forced to be used for CPU idle. In such case CPU C2/C3
* won't be used again.
* To continue to load the CPU idle driver, don't touch
* the boot_option_idle_override.
*/
pm_idle = default_idle;
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
* If the boot option of "idle=nomwait" is added,
* it means that mwait will be disabled for CPU C2/C3
* states. In such case it won't touch the variable
* of boot_option_idle_override.
*/
boot_option_idle_override = IDLE_NOMWAIT;
} else
return -1;
return 0;
}
early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}