From 62a03defeabd58f74e07ca030d6c21e069d4d88e Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Thu, 20 Oct 2016 16:14:52 +0800 Subject: [PATCH 01/12] PM / hibernate: Verify the consistent of e820 memory map by md5 digest On some platforms, there is occasional panic triggered when trying to resume from hibernation, a typical panic looks like: "BUG: unable to handle kernel paging request at ffff880085894000 IP: [] load_image_lzo+0x8c2/0xe70" Investigation carried out by Lee Chun-Yi shows that this is because e820 map has been changed by BIOS across hibernation, and one of the page frames from suspend kernel is right located in restore kernel's unmapped region, so panic comes out when accessing unmapped kernel address. In order to expose this issue earlier, the md5 hash of e820 map is passed from suspend kernel to restore kernel, and the restore kernel will terminate the resume process once it finds the md5 hash are not the same. As the format of image header has been modified, the magic number should also be adjusted as kernels with the same RESTORE_MAGIC have to use the same header format and interpret all of the fields in it in the same way. If the suspend kernel is built without md5 support, and the restore kernel has md5 support, then the latter will bypass the check process. Vice versa the restore kernel will bypass the check if it does not support md5 operation. Note: 1. Without this patch applied, it is possible that BIOS has provided an inconsistent memory map, but the resume kernel is still able to restore the image anyway(e.g, E820_RAM region is the superset of the previous one), although the system might be unstable. So this patch tries to treat any inconsistent e820 as illegal. 2. Another case is, this patch replies on comparing the e820_saved, but currently the e820_save might not be strictly the same across hibernation, even if BIOS has provided consistent e820 map - In theory mptable might modify the BIOS-provided e820_saved dynamically in early_reserve_e820_mpc_new, which would allocate a buffer from E820_RAM, and marks it from E820_RAM to E820_RESERVED). This is a potential and rare case we need to deal with in OS in the future. Suggested-by: Pavel Machek Suggested-by: Rafael J. Wysocki Signed-off-by: Chen Yu Reviewed-by: Lee, Chun-Yi Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- arch/x86/power/hibernate_64.c | 94 ++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..ded2e8272382 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,10 @@ #include #include #include +#include +#include + +#include #include #include @@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn) return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } +#define MD5_DIGEST_SIZE 16 + struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; }; -#define RESTORE_MAGIC 0x123456789ABCDEF0UL +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 map + * + * @map: the e820 map to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820map *map, void *buf) +{ + struct scatterlist sg; + struct crypto_ahash *tfm; + int size; + int ret = 0; + + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return -ENOMEM; + + { + AHASH_REQUEST_ON_STACK(req, tfm); + size = offsetof(struct e820map, map) + + sizeof(struct e820entry) * map->nr_map; + ahash_request_set_tfm(req, tfm); + sg_init_one(&sg, (u8 *)map, size); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, &sg, buf, size); + + if (crypto_ahash_digest(req)) + ret = -EINVAL; + ahash_request_zero(req); + } + crypto_free_ahash(tfm); + + return ret; +} + +static void hibernation_e820_save(void *buf) +{ + get_e820_md5(e820_saved, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_saved, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static void hibernation_e820_save(void *buf) +{ +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; + + hibernation_e820_save(rdr->e820_digest); + return 0; } @@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr) restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; } From 406e79385f3223d82272cf2be86bc95cd000a258 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Nov 2016 22:45:40 +0100 Subject: [PATCH 02/12] PM / sleep: System sleep state selection interface rework There are systems in which the platform doesn't support any special sleep states, so suspend-to-idle (PM_SUSPEND_FREEZE) is the only available system sleep state. However, some user space frameworks only use the "mem" and (sometimes) "standby" sleep state labels, so the users of those systems need to modify user space in order to be able to use system suspend at all and that may be a pain in practice. Commit 0399d4db3edf (PM / sleep: Introduce command line argument for sleep state enumeration) attempted to address this problem by adding a command line argument to change the meaning of the "mem" string in /sys/power/state to make it trigger suspend-to-idle (instead of suspend-to-RAM). However, there also are systems in which the platform does support special sleep states, but suspend-to-idle is the preferred one anyway (it even may save more energy than the platform-provided sleep states in some cases) and the above commit doesn't help in those cases. For this reason, rework the system sleep state selection interface again (but preserve backwards compatibiliby). Namely, add a new sysfs file, /sys/power/mem_sleep, that will control the system suspend mode triggered by writing "mem" to /sys/power/state (in analogy with what /sys/power/disk does for hibernation). Make it select suspend-to-RAM ("deep" sleep) by default (if supported) and fall back to suspend-to-idle ("s2idle") otherwise and add a new command line argument, mem_sleep_default, allowing that default to be overridden if need be. At the same time, drop the relative_sleep_states command line argument that doesn't make sense any more. Signed-off-by: Rafael J. Wysocki Tested-by: Mario Limonciello --- Documentation/ABI/testing/sysfs-power | 43 +++++++------ Documentation/kernel-parameters.txt | 13 ++-- Documentation/power/states.txt | 54 ++++++++++------ kernel/power/main.c | 88 ++++++++++++++++++++++++++- kernel/power/power.h | 6 +- kernel/power/suspend.c | 69 +++++++++++++-------- 6 files changed, 197 insertions(+), 76 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 50b368d490b5..f523e5a3ac33 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -7,30 +7,35 @@ Description: subsystem. What: /sys/power/state -Date: May 2014 +Date: November 2016 Contact: Rafael J. Wysocki Description: The /sys/power/state file controls system sleep states. Reading from this file returns the available sleep state - labels, which may be "mem", "standby", "freeze" and "disk" - (hibernation). The meanings of the first three labels depend on - the relative_sleep_states command line argument as follows: - 1) relative_sleep_states = 1 - "mem", "standby", "freeze" represent non-hibernation sleep - states from the deepest ("mem", always present) to the - shallowest ("freeze"). "standby" and "freeze" may or may - not be present depending on the capabilities of the - platform. "freeze" can only be present if "standby" is - present. - 2) relative_sleep_states = 0 (default) - "mem" - "suspend-to-RAM", present if supported. - "standby" - "power-on suspend", present if supported. - "freeze" - "suspend-to-idle", always present. + labels, which may be "mem" (suspend), "standby" (power-on + suspend), "freeze" (suspend-to-idle) and "disk" (hibernation). - Writing to this file one of these strings causes the system to - transition into the corresponding state, if available. See - Documentation/power/states.txt for a description of what - "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. + Writing one of the above strings to this file causes the system + to transition into the corresponding state, if available. + + See Documentation/power/states.txt for more information. + +What: /sys/power/mem_sleep +Date: November 2016 +Contact: Rafael J. Wysocki +Description: + The /sys/power/mem_sleep file controls the operating mode of + system suspend. Reading from it returns the available modes + as "s2idle" (always present), "shallow" and "deep" (present if + supported). The mode that will be used on subsequent attempts + to suspend the system (by writing "mem" to the /sys/power/state + file described above) is enclosed in square brackets. + + Writing one of the above strings to this file causes the mode + represented by it to be used on subsequent attempts to suspend + the system. + + See Documentation/power/states.txt for more information. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 37babf91f2cb..4131e169f97a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2325,6 +2325,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. memory contents and reserves bad memory regions that are detected. + mem_sleep_default= [SUSPEND] Default system suspend mode: + s2idle - Suspend-To-Idle + shallow - Power-On Suspend or equivalent (if supported) + deep - Suspend-To-RAM or equivalent (if supported) + See Documentation/power/states.txt. + meye.*= [HW] Set MotionEye Camera parameters See Documentation/video4linux/meye.txt. @@ -3668,13 +3674,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroup-v1/cpusets.txt. - relative_sleep_states= - [SUSPEND] Use sleep state labeling where the deepest - state available other than hibernation is always "mem". - Format: { "0" | "1" } - 0 -- Traditional sleep state labels. - 1 -- Relative sleep state labels. - reserve= [KNL,BUGS] Force the kernel to ignore some iomem area reservetop= [X86-32] diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 50f3ef9177c1..008ecb588317 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -8,25 +8,41 @@ for each state. The states are represented by strings that can be read or written to the /sys/power/state file. Those strings may be "mem", "standby", "freeze" and -"disk", where the last one always represents hibernation (Suspend-To-Disk) and -the meaning of the remaining ones depends on the relative_sleep_states command -line argument. +"disk", where the last three always represent Power-On Suspend (if supported), +Suspend-To-Idle and hibernation (Suspend-To-Disk), respectively. -For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the -available non-hibernation sleep states from the deepest to the shallowest, -respectively. In that case, "mem" is always present in /sys/power/state, -because there is at least one non-hibernation sleep state in every system. If -the given system supports two non-hibernation sleep states, "standby" is present -in /sys/power/state in addition to "mem". If the system supports three -non-hibernation sleep states, "freeze" will be present in /sys/power/state in -addition to "mem" and "standby". +The meaning of the "mem" string is controlled by the /sys/power/mem_sleep file. +It contains strings representing the available modes of system suspend that may +be triggered by writing "mem" to /sys/power/state. These modes are "s2idle" +(Suspend-To-Idle), "shallow" (Power-On Suspend) and "deep" (Suspend-To-RAM). +The "s2idle" mode is always available, while the other ones are only available +if supported by the platform (if not supported, the strings representing them +are not present in /sys/power/mem_sleep). The string representing the suspend +mode to be used subsequently is enclosed in square brackets. Writing one of +the other strings present in /sys/power/mem_sleep to it causes the suspend mode +to be used subsequently to change to the one represented by that string. -For relative_sleep_states=0, which is the default, the following descriptions -apply. +Consequently, there are two ways to cause the system to go into the +Suspend-To-Idle sleep state. The first one is to write "freeze" directly to +/sys/power/state. The second one is to write "s2idle" to /sys/power/mem_sleep +and then to wrtie "mem" to /sys/power/state. Similarly, there are two ways +to cause the system to go into the Power-On Suspend sleep state (the strings to +write to the control files in that case are "standby" or "shallow" and "mem", +respectively) if that state is supported by the platform. In turn, there is +only one way to cause the system to go into the Suspend-To-RAM state (write +"deep" into /sys/power/mem_sleep and "mem" into /sys/power/state). -state: Suspend-To-Idle +The default suspend mode (ie. the one to be used without writing anything into +/sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or +"s2idle", but it can be overridden by the value of the "mem_sleep_default" +parameter in the kernel command line. + +The properties of all of the sleep states are described below. + + +State: Suspend-To-Idle ACPI state: S0 -Label: "freeze" +Label: "s2idle" ("freeze") This state is a generic, pure software, light-weight, system sleep state. It allows more energy to be saved relative to runtime idle by freezing user @@ -35,13 +51,13 @@ lower-power than available at run time), such that the processors can spend more time in their idle states. This state can be used for platforms without Power-On Suspend/Suspend-to-RAM -support, or it can be used in addition to Suspend-to-RAM (memory sleep) -to provide reduced resume latency. It is always supported. +support, or it can be used in addition to Suspend-to-RAM to provide reduced +resume latency. It is always supported. State: Standby / Power-On Suspend ACPI State: S1 -Label: "standby" +Label: "shallow" ("standby") This state, if supported, offers moderate, though real, power savings, while providing a relatively low-latency transition back to a working system. No @@ -58,7 +74,7 @@ state. State: Suspend-to-RAM ACPI State: S3 -Label: "mem" +Label: "deep" This state, if supported, offers significant power savings as everything in the system is put into a low-power state, except for memory, which should be placed diff --git a/kernel/power/main.c b/kernel/power/main.c index 281a697fd458..d401c21136d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_async); +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + s += sprintf(s, "[%s] ", label); + else + s += sprintf(s, "%s ", label); + } + + /* Convert the last space to a newline if needed. */ + if (s != buf) + *(s-1) = '\n'; + + return (s - buf); +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); +#endif /* CONFIG_SUSPEND */ + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } state = decode_state(buf, n); - if (state < PM_SUSPEND_MAX) + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_suspend(state); - else if (state == PM_SUSPEND_MAX) + } else if (state == PM_SUSPEND_MAX) { error = hibernate(); - else + } else { error = -EINVAL; + } out: pm_autosleep_unlock(); @@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj, && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_autosleep_set_state(state); return error ? error : n; } @@ -602,6 +681,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, +#endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 56d1d0dedf76..1dfa0da827d3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ -extern const char *pm_labels[]; +extern const char * const pm_labels[]; extern const char *pm_states[]; +extern const char *mem_sleep_states[]; +extern suspend_state_t mem_sleep_current; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6ccb08f57fcb..15e6baef5c73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,8 +32,21 @@ #include "power.h" -const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; +const char * const pm_labels[] = { + [PM_SUSPEND_FREEZE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_FREEZE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", +}; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; +static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } -/* - * If this is set, the "mem" label always corresponds to the deepest sleep state - * available, the "standby" label corresponds to the second deepest sleep state - * available (if any), and the "freeze" label corresponds to the remaining - * available sleep state (if there is one). - */ -static bool relative_states; - void __init pm_states_init(void) { + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE]; /* - * freeze state should be supported even without any suspend_ops, - * initialize pm_states accordingly here + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. */ - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; + mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE]; } -static int __init sleep_states_setup(char *str) +static int __init mem_sleep_default_setup(char *str) { - relative_states = !strncmp(str, "1", 1); + suspend_state_t state; + + for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + break; + } + return 1; } - -__setup("relative_sleep_states=", sleep_states_setup); +__setup("mem_sleep_default=", mem_sleep_default_setup); /** * suspend_set_ops - Set the global suspend method table. @@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - suspend_state_t i; - int j = 0; - lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) { - pm_states[i] = pm_labels[j++]; - } else if (!relative_states) { - pm_states[i] = NULL; - j++; - } - pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; + } + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default == PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } unlock_system_sleep(); } From 08b98d3291652bdcd1029a059e39fbcae5ad93e2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 17 Nov 2016 03:28:53 +0100 Subject: [PATCH 03/12] PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag Modify the ACPI system sleep support setup code to select suspend-to-idle as the default system sleep state if the ACPI_FADT_LOW_POWER_S0 flag is set in the FADT and the default sleep state was not selected from the kernel command line. Signed-off-by: Rafael J. Wysocki Tested-by: Mario Limonciello --- Documentation/power/states.txt | 4 +++- drivers/acpi/sleep.c | 8 ++++++++ include/linux/suspend.h | 2 ++ kernel/power/suspend.c | 4 ++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 008ecb588317..8a39ce45d8a0 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -35,7 +35,9 @@ only one way to cause the system to go into the Suspend-To-RAM state (write The default suspend mode (ie. the one to be used without writing anything into /sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or "s2idle", but it can be overridden by the value of the "mem_sleep_default" -parameter in the kernel command line. +parameter in the kernel command line. On some ACPI-based systems, depending on +the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM +is supported. The properties of all of the sleep states are described below. diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index deb0ff78eba8..ce1855fd584b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -691,6 +691,14 @@ static void acpi_sleep_suspend_setup(void) if (acpi_sleep_state_supported(i)) sleep_states[i] = 1; + /* + * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and + * the default suspend mode was not selected from the command line. + */ + if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 && + mem_sleep_default > PM_SUSPEND_MEM) + mem_sleep_default = PM_SUSPEND_FREEZE; + suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); freeze_set_ops(&acpi_freeze_ops); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8be..0c729c3c8549 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -194,6 +194,8 @@ struct platform_freeze_ops { }; #ifdef CONFIG_SUSPEND +extern suspend_state_t mem_sleep_default; + /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..f67ceb7768b8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = { const char *mem_sleep_states[PM_SUSPEND_MAX]; suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; -static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) } if (valid_state(PM_SUSPEND_MEM)) { mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; - if (mem_sleep_default == PM_SUSPEND_MEM) + if (mem_sleep_default >= PM_SUSPEND_MEM) mem_sleep_current = PM_SUSPEND_MEM; } From bed5ab6375cd556d93661bbcea0d18c109c50df1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Nov 2016 21:15:58 +0000 Subject: [PATCH 04/12] powercap/intel_rapl: Add missing domain data update on hotplug The domain data of packages is only updated at init time, but new packages created by hotplug miss that treatment. Add it there and remove the global update at init time, because it's now obsolete. The more interesting question is why rapl_update_domain_data() exists at all as nothing ever uses that data. Signed-off-by: Thomas Gleixner Tested-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 42 +++++++++++++++-------------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 243b233ff31b..94b9901d192f 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1164,24 +1164,20 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); -/* read once for all raw primitive data for all packages, domains */ -static void rapl_update_domain_data(void) +/* Read once for all raw primitive data for domains */ +static void rapl_update_domain_data(struct rapl_package *rp) { int dmn, prim; u64 val; - struct rapl_package *rp; - list_for_each_entry(rp, &rapl_packages, plist) { - for (dmn = 0; dmn < rp->nr_domains; dmn++) { - pr_debug("update package %d domain %s data\n", rp->id, - rp->domains[dmn].name); - /* exclude non-raw primitives */ - for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) - if (!rapl_read_data_raw(&rp->domains[dmn], prim, - rpi[prim].unit, - &val)) - rp->domains[dmn].rdd.primitives[prim] = - val; + for (dmn = 0; dmn < rp->nr_domains; dmn++) { + pr_debug("update package %d domain %s data\n", rp->id, + rp->domains[dmn].name); + /* exclude non-raw primitives */ + for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { + if (!rapl_read_data_raw(&rp->domains[dmn], prim, + rpi[prim].unit, &val)) + rp->domains[dmn].rdd.primitives[prim] = val; } } @@ -1234,10 +1230,12 @@ static int rapl_unregister_powercap(void) static int rapl_package_register_powercap(struct rapl_package *rp) { struct rapl_domain *rd; - int ret = 0; char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/ struct powercap_zone *power_zone = NULL; - int nr_pl; + int nr_pl, ret;; + + /* Update the domain data of the new package */ + rapl_update_domain_data(rp); /* first we register package domain as the parent zone*/ for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { @@ -1257,8 +1255,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp) if (IS_ERR(power_zone)) { pr_debug("failed to register package, %d\n", rp->id); - ret = PTR_ERR(power_zone); - goto exit_package; + return PTR_ERR(power_zone); } /* track parent zone in per package/socket data */ rp->power_zone = power_zone; @@ -1268,8 +1265,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp) } if (!power_zone) { pr_err("no package domain found, unknown topology!\n"); - ret = -ENODEV; - goto exit_package; + return -ENODEV; } /* now register domains as children of the socket/package*/ for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { @@ -1290,9 +1286,8 @@ static int rapl_package_register_powercap(struct rapl_package *rp) goto err_cleanup; } } + return 0; -exit_package: - return ret; err_cleanup: /* clean up previously initialized domains within the package if we * failed after the first domain setup. @@ -1357,8 +1352,7 @@ static int rapl_register_powercap(void) pr_debug("failed to register powercap control_type.\n"); return PTR_ERR(control_type); } - /* read the initial data */ - rapl_update_domain_data(); + list_for_each_entry(rp, &rapl_packages, plist) if (rapl_package_register_powercap(rp)) goto err_cleanup_package; From a74f43675790ae55b908940652b4b04b236d8f18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Nov 2016 21:15:59 +0000 Subject: [PATCH 05/12] powercap/intel_rapl: Propagate error code when registration fails If rapl_package_register_powercap() fails in rapl_add_package() the function happily returns 0. Capture the error code and propagate it. Signed-off-by: Thomas Gleixner Tested-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 94b9901d192f..79b1e041f2ec 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1563,9 +1563,8 @@ static void rapl_remove_package(struct rapl_package *rp) /* called from CPU hotplug notifier, hotplug lock held */ static int rapl_add_package(int cpu) { - int ret = 0; - int phy_package_id; struct rapl_package *rp; + int ret, phy_package_id; phy_package_id = topology_physical_package_id(cpu); rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); @@ -1583,10 +1582,11 @@ static int rapl_add_package(int cpu) ret = -ENODEV; goto err_free_package; } - if (!rapl_package_register_powercap(rp)) { + ret = rapl_package_register_powercap(rp); + if (!ret) { INIT_LIST_HEAD(&rp->plist); list_add(&rp->plist, &rapl_packages); - return ret; + return 0; } err_free_package: From 5e4dc7915979a5a1ca9551659f0fdcd116a25b8f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 22 Nov 2016 21:16:00 +0000 Subject: [PATCH 06/12] powercap/intel rapl: Convert to hotplug state machine Install the callbacks via the state machine as a first step. The init/exit code is a duplicate of the hotplug code. This is cleaned up in a consecutive patch. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 94 ++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 79b1e041f2ec..de5cf22371f5 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1603,50 +1603,48 @@ err_free_package: * associated domains. Cooling devices are handled accordingly at * per-domain level. */ -static int rapl_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int rapl_cpu_online(unsigned int cpu) +{ + struct rapl_package *rp; + int phy_package_id; + + phy_package_id = topology_physical_package_id(cpu); + + rp = find_package_by_id(phy_package_id); + if (rp) + rp->nr_cpus++; + else + rapl_add_package(cpu); + return 0; +} + +static int rapl_cpu_down_prep(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; int phy_package_id; struct rapl_package *rp; int lead_cpu; phy_package_id = topology_physical_package_id(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - rp = find_package_by_id(phy_package_id); - if (rp) - ++rp->nr_cpus; - else - rapl_add_package(cpu); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - rp = find_package_by_id(phy_package_id); - if (!rp) - break; - if (--rp->nr_cpus == 0) - rapl_remove_package(rp); - else if (cpu == rp->lead_cpu) { - /* choose another active cpu in the package */ - lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); - if (lead_cpu < nr_cpu_ids) - rp->lead_cpu = lead_cpu; - else /* should never go here */ - pr_err("no active cpu available for package %d\n", - phy_package_id); + rp = find_package_by_id(phy_package_id); + if (!rp) + return 0; + if (--rp->nr_cpus == 0) { + rapl_remove_package(rp); + } else if (cpu == rp->lead_cpu) { + /* choose another active cpu in the package */ + lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); + if (lead_cpu < nr_cpu_ids) { + rp->lead_cpu = lead_cpu; + } else { + /* should never go here */ + pr_err("no active cpu available for package %d\n", + phy_package_id); } } - - return NOTIFY_OK; + return 0; } -static struct notifier_block rapl_cpu_notifier = { - .notifier_call = rapl_cpu_callback, -}; +static enum cpuhp_state pcap_rapl_online; static int __init rapl_init(void) { @@ -1663,36 +1661,42 @@ static int __init rapl_init(void) rapl_defaults = (struct rapl_defaults *)id->driver_data; - cpu_notifier_register_begin(); - /* prevent CPU hotplug during detection */ get_online_cpus(); ret = rapl_detect_topology(); if (ret) - goto done; + goto err; if (rapl_register_powercap()) { - rapl_cleanup_data(); ret = -ENODEV; - goto done; + goto err_cleanup; } - __register_hotcpu_notifier(&rapl_cpu_notifier); -done: + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "powercap/rapl:online", + rapl_cpu_online, rapl_cpu_down_prep); + if (ret < 0) + goto err_unreg; + pcap_rapl_online = ret; put_online_cpus(); - cpu_notifier_register_done(); + return 0; +err_unreg: + rapl_unregister_powercap(); + +err_cleanup: + rapl_cleanup_data(); +err: + put_online_cpus(); return ret; } static void __exit rapl_exit(void) { - cpu_notifier_register_begin(); get_online_cpus(); - __unregister_hotcpu_notifier(&rapl_cpu_notifier); + cpuhp_remove_state(pcap_rapl_online); rapl_unregister_powercap(); rapl_cleanup_data(); put_online_cpus(); - cpu_notifier_register_done(); } module_init(rapl_init); From 58705069204cc8afa0d7b759e81b61147e973de9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Nov 2016 21:16:02 +0000 Subject: [PATCH 07/12] powercap/intel_rapl: Cleanup duplicated init code The whole init/exit code is a duplicate of the cpuhotplug code. So we can just let the hotplug code do the actual work of setting up and tearing down the domains. This also restores the package hardware when a package is removed during hotplug instead of leaving it in the last configured state and only reset it when the driver is removed. Signed-off-by: Thomas Gleixner Tested-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 234 +++++++--------------------------- 1 file changed, 46 insertions(+), 188 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index de5cf22371f5..76a00696a5ce 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -275,18 +275,6 @@ static struct rapl_package *find_package_by_id(int id) return NULL; } -/* caller must hold cpu hotplug lock */ -static void rapl_cleanup_data(void) -{ - struct rapl_package *p, *tmp; - - list_for_each_entry_safe(p, tmp, &rapl_packages, plist) { - kfree(p->domains); - list_del(&p->plist); - kfree(p); - } -} - static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw) { struct rapl_domain *rd; @@ -976,10 +964,20 @@ static void package_power_limit_irq_save(struct rapl_package *rp) smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); } -static void power_limit_irq_restore_cpu(void *info) +/* + * Restore per package power limit interrupt enable state. Called from cpu + * hotplug code on package removal. + */ +static void package_power_limit_irq_restore(struct rapl_package *rp) { - u32 l, h = 0; - struct rapl_package *rp = (struct rapl_package *)info; + u32 l, h; + + if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) + return; + + /* irq enable state not saved, nothing to restore */ + if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) + return; rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); @@ -991,19 +989,6 @@ static void power_limit_irq_restore_cpu(void *info) wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } -/* restore per package power limit interrupt enable state */ -static void package_power_limit_irq_restore(struct rapl_package *rp) -{ - if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) - return; - - /* irq enable state not saved, nothing to restore */ - if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) - return; - - smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1); -} - static void set_floor_freq_default(struct rapl_domain *rd, bool mode) { int nr_powerlimit = find_nr_power_limit(rd); @@ -1183,48 +1168,14 @@ static void rapl_update_domain_data(struct rapl_package *rp) } -static int rapl_unregister_powercap(void) +static void rapl_unregister_powercap(void) { - struct rapl_package *rp; - struct rapl_domain *rd, *rd_package = NULL; - - /* unregister all active rapl packages from the powercap layer, - * hotplug lock held - */ - list_for_each_entry(rp, &rapl_packages, plist) { - package_power_limit_irq_restore(rp); - - for (rd = rp->domains; rd < rp->domains + rp->nr_domains; - rd++) { - pr_debug("remove package, undo power limit on %d: %s\n", - rp->id, rd->name); - rapl_write_data_raw(rd, PL1_ENABLE, 0); - rapl_write_data_raw(rd, PL1_CLAMP, 0); - if (find_nr_power_limit(rd) > 1) { - rapl_write_data_raw(rd, PL2_ENABLE, 0); - rapl_write_data_raw(rd, PL2_CLAMP, 0); - } - if (rd->id == RAPL_DOMAIN_PACKAGE) { - rd_package = rd; - continue; - } - powercap_unregister_zone(control_type, &rd->power_zone); - } - /* do the package zone last */ - if (rd_package) - powercap_unregister_zone(control_type, - &rd_package->power_zone); - } - if (platform_rapl_domain) { powercap_unregister_zone(control_type, &platform_rapl_domain->power_zone); kfree(platform_rapl_domain); } - powercap_unregister_control_type(control_type); - - return 0; } static int rapl_package_register_powercap(struct rapl_package *rp) @@ -1289,7 +1240,8 @@ static int rapl_package_register_powercap(struct rapl_package *rp) return 0; err_cleanup: - /* clean up previously initialized domains within the package if we + /* + * Clean up previously initialized domains within the package if we * failed after the first domain setup. */ while (--rd >= rp->domains) { @@ -1300,7 +1252,7 @@ err_cleanup: return ret; } -static int rapl_register_psys(void) +static int __init rapl_register_psys(void) { struct rapl_domain *rd; struct powercap_zone *power_zone; @@ -1341,39 +1293,14 @@ static int rapl_register_psys(void) return 0; } -static int rapl_register_powercap(void) +static int __init rapl_register_powercap(void) { - struct rapl_domain *rd; - struct rapl_package *rp; - int ret = 0; - control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); if (IS_ERR(control_type)) { pr_debug("failed to register powercap control_type.\n"); return PTR_ERR(control_type); } - - list_for_each_entry(rp, &rapl_packages, plist) - if (rapl_package_register_powercap(rp)) - goto err_cleanup_package; - - /* Don't bail out if PSys is not supported */ - rapl_register_psys(); - - return ret; - -err_cleanup_package: - /* clean up previously initialized packages */ - list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) { - for (rd = rp->domains; rd < rp->domains + rp->nr_domains; - rd++) { - pr_debug("unregister zone/package %d, %s domain\n", - rp->id, rd->name); - powercap_unregister_zone(control_type, &rd->power_zone); - } - } - - return ret; + return 0; } static int rapl_check_domain(int cpu, int domain) @@ -1446,9 +1373,8 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd) */ static int rapl_detect_domains(struct rapl_package *rp, int cpu) { - int i; - int ret = 0; struct rapl_domain *rd; + int i; for (i = 0; i < RAPL_DOMAIN_MAX; i++) { /* use physical package id to read counters */ @@ -1460,84 +1386,20 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu) rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); if (!rp->nr_domains) { pr_debug("no valid rapl domains found in package %d\n", rp->id); - ret = -ENODEV; - goto done; + return -ENODEV; } pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id); rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), GFP_KERNEL); - if (!rp->domains) { - ret = -ENOMEM; - goto done; - } + if (!rp->domains) + return -ENOMEM; + rapl_init_domains(rp); for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) rapl_detect_powerlimit(rd); - - -done: - return ret; -} - -static bool is_package_new(int package) -{ - struct rapl_package *rp; - - /* caller prevents cpu hotplug, there will be no new packages added - * or deleted while traversing the package list, no need for locking. - */ - list_for_each_entry(rp, &rapl_packages, plist) - if (package == rp->id) - return false; - - return true; -} - -/* RAPL interface can be made of a two-level hierarchy: package level and domain - * level. We first detect the number of packages then domains of each package. - * We have to consider the possiblity of CPU online/offline due to hotplug and - * other scenarios. - */ -static int rapl_detect_topology(void) -{ - int i; - int phy_package_id; - struct rapl_package *new_package, *rp; - - for_each_online_cpu(i) { - phy_package_id = topology_physical_package_id(i); - if (is_package_new(phy_package_id)) { - new_package = kzalloc(sizeof(*rp), GFP_KERNEL); - if (!new_package) { - rapl_cleanup_data(); - return -ENOMEM; - } - /* add the new package to the list */ - new_package->id = phy_package_id; - new_package->nr_cpus = 1; - /* use the first active cpu of the package to access */ - new_package->lead_cpu = i; - /* check if the package contains valid domains */ - if (rapl_detect_domains(new_package, i) || - rapl_defaults->check_unit(new_package, i)) { - kfree(new_package->domains); - kfree(new_package); - /* free up the packages already initialized */ - rapl_cleanup_data(); - return -ENODEV; - } - INIT_LIST_HEAD(&new_package->plist); - list_add(&new_package->plist, &rapl_packages); - } else { - rp = find_package_by_id(phy_package_id); - if (rp) - ++rp->nr_cpus; - } - } - return 0; } @@ -1546,12 +1408,21 @@ static void rapl_remove_package(struct rapl_package *rp) { struct rapl_domain *rd, *rd_package = NULL; + package_power_limit_irq_restore(rp); + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + rapl_write_data_raw(rd, PL1_ENABLE, 0); + rapl_write_data_raw(rd, PL1_CLAMP, 0); + if (find_nr_power_limit(rd) > 1) { + rapl_write_data_raw(rd, PL2_ENABLE, 0); + rapl_write_data_raw(rd, PL2_CLAMP, 0); + } if (rd->id == RAPL_DOMAIN_PACKAGE) { rd_package = rd; continue; } - pr_debug("remove package %d, %s domain\n", rp->id, rd->name); + pr_debug("remove package, undo power limit on %d: %s\n", + rp->id, rd->name); powercap_unregister_zone(control_type, &rd->power_zone); } /* do parent zone last */ @@ -1611,11 +1482,11 @@ static int rapl_cpu_online(unsigned int cpu) phy_package_id = topology_physical_package_id(cpu); rp = find_package_by_id(phy_package_id); - if (rp) + if (rp) { rp->nr_cpus++; - else - rapl_add_package(cpu); - return 0; + return 0; + } + return rapl_add_package(cpu); } static int rapl_cpu_down_prep(unsigned int cpu) @@ -1648,8 +1519,8 @@ static enum cpuhp_state pcap_rapl_online; static int __init rapl_init(void) { - int ret = 0; const struct x86_cpu_id *id; + int ret; id = x86_match_cpu(rapl_ids); if (!id) { @@ -1661,42 +1532,29 @@ static int __init rapl_init(void) rapl_defaults = (struct rapl_defaults *)id->driver_data; - /* prevent CPU hotplug during detection */ - get_online_cpus(); - ret = rapl_detect_topology(); + ret = rapl_register_powercap(); if (ret) - goto err; + return ret; - if (rapl_register_powercap()) { - ret = -ENODEV; - goto err_cleanup; - } - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "powercap/rapl:online", - rapl_cpu_online, rapl_cpu_down_prep); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online", + rapl_cpu_online, rapl_cpu_down_prep); if (ret < 0) goto err_unreg; pcap_rapl_online = ret; - put_online_cpus(); + + /* Don't bail out if PSys is not supported */ + rapl_register_psys(); return 0; err_unreg: rapl_unregister_powercap(); - -err_cleanup: - rapl_cleanup_data(); -err: - put_online_cpus(); return ret; } static void __exit rapl_exit(void) { - get_online_cpus(); cpuhp_remove_state(pcap_rapl_online); rapl_unregister_powercap(); - rapl_cleanup_data(); - put_online_cpus(); } module_init(rapl_init); From b4005e9278a4e62819fb16ba4bc3430ca650d0ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Nov 2016 21:16:05 +0000 Subject: [PATCH 08/12] powercap/intel_rapl: Track active CPUs internally The ability of the CPU hotplug code to stop online/offline at each step makes it necessary to track the activated CPUs in a package directly, because outerwise a CPU offline callback can find CPUs which have already executed the offline callback, but are not yet marked offline in the topology mask. That could make such a CPU the package leader and in case that CPU goes fully offline leave the package lead orphaned. Signed-off-by: Thomas Gleixner Tested-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 59 ++++++++++++++--------------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 76a00696a5ce..65ed88a28030 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -189,14 +189,13 @@ struct rapl_package { unsigned int time_unit; struct rapl_domain *domains; /* array of domains, sized at runtime */ struct powercap_zone *power_zone; /* keep track of parent zone */ - int nr_cpus; /* active cpus on the package, topology info is lost during - * cpu hotplug. so we have to track ourselves. - */ unsigned long power_limit_irq; /* keep track of package power limit * notify interrupt enable status. */ struct list_head plist; int lead_cpu; /* one active cpu per package for access */ + /* Track active cpus */ + struct cpumask cpumask; }; struct rapl_defaults { @@ -1432,19 +1431,17 @@ static void rapl_remove_package(struct rapl_package *rp) } /* called from CPU hotplug notifier, hotplug lock held */ -static int rapl_add_package(int cpu) +static struct rapl_package *rapl_add_package(int cpu, int pkgid) { struct rapl_package *rp; - int ret, phy_package_id; + int ret; - phy_package_id = topology_physical_package_id(cpu); rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); if (!rp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* add the new package to the list */ - rp->id = phy_package_id; - rp->nr_cpus = 1; + rp->id = pkgid; rp->lead_cpu = cpu; /* check if the package contains valid domains */ @@ -1457,14 +1454,13 @@ static int rapl_add_package(int cpu) if (!ret) { INIT_LIST_HEAD(&rp->plist); list_add(&rp->plist, &rapl_packages); - return 0; + return rp; } err_free_package: kfree(rp->domains); kfree(rp); - - return ret; + return ERR_PTR(ret); } /* Handles CPU hotplug on multi-socket systems. @@ -1476,42 +1472,35 @@ err_free_package: */ static int rapl_cpu_online(unsigned int cpu) { + int pkgid = topology_physical_package_id(cpu); struct rapl_package *rp; - int phy_package_id; - phy_package_id = topology_physical_package_id(cpu); - - rp = find_package_by_id(phy_package_id); - if (rp) { - rp->nr_cpus++; - return 0; + rp = find_package_by_id(pkgid); + if (!rp) { + rp = rapl_add_package(cpu, pkgid); + if (IS_ERR(rp)) + return PTR_ERR(rp); } - return rapl_add_package(cpu); + cpumask_set_cpu(cpu, &rp->cpumask); + return 0; } static int rapl_cpu_down_prep(unsigned int cpu) { - int phy_package_id; + int pkgid = topology_physical_package_id(cpu); struct rapl_package *rp; int lead_cpu; - phy_package_id = topology_physical_package_id(cpu); - rp = find_package_by_id(phy_package_id); + rp = find_package_by_id(pkgid); if (!rp) return 0; - if (--rp->nr_cpus == 0) { + + cpumask_clear_cpu(cpu, &rp->cpumask); + lead_cpu = cpumask_first(&rp->cpumask); + if (lead_cpu >= nr_cpu_ids) rapl_remove_package(rp); - } else if (cpu == rp->lead_cpu) { - /* choose another active cpu in the package */ - lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); - if (lead_cpu < nr_cpu_ids) { - rp->lead_cpu = lead_cpu; - } else { - /* should never go here */ - pr_err("no active cpu available for package %d\n", - phy_package_id); - } - } + else if (rp->lead_cpu == cpu) + rp->lead_cpu = lead_cpu; return 0; } From cb43f81b8489dcb87555e16c17453f0a9fa690f2 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 28 Nov 2016 13:53:11 -0800 Subject: [PATCH 09/12] powercap/intel_rapl: fix and tidy up error handling Commit e1399ba20eee ("powercap / RAPL: handle missing MSRs") added contraint_to_pl() function to return index into an array. But it can potentially return -EINVAL if powercap layer sends an out of range constraint ID. This patch adds sanity check. Unnecessary RAPL domain pointer check is removed since it must be initialized before calling rapl_unit_xlate(). Fixes: e1399ba20eee ("powercap / RAPL: handle missing MSRs") Reported-by: Odzioba, Lukasz Reported-by: Koss, Marcin Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 65ed88a28030..f6937d0b742e 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -429,6 +429,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid) return i; } } + pr_err("Cannot find matching power limit for constraint %d\n", cid); return -EINVAL; } @@ -444,6 +445,10 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid, get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto set_exit; + } rp = rd->rp; @@ -483,6 +488,11 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto get_exit; + } + switch (rd->rpl[id].prim_id) { case PL1_ENABLE: prim = POWER_LIMIT1; @@ -499,6 +509,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, else *data = val; +get_exit: put_online_cpus(); return ret; @@ -514,6 +525,10 @@ static int set_time_window(struct powercap_zone *power_zone, int cid, get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto set_time_exit; + } switch (rd->rpl[id].prim_id) { case PL1_ENABLE: @@ -525,6 +540,8 @@ static int set_time_window(struct powercap_zone *power_zone, int cid, default: ret = -EINVAL; } + +set_time_exit: put_online_cpus(); return ret; } @@ -539,6 +556,10 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data) get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto get_time_exit; + } switch (rd->rpl[id].prim_id) { case PL1_ENABLE: @@ -553,6 +574,8 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data) } if (!ret) *data = val; + +get_time_exit: put_online_cpus(); return ret; @@ -694,7 +717,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, case ENERGY_UNIT: scale = ENERGY_UNIT_SCALE; /* per domain unit takes precedence */ - if (rd && rd->domain_energy_unit) + if (rd->domain_energy_unit) units = rd->domain_energy_unit; else units = rp->energy_unit; From bd9089162d8002666f734dd18552c7297118fec3 Mon Sep 17 00:00:00 2001 From: Piotr Luc Date: Thu, 13 Oct 2016 17:31:04 +0200 Subject: [PATCH 10/12] powercap / RAPL: Add Knights Mill CPUID Add Knights Mill (KNM) to the list of CPUIDs supported by intel_rapl Signed-off-by: Piotr Luc Reviewed-by: Dave Hansen Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index f6937d0b742e..9a25110c4a46 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1167,6 +1167,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), + RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM, rapl_defaults_hsw_server), {} }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); From b53f40db59b27b62bc294c30506b02a0cae47e0b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 2 Dec 2016 11:42:21 -0600 Subject: [PATCH 11/12] x86/suspend: fix false positive KASAN warning on suspend/resume Resuming from a suspend operation is showing a KASAN false positive warning: BUG: KASAN: stack-out-of-bounds in unwind_get_return_address+0x11d/0x130 at addr ffff8803867d7878 Read of size 8 by task pm-suspend/7774 page:ffffea000e19f5c0 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x2ffff0000000000() page dumped because: kasan: bad access detected CPU: 0 PID: 7774 Comm: pm-suspend Tainted: G B 4.9.0-rc7+ #8 Hardware name: Gigabyte Technology Co., Ltd. Z170X-UD5/Z170X-UD5-CF, BIOS F5 03/07/2016 Call Trace: dump_stack+0x63/0x82 kasan_report_error+0x4b4/0x4e0 ? acpi_hw_read_port+0xd0/0x1ea ? kfree_const+0x22/0x30 ? acpi_hw_validate_io_request+0x1a6/0x1a6 __asan_report_load8_noabort+0x61/0x70 ? unwind_get_return_address+0x11d/0x130 unwind_get_return_address+0x11d/0x130 ? unwind_next_frame+0x97/0xf0 __save_stack_trace+0x92/0x100 save_stack_trace+0x1b/0x20 save_stack+0x46/0xd0 ? save_stack_trace+0x1b/0x20 ? save_stack+0x46/0xd0 ? kasan_kmalloc+0xad/0xe0 ? kasan_slab_alloc+0x12/0x20 ? acpi_hw_read+0x2b6/0x3aa ? acpi_hw_validate_register+0x20b/0x20b ? acpi_hw_write_port+0x72/0xc7 ? acpi_hw_write+0x11f/0x15f ? acpi_hw_read_multiple+0x19f/0x19f ? memcpy+0x45/0x50 ? acpi_hw_write_port+0x72/0xc7 ? acpi_hw_write+0x11f/0x15f ? acpi_hw_read_multiple+0x19f/0x19f ? kasan_unpoison_shadow+0x36/0x50 kasan_kmalloc+0xad/0xe0 kasan_slab_alloc+0x12/0x20 kmem_cache_alloc_trace+0xbc/0x1e0 ? acpi_get_sleep_type_data+0x9a/0x578 acpi_get_sleep_type_data+0x9a/0x578 acpi_hw_legacy_wake_prep+0x88/0x22c ? acpi_hw_legacy_sleep+0x3c7/0x3c7 ? acpi_write_bit_register+0x28d/0x2d3 ? acpi_read_bit_register+0x19b/0x19b acpi_hw_sleep_dispatch+0xb5/0xba acpi_leave_sleep_state_prep+0x17/0x19 acpi_suspend_enter+0x154/0x1e0 ? trace_suspend_resume+0xe8/0xe8 suspend_devices_and_enter+0xb09/0xdb0 ? printk+0xa8/0xd8 ? arch_suspend_enable_irqs+0x20/0x20 ? try_to_freeze_tasks+0x295/0x600 pm_suspend+0x6c9/0x780 ? finish_wait+0x1f0/0x1f0 ? suspend_devices_and_enter+0xdb0/0xdb0 state_store+0xa2/0x120 ? kobj_attr_show+0x60/0x60 kobj_attr_store+0x36/0x70 sysfs_kf_write+0x131/0x200 kernfs_fop_write+0x295/0x3f0 __vfs_write+0xef/0x760 ? handle_mm_fault+0x1346/0x35e0 ? do_iter_readv_writev+0x660/0x660 ? __pmd_alloc+0x310/0x310 ? do_lock_file_wait+0x1e0/0x1e0 ? apparmor_file_permission+0x18/0x20 ? security_file_permission+0x73/0x1c0 ? rw_verify_area+0xbd/0x2b0 vfs_write+0x149/0x4a0 SyS_write+0xd9/0x1c0 ? SyS_read+0x1c0/0x1c0 entry_SYSCALL_64_fastpath+0x1e/0xad Memory state around the buggy address: ffff8803867d7700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8803867d7780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8803867d7800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f4 ^ ffff8803867d7880: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 ffff8803867d7900: 00 00 00 f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 KASAN instrumentation poisons the stack when entering a function and unpoisons it when exiting the function. However, in the suspend path, some functions never return, so their stack never gets unpoisoned, resulting in stale KASAN shadow data which can cause later false positive warnings like the one above. Reported-by: Scott Bauer Signed-off-by: Josh Poimboeuf Acked-by: Andrey Ryabinin Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/wakeup_64.S | 9 +++++++++ mm/kasan/kasan.c | 9 ++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 169963f471bb..50b8ed0317a3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 +#ifdef CONFIG_KASAN + /* + * The suspend path may have poisoned some areas deeper in the stack, + * which we now need to unpoison. + */ + movq %rsp, %rdi + call kasan_unpoison_task_stack_below +#endif + xorl %eax, %eax addq $8, %rsp FRAME_END diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 70c009741aab..f7dd9595ef43 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task) /* Unpoison the stack for the current task beyond a watermark sp value. */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, watermark); + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison_shadow(base, watermark - base); } /* From 9320f95c0b8f1d074f570385e6855d4554f693e4 Mon Sep 17 00:00:00 2001 From: xing wei Date: Wed, 7 Dec 2016 19:31:16 +0800 Subject: [PATCH 12/12] PM / sleep: Print active wakeup sources when blocking on wakeup_count reads If there are any wakeup events being processed, read operation on /sys/power/wakeup_count will be blocked, so print the names of all active wakeup sources to help to find out who is preventing system suspend from triggering. While at it change pr_info() in pm_print_active_wakeup_sources() to pr_debug() to avoid excessive log noise. Signed-off-by: xing wei [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 62e4de2aa8d1..bf9ba26981a5 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -811,7 +811,7 @@ void pm_print_active_wakeup_sources(void) rcu_read_lock(); list_for_each_entry_rcu(ws, &wakeup_sources, entry) { if (ws->active) { - pr_info("active wakeup source: %s\n", ws->name); + pr_debug("active wakeup source: %s\n", ws->name); active = 1; } else if (!active && (!last_activity_ws || @@ -822,7 +822,7 @@ void pm_print_active_wakeup_sources(void) } if (!active && last_activity_ws) - pr_info("last active wakeup source: %s\n", + pr_debug("last active wakeup source: %s\n", last_activity_ws->name); rcu_read_unlock(); } @@ -905,7 +905,7 @@ bool pm_get_wakeup_count(unsigned int *count, bool block) split_counters(&cnt, &inpr); if (inpr == 0 || signal_pending(current)) break; - + pm_print_active_wakeup_sources(); schedule(); } finish_wait(&wakeup_count_wait_queue, &wait);