diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index fef3c2160691..d83ab94d041a 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -274,6 +274,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) transition_latency = CPUFREQ_ETERNAL; policy->cpuinfo.transition_latency = transition_latency; + policy->dvfs_possible_from_any_cpu = true; return 0; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c7ae67d6886d..ea43b147a7fe 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1843,9 +1843,10 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); * twice in parallel for the same policy and that it will never be called in * parallel with either ->target() or ->target_index() for the same policy. * - * If CPUFREQ_ENTRY_INVALID is returned by the driver's ->fast_switch() - * callback to indicate an error condition, the hardware configuration must be - * preserved. + * Returns the actual frequency set for the CPU. + * + * If 0 is returned by the driver's ->fast_switch() callback to indicate an + * error condition, the hardware configuration must be preserved. */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index eed069ecfd5e..58d4f4e1ad6a 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -272,6 +272,9 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time, struct policy_dbs_info *policy_dbs = cdbs->policy_dbs; u64 delta_ns, lst; + if (!cpufreq_can_do_remote_dvfs(policy_dbs->policy)) + return; + /* * The work may not be allowed to be queued up right now. * Possible reasons: diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 04dd5f46803d..0c50637e6bda 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1746,6 +1746,10 @@ static void intel_pstate_update_util_pid(struct update_util_data *data, struct cpudata *cpu = container_of(data, struct cpudata, update_util); u64 delta_ns = time - cpu->sample.time; + /* Don't allow remote callbacks */ + if (smp_processor_id() != cpu->cpu) + return; + if ((s64)delta_ns < pid_params.sample_rate_ns) return; @@ -1763,6 +1767,10 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time, struct cpudata *cpu = container_of(data, struct cpudata, update_util); u64 delta_ns; + /* Don't allow remote callbacks */ + if (smp_processor_id() != cpu->cpu) + return; + if (flags & SCHED_CPUFREQ_IOWAIT) { cpu->iowait_boost = int_tofp(1); } else if (cpu->iowait_boost) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 5f40522ec98c..537ff842ff73 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -127,6 +127,15 @@ struct cpufreq_policy { */ unsigned int transition_delay_us; + /* + * Remote DVFS flag (Not added to the driver structure as we don't want + * to access another structure from scheduler hotpath). + * + * Should be set if CPUs can do DVFS on behalf of other CPUs from + * different cpufreq policies. + */ + bool dvfs_possible_from_any_cpu; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; int cached_resolved_idx; @@ -562,6 +571,17 @@ struct governor_attr { size_t count); }; +static inline bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy) +{ + /* + * Allow remote callbacks if: + * - dvfs_possible_from_any_cpu flag is set + * - the local and remote CPUs share cpufreq policy + */ + return policy->dvfs_possible_from_any_cpu || + cpumask_test_cpu(smp_processor_id(), policy->cpus); +} + /********************************************************************* * FREQUENCY TABLE HELPERS * *********************************************************************/ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 45fcf21ad685..9209d83ecdcf 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -52,9 +52,11 @@ struct sugov_policy { struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + unsigned int cpu; - unsigned long iowait_boost; - unsigned long iowait_boost_max; + bool iowait_boost_pending; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; /* The fields below are only needed when sharing a policy. */ @@ -76,6 +78,26 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) { s64 delta_ns; + /* + * Since cpufreq_update_util() is called with rq->lock held for + * the @target_cpu, our per-cpu data is fully serialized. + * + * However, drivers cannot in general deal with cross-cpu + * requests, so while get_next_freq() will work, our + * sugov_update_commit() call may not for the fast switching platforms. + * + * Hence stop here for remote requests if they aren't supported + * by the hardware, as calculating the frequency is pointless if + * we cannot in fact act on it. + * + * For the slow switching platforms, the kthread is always scheduled on + * the right set of CPUs and any CPU can find the next frequency and + * schedule the kthread. + */ + if (sg_policy->policy->fast_switch_enabled && + !cpufreq_can_do_remote_dvfs(sg_policy->policy)) + return false; + if (sg_policy->work_in_progress) return false; @@ -106,7 +128,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, if (policy->fast_switch_enabled) { next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (next_freq == CPUFREQ_ENTRY_INVALID) + if (!next_freq) return; policy->cur = next_freq; @@ -154,12 +176,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max) +static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) { - struct rq *rq = this_rq(); + struct rq *rq = cpu_rq(cpu); unsigned long cfs_max; - cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id()); + cfs_max = arch_scale_cpu_capacity(NULL, cpu); *util = min(rq->cfs.avg.util_avg, cfs_max); *max = cfs_max; @@ -169,30 +191,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { if (flags & SCHED_CPUFREQ_IOWAIT) { - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + if (sg_cpu->iowait_boost_pending) + return; + + sg_cpu->iowait_boost_pending = true; + + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else { + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + } } else if (sg_cpu->iowait_boost) { s64 delta_ns = time - sg_cpu->last_update; /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } } } static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util = sg_cpu->iowait_boost; - unsigned long boost_max = sg_cpu->iowait_boost_max; + unsigned int boost_util, boost_max; - if (!boost_util) + if (!sg_cpu->iowait_boost) return; + if (sg_cpu->iowait_boost_pending) { + sg_cpu->iowait_boost_pending = false; + } else { + sg_cpu->iowait_boost >>= 1; + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + sg_cpu->iowait_boost = 0; + return; + } + } + + boost_util = sg_cpu->iowait_boost; + boost_max = sg_cpu->iowait_boost_max; + if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; } - sg_cpu->iowait_boost >>= 1; } #ifdef CONFIG_NO_HZ_COMMON @@ -229,7 +275,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, sg_cpu->cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -264,6 +310,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; + j_sg_cpu->iowait_boost_pending = false; continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) @@ -290,7 +337,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max); + sugov_get_util(&util, &max, sg_cpu->cpu); raw_spin_lock(&sg_policy->update_lock); @@ -445,7 +492,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + + /* Kthread is bound to all CPUs by default */ + if (!policy->dvfs_possible_from_any_cpu) + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -663,6 +714,11 @@ struct cpufreq_governor *cpufreq_default_governor(void) static int __init sugov_register(void) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(sugov_cpu, cpu).cpu = cpu; + return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 755bd3f1a1a9..5c3bf4bd0327 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1136,7 +1136,7 @@ static void update_curr_dl(struct rq *rq) } /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); + cpufreq_update_util(rq, SCHED_CPUFREQ_DL); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c95880e216f6..d378d02fdfcb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3278,7 +3278,9 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { - if (&this_rq()->cfs == cfs_rq) { + struct rq *rq = rq_of(cfs_rq); + + if (&rq->cfs == cfs_rq) { /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be @@ -3295,7 +3297,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * * See cpu_util(). */ - cpufreq_update_util(rq_of(cfs_rq), 0); + cpufreq_update_util(rq, 0); } } @@ -4875,7 +4877,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * passed. */ if (p->in_iowait) - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { if (se->on_rq) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 45caf937ef90..0af5ca9e3e3f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq) return; /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); + cpufreq_update_util(rq, SCHED_CPUFREQ_RT); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eeef1a3086d1..aa9d5b87b4f8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2070,19 +2070,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); if (data) data->func(data, rq_clock(rq), flags); } - -static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -{ - if (cpu_of(rq) == smp_processor_id()) - cpufreq_update_util(rq, flags); -} #else static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity