sched: provide per cpu-cgroup option to notify on migrations
On systems where CPUs may run asynchronously, task migrations
between CPUs running at grossly different speeds can cause
problems.
This change provides a mechanism to notify a subsystem
in the kernel if a task in a particular cgroup migrates to a
different CPU. Other subsystems (such as cpufreq) may then
register for this notifier to take appropriate action when
such a task is migrated.
The cgroup attribute to set for this behavior is
"notify_on_migrate" .
Change-Id: Ie1868249e53ef901b89c837fdc33b0ad0c0a4590
Signed-off-by: Steve Muckle <smuckle@codeaurora.org>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7ddd804..bd56b4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2734,6 +2734,8 @@
#endif /* CONFIG_SMP */
+extern struct atomic_notifier_head migration_notifier_head;
+
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cfa614e..971b76e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -87,6 +87,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+ATOMIC_NOTIFIER_HEAD(migration_notifier_head);
+
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -1581,15 +1583,17 @@
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
- int cpu, success = 0;
+ int cpu, src_cpu, success = 0;
smp_wmb();
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ src_cpu = task_cpu(p);
+ cpu = src_cpu;
+
if (!(p->state & state))
goto out;
success = 1; /* we're going to change ->state */
- cpu = task_cpu(p);
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
@@ -1626,7 +1630,7 @@
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (task_cpu(p) != cpu) {
+ if (src_cpu != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
@@ -1638,6 +1642,9 @@
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ if (src_cpu != cpu && task_notify_on_migrate(p))
+ atomic_notifier_call_chain(&migration_notifier_head,
+ cpu, (void *)src_cpu);
return success;
}
@@ -5196,6 +5203,7 @@
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
struct rq *rq_dest, *rq_src;
+ bool moved = false;
int ret = 0;
if (unlikely(!cpu_active(dest_cpu)))
@@ -5222,12 +5230,16 @@
set_task_cpu(p, dest_cpu);
enqueue_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p, 0);
+ moved = true;
}
done:
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
raw_spin_unlock(&p->pi_lock);
+ if (moved && task_notify_on_migrate(p))
+ atomic_notifier_call_chain(&migration_notifier_head,
+ dest_cpu, (void *)src_cpu);
return ret;
}
@@ -7857,6 +7869,24 @@
sched_move_task(task);
}
+static u64 cpu_notify_on_migrate_read_u64(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ return tg->notify_on_migrate;
+}
+
+static int cpu_notify_on_migrate_write_u64(struct cgroup *cgrp,
+ struct cftype *cft, u64 notify)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ tg->notify_on_migrate = (notify > 0);
+
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
@@ -8128,6 +8158,11 @@
#endif /* CONFIG_RT_GROUP_SCHED */
static struct cftype cpu_files[] = {
+ {
+ .name = "notify_on_migrate",
+ .read_u64 = cpu_notify_on_migrate_read_u64,
+ .write_u64 = cpu_notify_on_migrate_write_u64,
+ },
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b175073..d71e6e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3103,6 +3103,8 @@
unsigned int loop_max;
};
+static DEFINE_PER_CPU(bool, dbs_boost_needed);
+
/*
* move_task - move a task from one runqueue to another runqueue.
* Both runqueues must be locked.
@@ -3113,6 +3115,8 @@
set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0);
+ if (task_notify_on_migrate(p))
+ per_cpu(dbs_boost_needed, env->dst_cpu) = true;
}
/*
@@ -4543,9 +4547,15 @@
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
- } else
+ } else {
sd->nr_balance_failed = 0;
-
+ if (per_cpu(dbs_boost_needed, this_cpu)) {
+ per_cpu(dbs_boost_needed, this_cpu) = false;
+ atomic_notifier_call_chain(&migration_notifier_head,
+ this_cpu,
+ (void *)cpu_of(busiest));
+ }
+ }
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
@@ -4700,6 +4710,12 @@
out_unlock:
busiest_rq->active_balance = 0;
raw_spin_unlock_irq(&busiest_rq->lock);
+ if (per_cpu(dbs_boost_needed, target_cpu)) {
+ per_cpu(dbs_boost_needed, target_cpu) = false;
+ atomic_notifier_call_chain(&migration_notifier_head,
+ target_cpu,
+ (void *)cpu_of(busiest_rq));
+ }
return 0;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8f32475..f8317df 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1604,6 +1604,7 @@
struct task_struct *next_task;
struct rq *lowest_rq;
int ret = 0;
+ bool moved = false;
if (!rq->rt.overloaded)
return 0;
@@ -1673,6 +1674,7 @@
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
+ moved = true;
activate_task(lowest_rq, next_task, 0);
ret = 1;
@@ -1683,6 +1685,11 @@
out:
put_task_struct(next_task);
+ if (moved && task_notify_on_migrate(next_task))
+ atomic_notifier_call_chain(&migration_notifier_head,
+ cpu_of(lowest_rq),
+ (void *)cpu_of(rq));
+
return ret;
}
@@ -1696,8 +1703,10 @@
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
- struct task_struct *p;
+ struct task_struct *p = NULL;
struct rq *src_rq;
+ bool moved = false;
+ int src_cpu = 0;
if (likely(!rt_overloaded(this_rq)))
return 0;
@@ -1758,6 +1767,10 @@
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
+
+ moved = true;
+ src_cpu = cpu_of(src_rq);
+
/*
* We continue with the search, just in
* case there's an even higher prio task
@@ -1769,6 +1782,11 @@
double_unlock_balance(this_rq, src_rq);
}
+ if (moved && task_notify_on_migrate(p))
+ atomic_notifier_call_chain(&migration_notifier_head,
+ this_cpu,
+ (void *)src_cpu);
+
return ret;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 74f45e1..bec78f5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -104,6 +104,8 @@
struct task_group {
struct cgroup_subsys_state css;
+ bool notify_on_migrate;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
@@ -554,6 +556,11 @@
return autogroup_task_group(p, tg);
}
+static inline bool task_notify_on_migrate(struct task_struct *p)
+{
+ return task_group(p)->notify_on_migrate;
+}
+
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
@@ -579,7 +586,10 @@
{
return NULL;
}
-
+static inline bool task_notify_on_migrate(struct task_struct *p)
+{
+ return false;
+}
#endif /* CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)