Blame - kernel/sched/fair.c - android_kernel_oneplus_msm8996

blob: 35661b8afb4e042b16ad79fc9b718df74089c94d [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Paul Gortmaker	8527632	2013-04-19 15:10:50 -0400	[diff] [blame]	116	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				117	{
				118	lw->weight += inc;
				119	lw->inv_weight = 0;
				120	}
				121
				122	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				123	{
				124	lw->weight -= dec;
				125	lw->inv_weight = 0;
				126	}
				127
				128	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				129	{
				130	lw->weight = w;
				131	lw->inv_weight = 0;
				132	}
				133
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	134	/*
				135	* Increase the granularity value when there are more CPUs,
				136	* because with more CPUs the 'effective latency' as visible
				137	* to users decreases. But the relationship is not linear,
				138	* so pick a second-best guess by going with the log2 of the
				139	* number of CPUs.
				140	*
				141	* This idea comes from the SD scheduler of Con Kolivas:
				142	*/
				143	static int get_update_sysctl_factor(void)
				144	{
				145	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				146	unsigned int factor;
				147
				148	switch (sysctl_sched_tunable_scaling) {
				149	case SCHED_TUNABLESCALING_NONE:
				150	factor = 1;
				151	break;
				152	case SCHED_TUNABLESCALING_LINEAR:
				153	factor = cpus;
				154	break;
				155	case SCHED_TUNABLESCALING_LOG:
				156	default:
				157	factor = 1 + ilog2(cpus);
				158	break;
				159	}
				160
				161	return factor;
				162	}
				163
				164	static void update_sysctl(void)
				165	{
				166	unsigned int factor = get_update_sysctl_factor();
				167
				168	#define SET_SYSCTL(name) \
				169	(sysctl_##name = (factor) * normalized_sysctl_##name)
				170	SET_SYSCTL(sched_min_granularity);
				171	SET_SYSCTL(sched_latency);
				172	SET_SYSCTL(sched_wakeup_granularity);
				173	#undef SET_SYSCTL
				174	}
				175
				176	void sched_init_granularity(void)
				177	{
				178	update_sysctl();
				179	}
				180
				181	#if BITS_PER_LONG == 32
				182	# define WMULT_CONST (~0UL)
				183	#else
				184	# define WMULT_CONST (1UL << 32)
				185	#endif
				186
				187	#define WMULT_SHIFT 32
				188
				189	/*
				190	* Shift right and round:
				191	*/
				192	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				193
				194	/*
				195	* delta *= weight / lw
				196	*/
				197	static unsigned long
				198	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				199	struct load_weight *lw)
				200	{
				201	u64 tmp;
				202
				203	/*
				204	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				205	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				206	* 2^SCHED_LOAD_RESOLUTION.
				207	*/
				208	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				209	tmp = (u64)delta_exec * scale_load_down(weight);
				210	else
				211	tmp = (u64)delta_exec;
				212
				213	if (!lw->inv_weight) {
				214	unsigned long w = scale_load_down(lw->weight);
				215
				216	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				217	lw->inv_weight = 1;
				218	else if (unlikely(!w))
				219	lw->inv_weight = WMULT_CONST;
				220	else
				221	lw->inv_weight = WMULT_CONST / w;
				222	}
				223
				224	/*
				225	* Check whether we'd overflow the 64-bit multiplication:
				226	*/
				227	if (unlikely(tmp > WMULT_CONST))
				228	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				229	WMULT_SHIFT/2);
				230	else
				231	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				232
				233	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				234	}
				235
				236
				237	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	238
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	239	/**************************************************************
				240	* CFS operations on generic schedulable entities:
				241	*/
				242
				243	#ifdef CONFIG_FAIR_GROUP_SCHED
				244
				245	/* cpu runqueue to which this cfs_rq is attached */
				246	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				247	{
				248	return cfs_rq->rq;
				249	}
				250
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	251	/* An entity is a task if it doesn't "own" a runqueue */
				252	#define entity_is_task(se) (!se->my_q)
				253
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	254	static inline struct task_struct task_of(struct sched_entity se)
				255	{
				256	#ifdef CONFIG_SCHED_DEBUG
				257	WARN_ON_ONCE(!entity_is_task(se));
				258	#endif
				259	return container_of(se, struct task_struct, se);
				260	}
				261
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	262	/* Walk up scheduling entities hierarchy */
				263	#define for_each_sched_entity(se) \
				264	for (; se; se = se->parent)
				265
				266	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				267	{
				268	return p->se.cfs_rq;
				269	}
				270
				271	/* runqueue on which this entity is (to be) queued */
				272	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				273	{
				274	return se->cfs_rq;
				275	}
				276
				277	/* runqueue "owned" by this group */
				278	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				279	{
				280	return grp->my_q;
				281	}
				282
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	283	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				284	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	285
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	286	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				287	{
				288	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	289	/*
				290	* Ensure we either appear before our parent (if already
				291	* enqueued) or force our parent to appear after us when it is
				292	* enqueued. The fact that we always enqueue bottom-up
				293	* reduces this to two cases.
				294	*/
				295	if (cfs_rq->tg->parent &&
				296	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				297	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	298	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	299	} else {
				300	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				301	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				302	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	303
				304	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	305	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	306	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	307	}
				308	}
				309
				310	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				311	{
				312	if (cfs_rq->on_list) {
				313	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				314	cfs_rq->on_list = 0;
				315	}
				316	}
				317
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	318	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				319	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				320	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				321
				322	/* Do the two (enqueued) entities belong to the same group ? */
				323	static inline int
				324	is_same_group(struct sched_entity se, struct sched_entity pse)
				325	{
				326	if (se->cfs_rq == pse->cfs_rq)
				327	return 1;
				328
				329	return 0;
				330	}
				331
				332	static inline struct sched_entity parent_entity(struct sched_entity se)
				333	{
				334	return se->parent;
				335	}
				336
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	337	/* return depth at which a sched entity is present in the hierarchy */
				338	static inline int depth_se(struct sched_entity *se)
				339	{
				340	int depth = 0;
				341
				342	for_each_sched_entity(se)
				343	depth++;
				344
				345	return depth;
				346	}
				347
				348	static void
				349	find_matching_se(struct sched_entity se, struct sched_entity pse)
				350	{
				351	int se_depth, pse_depth;
				352
				353	/*
				354	* preemption test can be made between sibling entities who are in the
				355	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				356	* both tasks until we find their ancestors who are siblings of common
				357	* parent.
				358	*/
				359
				360	/* First walk up until both entities are at same depth */
				361	se_depth = depth_se(*se);
				362	pse_depth = depth_se(*pse);
				363
				364	while (se_depth > pse_depth) {
				365	se_depth--;
				366	se = parent_entity(se);
				367	}
				368
				369	while (pse_depth > se_depth) {
				370	pse_depth--;
				371	pse = parent_entity(pse);
				372	}
				373
				374	while (!is_same_group(se, pse)) {
				375	se = parent_entity(se);
				376	pse = parent_entity(pse);
				377	}
				378	}
				379
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	380	#else /* !CONFIG_FAIR_GROUP_SCHED */
				381
				382	static inline struct task_struct task_of(struct sched_entity se)
				383	{
				384	return container_of(se, struct task_struct, se);
				385	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	386
				387	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				388	{
				389	return container_of(cfs_rq, struct rq, cfs);
				390	}
				391
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	392	#define entity_is_task(se) 1
				393
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	394	#define for_each_sched_entity(se) \
				395	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	396
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	397	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	398	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	399	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	400	}
				401
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	402	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				403	{
				404	struct task_struct *p = task_of(se);
				405	struct rq *rq = task_rq(p);
				406
				407	return &rq->cfs;
				408	}
				409
				410	/* runqueue "owned" by this group */
				411	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				412	{
				413	return NULL;
				414	}
				415
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	416	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				417	{
				418	}
				419
				420	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				421	{
				422	}
				423
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	424	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				425	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				426
				427	static inline int
				428	is_same_group(struct sched_entity se, struct sched_entity pse)
				429	{
				430	return 1;
				431	}
				432
				433	static inline struct sched_entity parent_entity(struct sched_entity se)
				434	{
				435	return NULL;
				436	}
				437
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	438	static inline void
				439	find_matching_se(struct sched_entity se, struct sched_entity pse)
				440	{
				441	}
				442
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	443	#endif /* CONFIG_FAIR_GROUP_SCHED */
				444
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	445	static __always_inline
				446	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	447
				448	/**************************************************************
				449	* Scheduling class tree data structure manipulation methods:
				450	*/
				451
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	452	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	453	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	454	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	455	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	456	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	457
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	458	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	459	}
				460
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	461	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	462	{
				463	s64 delta = (s64)(vruntime - min_vruntime);
				464	if (delta < 0)
				465	min_vruntime = vruntime;
				466
				467	return min_vruntime;
				468	}
				469
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	470	static inline int entity_before(struct sched_entity *a,
				471	struct sched_entity *b)
				472	{
				473	return (s64)(a->vruntime - b->vruntime) < 0;
				474	}
				475
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	476	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				477	{
				478	u64 vruntime = cfs_rq->min_vruntime;
				479
				480	if (cfs_rq->curr)
				481	vruntime = cfs_rq->curr->vruntime;
				482
				483	if (cfs_rq->rb_leftmost) {
				484	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				485	struct sched_entity,
				486	run_node);
				487
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	488	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	489	vruntime = se->vruntime;
				490	else
				491	vruntime = min_vruntime(vruntime, se->vruntime);
				492	}
				493
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	494	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	495	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	496	#ifndef CONFIG_64BIT
				497	smp_wmb();
				498	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				499	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	500	}
				501
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	502	/*
				503	* Enqueue an entity into the rb-tree:
				504	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	505	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	506	{
				507	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				508	struct rb_node *parent = NULL;
				509	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	510	int leftmost = 1;
				511
				512	/*
				513	* Find the right place in the rbtree:
				514	*/
				515	while (*link) {
				516	parent = *link;
				517	entry = rb_entry(parent, struct sched_entity, run_node);
				518	/*
				519	* We dont care about collisions. Nodes with
				520	* the same key stay together.
				521	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	522	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	link = &parent->rb_left;
				524	} else {
				525	link = &parent->rb_right;
				526	leftmost = 0;
				527	}
				528	}
				529
				530	/*
				531	* Maintain a cache of leftmost tree entries (it is frequently
				532	* used):
				533	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	534	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	535	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	536
				537	rb_link_node(&se->run_node, parent, link);
				538	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	539	}
				540
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	541	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	543	if (cfs_rq->rb_leftmost == &se->run_node) {
				544	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	545
				546	next_node = rb_next(&se->run_node);
				547	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	548	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	549
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	550	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	551	}
				552
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	553	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	554	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	555	struct rb_node *left = cfs_rq->rb_leftmost;
				556
				557	if (!left)
				558	return NULL;
				559
				560	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	561	}
				562
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	563	static struct sched_entity __pick_next_entity(struct sched_entity se)
				564	{
				565	struct rb_node *next = rb_next(&se->run_node);
				566
				567	if (!next)
				568	return NULL;
				569
				570	return rb_entry(next, struct sched_entity, run_node);
				571	}
				572
				573	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	574	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	575	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	576	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	577
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	578	if (!last)
				579	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	580
				581	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	582	}
				583
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	584	/**************************************************************
				585	* Scheduling class statistics methods:
				586	*/
				587
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	588	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	589	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	590	loff_t *ppos)
				591	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	592	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	593	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	594
				595	if (ret \|\| !write)
				596	return ret;
				597
				598	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				599	sysctl_sched_min_granularity);
				600
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	601	#define WRT_SYSCTL(name) \
				602	(normalized_sysctl_##name = sysctl_##name / (factor))
				603	WRT_SYSCTL(sched_min_granularity);
				604	WRT_SYSCTL(sched_latency);
				605	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	606	#undef WRT_SYSCTL
				607
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	608	return 0;
				609	}
				610	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	611
				612	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	613	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	614	*/
				615	static inline unsigned long
				616	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				617	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	618	if (unlikely(se->load.weight != NICE_0_LOAD))
				619	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	620
				621	return delta;
				622	}
				623
				624	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	625	* The idea is to set a period in which each task runs once.
				626	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	627	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	628	* this period because otherwise the slices get too small.
				629	*
				630	* p = (nr <= nl) ? l : l*nr/nl
				631	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	632	static u64 __sched_period(unsigned long nr_running)
				633	{
				634	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	635	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	636
				637	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	638	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	639	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	640	}
				641
				642	return period;
				643	}
				644
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	645	/*
				646	* We calculate the wall-time slice from the period by taking a part
				647	* proportional to the weight.
				648	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	649	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	650	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	651	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	652	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	653	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	654
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	655	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	656	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	657	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	658
				659	cfs_rq = cfs_rq_of(se);
				660	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	662	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	663	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	664
				665	update_load_add(&lw, se->load.weight);
				666	load = &lw;
				667	}
				668	slice = calc_delta_mine(slice, se->load.weight, load);
				669	}
				670	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	}
				672
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	673	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	674	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	675	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	676	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	677	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	678	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	679	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	680	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	681	}
				682
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	683	#ifdef CONFIG_SMP
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	684	static unsigned long task_h_load(struct task_struct *p);
				685
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	686	static inline void __update_task_entity_contrib(struct sched_entity *se);
				687
				688	/* Give new task start runnable values to heavy its load in infant time */
				689	void init_task_runnable_average(struct task_struct *p)
				690	{
				691	u32 slice;
				692
				693	p->se.avg.decay_count = 0;
				694	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
				695	p->se.avg.runnable_avg_sum = slice;
				696	p->se.avg.runnable_avg_period = slice;
				697	__update_task_entity_contrib(&p->se);
				698	}
				699	#else
				700	void init_task_runnable_average(struct task_struct *p)
				701	{
				702	}
				703	#endif
				704
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	705	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	706	* Update the current task's runtime statistics. Skip current tasks that
				707	* are not in our scheduling class.
				708	*/
				709	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	710	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				711	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	712	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	713	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	714
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	715	schedstat_set(curr->statistics.exec_max,
				716	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	717
				718	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	719	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	720	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	721
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	722	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	723	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	724	}
				725
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	726	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	727	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	728	struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	729	u64 now = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	730	unsigned long delta_exec;
				731
				732	if (unlikely(!curr))
				733	return;
				734
				735	/*
				736	* Get the amount of time the current task was running
				737	* since the last time we changed load (this cannot
				738	* overflow on 32 bits):
				739	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	740	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	741	if (!delta_exec)
				742	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	743
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	744	__update_curr(cfs_rq, curr, delta_exec);
				745	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	746
				747	if (entity_is_task(curr)) {
				748	struct task_struct *curtask = task_of(curr);
				749
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	750	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	751	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	752	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	753	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	754
				755	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	}
				757
				758	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	759	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	760	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	761	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	}
				763
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	764	/*
				765	* Task is being enqueued - update stats:
				766	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	767	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	768	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	769	/*
				770	* Are we enqueueing a waiting task? (for current tasks
				771	* a dequeue/enqueue event is a NOP)
				772	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	773	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	774	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	775	}
				776
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	777	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	778	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	779	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	780	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	781	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	782	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				783	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	784	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	785	#ifdef CONFIG_SCHEDSTATS
				786	if (entity_is_task(se)) {
				787	trace_sched_stat_wait(task_of(se),
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	788	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	789	}
				790	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	791	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	792	}
				793
				794	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	795	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	796	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	797	/*
				798	* Mark the end of the wait period if dequeueing a
				799	* waiting task:
				800	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	801	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	802	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	803	}
				804
				805	/*
				806	* We are picking a new current task - update its stats:
				807	*/
				808	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	809	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	810	{
				811	/*
				812	* We are starting a new run period:
				813	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	814	se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	815	}
				816
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	817	/**************************************************
				818	* Scheduling class queueing methods:
				819	*/
				820
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	821	#ifdef CONFIG_NUMA_BALANCING
				822	/*
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	823	* Approximate time to scan a full NUMA task in ms. The task scan period is
				824	* calculated based on the tasks virtual memory size and
				825	* numa_balancing_scan_size.
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	826	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	827	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				828	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
				829	unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	830
				831	/* Portion of address space to scan in MB */
				832	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	833
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	834	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				835	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				836
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	837	static unsigned int task_nr_scan_windows(struct task_struct *p)
				838	{
				839	unsigned long rss = 0;
				840	unsigned long nr_scan_pages;
				841
				842	/*
				843	* Calculations based on RSS as non-present and empty pages are skipped
				844	* by the PTE scanner and NUMA hinting faults should be trapped based
				845	* on resident pages
				846	*/
				847	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				848	rss = get_mm_rss(p->mm);
				849	if (!rss)
				850	rss = nr_scan_pages;
				851
				852	rss = round_up(rss, nr_scan_pages);
				853	return rss / nr_scan_pages;
				854	}
				855
				856	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				857	#define MAX_SCAN_WINDOW 2560
				858
				859	static unsigned int task_scan_min(struct task_struct *p)
				860	{
				861	unsigned int scan, floor;
				862	unsigned int windows = 1;
				863
				864	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
				865	windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
				866	floor = 1000 / windows;
				867
				868	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				869	return max_t(unsigned int, floor, scan);
				870	}
				871
				872	static unsigned int task_scan_max(struct task_struct *p)
				873	{
				874	unsigned int smin = task_scan_min(p);
				875	unsigned int smax;
				876
				877	/* Watch for min being lower than max due to floor calculations */
				878	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				879	return max(smin, smax);
				880	}
				881
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	882	/*
				883	* Once a preferred node is selected the scheduler balancer will prefer moving
				884	* a task to that node for sysctl_numa_balancing_settle_count number of PTE
				885	* scans. This will give the process the chance to accumulate more faults on
				886	* the preferred node but still allow the scheduler to move the task again if
				887	* the nodes CPUs are overloaded.
				888	*/
Rik van Riel	6fe6b2d	2013-10-07 11:29:08 +0100	[diff] [blame]	889	unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	890
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	891	struct numa_group {
				892	atomic_t refcount;
				893
				894	spinlock_t lock; /* nr_tasks, tasks */
				895	int nr_tasks;
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	896	pid_t gid;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	897	struct list_head task_list;
				898
				899	struct rcu_head rcu;
				900	atomic_long_t faults[0];
				901	};
				902
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	903	pid_t task_numa_group_id(struct task_struct *p)
				904	{
				905	return p->numa_group ? p->numa_group->gid : 0;
				906	}
				907
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	908	static inline int task_faults_idx(int nid, int priv)
				909	{
				910	return 2 * nid + priv;
				911	}
				912
				913	static inline unsigned long task_faults(struct task_struct *p, int nid)
				914	{
				915	if (!p->numa_faults)
				916	return 0;
				917
				918	return p->numa_faults[task_faults_idx(nid, 0)] +
				919	p->numa_faults[task_faults_idx(nid, 1)];
				920	}
				921
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	922	static unsigned long weighted_cpuload(const int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	923	static unsigned long source_load(int cpu, int type);
				924	static unsigned long target_load(int cpu, int type);
				925	static unsigned long power_of(int cpu);
				926	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	927
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	928	/* Cached statistics for all CPUs within a node */
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	929	struct numa_stats {
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	930	unsigned long nr_running;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	931	unsigned long load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	932
				933	/* Total compute capacity of CPUs on a node */
				934	unsigned long power;
				935
				936	/* Approximate capacity in terms of runnable tasks on a node */
				937	unsigned long capacity;
				938	int has_capacity;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	939	};
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	940
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	941	/*
				942	* XXX borrowed from update_sg_lb_stats
				943	*/
				944	static void update_numa_stats(struct numa_stats *ns, int nid)
				945	{
				946	int cpu;
				947
				948	memset(ns, 0, sizeof(*ns));
				949	for_each_cpu(cpu, cpumask_of_node(nid)) {
				950	struct rq *rq = cpu_rq(cpu);
				951
				952	ns->nr_running += rq->nr_running;
				953	ns->load += weighted_cpuload(cpu);
				954	ns->power += power_of(cpu);
				955	}
				956
				957	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
				958	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
				959	ns->has_capacity = (ns->nr_running < ns->capacity);
				960	}
				961
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	962	struct task_numa_env {
				963	struct task_struct *p;
				964
				965	int src_cpu, src_nid;
				966	int dst_cpu, dst_nid;
				967
				968	struct numa_stats src_stats, dst_stats;
				969
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	970	int imbalance_pct, idx;
				971
				972	struct task_struct *best_task;
				973	long best_imp;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	974	int best_cpu;
				975	};
				976
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	977	static void task_numa_assign(struct task_numa_env *env,
				978	struct task_struct *p, long imp)
				979	{
				980	if (env->best_task)
				981	put_task_struct(env->best_task);
				982	if (p)
				983	get_task_struct(p);
				984
				985	env->best_task = p;
				986	env->best_imp = imp;
				987	env->best_cpu = env->dst_cpu;
				988	}
				989
				990	/*
				991	* This checks if the overall compute and NUMA accesses of the system would
				992	* be improved if the source tasks was migrated to the target dst_cpu taking
				993	* into account that it might be best if task running on the dst_cpu should
				994	* be exchanged with the source task
				995	*/
				996	static void task_numa_compare(struct task_numa_env *env, long imp)
				997	{
				998	struct rq *src_rq = cpu_rq(env->src_cpu);
				999	struct rq *dst_rq = cpu_rq(env->dst_cpu);
				1000	struct task_struct *cur;
				1001	long dst_load, src_load;
				1002	long load;
				1003
				1004	rcu_read_lock();
				1005	cur = ACCESS_ONCE(dst_rq->curr);
				1006	if (cur->pid == 0) /* idle */
				1007	cur = NULL;
				1008
				1009	/*
				1010	* "imp" is the fault differential for the source task between the
				1011	* source and destination node. Calculate the total differential for
				1012	* the source task and potential destination task. The more negative
				1013	* the value is, the more rmeote accesses that would be expected to
				1014	* be incurred if the tasks were swapped.
				1015	*/
				1016	if (cur) {
				1017	/* Skip this swap candidate if cannot move to the source cpu */
				1018	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
				1019	goto unlock;
				1020
				1021	imp += task_faults(cur, env->src_nid) -
				1022	task_faults(cur, env->dst_nid);
				1023	}
				1024
				1025	if (imp < env->best_imp)
				1026	goto unlock;
				1027
				1028	if (!cur) {
				1029	/* Is there capacity at our destination? */
				1030	if (env->src_stats.has_capacity &&
				1031	!env->dst_stats.has_capacity)
				1032	goto unlock;
				1033
				1034	goto balance;
				1035	}
				1036
				1037	/* Balance doesn't matter much if we're running a task per cpu */
				1038	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
				1039	goto assign;
				1040
				1041	/*
				1042	* In the overloaded case, try and keep the load balanced.
				1043	*/
				1044	balance:
				1045	dst_load = env->dst_stats.load;
				1046	src_load = env->src_stats.load;
				1047
				1048	/* XXX missing power terms */
				1049	load = task_h_load(env->p);
				1050	dst_load += load;
				1051	src_load -= load;
				1052
				1053	if (cur) {
				1054	load = task_h_load(cur);
				1055	dst_load -= load;
				1056	src_load += load;
				1057	}
				1058
				1059	/* make src_load the smaller */
				1060	if (dst_load < src_load)
				1061	swap(dst_load, src_load);
				1062
				1063	if (src_load * env->imbalance_pct < dst_load * 100)
				1064	goto unlock;
				1065
				1066	assign:
				1067	task_numa_assign(env, cur, imp);
				1068	unlock:
				1069	rcu_read_unlock();
				1070	}
				1071
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1072	static void task_numa_find_cpu(struct task_numa_env *env, long imp)
				1073	{
				1074	int cpu;
				1075
				1076	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
				1077	/* Skip this CPU if the source task cannot migrate */
				1078	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
				1079	continue;
				1080
				1081	env->dst_cpu = cpu;
				1082	task_numa_compare(env, imp);
				1083	}
				1084	}
				1085
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1086	static int task_numa_migrate(struct task_struct *p)
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1087	{
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1088	struct task_numa_env env = {
				1089	.p = p,
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1090
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1091	.src_cpu = task_cpu(p),
				1092	.src_nid = cpu_to_node(task_cpu(p)),
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1093
				1094	.imbalance_pct = 112,
				1095
				1096	.best_task = NULL,
				1097	.best_imp = 0,
				1098	.best_cpu = -1
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1099	};
				1100	struct sched_domain *sd;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1101	unsigned long faults;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1102	int nid, ret;
				1103	long imp;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1104
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1105	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1106	* Pick the lowest SD_NUMA domain, as that would have the smallest
				1107	* imbalance and would be the first to start moving tasks about.
				1108	*
				1109	* And we want to avoid any moving of tasks about, as that would create
				1110	* random movement of tasks -- counter the numa conditions we're trying
				1111	* to satisfy here.
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1112	*/
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1113	rcu_read_lock();
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1114	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
				1115	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1116	rcu_read_unlock();
				1117
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1118	faults = task_faults(p, env.src_nid);
				1119	update_numa_stats(&env.src_stats, env.src_nid);
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1120	env.dst_nid = p->numa_preferred_nid;
				1121	imp = task_faults(env.p, env.dst_nid) - faults;
				1122	update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1123
Rik van Riel	e1dda8a	2013-10-07 11:29:19 +0100	[diff] [blame]	1124	/* If the preferred nid has capacity, try to use it. */
				1125	if (env.dst_stats.has_capacity)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1126	task_numa_find_cpu(&env, imp);
Rik van Riel	e1dda8a	2013-10-07 11:29:19 +0100	[diff] [blame]	1127
				1128	/* No space available on the preferred nid. Look elsewhere. */
				1129	if (env.best_cpu == -1) {
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1130	for_each_online_node(nid) {
				1131	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1132	continue;
				1133
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1134	/* Only consider nodes that recorded more faults */
				1135	imp = task_faults(env.p, nid) - faults;
				1136	if (imp < 0)
				1137	continue;
				1138
				1139	env.dst_nid = nid;
				1140	update_numa_stats(&env.dst_stats, env.dst_nid);
				1141	task_numa_find_cpu(&env, imp);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1142	}
				1143	}
				1144
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1145	/* No better CPU than the current one was found. */
				1146	if (env.best_cpu == -1)
				1147	return -EAGAIN;
				1148
				1149	if (env.best_task == NULL) {
				1150	int ret = migrate_task_to(p, env.best_cpu);
				1151	return ret;
				1152	}
				1153
				1154	ret = migrate_swap(p, env.best_task);
				1155	put_task_struct(env.best_task);
				1156	return ret;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1157	}
				1158
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1159	/* Attempt to migrate a task to a CPU on the preferred node. */
				1160	static void numa_migrate_preferred(struct task_struct *p)
				1161	{
				1162	/* Success if task is already running on preferred CPU */
				1163	p->numa_migrate_retry = 0;
Rik van Riel	06ea5e0	2013-10-07 11:29:12 +0100	[diff] [blame]	1164	if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) {
				1165	/*
				1166	* If migration is temporarily disabled due to a task migration
				1167	* then re-enable it now as the task is running on its
				1168	* preferred node and memory should migrate locally
				1169	*/
				1170	if (!p->numa_migrate_seq)
				1171	p->numa_migrate_seq++;
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1172	return;
Rik van Riel	06ea5e0	2013-10-07 11:29:12 +0100	[diff] [blame]	1173	}
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1174
				1175	/* This task has no NUMA fault statistics yet */
				1176	if (unlikely(p->numa_preferred_nid == -1))
				1177	return;
				1178
				1179	/* Otherwise, try migrate to a CPU on the preferred node */
				1180	if (task_numa_migrate(p) != 0)
				1181	p->numa_migrate_retry = jiffies + HZ*5;
				1182	}
				1183
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1184	static void task_numa_placement(struct task_struct *p)
				1185	{
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1186	int seq, nid, max_nid = -1;
				1187	unsigned long max_faults = 0;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1188
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	1189	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1190	if (p->numa_scan_seq == seq)
				1191	return;
				1192	p->numa_scan_seq = seq;
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	1193	p->numa_migrate_seq++;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1194	p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1195
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1196	/* Find the node with the highest number of faults */
				1197	for_each_online_node(nid) {
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1198	unsigned long faults = 0;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1199	int priv, i;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	1200
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1201	for (priv = 0; priv < 2; priv++) {
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1202	long diff;
				1203
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1204	i = task_faults_idx(nid, priv);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1205	diff = -p->numa_faults[i];
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	1206
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1207	/* Decay existing window, copy faults since last scan */
				1208	p->numa_faults[i] >>= 1;
				1209	p->numa_faults[i] += p->numa_faults_buffer[i];
				1210	p->numa_faults_buffer[i] = 0;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1211
				1212	faults += p->numa_faults[i];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1213	diff += p->numa_faults[i];
				1214	if (p->numa_group) {
				1215	/* safe because we can only change our own group */
				1216	atomic_long_add(diff, &p->numa_group->faults[i]);
				1217	}
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1218	}
				1219
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1220	if (faults > max_faults) {
				1221	max_faults = faults;
				1222	max_nid = nid;
				1223	}
				1224	}
				1225
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1226	/* Preferred node as the node with the most faults */
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	1227	if (max_faults && max_nid != p->numa_preferred_nid) {
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1228	/* Update the preferred nid and migrate task if possible */
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1229	p->numa_preferred_nid = max_nid;
Rik van Riel	6fe6b2d	2013-10-07 11:29:08 +0100	[diff] [blame]	1230	p->numa_migrate_seq = 1;
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1231	numa_migrate_preferred(p);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	1232	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1233	}
				1234
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1235	static inline int get_numa_group(struct numa_group *grp)
				1236	{
				1237	return atomic_inc_not_zero(&grp->refcount);
				1238	}
				1239
				1240	static inline void put_numa_group(struct numa_group *grp)
				1241	{
				1242	if (atomic_dec_and_test(&grp->refcount))
				1243	kfree_rcu(grp, rcu);
				1244	}
				1245
				1246	static void double_lock(spinlock_t l1, spinlock_t l2)
				1247	{
				1248	if (l1 > l2)
				1249	swap(l1, l2);
				1250
				1251	spin_lock(l1);
				1252	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
				1253	}
				1254
				1255	static void task_numa_group(struct task_struct *p, int cpupid)
				1256	{
				1257	struct numa_group grp, my_grp;
				1258	struct task_struct *tsk;
				1259	bool join = false;
				1260	int cpu = cpupid_to_cpu(cpupid);
				1261	int i;
				1262
				1263	if (unlikely(!p->numa_group)) {
				1264	unsigned int size = sizeof(struct numa_group) +
				1265	2nr_node_idssizeof(atomic_long_t);
				1266
				1267	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
				1268	if (!grp)
				1269	return;
				1270
				1271	atomic_set(&grp->refcount, 1);
				1272	spin_lock_init(&grp->lock);
				1273	INIT_LIST_HEAD(&grp->task_list);
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	1274	grp->gid = p->pid;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1275
				1276	for (i = 0; i < 2*nr_node_ids; i++)
				1277	atomic_long_set(&grp->faults[i], p->numa_faults[i]);
				1278
				1279	list_add(&p->numa_entry, &grp->task_list);
				1280	grp->nr_tasks++;
				1281	rcu_assign_pointer(p->numa_group, grp);
				1282	}
				1283
				1284	rcu_read_lock();
				1285	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
				1286
				1287	if (!cpupid_match_pid(tsk, cpupid))
				1288	goto unlock;
				1289
				1290	grp = rcu_dereference(tsk->numa_group);
				1291	if (!grp)
				1292	goto unlock;
				1293
				1294	my_grp = p->numa_group;
				1295	if (grp == my_grp)
				1296	goto unlock;
				1297
				1298	/*
				1299	* Only join the other group if its bigger; if we're the bigger group,
				1300	* the other task will join us.
				1301	*/
				1302	if (my_grp->nr_tasks > grp->nr_tasks)
				1303	goto unlock;
				1304
				1305	/*
				1306	* Tie-break on the grp address.
				1307	*/
				1308	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
				1309	goto unlock;
				1310
				1311	if (!get_numa_group(grp))
				1312	goto unlock;
				1313
				1314	join = true;
				1315
				1316	unlock:
				1317	rcu_read_unlock();
				1318
				1319	if (!join)
				1320	return;
				1321
				1322	for (i = 0; i < 2*nr_node_ids; i++) {
				1323	atomic_long_sub(p->numa_faults[i], &my_grp->faults[i]);
				1324	atomic_long_add(p->numa_faults[i], &grp->faults[i]);
				1325	}
				1326
				1327	double_lock(&my_grp->lock, &grp->lock);
				1328
				1329	list_move(&p->numa_entry, &grp->task_list);
				1330	my_grp->nr_tasks--;
				1331	grp->nr_tasks++;
				1332
				1333	spin_unlock(&my_grp->lock);
				1334	spin_unlock(&grp->lock);
				1335
				1336	rcu_assign_pointer(p->numa_group, grp);
				1337
				1338	put_numa_group(my_grp);
				1339	}
				1340
				1341	void task_numa_free(struct task_struct *p)
				1342	{
				1343	struct numa_group *grp = p->numa_group;
				1344	int i;
				1345
				1346	if (grp) {
				1347	for (i = 0; i < 2*nr_node_ids; i++)
				1348	atomic_long_sub(p->numa_faults[i], &grp->faults[i]);
				1349
				1350	spin_lock(&grp->lock);
				1351	list_del(&p->numa_entry);
				1352	grp->nr_tasks--;
				1353	spin_unlock(&grp->lock);
				1354	rcu_assign_pointer(p->numa_group, NULL);
				1355	put_numa_group(grp);
				1356	}
				1357
				1358	kfree(p->numa_faults);
				1359	}
				1360
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1361	/*
				1362	* Got a PROT_NONE fault for a page on @node.
				1363	*/
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame^]	1364	void task_numa_fault(int last_cpupid, int node, int pages, int flags)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1365	{
				1366	struct task_struct *p = current;
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame^]	1367	bool migrated = flags & TNF_MIGRATED;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1368	int priv;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1369
Dave Kleikamp	10e84b9	2013-07-31 13:53:35 -0700	[diff] [blame]	1370	if (!numabalancing_enabled)
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	1371	return;
				1372
Mel Gorman	9ff1d9f	2013-10-07 11:29:04 +0100	[diff] [blame]	1373	/* for example, ksmd faulting in a user's mm */
				1374	if (!p->mm)
				1375	return;
				1376
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	1377	/* Allocate buffer to track faults on a per-node basis */
				1378	if (unlikely(!p->numa_faults)) {
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1379	int size = sizeof(p->numa_faults) 2 * nr_node_ids;
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	1380
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	1381	/* numa_faults and numa_faults_buffer share the allocation */
				1382	p->numa_faults = kzalloc(size * 2, GFP_KERNEL\|__GFP_NOWARN);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	1383	if (!p->numa_faults)
				1384	return;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	1385
				1386	BUG_ON(p->numa_faults_buffer);
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1387	p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	1388	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1389
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	1390	/*
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1391	* First accesses are treated as private, otherwise consider accesses
				1392	* to be private if the accessing pid has not changed
				1393	*/
				1394	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
				1395	priv = 1;
				1396	} else {
				1397	priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame^]	1398	if (!priv && !(flags & TNF_NO_GROUP))
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1399	task_numa_group(p, last_cpupid);
				1400	}
				1401
				1402	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	1403	* If pages are properly placed (did not migrate) then scan slower.
				1404	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	1405	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1406	if (!migrated) {
				1407	/* Initialise if necessary */
				1408	if (!p->numa_scan_period_max)
				1409	p->numa_scan_period_max = task_scan_max(p);
				1410
				1411	p->numa_scan_period = min(p->numa_scan_period_max,
				1412	p->numa_scan_period + 10);
				1413	}
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	1414
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1415	task_numa_placement(p);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	1416
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1417	/* Retry task to preferred node migration if it previously failed */
				1418	if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry))
				1419	numa_migrate_preferred(p);
				1420
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1421	p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1422	}
				1423
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1424	static void reset_ptenuma_scan(struct task_struct *p)
				1425	{
				1426	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				1427	p->mm->numa_scan_offset = 0;
				1428	}
				1429
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1430	/*
				1431	* The expensive part of numa migration is done from task_work context.
				1432	* Triggered from task_tick_numa().
				1433	*/
				1434	void task_numa_work(struct callback_head *work)
				1435	{
				1436	unsigned long migrate, next_scan, now = jiffies;
				1437	struct task_struct *p = current;
				1438	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1439	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1440	unsigned long start, end;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1441	unsigned long nr_pte_updates = 0;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1442	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1443
				1444	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				1445
				1446	work->next = work; /* protect against double add */
				1447	/*
				1448	* Who cares about NUMA placement when they're dying.
				1449	*
				1450	* NOTE: make sure not to dereference p->mm before this check,
				1451	* exit_task_work() happens _after_ exit_mm() so we could be called
				1452	* without p->mm even though we still had it when we enqueued this
				1453	* work.
				1454	*/
				1455	if (p->flags & PF_EXITING)
				1456	return;
				1457
Mel Gorman	7e8d16b	2013-10-07 11:28:54 +0100	[diff] [blame]	1458	if (!mm->numa_next_reset \|\| !mm->numa_next_scan) {
				1459	mm->numa_next_scan = now +
				1460	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
				1461	mm->numa_next_reset = now +
				1462	msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				1463	}
				1464
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1465	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	1466	* Reset the scan period if enough time has gone by. Objective is that
				1467	* scanning will be reduced if pages are properly placed. As tasks
				1468	* can enter different phases this needs to be re-examined. Lacking
				1469	* proper tracking of reference behaviour, this blunt hammer is used.
				1470	*/
				1471	migrate = mm->numa_next_reset;
				1472	if (time_after(now, migrate)) {
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1473	p->numa_scan_period = task_scan_min(p);
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	1474	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				1475	xchg(&mm->numa_next_reset, next_scan);
				1476	}
				1477
				1478	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1479	* Enforce maximal scan/migration frequency..
				1480	*/
				1481	migrate = mm->numa_next_scan;
				1482	if (time_before(now, migrate))
				1483	return;
				1484
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1485	if (p->numa_scan_period == 0) {
				1486	p->numa_scan_period_max = task_scan_max(p);
				1487	p->numa_scan_period = task_scan_min(p);
				1488	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1489
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	1490	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1491	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				1492	return;
				1493
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	1494	/*
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	1495	* Delay this task enough that another task of this mm will likely win
				1496	* the next time around.
				1497	*/
				1498	p->node_stamp += 2 * TICK_NSEC;
				1499
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1500	start = mm->numa_scan_offset;
				1501	pages = sysctl_numa_balancing_scan_size;
				1502	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				1503	if (!pages)
				1504	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1505
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1506	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1507	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1508	if (!vma) {
				1509	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1510	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1511	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1512	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1513	for (; vma; vma = vma->vm_next) {
Mel Gorman	fc314724	2013-10-07 11:29:09 +0100	[diff] [blame]	1514	if (!vma_migratable(vma) \|\| !vma_policy_mof(p, vma))
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1515	continue;
				1516
Mel Gorman	4591ce4f	2013-10-07 11:29:13 +0100	[diff] [blame]	1517	/*
				1518	* Shared library pages mapped by multiple processes are not
				1519	* migrated as it is expected they are cache replicated. Avoid
				1520	* hinting faults in read-only file-backed mappings or the vdso
				1521	* as migrating the pages will be of marginal benefit.
				1522	*/
				1523	if (!vma->vm_mm \|\|
				1524	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
				1525	continue;
				1526
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1527	do {
				1528	start = max(start, vma->vm_start);
				1529	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				1530	end = min(end, vma->vm_end);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1531	nr_pte_updates += change_prot_numa(vma, start, end);
				1532
				1533	/*
				1534	* Scan sysctl_numa_balancing_scan_size but ensure that
				1535	* at least one PTE is updated so that unused virtual
				1536	* address space is quickly skipped.
				1537	*/
				1538	if (nr_pte_updates)
				1539	pages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1540
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1541	start = end;
				1542	if (pages <= 0)
				1543	goto out;
				1544	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1545	}
				1546
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1547	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1548	/*
Mel Gorman	f307cd1	2013-10-07 11:28:56 +0100	[diff] [blame]	1549	* If the whole process was scanned without updates then no NUMA
				1550	* hinting faults are being recorded and scan rate should be lower.
				1551	*/
				1552	if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
				1553	p->numa_scan_period = min(p->numa_scan_period_max,
				1554	p->numa_scan_period << 1);
				1555
				1556	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
				1557	mm->numa_next_scan = next_scan;
				1558	}
				1559
				1560	/*
Peter Zijlstra	c69307d	2013-10-07 11:28:41 +0100	[diff] [blame]	1561	* It is possible to reach the end of the VMA list but the last few
				1562	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				1563	* would find the !migratable VMA on the next scan but not reset the
				1564	* scanner to the start so check it now.
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1565	*/
				1566	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1567	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1568	else
				1569	reset_ptenuma_scan(p);
				1570	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1571	}
				1572
				1573	/*
				1574	* Drive the periodic memory faults..
				1575	*/
				1576	void task_tick_numa(struct rq rq, struct task_struct curr)
				1577	{
				1578	struct callback_head *work = &curr->numa_work;
				1579	u64 period, now;
				1580
				1581	/*
				1582	* We don't care about NUMA placement if we don't have memory.
				1583	*/
				1584	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				1585	return;
				1586
				1587	/*
				1588	* Using runtime rather than walltime has the dual advantage that
				1589	* we (mostly) drive the selection from busy threads and that the
				1590	* task needs to have done some actual work before we bother with
				1591	* NUMA placement.
				1592	*/
				1593	now = curr->se.sum_exec_runtime;
				1594	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				1595
				1596	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	1597	if (!curr->node_stamp)
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1598	curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	1599	curr->node_stamp += period;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1600
				1601	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				1602	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				1603	task_work_add(curr, work, true);
				1604	}
				1605	}
				1606	}
				1607	#else
				1608	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1609	{
				1610	}
				1611	#endif /* CONFIG_NUMA_BALANCING */
				1612
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1613	static void
				1614	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1615	{
				1616	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1617	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1618	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1619	#ifdef CONFIG_SMP
				1620	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1621	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1622	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1623	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1624	}
				1625
				1626	static void
				1627	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1628	{
				1629	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1630	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1631	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1632	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1633	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1634	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1635	}
				1636
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1637	#ifdef CONFIG_FAIR_GROUP_SCHED
				1638	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1639	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1640	{
				1641	long tg_weight;
				1642
				1643	/*
				1644	* Use this CPU's actual weight instead of the last load_contribution
				1645	* to gain a more accurate current total weight. See
				1646	* update_cfs_rq_load_contribution().
				1647	*/
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1648	tg_weight = atomic_long_read(&tg->load_avg);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1649	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1650	tg_weight += cfs_rq->load.weight;
				1651
				1652	return tg_weight;
				1653	}
				1654
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1655	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1656	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1657	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1658
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1659	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1660	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1661
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1662	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1663	if (tg_weight)
				1664	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1665
				1666	if (shares < MIN_SHARES)
				1667	shares = MIN_SHARES;
				1668	if (shares > tg->shares)
				1669	shares = tg->shares;
				1670
				1671	return shares;
				1672	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1673	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1674	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1675	{
				1676	return tg->shares;
				1677	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1678	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1679	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1680	unsigned long weight)
				1681	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1682	if (se->on_rq) {
				1683	/* commit outstanding execution time */
				1684	if (cfs_rq->curr == se)
				1685	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1686	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1687	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1688
				1689	update_load_set(&se->load, weight);
				1690
				1691	if (se->on_rq)
				1692	account_entity_enqueue(cfs_rq, se);
				1693	}
				1694
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1695	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1696
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1697	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1698	{
				1699	struct task_group *tg;
				1700	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1701	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1702
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1703	tg = cfs_rq->tg;
				1704	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1705	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1706	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1707	#ifndef CONFIG_SMP
				1708	if (likely(se->load.weight == tg->shares))
				1709	return;
				1710	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1711	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1712
				1713	reweight_entity(cfs_rq_of(se), se, shares);
				1714	}
				1715	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1716	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1717	{
				1718	}
				1719	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1720
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	1721	#ifdef CONFIG_SMP
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1722	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1723	* We choose a half-life close to 1 scheduling period.
				1724	* Note: The tables below are dependent on this value.
				1725	*/
				1726	#define LOAD_AVG_PERIOD 32
				1727	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1728	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1729
				1730	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1731	static const u32 runnable_avg_yN_inv[] = {
				1732	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1733	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1734	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1735	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1736	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1737	0x85aac367, 0x82cd8698,
				1738	};
				1739
				1740	/*
				1741	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1742	* over-estimates when re-combining.
				1743	*/
				1744	static const u32 runnable_avg_yN_sum[] = {
				1745	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1746	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1747	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1748	};
				1749
				1750	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1751	* Approximate:
				1752	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1753	*/
				1754	static __always_inline u64 decay_load(u64 val, u64 n)
				1755	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1756	unsigned int local_n;
				1757
				1758	if (!n)
				1759	return val;
				1760	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1761	return 0;
				1762
				1763	/* after bounds checking we can collapse to 32-bit */
				1764	local_n = n;
				1765
				1766	/*
				1767	* As y^PERIOD = 1/2, we can combine
				1768	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1769	* With a look-up table which covers k^n (n<PERIOD)
				1770	*
				1771	* To achieve constant time decay_load.
				1772	*/
				1773	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1774	val >>= local_n / LOAD_AVG_PERIOD;
				1775	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1776	}
				1777
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1778	val *= runnable_avg_yN_inv[local_n];
				1779	/* We don't use SRR here since we always want to round down. */
				1780	return val >> 32;
				1781	}
				1782
				1783	/*
				1784	* For updates fully spanning n periods, the contribution to runnable
				1785	* average will be: \Sum 1024*y^n
				1786	*
				1787	* We can compute this reasonably efficiently by combining:
				1788	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1789	*/
				1790	static u32 __compute_runnable_contrib(u64 n)
				1791	{
				1792	u32 contrib = 0;
				1793
				1794	if (likely(n <= LOAD_AVG_PERIOD))
				1795	return runnable_avg_yN_sum[n];
				1796	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1797	return LOAD_AVG_MAX;
				1798
				1799	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1800	do {
				1801	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1802	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1803
				1804	n -= LOAD_AVG_PERIOD;
				1805	} while (n > LOAD_AVG_PERIOD);
				1806
				1807	contrib = decay_load(contrib, n);
				1808	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1809	}
				1810
				1811	/*
				1812	* We can represent the historical contribution to runnable average as the
				1813	* coefficients of a geometric series. To do this we sub-divide our runnable
				1814	* history into segments of approximately 1ms (1024us); label the segment that
				1815	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1816	*
				1817	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1818	* p0 p1 p2
				1819	* (now) (~1ms ago) (~2ms ago)
				1820	*
				1821	* Let u_i denote the fraction of p_i that the entity was runnable.
				1822	*
				1823	* We then designate the fractions u_i as our co-efficients, yielding the
				1824	* following representation of historical load:
				1825	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1826	*
				1827	* We choose y based on the with of a reasonably scheduling period, fixing:
				1828	* y^32 = 0.5
				1829	*
				1830	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1831	* approximately half as much as the contribution to load within the last ms
				1832	* (u_0).
				1833	*
				1834	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1835	* sum again by y is sufficient to update:
				1836	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1837	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1838	*/
				1839	static __always_inline int __update_entity_runnable_avg(u64 now,
				1840	struct sched_avg *sa,
				1841	int runnable)
				1842	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1843	u64 delta, periods;
				1844	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1845	int delta_w, decayed = 0;
				1846
				1847	delta = now - sa->last_runnable_update;
				1848	/*
				1849	* This should only happen when time goes backwards, which it
				1850	* unfortunately does during sched clock init when we swap over to TSC.
				1851	*/
				1852	if ((s64)delta < 0) {
				1853	sa->last_runnable_update = now;
				1854	return 0;
				1855	}
				1856
				1857	/*
				1858	* Use 1024ns as the unit of measurement since it's a reasonable
				1859	* approximation of 1us and fast to compute.
				1860	*/
				1861	delta >>= 10;
				1862	if (!delta)
				1863	return 0;
				1864	sa->last_runnable_update = now;
				1865
				1866	/* delta_w is the amount already accumulated against our next period */
				1867	delta_w = sa->runnable_avg_period % 1024;
				1868	if (delta + delta_w >= 1024) {
				1869	/* period roll-over */
				1870	decayed = 1;
				1871
				1872	/*
				1873	* Now that we know we're crossing a period boundary, figure
				1874	* out how much from delta we need to complete the current
				1875	* period and accrue it.
				1876	*/
				1877	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1878	if (runnable)
				1879	sa->runnable_avg_sum += delta_w;
				1880	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1881
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1882	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1883
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1884	/* Figure out how many additional periods this update spans */
				1885	periods = delta / 1024;
				1886	delta %= 1024;
				1887
				1888	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1889	periods + 1);
				1890	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1891	periods + 1);
				1892
				1893	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1894	runnable_contrib = __compute_runnable_contrib(periods);
				1895	if (runnable)
				1896	sa->runnable_avg_sum += runnable_contrib;
				1897	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1898	}
				1899
				1900	/* Remainder of delta accrued against u_0` */
				1901	if (runnable)
				1902	sa->runnable_avg_sum += delta;
				1903	sa->runnable_avg_period += delta;
				1904
				1905	return decayed;
				1906	}
				1907
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1908	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1909	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1910	{
				1911	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1912	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1913
				1914	decays -= se->avg.decay_count;
				1915	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1916	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1917
				1918	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1919	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1920
				1921	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1922	}
				1923
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1924	#ifdef CONFIG_FAIR_GROUP_SCHED
				1925	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1926	int force_update)
				1927	{
				1928	struct task_group *tg = cfs_rq->tg;
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1929	long tg_contrib;
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1930
				1931	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1932	tg_contrib -= cfs_rq->tg_load_contrib;
				1933
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1934	if (force_update \|\| abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1935	atomic_long_add(tg_contrib, &tg->load_avg);
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1936	cfs_rq->tg_load_contrib += tg_contrib;
				1937	}
				1938	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1939
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1940	/*
				1941	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1942	* representation for computing load contributions.
				1943	*/
				1944	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1945	struct cfs_rq *cfs_rq)
				1946	{
				1947	struct task_group *tg = cfs_rq->tg;
				1948	long contrib;
				1949
				1950	/* The fraction of a cpu used by this cfs_rq */
				1951	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1952	sa->runnable_avg_period + 1);
				1953	contrib -= cfs_rq->tg_runnable_contrib;
				1954
				1955	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1956	atomic_add(contrib, &tg->runnable_avg);
				1957	cfs_rq->tg_runnable_contrib += contrib;
				1958	}
				1959	}
				1960
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1961	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1962	{
				1963	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1964	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1965	int runnable_avg;
				1966
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1967	u64 contrib;
				1968
				1969	contrib = cfs_rq->tg_load_contrib * tg->shares;
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1970	se->avg.load_avg_contrib = div_u64(contrib,
				1971	atomic_long_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1972
				1973	/*
				1974	* For group entities we need to compute a correction term in the case
				1975	* that they are consuming <1 cpu so that we would contribute the same
				1976	* load as a task of equal weight.
				1977	*
				1978	* Explicitly co-ordinating this measurement would be expensive, but
				1979	* fortunately the sum of each cpus contribution forms a usable
				1980	* lower-bound on the true value.
				1981	*
				1982	* Consider the aggregate of 2 contributions. Either they are disjoint
				1983	* (and the sum represents true value) or they are disjoint and we are
				1984	* understating by the aggregate of their overlap.
				1985	*
				1986	* Extending this to N cpus, for a given overlap, the maximum amount we
				1987	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1988	* cpus that overlap for this interval and w_i is the interval width.
				1989	*
				1990	* On a small machine; the first term is well-bounded which bounds the
				1991	* total error since w_i is a subset of the period. Whereas on a
				1992	* larger machine, while this first term can be larger, if w_i is the
				1993	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1994	* our upper bound of 1-cpu.
				1995	*/
				1996	runnable_avg = atomic_read(&tg->runnable_avg);
				1997	if (runnable_avg < NICE_0_LOAD) {
				1998	se->avg.load_avg_contrib *= runnable_avg;
				1999	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				2000	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2001	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2002	#else
				2003	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				2004	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2005	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				2006	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2007	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2008	#endif
				2009
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2010	static inline void __update_task_entity_contrib(struct sched_entity *se)
				2011	{
				2012	u32 contrib;
				2013
				2014	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				2015	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				2016	contrib /= (se->avg.runnable_avg_period + 1);
				2017	se->avg.load_avg_contrib = scale_load(contrib);
				2018	}
				2019
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2020	/* Compute the current contribution to load_avg by se, return any delta */
				2021	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				2022	{
				2023	long old_contrib = se->avg.load_avg_contrib;
				2024
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2025	if (entity_is_task(se)) {
				2026	__update_task_entity_contrib(se);
				2027	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2028	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2029	__update_group_entity_contrib(se);
				2030	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2031
				2032	return se->avg.load_avg_contrib - old_contrib;
				2033	}
				2034
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2035	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				2036	long load_contrib)
				2037	{
				2038	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				2039	cfs_rq->blocked_load_avg -= load_contrib;
				2040	else
				2041	cfs_rq->blocked_load_avg = 0;
				2042	}
				2043
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2044	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				2045
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2046	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2047	static inline void update_entity_load_avg(struct sched_entity *se,
				2048	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2049	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2050	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2051	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2052	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2053
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2054	/*
				2055	* For a group entity we need to use their owned cfs_rq_clock_task() in
				2056	* case they are the parent of a throttled hierarchy.
				2057	*/
				2058	if (entity_is_task(se))
				2059	now = cfs_rq_clock_task(cfs_rq);
				2060	else
				2061	now = cfs_rq_clock_task(group_cfs_rq(se));
				2062
				2063	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2064	return;
				2065
				2066	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2067
				2068	if (!update_cfs_rq)
				2069	return;
				2070
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2071	if (se->on_rq)
				2072	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2073	else
				2074	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				2075	}
				2076
				2077	/*
				2078	* Decay the load contributed by all blocked children and account this so that
				2079	* their contribution may appropriately discounted when they wake up.
				2080	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2081	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2082	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2083	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2084	u64 decays;
				2085
				2086	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2087	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2088	return;
				2089
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	2090	if (atomic_long_read(&cfs_rq->removed_load)) {
				2091	unsigned long removed_load;
				2092	removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2093	subtract_blocked_load_contrib(cfs_rq, removed_load);
				2094	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2095
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2096	if (decays) {
				2097	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				2098	decays);
				2099	atomic64_add(decays, &cfs_rq->decay_counter);
				2100	cfs_rq->last_decay = now;
				2101	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2102
				2103	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2104	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2105
				2106	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				2107	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2108	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2109	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2110	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2111
				2112	/* Add the load generated by se into cfs_rq's child load-average */
				2113	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2114	struct sched_entity *se,
				2115	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2116	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2117	/*
				2118	* We track migrations using entity decay_count <= 0, on a wake-up
				2119	* migration we use a negative decay count to track the remote decays
				2120	* accumulated while sleeping.
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	2121	*
				2122	* Newly forked tasks are enqueued with se->avg.decay_count == 0, they
				2123	* are seen by enqueue_entity_load_avg() as a migration with an already
				2124	* constructed load_avg_contrib.
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2125	*/
				2126	if (unlikely(se->avg.decay_count <= 0)) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2127	se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2128	if (se->avg.decay_count) {
				2129	/*
				2130	* In a wake-up migration we have to approximate the
				2131	* time sleeping. This is because we can't synchronize
				2132	* clock_task between the two cpus, and it is not
				2133	* guaranteed to be read-safe. Instead, we can
				2134	* approximate this using our carried decays, which are
				2135	* explicitly atomically readable.
				2136	*/
				2137	se->avg.last_runnable_update -= (-se->avg.decay_count)
				2138	<< 20;
				2139	update_entity_load_avg(se, 0);
				2140	/* Indicate that we're now synchronized and on-rq */
				2141	se->avg.decay_count = 0;
				2142	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2143	wakeup = 0;
				2144	} else {
Alex Shi	282cf49	2013-06-20 10:18:48 +0800	[diff] [blame]	2145	/*
				2146	* Task re-woke on same cpu (or else migrate_task_rq_fair()
				2147	* would have made count negative); we must be careful to avoid
				2148	* double-accounting blocked time after synchronizing decays.
				2149	*/
				2150	se->avg.last_runnable_update += __synchronize_entity_decay(se)
				2151	<< 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2152	}
				2153
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2154	/* migrated tasks did not contribute to our blocked load */
				2155	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2156	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2157	update_entity_load_avg(se, 0);
				2158	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2159
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2160	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2161	/* we force update consideration on load-balancer moves */
				2162	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2163	}
				2164
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2165	/*
				2166	* Remove se's load from this cfs_rq child load-average, if the entity is
				2167	* transitioning to a blocked state we track its projected decay using
				2168	* blocked_load_avg.
				2169	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2170	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2171	struct sched_entity *se,
				2172	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2173	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2174	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2175	/* we force update consideration on load-balancer moves */
				2176	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2177
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2178	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2179	if (sleep) {
				2180	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				2181	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				2182	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2183	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	2184
				2185	/*
				2186	* Update the rq's load with the elapsed running time before entering
				2187	* idle. if the last scheduled task is not a CFS task, idle_enter will
				2188	* be the only way to update the runnable statistic.
				2189	*/
				2190	void idle_enter_fair(struct rq *this_rq)
				2191	{
				2192	update_rq_runnable_avg(this_rq, 1);
				2193	}
				2194
				2195	/*
				2196	* Update the rq's load with the elapsed idle time before a task is
				2197	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
				2198	* be the only way to update the runnable statistic.
				2199	*/
				2200	void idle_exit_fair(struct rq *this_rq)
				2201	{
				2202	update_rq_runnable_avg(this_rq, 0);
				2203	}
				2204
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2205	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2206	static inline void update_entity_load_avg(struct sched_entity *se,
				2207	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2208	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2209	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2210	struct sched_entity *se,
				2211	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2212	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2213	struct sched_entity *se,
				2214	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2215	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				2216	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2217	#endif
				2218
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	2219	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2220	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2221	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	2222	struct task_struct *tsk = NULL;
				2223
				2224	if (entity_is_task(se))
				2225	tsk = task_of(se);
				2226
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2227	if (se->statistics.sleep_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2228	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2229
				2230	if ((s64)delta < 0)
				2231	delta = 0;
				2232
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2233	if (unlikely(delta > se->statistics.sleep_max))
				2234	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2235
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	2236	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2237	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	2238
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	2239	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	2240	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	2241	trace_sched_stat_sleep(tsk, delta);
				2242	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2243	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2244	if (se->statistics.block_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2245	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2246
				2247	if ((s64)delta < 0)
				2248	delta = 0;
				2249
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2250	if (unlikely(delta > se->statistics.block_max))
				2251	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2252
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	2253	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2254	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	2255
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	2256	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	2257	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2258	se->statistics.iowait_sum += delta;
				2259	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	2260	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	2261	}
				2262
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	2263	trace_sched_stat_blocked(tsk, delta);
				2264
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	2265	/*
				2266	* Blocking time is in units of nanosecs, so shift by
				2267	* 20 to get a milliseconds-range estimation of the
				2268	* amount of time that the task spent sleeping:
				2269	*/
				2270	if (unlikely(prof_on == SLEEP_PROFILING)) {
				2271	profile_hits(SLEEP_PROFILING,
				2272	(void *)get_wchan(tsk),
				2273	delta >> 20);
				2274	}
				2275	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	2276	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2277	}
				2278	#endif
				2279	}
				2280
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	2281	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				2282	{
				2283	#ifdef CONFIG_SCHED_DEBUG
				2284	s64 d = se->vruntime - cfs_rq->min_vruntime;
				2285
				2286	if (d < 0)
				2287	d = -d;
				2288
				2289	if (d > 3*sysctl_sched_latency)
				2290	schedstat_inc(cfs_rq, nr_spread_over);
				2291	#endif
				2292	}
				2293
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2294	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	2295	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				2296	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	2297	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	2298
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	2299	/*
				2300	* The 'current' period is already promised to the current tasks,
				2301	* however the extra weight of the new task will slow them down a
				2302	* little, place the new task so that it fits in the slot that
				2303	* stays open at the end.
				2304	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	2305	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	2306	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	2307
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	2308	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	2309	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	2310	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	2311
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	2312	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	2313	* Halve their sleep time's effect, to allow
				2314	* for a gentler effect of sleepers:
				2315	*/
				2316	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				2317	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	2318
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	2319	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	2320	}
				2321
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	2322	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	2323	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	2324	}
				2325
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2326	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				2327
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	2328	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2329	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2330	{
				2331	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2332	* Update the normalized vruntime before updating min_vruntime
Kamalesh Babulal	0fc576d	2013-06-27 11:24:18 +0530	[diff] [blame]	2333	* through calling update_curr().
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2334	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2335	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2336	se->vruntime += cfs_rq->min_vruntime;
				2337
				2338	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	2339	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2340	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	2341	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	2342	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2343	account_entity_enqueue(cfs_rq, se);
				2344	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2345
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2346	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	2347	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	2348	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	2349	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2350
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	2351	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	2352	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2353	if (se != cfs_rq->curr)
				2354	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2355	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	2356
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2357	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	2358	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2359	check_enqueue_throttle(cfs_rq);
				2360	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2361	}
				2362
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	2363	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	2364	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	2365	for_each_sched_entity(se) {
				2366	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2367	if (cfs_rq->last == se)
				2368	cfs_rq->last = NULL;
				2369	else
				2370	break;
				2371	}
				2372	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	2373
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	2374	static void __clear_buddies_next(struct sched_entity *se)
				2375	{
				2376	for_each_sched_entity(se) {
				2377	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2378	if (cfs_rq->next == se)
				2379	cfs_rq->next = NULL;
				2380	else
				2381	break;
				2382	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	2383	}
				2384
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2385	static void __clear_buddies_skip(struct sched_entity *se)
				2386	{
				2387	for_each_sched_entity(se) {
				2388	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2389	if (cfs_rq->skip == se)
				2390	cfs_rq->skip = NULL;
				2391	else
				2392	break;
				2393	}
				2394	}
				2395
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	2396	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				2397	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	2398	if (cfs_rq->last == se)
				2399	__clear_buddies_last(se);
				2400
				2401	if (cfs_rq->next == se)
				2402	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2403
				2404	if (cfs_rq->skip == se)
				2405	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	2406	}
				2407
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2408	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2409
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2410	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2411	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2412	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	2413	/*
				2414	* Update run-time statistics of the 'current'.
				2415	*/
				2416	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2417	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	2418
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	2419	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2420	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	2421	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2422	if (entity_is_task(se)) {
				2423	struct task_struct *tsk = task_of(se);
				2424
				2425	if (tsk->state & TASK_INTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2426	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2427	if (tsk->state & TASK_UNINTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2428	se->statistics.block_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2429	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	2430	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	2431	}
				2432
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	2433	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	2434
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2435	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2436	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2437	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2438	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2439
				2440	/*
				2441	* Normalize the entity after updating the min_vruntime because the
				2442	* update can refer to the ->curr item and we need to reflect this
				2443	* movement in our normalized position.
				2444	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2445	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2446	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	2447
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2448	/* return excess runtime on last dequeue */
				2449	return_cfs_rq_runtime(cfs_rq);
				2450
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	2451	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2452	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2453	}
				2454
				2455	/*
				2456	* Preempt the current task with a newly woken task if needed:
				2457	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	2458	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	2459	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2460	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	2461	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2462	struct sched_entity *se;
				2463	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	2464
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	2465	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	2466	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	2467	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2468	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	2469	/*
				2470	* The current task ran long enough, ensure it doesn't get
				2471	* re-elected due to buddy favours.
				2472	*/
				2473	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2474	return;
				2475	}
				2476
				2477	/*
				2478	* Ensure that a task that missed wakeup preemption by a
				2479	* narrow margin doesn't have to wait for a full slice.
				2480	* This also mitigates buddy induced latencies under load.
				2481	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2482	if (delta_exec < sysctl_sched_min_granularity)
				2483	return;
				2484
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2485	se = __pick_first_entity(cfs_rq);
				2486	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2487
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2488	if (delta < 0)
				2489	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	2490
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2491	if (delta > ideal_runtime)
				2492	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2493	}
				2494
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2495	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	2496	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2497	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2498	/* 'current' is not kept within the tree. */
				2499	if (se->on_rq) {
				2500	/*
				2501	* Any task has to be enqueued before it get to execute on
				2502	* a CPU. So account for the time it spent waiting on the
				2503	* runqueue.
				2504	*/
				2505	update_stats_wait_end(cfs_rq, se);
				2506	__dequeue_entity(cfs_rq, se);
				2507	}
				2508
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	2509	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	2510	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	2511	#ifdef CONFIG_SCHEDSTATS
				2512	/*
				2513	* Track our maximum slice length, if the CPU's load is at
				2514	* least twice that of our own weight (i.e. dont track it
				2515	* when there are only lesser-weight tasks around):
				2516	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	2517	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2518	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	2519	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				2520	}
				2521	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	2522	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2523	}
				2524
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	2525	static int
				2526	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				2527
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2528	/*
				2529	* Pick the next process, keeping these things in mind, in this order:
				2530	* 1) keep things fair between processes/task groups
				2531	* 2) pick the "next" process, since someone really wants that to run
				2532	* 3) pick the "last" process, for cache locality
				2533	* 4) do not run the "skip" process, if something else is available
				2534	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	2535	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2536	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2537	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2538	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	2539
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2540	/*
				2541	* Avoid running the skip buddy, if running something else can
				2542	* be done without getting too unfair.
				2543	*/
				2544	if (cfs_rq->skip == se) {
				2545	struct sched_entity *second = __pick_next_entity(se);
				2546	if (second && wakeup_preempt_entity(second, left) < 1)
				2547	se = second;
				2548	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2549
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2550	/*
				2551	* Prefer last buddy, try to return the CPU to a preempted task.
				2552	*/
				2553	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				2554	se = cfs_rq->last;
				2555
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2556	/*
				2557	* Someone really wants this to run. If it's not unfair, run it.
				2558	*/
				2559	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				2560	se = cfs_rq->next;
				2561
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2562	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	2563
				2564	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2565	}
				2566
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2567	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				2568
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	2569	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2570	{
				2571	/*
				2572	* If still on the runqueue then deactivate_task()
				2573	* was not called and update_curr() has to be done:
				2574	*/
				2575	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	2576	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2577
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2578	/* throttle cfs_rqs exceeding runtime */
				2579	check_cfs_rq_runtime(cfs_rq);
				2580
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	2581	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2582	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	2583	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2584	/* Put 'current' back into the tree. */
				2585	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2586	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2587	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2588	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	2589	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2590	}
				2591
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2592	static void
				2593	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2594	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2595	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2596	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2597	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2598	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2599
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	2600	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2601	* Ensure that runnable average is periodically updated.
				2602	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2603	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2604	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	bf0bd94	2013-07-26 23:48:42 +0200	[diff] [blame]	2605	update_cfs_shares(cfs_rq);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2606
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2607	#ifdef CONFIG_SCHED_HRTICK
				2608	/*
				2609	* queued ticks are scheduled to match the slice, so don't bother
				2610	* validating it and just reschedule.
				2611	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	2612	if (queued) {
				2613	resched_task(rq_of(cfs_rq)->curr);
				2614	return;
				2615	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2616	/*
				2617	* don't let the period tick interfere with the hrtick preemption
				2618	*/
				2619	if (!sched_feat(DOUBLE_TICK) &&
				2620	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				2621	return;
				2622	#endif
				2623
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	2624	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	2625	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2626	}
				2627
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2628
				2629	/**************************************************
				2630	* CFS bandwidth control machinery
				2631	*/
				2632
				2633	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2634
				2635	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2636	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2637
				2638	static inline bool cfs_bandwidth_used(void)
				2639	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2640	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2641	}
				2642
				2643	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2644	{
				2645	/* only need to count groups transitioning between enabled/!enabled */
				2646	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2647	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2648	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2649	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2650	}
				2651	#else /* HAVE_JUMP_LABEL */
				2652	static bool cfs_bandwidth_used(void)
				2653	{
				2654	return true;
				2655	}
				2656
				2657	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2658	#endif /* HAVE_JUMP_LABEL */
				2659
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2660	/*
				2661	* default period for cfs group bandwidth.
				2662	* default: 0.1s, units: nanoseconds
				2663	*/
				2664	static inline u64 default_cfs_period(void)
				2665	{
				2666	return 100000000ULL;
				2667	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2668
				2669	static inline u64 sched_cfs_bandwidth_slice(void)
				2670	{
				2671	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2672	}
				2673
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2674	/*
				2675	* Replenish runtime according to assigned quota and update expiration time.
				2676	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2677	* additional synchronization around rq->lock.
				2678	*
				2679	* requires cfs_b->lock
				2680	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2681	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2682	{
				2683	u64 now;
				2684
				2685	if (cfs_b->quota == RUNTIME_INF)
				2686	return;
				2687
				2688	now = sched_clock_cpu(smp_processor_id());
				2689	cfs_b->runtime = cfs_b->quota;
				2690	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2691	}
				2692
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2693	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2694	{
				2695	return &tg->cfs_bandwidth;
				2696	}
				2697
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2698	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2699	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2700	{
				2701	if (unlikely(cfs_rq->throttle_count))
				2702	return cfs_rq->throttled_clock_task;
				2703
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2704	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2705	}
				2706
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2707	/* returns 0 on failure to allocate runtime */
				2708	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2709	{
				2710	struct task_group *tg = cfs_rq->tg;
				2711	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2712	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2713
				2714	/* note: this is a positive sum as runtime_remaining <= 0 */
				2715	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2716
				2717	raw_spin_lock(&cfs_b->lock);
				2718	if (cfs_b->quota == RUNTIME_INF)
				2719	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2720	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2721	/*
				2722	* If the bandwidth pool has become inactive, then at least one
				2723	* period must have elapsed since the last consumption.
				2724	* Refresh the global state and ensure bandwidth timer becomes
				2725	* active.
				2726	*/
				2727	if (!cfs_b->timer_active) {
				2728	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2729	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2730	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2731
				2732	if (cfs_b->runtime > 0) {
				2733	amount = min(cfs_b->runtime, min_amount);
				2734	cfs_b->runtime -= amount;
				2735	cfs_b->idle = 0;
				2736	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2737	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2738	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2739	raw_spin_unlock(&cfs_b->lock);
				2740
				2741	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2742	/*
				2743	* we may have advanced our local expiration to account for allowed
				2744	* spread between our sched_clock and the one on which runtime was
				2745	* issued.
				2746	*/
				2747	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2748	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2749
				2750	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2751	}
				2752
				2753	/*
				2754	* Note: This depends on the synchronization provided by sched_clock and the
				2755	* fact that rq->clock snapshots this value.
				2756	*/
				2757	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2758	{
				2759	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2760
				2761	/* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2762	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2763	return;
				2764
				2765	if (cfs_rq->runtime_remaining < 0)
				2766	return;
				2767
				2768	/*
				2769	* If the local deadline has passed we have to consider the
				2770	* possibility that our sched_clock is 'fast' and the global deadline
				2771	* has not truly expired.
				2772	*
				2773	* Fortunately we can check determine whether this the case by checking
				2774	* whether the global deadline has advanced.
				2775	*/
				2776
				2777	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2778	/* extend local deadline, drift is bounded above by 2 ticks */
				2779	cfs_rq->runtime_expires += TICK_NSEC;
				2780	} else {
				2781	/* global deadline is ahead, expiration has passed */
				2782	cfs_rq->runtime_remaining = 0;
				2783	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2784	}
				2785
				2786	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2787	unsigned long delta_exec)
				2788	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2789	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2790	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2791	expire_cfs_rq_runtime(cfs_rq);
				2792
				2793	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2794	return;
				2795
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2796	/*
				2797	* if we're unable to extend our runtime we resched so that the active
				2798	* hierarchy can be throttled
				2799	*/
				2800	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2801	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2802	}
				2803
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2804	static __always_inline
				2805	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2806	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2807	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2808	return;
				2809
				2810	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2811	}
				2812
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2813	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2814	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2815	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2816	}
				2817
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2818	/* check whether cfs_rq, or any parent, is throttled */
				2819	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2820	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2821	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2822	}
				2823
				2824	/*
				2825	* Ensure that neither of the group entities corresponding to src_cpu or
				2826	* dest_cpu are members of a throttled hierarchy when performing group
				2827	* load-balance operations.
				2828	*/
				2829	static inline int throttled_lb_pair(struct task_group *tg,
				2830	int src_cpu, int dest_cpu)
				2831	{
				2832	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2833
				2834	src_cfs_rq = tg->cfs_rq[src_cpu];
				2835	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2836
				2837	return throttled_hierarchy(src_cfs_rq) \|\|
				2838	throttled_hierarchy(dest_cfs_rq);
				2839	}
				2840
				2841	/* updated child weight may affect parent so we have to do this bottom up */
				2842	static int tg_unthrottle_up(struct task_group tg, void data)
				2843	{
				2844	struct rq *rq = data;
				2845	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2846
				2847	cfs_rq->throttle_count--;
				2848	#ifdef CONFIG_SMP
				2849	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2850	/* adjust cfs_rq_clock_task() */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2851	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2852	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2853	}
				2854	#endif
				2855
				2856	return 0;
				2857	}
				2858
				2859	static int tg_throttle_down(struct task_group tg, void data)
				2860	{
				2861	struct rq *rq = data;
				2862	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2863
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2864	/* group is entering throttled state, stop time */
				2865	if (!cfs_rq->throttle_count)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2866	cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2867	cfs_rq->throttle_count++;
				2868
				2869	return 0;
				2870	}
				2871
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2872	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2873	{
				2874	struct rq *rq = rq_of(cfs_rq);
				2875	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2876	struct sched_entity *se;
				2877	long task_delta, dequeue = 1;
				2878
				2879	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2880
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2881	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2882	rcu_read_lock();
				2883	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2884	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2885
				2886	task_delta = cfs_rq->h_nr_running;
				2887	for_each_sched_entity(se) {
				2888	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2889	/* throttled entity or throttle-on-deactivate */
				2890	if (!se->on_rq)
				2891	break;
				2892
				2893	if (dequeue)
				2894	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2895	qcfs_rq->h_nr_running -= task_delta;
				2896
				2897	if (qcfs_rq->load.weight)
				2898	dequeue = 0;
				2899	}
				2900
				2901	if (!se)
				2902	rq->nr_running -= task_delta;
				2903
				2904	cfs_rq->throttled = 1;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2905	cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2906	raw_spin_lock(&cfs_b->lock);
				2907	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2908	raw_spin_unlock(&cfs_b->lock);
				2909	}
				2910
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2911	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2912	{
				2913	struct rq *rq = rq_of(cfs_rq);
				2914	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2915	struct sched_entity *se;
				2916	int enqueue = 1;
				2917	long task_delta;
				2918
Michael Wang	22b958d	2013-06-04 14:23:39 +0800	[diff] [blame]	2919	se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2920
				2921	cfs_rq->throttled = 0;
Frederic Weisbecker	1a55af2	2013-04-12 01:51:01 +0200	[diff] [blame]	2922
				2923	update_rq_clock(rq);
				2924
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2925	raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2926	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2927	list_del_rcu(&cfs_rq->throttled_list);
				2928	raw_spin_unlock(&cfs_b->lock);
				2929
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2930	/* update hierarchical throttle state */
				2931	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2932
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2933	if (!cfs_rq->load.weight)
				2934	return;
				2935
				2936	task_delta = cfs_rq->h_nr_running;
				2937	for_each_sched_entity(se) {
				2938	if (se->on_rq)
				2939	enqueue = 0;
				2940
				2941	cfs_rq = cfs_rq_of(se);
				2942	if (enqueue)
				2943	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2944	cfs_rq->h_nr_running += task_delta;
				2945
				2946	if (cfs_rq_throttled(cfs_rq))
				2947	break;
				2948	}
				2949
				2950	if (!se)
				2951	rq->nr_running += task_delta;
				2952
				2953	/* determine whether we need to wake up potentially idle cpu */
				2954	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2955	resched_task(rq->curr);
				2956	}
				2957
				2958	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2959	u64 remaining, u64 expires)
				2960	{
				2961	struct cfs_rq *cfs_rq;
				2962	u64 runtime = remaining;
				2963
				2964	rcu_read_lock();
				2965	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2966	throttled_list) {
				2967	struct rq *rq = rq_of(cfs_rq);
				2968
				2969	raw_spin_lock(&rq->lock);
				2970	if (!cfs_rq_throttled(cfs_rq))
				2971	goto next;
				2972
				2973	runtime = -cfs_rq->runtime_remaining + 1;
				2974	if (runtime > remaining)
				2975	runtime = remaining;
				2976	remaining -= runtime;
				2977
				2978	cfs_rq->runtime_remaining += runtime;
				2979	cfs_rq->runtime_expires = expires;
				2980
				2981	/* we check whether we're throttled above */
				2982	if (cfs_rq->runtime_remaining > 0)
				2983	unthrottle_cfs_rq(cfs_rq);
				2984
				2985	next:
				2986	raw_spin_unlock(&rq->lock);
				2987
				2988	if (!remaining)
				2989	break;
				2990	}
				2991	rcu_read_unlock();
				2992
				2993	return remaining;
				2994	}
				2995
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2996	/*
				2997	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2998	* cfs_rqs as appropriate. If there has been no activity within the last
				2999	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				3000	* used to track this state.
				3001	*/
				3002	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				3003	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3004	u64 runtime, runtime_expires;
				3005	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3006
				3007	raw_spin_lock(&cfs_b->lock);
				3008	/* no need to continue the timer with no bandwidth constraint */
				3009	if (cfs_b->quota == RUNTIME_INF)
				3010	goto out_unlock;
				3011
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3012	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				3013	/* idle depends on !throttled (for the case of a large deficit) */
				3014	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	3015	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3016
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3017	/* if we're going inactive then everything else can be deferred */
				3018	if (idle)
				3019	goto out_unlock;
				3020
				3021	__refill_cfs_bandwidth_runtime(cfs_b);
				3022
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3023	if (!throttled) {
				3024	/* mark as potentially idle for the upcoming period */
				3025	cfs_b->idle = 1;
				3026	goto out_unlock;
				3027	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3028
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	3029	/* account preceding periods in which throttling occurred */
				3030	cfs_b->nr_throttled += overrun;
				3031
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3032	/*
				3033	* There are throttled entities so we must first use the new bandwidth
				3034	* to unthrottle them before making it generally available. This
				3035	* ensures that all existing debts will be paid before a new cfs_rq is
				3036	* allowed to run.
				3037	*/
				3038	runtime = cfs_b->runtime;
				3039	runtime_expires = cfs_b->runtime_expires;
				3040	cfs_b->runtime = 0;
				3041
				3042	/*
				3043	* This check is repeated as we are holding onto the new bandwidth
				3044	* while we unthrottle. This can potentially race with an unthrottled
				3045	* group trying to acquire new bandwidth from the global pool.
				3046	*/
				3047	while (throttled && runtime > 0) {
				3048	raw_spin_unlock(&cfs_b->lock);
				3049	/* we can't nest cfs_b->lock while distributing bandwidth */
				3050	runtime = distribute_cfs_runtime(cfs_b, runtime,
				3051	runtime_expires);
				3052	raw_spin_lock(&cfs_b->lock);
				3053
				3054	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				3055	}
				3056
				3057	/* return (any) remaining runtime */
				3058	cfs_b->runtime = runtime;
				3059	/*
				3060	* While we are ensured activity in the period following an
				3061	* unthrottle, this also covers the case in which the new bandwidth is
				3062	* insufficient to cover the existing bandwidth deficit. (Forcing the
				3063	* timer to remain active while there are any throttled entities.)
				3064	*/
				3065	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3066	out_unlock:
				3067	if (idle)
				3068	cfs_b->timer_active = 0;
				3069	raw_spin_unlock(&cfs_b->lock);
				3070
				3071	return idle;
				3072	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3073
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3074	/* a cfs_rq won't donate quota below this amount */
				3075	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				3076	/* minimum remaining period time to redistribute slack quota */
				3077	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				3078	/* how long we wait to gather additional slack before distributing */
				3079	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				3080
				3081	/* are we near the end of the current quota period? */
				3082	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				3083	{
				3084	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				3085	u64 remaining;
				3086
				3087	/* if the call-back is running a quota refresh is already occurring */
				3088	if (hrtimer_callback_running(refresh_timer))
				3089	return 1;
				3090
				3091	/* is a quota refresh about to occur? */
				3092	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				3093	if (remaining < min_expire)
				3094	return 1;
				3095
				3096	return 0;
				3097	}
				3098
				3099	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				3100	{
				3101	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				3102
				3103	/* if there's a quota refresh soon don't bother with slack */
				3104	if (runtime_refresh_within(cfs_b, min_left))
				3105	return;
				3106
				3107	start_bandwidth_timer(&cfs_b->slack_timer,
				3108	ns_to_ktime(cfs_bandwidth_slack_period));
				3109	}
				3110
				3111	/* we know any runtime found here is valid as update_curr() precedes return */
				3112	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3113	{
				3114	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3115	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				3116
				3117	if (slack_runtime <= 0)
				3118	return;
				3119
				3120	raw_spin_lock(&cfs_b->lock);
				3121	if (cfs_b->quota != RUNTIME_INF &&
				3122	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				3123	cfs_b->runtime += slack_runtime;
				3124
				3125	/* we are under rq->lock, defer unthrottling using a timer */
				3126	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				3127	!list_empty(&cfs_b->throttled_cfs_rq))
				3128	start_cfs_slack_bandwidth(cfs_b);
				3129	}
				3130	raw_spin_unlock(&cfs_b->lock);
				3131
				3132	/* even if it's not valid for return we don't want to try again */
				3133	cfs_rq->runtime_remaining -= slack_runtime;
				3134	}
				3135
				3136	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3137	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3138	if (!cfs_bandwidth_used())
				3139	return;
				3140
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	3141	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3142	return;
				3143
				3144	__return_cfs_rq_runtime(cfs_rq);
				3145	}
				3146
				3147	/*
				3148	* This is done with a timer (instead of inline with bandwidth return) since
				3149	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				3150	*/
				3151	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				3152	{
				3153	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				3154	u64 expires;
				3155
				3156	/* confirm we're still not at a refresh boundary */
				3157	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				3158	return;
				3159
				3160	raw_spin_lock(&cfs_b->lock);
				3161	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				3162	runtime = cfs_b->runtime;
				3163	cfs_b->runtime = 0;
				3164	}
				3165	expires = cfs_b->runtime_expires;
				3166	raw_spin_unlock(&cfs_b->lock);
				3167
				3168	if (!runtime)
				3169	return;
				3170
				3171	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				3172
				3173	raw_spin_lock(&cfs_b->lock);
				3174	if (expires == cfs_b->runtime_expires)
				3175	cfs_b->runtime = runtime;
				3176	raw_spin_unlock(&cfs_b->lock);
				3177	}
				3178
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3179	/*
				3180	* When a group wakes up we want to make sure that its quota is not already
				3181	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				3182	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				3183	*/
				3184	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				3185	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3186	if (!cfs_bandwidth_used())
				3187	return;
				3188
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3189	/* an active group must be handled by the update_curr()->put() path */
				3190	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				3191	return;
				3192
				3193	/* ensure the group is not already throttled */
				3194	if (cfs_rq_throttled(cfs_rq))
				3195	return;
				3196
				3197	/* update runtime allocation */
				3198	account_cfs_rq_runtime(cfs_rq, 0);
				3199	if (cfs_rq->runtime_remaining <= 0)
				3200	throttle_cfs_rq(cfs_rq);
				3201	}
				3202
				3203	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				3204	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3205	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3206	if (!cfs_bandwidth_used())
				3207	return;
				3208
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3209	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				3210	return;
				3211
				3212	/*
				3213	* it's possible for a throttled entity to be forced into a running
				3214	* state (e.g. set_curr_task), in this case we're finished.
				3215	*/
				3216	if (cfs_rq_throttled(cfs_rq))
				3217	return;
				3218
				3219	throttle_cfs_rq(cfs_rq);
				3220	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3221
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3222	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				3223	{
				3224	struct cfs_bandwidth *cfs_b =
				3225	container_of(timer, struct cfs_bandwidth, slack_timer);
				3226	do_sched_cfs_slack_timer(cfs_b);
				3227
				3228	return HRTIMER_NORESTART;
				3229	}
				3230
				3231	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				3232	{
				3233	struct cfs_bandwidth *cfs_b =
				3234	container_of(timer, struct cfs_bandwidth, period_timer);
				3235	ktime_t now;
				3236	int overrun;
				3237	int idle = 0;
				3238
				3239	for (;;) {
				3240	now = hrtimer_cb_get_time(timer);
				3241	overrun = hrtimer_forward(timer, now, cfs_b->period);
				3242
				3243	if (!overrun)
				3244	break;
				3245
				3246	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				3247	}
				3248
				3249	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				3250	}
				3251
				3252	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				3253	{
				3254	raw_spin_lock_init(&cfs_b->lock);
				3255	cfs_b->runtime = 0;
				3256	cfs_b->quota = RUNTIME_INF;
				3257	cfs_b->period = ns_to_ktime(default_cfs_period());
				3258
				3259	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				3260	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				3261	cfs_b->period_timer.function = sched_cfs_period_timer;
				3262	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				3263	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				3264	}
				3265
				3266	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3267	{
				3268	cfs_rq->runtime_enabled = 0;
				3269	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				3270	}
				3271
				3272	/* requires cfs_b->lock, may release to reprogram timer */
				3273	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				3274	{
				3275	/*
				3276	* The timer may be active because we're trying to set a new bandwidth
				3277	* period or because we're racing with the tear-down path
				3278	* (timer_active==0 becomes visible before the hrtimer call-back
				3279	* terminates). In either case we ensure that it's re-programmed
				3280	*/
				3281	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				3282	raw_spin_unlock(&cfs_b->lock);
				3283	/* ensure cfs_b->lock is available while we wait */
				3284	hrtimer_cancel(&cfs_b->period_timer);
				3285
				3286	raw_spin_lock(&cfs_b->lock);
				3287	/* if someone else restarted the timer then we're done */
				3288	if (cfs_b->timer_active)
				3289	return;
				3290	}
				3291
				3292	cfs_b->timer_active = 1;
				3293	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				3294	}
				3295
				3296	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				3297	{
				3298	hrtimer_cancel(&cfs_b->period_timer);
				3299	hrtimer_cancel(&cfs_b->slack_timer);
				3300	}
				3301
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	3302	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3303	{
				3304	struct cfs_rq *cfs_rq;
				3305
				3306	for_each_leaf_cfs_rq(rq, cfs_rq) {
				3307	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3308
				3309	if (!cfs_rq->runtime_enabled)
				3310	continue;
				3311
				3312	/*
				3313	* clock_task is not advancing so we just need to make sure
				3314	* there's some valid quota amount
				3315	*/
				3316	cfs_rq->runtime_remaining = cfs_b->quota;
				3317	if (cfs_rq_throttled(cfs_rq))
				3318	unthrottle_cfs_rq(cfs_rq);
				3319	}
				3320	}
				3321
				3322	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3323	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				3324	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3325	return rq_clock_task(rq_of(cfs_rq));
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3326	}
				3327
				3328	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				3329	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3330	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				3331	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	3332	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3333
				3334	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				3335	{
				3336	return 0;
				3337	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3338
				3339	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				3340	{
				3341	return 0;
				3342	}
				3343
				3344	static inline int throttled_lb_pair(struct task_group *tg,
				3345	int src_cpu, int dest_cpu)
				3346	{
				3347	return 0;
				3348	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3349
				3350	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				3351
				3352	#ifdef CONFIG_FAIR_GROUP_SCHED
				3353	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3354	#endif
				3355
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3356	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				3357	{
				3358	return NULL;
				3359	}
				3360	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	3361	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3362
				3363	#endif /* CONFIG_CFS_BANDWIDTH */
				3364
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3365	/**************************************************
				3366	* CFS operations on tasks:
				3367	*/
				3368
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3369	#ifdef CONFIG_SCHED_HRTICK
				3370	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				3371	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3372	struct sched_entity *se = &p->se;
				3373	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3374
				3375	WARN_ON(task_rq(p) != rq);
				3376
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3377	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3378	u64 slice = sched_slice(cfs_rq, se);
				3379	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				3380	s64 delta = slice - ran;
				3381
				3382	if (delta < 0) {
				3383	if (rq->curr == p)
				3384	resched_task(p);
				3385	return;
				3386	}
				3387
				3388	/*
				3389	* Don't schedule slices shorter than 10000ns, that just
				3390	* doesn't make sense. Rely on vruntime for fairness.
				3391	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	3392	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	3393	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3394
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	3395	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3396	}
				3397	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3398
				3399	/*
				3400	* called from enqueue/dequeue and updates the hrtick when the
				3401	* current task is from our class and nr_running is low enough
				3402	* to matter.
				3403	*/
				3404	static void hrtick_update(struct rq *rq)
				3405	{
				3406	struct task_struct *curr = rq->curr;
				3407
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3408	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3409	return;
				3410
				3411	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				3412	hrtick_start_fair(rq, curr);
				3413	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	3414	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3415	static inline void
				3416	hrtick_start_fair(struct rq rq, struct task_struct p)
				3417	{
				3418	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3419
				3420	static inline void hrtick_update(struct rq *rq)
				3421	{
				3422	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3423	#endif
				3424
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3425	/*
				3426	* The enqueue_task method is called before nr_running is
				3427	* increased. Here we update the fair scheduling stats and
				3428	* then put the task into the rbtree:
				3429	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	3430	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3431	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3432	{
				3433	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	3434	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3435
				3436	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	3437	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3438	break;
				3439	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3440	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3441
				3442	/*
				3443	* end evaluation on encountering a throttled cfs_rq
				3444	*
				3445	* note: in the case of encountering a throttled cfs_rq we will
				3446	* post the final h_nr_running increment below.
				3447	*/
				3448	if (cfs_rq_throttled(cfs_rq))
				3449	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3450	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3451
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3452	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3453	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3454
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3455	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	3456	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3457	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3458
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3459	if (cfs_rq_throttled(cfs_rq))
				3460	break;
				3461
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3462	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3463	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3464	}
				3465
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3466	if (!se) {
				3467	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3468	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3469	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3470	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3471	}
				3472
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3473	static void set_next_buddy(struct sched_entity *se);
				3474
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3475	/*
				3476	* The dequeue_task method is called before nr_running is
				3477	* decreased. We remove the task from the rbtree and
				3478	* update the fair scheduling stats:
				3479	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3480	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3481	{
				3482	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	3483	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3484	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3485
				3486	for_each_sched_entity(se) {
				3487	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3488	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3489
				3490	/*
				3491	* end evaluation on encountering a throttled cfs_rq
				3492	*
				3493	* note: in the case of encountering a throttled cfs_rq we will
				3494	* post the final h_nr_running decrement below.
				3495	*/
				3496	if (cfs_rq_throttled(cfs_rq))
				3497	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3498	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3499
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3500	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3501	if (cfs_rq->load.weight) {
				3502	/*
				3503	* Bias pick_next to pick a task from this cfs_rq, as
				3504	* p is sleeping when it is within its sched_slice.
				3505	*/
				3506	if (task_sleep && parent_entity(se))
				3507	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	3508
				3509	/* avoid re-evaluating load for this entity */
				3510	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3511	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3512	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3513	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3514	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3515
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3516	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	3517	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3518	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3519
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3520	if (cfs_rq_throttled(cfs_rq))
				3521	break;
				3522
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3523	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3524	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3525	}
				3526
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3527	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3528	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3529	update_rq_runnable_avg(rq, 1);
				3530	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3531	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3532	}
				3533
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3534	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3535	/* Used instead of source_load when we know the type == 0 */
				3536	static unsigned long weighted_cpuload(const int cpu)
				3537	{
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3538	return cpu_rq(cpu)->cfs.runnable_load_avg;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3539	}
				3540
				3541	/*
				3542	* Return a low guess at the load of a migration-source cpu weighted
				3543	* according to the scheduling class and "nice" value.
				3544	*
				3545	* We want to under-estimate the load of migration sources, to
				3546	* balance conservatively.
				3547	*/
				3548	static unsigned long source_load(int cpu, int type)
				3549	{
				3550	struct rq *rq = cpu_rq(cpu);
				3551	unsigned long total = weighted_cpuload(cpu);
				3552
				3553	if (type == 0 \|\| !sched_feat(LB_BIAS))
				3554	return total;
				3555
				3556	return min(rq->cpu_load[type-1], total);
				3557	}
				3558
				3559	/*
				3560	* Return a high guess at the load of a migration-target cpu weighted
				3561	* according to the scheduling class and "nice" value.
				3562	*/
				3563	static unsigned long target_load(int cpu, int type)
				3564	{
				3565	struct rq *rq = cpu_rq(cpu);
				3566	unsigned long total = weighted_cpuload(cpu);
				3567
				3568	if (type == 0 \|\| !sched_feat(LB_BIAS))
				3569	return total;
				3570
				3571	return max(rq->cpu_load[type-1], total);
				3572	}
				3573
				3574	static unsigned long power_of(int cpu)
				3575	{
				3576	return cpu_rq(cpu)->cpu_power;
				3577	}
				3578
				3579	static unsigned long cpu_avg_load_per_task(int cpu)
				3580	{
				3581	struct rq *rq = cpu_rq(cpu);
				3582	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3583	unsigned long load_avg = rq->cfs.runnable_load_avg;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3584
				3585	if (nr_running)
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3586	return load_avg / nr_running;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3587
				3588	return 0;
				3589	}
				3590
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3591	static void record_wakee(struct task_struct *p)
				3592	{
				3593	/*
				3594	* Rough decay (wiping) for cost saving, don't worry
				3595	* about the boundary, really active task won't care
				3596	* about the loss.
				3597	*/
				3598	if (jiffies > current->wakee_flip_decay_ts + HZ) {
				3599	current->wakee_flips = 0;
				3600	current->wakee_flip_decay_ts = jiffies;
				3601	}
				3602
				3603	if (current->last_wakee != p) {
				3604	current->last_wakee = p;
				3605	current->wakee_flips++;
				3606	}
				3607	}
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3608
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	3609	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3610	{
				3611	struct sched_entity *se = &p->se;
				3612	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3613	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3614
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3615	#ifndef CONFIG_64BIT
				3616	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	3617
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3618	do {
				3619	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				3620	smp_rmb();
				3621	min_vruntime = cfs_rq->min_vruntime;
				3622	} while (min_vruntime != min_vruntime_copy);
				3623	#else
				3624	min_vruntime = cfs_rq->min_vruntime;
				3625	#endif
				3626
				3627	se->vruntime -= min_vruntime;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3628	record_wakee(p);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3629	}
				3630
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3631	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3632	/*
				3633	* effective_load() calculates the load change as seen from the root_task_group
				3634	*
				3635	* Adding load to a group doesn't make a group heavier, but can cause movement
				3636	* of group shares between cpus. Assuming the shares were perfectly aligned one
				3637	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3638	*
				3639	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				3640	* on this @cpu and results in a total addition (subtraction) of @wg to the
				3641	* total group weight.
				3642	*
				3643	* Given a runqueue weight distribution (rw_i) we can compute a shares
				3644	* distribution (s_i) using:
				3645	*
				3646	* s_i = rw_i / \Sum rw_j (1)
				3647	*
				3648	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				3649	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				3650	* shares distribution (s_i):
				3651	*
				3652	* rw_i = { 2, 4, 1, 0 }
				3653	* s_i = { 2/7, 4/7, 1/7, 0 }
				3654	*
				3655	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				3656	* task used to run on and the CPU the waker is running on), we need to
				3657	* compute the effect of waking a task on either CPU and, in case of a sync
				3658	* wakeup, compute the effect of the current task going to sleep.
				3659	*
				3660	* So for a change of @wl to the local @cpu with an overall group weight change
				3661	* of @wl we can compute the new shares distribution (s'_i) using:
				3662	*
				3663	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3664	*
				3665	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3666	* differences in waking a task to CPU 0. The additional task changes the
				3667	* weight and shares distributions like:
				3668	*
				3669	* rw'_i = { 3, 4, 1, 0 }
				3670	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3671	*
				3672	* We can then compute the difference in effective weight by using:
				3673	*
				3674	* dw_i = S * (s'_i - s_i) (3)
				3675	*
				3676	* Where 'S' is the group weight as seen by its parent.
				3677	*
				3678	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3679	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3680	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3681	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3682	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3683	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3684	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3685
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	3686	if (!tg->parent \|\| !wl) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3687	return wl;
				3688
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3689	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3690	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3691
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3692	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3693
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3694	/*
				3695	* W = @wg + \Sum rw_j
				3696	*/
				3697	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3698
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3699	/*
				3700	* w = rw_i + @wl
				3701	*/
				3702	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3703
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3704	/*
				3705	* wl = S * s'_i; see (2)
				3706	*/
				3707	if (W > 0 && w < W)
				3708	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3709	else
				3710	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3711
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3712	/*
				3713	* Per the above, wl is the new se->load.weight value; since
				3714	* those are clipped to [MIN_SHARES, ...) do so now. See
				3715	* calc_cfs_shares().
				3716	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3717	if (wl < MIN_SHARES)
				3718	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3719
				3720	/*
				3721	* wl = dw_i = S * (s'_i - s_i); see (3)
				3722	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3723	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3724
				3725	/*
				3726	* Recursively apply this logic to all parent groups to compute
				3727	* the final effective load change on the root group. Since
				3728	* only the @tg group gets extra weight, all parent groups can
				3729	* only redistribute existing shares. @wl is the shift in shares
				3730	* resulting from this level per the above.
				3731	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3732	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3733	}
				3734
				3735	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3736	}
				3737	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3738
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	3739	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3740	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3741	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3742	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3743
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3744	#endif
				3745
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3746	static int wake_wide(struct task_struct *p)
				3747	{
Peter Zijlstra	7d9ffa8	2013-07-04 12:56:46 +0800	[diff] [blame]	3748	int factor = this_cpu_read(sd_llc_size);
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3749
				3750	/*
				3751	* Yeah, it's the switching-frequency, could means many wakee or
				3752	* rapidly switch, use factor here will just help to automatically
				3753	* adjust the loose-degree, so bigger node will lead to more pull.
				3754	*/
				3755	if (p->wakee_flips > factor) {
				3756	/*
				3757	* wakee is somewhat hot, it needs certain amount of cpu
				3758	* resource, so if waker is far more hot, prefer to leave
				3759	* it alone.
				3760	*/
				3761	if (current->wakee_flips > (factor * p->wakee_flips))
				3762	return 1;
				3763	}
				3764
				3765	return 0;
				3766	}
				3767
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3768	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3769	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3770	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3771	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3772	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3773	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3774	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3775	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3776
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3777	/*
				3778	* If we wake multiple tasks be careful to not bounce
				3779	* ourselves around too much.
				3780	*/
				3781	if (wake_wide(p))
				3782	return 0;
				3783
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3784	idx = sd->wake_idx;
				3785	this_cpu = smp_processor_id();
				3786	prev_cpu = task_cpu(p);
				3787	load = source_load(prev_cpu, idx);
				3788	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3789
				3790	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3791	* If sync wakeup then subtract the (maximum possible)
				3792	* effect of the currently running task from the load
				3793	* of the current CPU:
				3794	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3795	if (sync) {
				3796	tg = task_group(current);
				3797	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3798
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3799	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3800	load += effective_load(tg, prev_cpu, 0, -weight);
				3801	}
				3802
				3803	tg = task_group(p);
				3804	weight = p->se.load.weight;
				3805
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3806	/*
				3807	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3808	* due to the sync cause above having dropped this_load to 0, we'll
				3809	* always have an imbalance, but there's really nothing you can do
				3810	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3811	*
				3812	* Otherwise check if either cpus are near enough in load to allow this
				3813	* task to be woken on this_cpu.
				3814	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3815	if (this_load > 0) {
				3816	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3817
				3818	this_eff_load = 100;
				3819	this_eff_load *= power_of(prev_cpu);
				3820	this_eff_load *= this_load +
				3821	effective_load(tg, this_cpu, weight, weight);
				3822
				3823	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3824	prev_eff_load *= power_of(this_cpu);
				3825	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3826
				3827	balanced = this_eff_load <= prev_eff_load;
				3828	} else
				3829	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3830
				3831	/*
				3832	* If the currently running task will sleep within
				3833	* a reasonable amount of time then attract this newly
				3834	* woken task:
				3835	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3836	if (sync && balanced)
				3837	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3838
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3839	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3840	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3841
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3842	if (balanced \|\|
				3843	(this_load <= load &&
				3844	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3845	/*
				3846	* This domain has SD_WAKE_AFFINE and
				3847	* p is cache cold in this domain, and
				3848	* there is no bad imbalance.
				3849	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3850	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3851	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3852
				3853	return 1;
				3854	}
				3855	return 0;
				3856	}
				3857
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3858	/*
				3859	* find_idlest_group finds and returns the least busy CPU group within the
				3860	* domain.
				3861	*/
				3862	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3863	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3864	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3865	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3866	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3867	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3868	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3869
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3870	do {
				3871	unsigned long load, avg_load;
				3872	int local_group;
				3873	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3874
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3875	/* Skip over this group if it has no CPUs allowed */
				3876	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3877	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3878	continue;
				3879
				3880	local_group = cpumask_test_cpu(this_cpu,
				3881	sched_group_cpus(group));
				3882
				3883	/* Tally up the load of all CPUs in the group */
				3884	avg_load = 0;
				3885
				3886	for_each_cpu(i, sched_group_cpus(group)) {
				3887	/* Bias balancing toward cpus of our domain */
				3888	if (local_group)
				3889	load = source_load(i, load_idx);
				3890	else
				3891	load = target_load(i, load_idx);
				3892
				3893	avg_load += load;
				3894	}
				3895
				3896	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3897	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3898
				3899	if (local_group) {
				3900	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3901	} else if (avg_load < min_load) {
				3902	min_load = avg_load;
				3903	idlest = group;
				3904	}
				3905	} while (group = group->next, group != sd->groups);
				3906
				3907	if (!idlest \|\| 100this_load < imbalancemin_load)
				3908	return NULL;
				3909	return idlest;
				3910	}
				3911
				3912	/*
				3913	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3914	*/
				3915	static int
				3916	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3917	{
				3918	unsigned long load, min_load = ULONG_MAX;
				3919	int idlest = -1;
				3920	int i;
				3921
				3922	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3923	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3924	load = weighted_cpuload(i);
				3925
				3926	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3927	min_load = load;
				3928	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3929	}
				3930	}
				3931
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3932	return idlest;
				3933	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3934
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3935	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3936	* Try and locate an idle CPU in the sched_domain.
				3937	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3938	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3939	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3940	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3941	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3942	int i = task_cpu(p);
				3943
				3944	if (idle_cpu(target))
				3945	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3946
				3947	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3948	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3949	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3950	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				3951	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3952
				3953	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3954	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3955	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3956	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3957	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3958	sg = sd->groups;
				3959	do {
				3960	if (!cpumask_intersects(sched_group_cpus(sg),
				3961	tsk_cpus_allowed(p)))
				3962	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3963
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3964	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3965	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3966	goto next;
				3967	}
				3968
				3969	target = cpumask_first_and(sched_group_cpus(sg),
				3970	tsk_cpus_allowed(p));
				3971	goto done;
				3972	next:
				3973	sg = sg->next;
				3974	} while (sg != sd->groups);
				3975	}
				3976	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3977	return target;
				3978	}
				3979
				3980	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3981	* sched_balance_self: balance the current task (running on cpu) in domains
				3982	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3983	* SD_BALANCE_EXEC.
				3984	*
				3985	* Balance, ie. select the least loaded group.
				3986	*
				3987	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3988	*
				3989	* preempt must be disabled.
				3990	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3991	static int
Peter Zijlstra	ac66f54	2013-10-07 11:29:16 +0100	[diff] [blame]	3992	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3993	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3994	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3995	int cpu = smp_processor_id();
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3996	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3997	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3998	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3999
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	4000	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	4001	return prev_cpu;
				4002
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	4003	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	4004	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4005	want_affine = 1;
				4006	new_cpu = prev_cpu;
				4007	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4008
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	4009	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4010	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	4011	if (!(tmp->flags & SD_LOAD_BALANCE))
				4012	continue;
				4013
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4014	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	4015	* If both cpu and prev_cpu are part of this domain,
				4016	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	4017	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	4018	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				4019	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				4020	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	4021	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4022	}
				4023
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	4024	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	4025	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4026	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4027
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	4028	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	4029	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	4030	prev_cpu = cpu;
				4031
				4032	new_cpu = select_idle_sibling(p, prev_cpu);
				4033	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	4034	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	4035
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4036	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	4037	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4038	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4039	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4040
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	4041	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4042	sd = sd->child;
				4043	continue;
				4044	}
				4045
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	4046	if (sd_flag & SD_BALANCE_WAKE)
				4047	load_idx = sd->wake_idx;
				4048
				4049	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4050	if (!group) {
				4051	sd = sd->child;
				4052	continue;
				4053	}
				4054
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	4055	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4056	if (new_cpu == -1 \|\| new_cpu == cpu) {
				4057	/* Now try balancing at a lower domain level of cpu */
				4058	sd = sd->child;
				4059	continue;
				4060	}
				4061
				4062	/* Now try balancing at a lower domain level of new_cpu */
				4063	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4064	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4065	sd = NULL;
				4066	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4067	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4068	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	4069	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4070	sd = tmp;
				4071	}
				4072	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4073	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	4074	unlock:
				4075	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4076
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4077	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4078	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	4079
				4080	/*
				4081	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				4082	* cfs_rq_of(p) references at time of call are still valid and identify the
				4083	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				4084	* other assumptions, including the state of rq->lock, should be made.
				4085	*/
				4086	static void
				4087	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				4088	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4089	struct sched_entity *se = &p->se;
				4090	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4091
				4092	/*
				4093	* Load tracking: accumulate removed load so that it can be processed
				4094	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				4095	* to blocked load iff they have a positive decay-count. It can never
				4096	* be negative here since on-rq tasks have decay-count == 0.
				4097	*/
				4098	if (se->avg.decay_count) {
				4099	se->avg.decay_count = -__synchronize_entity_decay(se);
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	4100	atomic_long_add(se->avg.load_avg_contrib,
				4101	&cfs_rq->removed_load);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4102	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	4103	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4104	#endif /* CONFIG_SMP */
				4105
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	4106	static unsigned long
				4107	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	4108	{
				4109	unsigned long gran = sysctl_sched_wakeup_granularity;
				4110
				4111	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	4112	* Since its curr running now, convert the gran from real-time
				4113	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	4114	*
				4115	* By using 'se' instead of 'curr' we penalize light tasks, so
				4116	* they get preempted easier. That is, if 'se' < 'curr' then
				4117	* the resulting gran will be larger, therefore penalizing the
				4118	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				4119	* be smaller, again penalizing the lighter task.
				4120	*
				4121	* This is especially important for buddies when the leftmost
				4122	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	4123	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	4124	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	4125	}
				4126
				4127	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	4128	* Should 'se' preempt 'curr'.
				4129	*
				4130	* \|s1
				4131	* \|s2
				4132	* \|s3
				4133	* g
				4134	* \|<--->\|c
				4135	*
				4136	* w(c, s1) = -1
				4137	* w(c, s2) = 0
				4138	* w(c, s3) = 1
				4139	*
				4140	*/
				4141	static int
				4142	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				4143	{
				4144	s64 gran, vdiff = curr->vruntime - se->vruntime;
				4145
				4146	if (vdiff <= 0)
				4147	return -1;
				4148
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	4149	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	4150	if (vdiff > gran)
				4151	return 1;
				4152
				4153	return 0;
				4154	}
				4155
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	4156	static void set_last_buddy(struct sched_entity *se)
				4157	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	4158	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				4159	return;
				4160
				4161	for_each_sched_entity(se)
				4162	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	4163	}
				4164
				4165	static void set_next_buddy(struct sched_entity *se)
				4166	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	4167	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				4168	return;
				4169
				4170	for_each_sched_entity(se)
				4171	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	4172	}
				4173
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	4174	static void set_skip_buddy(struct sched_entity *se)
				4175	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	4176	for_each_sched_entity(se)
				4177	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	4178	}
				4179
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	4180	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4181	* Preempt the current task with a newly woken task if needed:
				4182	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	4183	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4184	{
				4185	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	4186	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	4187	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	4188	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4189	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	4190
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	4191	if (unlikely(se == pse))
				4192	return;
				4193
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	4194	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4195	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	4196	* unconditionally check_prempt_curr() after an enqueue (which may have
				4197	* lead to a throttle). This both saves work and prevents false
				4198	* next-buddy nomination below.
				4199	*/
				4200	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				4201	return;
				4202
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4203	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	4204	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4205	next_buddy_marked = 1;
				4206	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	4207
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	4208	/*
				4209	* We can come here with TIF_NEED_RESCHED already set from new task
				4210	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	4211	*
				4212	* Note: this also catches the edge-case of curr being in a throttled
				4213	* group (e.g. via set_curr_task), since update_curr() (in the
				4214	* enqueue of curr) will have resulted in resched being set. This
				4215	* prevents us from potentially nominating it as a false LAST_BUDDY
				4216	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	4217	*/
				4218	if (test_tsk_need_resched(curr))
				4219	return;
				4220
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	4221	/* Idle tasks are by definition preempted by non-idle tasks. */
				4222	if (unlikely(curr->policy == SCHED_IDLE) &&
				4223	likely(p->policy != SCHED_IDLE))
				4224	goto preempt;
				4225
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	4226	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	4227	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				4228	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	4229	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	4230	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	4231	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4232
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	4233	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	4234	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	4235	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4236	if (wakeup_preempt_entity(se, pse) == 1) {
				4237	/*
				4238	* Bias pick_next to pick the sched entity that is
				4239	* triggering this preemption.
				4240	*/
				4241	if (!next_buddy_marked)
				4242	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	4243	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4244	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	4245
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	4246	return;
				4247
				4248	preempt:
				4249	resched_task(curr);
				4250	/*
				4251	* Only set the backward buddy when the current task is still
				4252	* on the rq. This can happen when a wakeup gets interleaved
				4253	* with schedule on the ->pre_schedule() or idle_balance()
				4254	* point, either of which can * drop the rq lock.
				4255	*
				4256	* Also, during early boot the idle thread is in the fair class,
				4257	* for obvious reasons its a bad idea to schedule back to it.
				4258	*/
				4259	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				4260	return;
				4261
				4262	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				4263	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4264	}
				4265
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	4266	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4267	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4268	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4269	struct cfs_rq *cfs_rq = &rq->cfs;
				4270	struct sched_entity *se;
				4271
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	4272	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4273	return NULL;
				4274
				4275	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	4276	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	4277	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4278	cfs_rq = group_cfs_rq(se);
				4279	} while (cfs_rq);
				4280
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4281	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	4282	if (hrtick_enabled(rq))
				4283	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4284
				4285	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4286	}
				4287
				4288	/*
				4289	* Account for a descheduled task:
				4290	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	4291	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4292	{
				4293	struct sched_entity *se = &prev->se;
				4294	struct cfs_rq *cfs_rq;
				4295
				4296	for_each_sched_entity(se) {
				4297	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	4298	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4299	}
				4300	}
				4301
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	4302	/*
				4303	* sched_yield() is very simple
				4304	*
				4305	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				4306	*/
				4307	static void yield_task_fair(struct rq *rq)
				4308	{
				4309	struct task_struct *curr = rq->curr;
				4310	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				4311	struct sched_entity *se = &curr->se;
				4312
				4313	/*
				4314	* Are we the only task in the tree?
				4315	*/
				4316	if (unlikely(rq->nr_running == 1))
				4317	return;
				4318
				4319	clear_buddies(cfs_rq, se);
				4320
				4321	if (curr->policy != SCHED_BATCH) {
				4322	update_rq_clock(rq);
				4323	/*
				4324	* Update run-time statistics of the 'current'.
				4325	*/
				4326	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	4327	/*
				4328	* Tell update_rq_clock() that we've just updated,
				4329	* so we don't do microscopic update in schedule()
				4330	* and double the fastpath cost.
				4331	*/
				4332	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	4333	}
				4334
				4335	set_skip_buddy(se);
				4336	}
				4337
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	4338	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				4339	{
				4340	struct sched_entity *se = &p->se;
				4341
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	4342	/* throttled hierarchies are not runnable */
				4343	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	4344	return false;
				4345
				4346	/* Tell the scheduler that we'd really like pse to run next. */
				4347	set_next_buddy(se);
				4348
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	4349	yield_task_fair(rq);
				4350
				4351	return true;
				4352	}
				4353
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	4354	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4355	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	4356	* Fair scheduling class load-balancing methods.
				4357	*
				4358	* BASICS
				4359	*
				4360	* The purpose of load-balancing is to achieve the same basic fairness the
				4361	* per-cpu scheduler provides, namely provide a proportional amount of compute
				4362	* time to each task. This is expressed in the following equation:
				4363	*
				4364	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				4365	*
				4366	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				4367	* W_i,0 is defined as:
				4368	*
				4369	* W_i,0 = \Sum_j w_i,j (2)
				4370	*
				4371	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				4372	* is derived from the nice value as per prio_to_weight[].
				4373	*
				4374	* The weight average is an exponential decay average of the instantaneous
				4375	* weight:
				4376	*
				4377	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				4378	*
				4379	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				4380	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				4381	* can also include other factors [XXX].
				4382	*
				4383	* To achieve this balance we define a measure of imbalance which follows
				4384	* directly from (1):
				4385	*
				4386	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				4387	*
				4388	* We them move tasks around to minimize the imbalance. In the continuous
				4389	* function space it is obvious this converges, in the discrete case we get
				4390	* a few fun cases generally called infeasible weight scenarios.
				4391	*
				4392	* [XXX expand on:
				4393	* - infeasible weights;
				4394	* - local vs global optima in the discrete case. ]
				4395	*
				4396	*
				4397	* SCHED DOMAINS
				4398	*
				4399	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				4400	* for all i,j solution, we create a tree of cpus that follows the hardware
				4401	* topology where each level pairs two lower groups (or better). This results
				4402	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				4403	* tree to only the first of the previous level and we decrease the frequency
				4404	* of load-balance at each level inv. proportional to the number of cpus in
				4405	* the groups.
				4406	*
				4407	* This yields:
				4408	*
				4409	* log_2 n 1 n
				4410	* \Sum { --- * --- * 2^i } = O(n) (5)
				4411	* i = 0 2^i 2^i
				4412	* `- size of each group
				4413	* \| \| `- number of cpus doing load-balance
				4414	* \| `- freq
				4415	* `- sum over all levels
				4416	*
				4417	* Coupled with a limit on how many tasks we can migrate every balance pass,
				4418	* this makes (5) the runtime complexity of the balancer.
				4419	*
				4420	* An important property here is that each CPU is still (indirectly) connected
				4421	* to every other cpu in at most O(log n) steps:
				4422	*
				4423	* The adjacency matrix of the resulting graph is given by:
				4424	*
				4425	* log_2 n
				4426	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				4427	* k = 0
				4428	*
				4429	* And you'll find that:
				4430	*
				4431	* A^(log_2 n)_i,j != 0 for all i,j (7)
				4432	*
				4433	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				4434	* The task movement gives a factor of O(m), giving a convergence complexity
				4435	* of:
				4436	*
				4437	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				4438	*
				4439	*
				4440	* WORK CONSERVING
				4441	*
				4442	* In order to avoid CPUs going idle while there's still work to do, new idle
				4443	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				4444	* tree itself instead of relying on other CPUs to bring it work.
				4445	*
				4446	* This adds some complexity to both (5) and (8) but it reduces the total idle
				4447	* time.
				4448	*
				4449	* [XXX more?]
				4450	*
				4451	*
				4452	* CGROUPS
				4453	*
				4454	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				4455	*
				4456	* s_k,i
				4457	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				4458	* S_k
				4459	*
				4460	* Where
				4461	*
				4462	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				4463	*
				4464	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				4465	*
				4466	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				4467	* property.
				4468	*
				4469	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				4470	* rewrite all of this once again.]
				4471	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4472
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	4473	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				4474
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4475	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4476	#define LBF_NEED_BREAK 0x02
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4477	#define LBF_DST_PINNED 0x04
				4478	#define LBF_SOME_PINNED 0x08
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4479
				4480	struct lb_env {
				4481	struct sched_domain *sd;
				4482
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4483	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	4484	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4485
				4486	int dst_cpu;
				4487	struct rq *dst_rq;
				4488
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4489	struct cpumask *dst_grpmask;
				4490	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4491	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4492	long imbalance;
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4493	/* The set of CPUs under consideration for load-balancing */
				4494	struct cpumask *cpus;
				4495
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4496	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4497
				4498	unsigned int loop;
				4499	unsigned int loop_break;
				4500	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4501	};
				4502
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4503	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4504	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4505	* Both runqueues must be locked.
				4506	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4507	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4508	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4509	deactivate_task(env->src_rq, p, 0);
				4510	set_task_cpu(p, env->dst_cpu);
				4511	activate_task(env->dst_rq, p, 0);
				4512	check_preempt_curr(env->dst_rq, p, 0);
Rik van Riel	6fe6b2d	2013-10-07 11:29:08 +0100	[diff] [blame]	4513	#ifdef CONFIG_NUMA_BALANCING
				4514	if (p->numa_preferred_nid != -1) {
				4515	int src_nid = cpu_to_node(env->src_cpu);
				4516	int dst_nid = cpu_to_node(env->dst_cpu);
				4517
				4518	/*
				4519	* If the load balancer has moved the task then limit
				4520	* migrations from taking place in the short term in
				4521	* case this is a short-lived migration.
				4522	*/
				4523	if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
				4524	p->numa_migrate_seq = 0;
				4525	}
				4526	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4527	}
				4528
				4529	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4530	* Is this task likely cache-hot:
				4531	*/
				4532	static int
				4533	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				4534	{
				4535	s64 delta;
				4536
				4537	if (p->sched_class != &fair_sched_class)
				4538	return 0;
				4539
				4540	if (unlikely(p->policy == SCHED_IDLE))
				4541	return 0;
				4542
				4543	/*
				4544	* Buddy candidates are cache hot:
				4545	*/
				4546	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				4547	(&p->se == cfs_rq_of(&p->se)->next \|\|
				4548	&p->se == cfs_rq_of(&p->se)->last))
				4549	return 1;
				4550
				4551	if (sysctl_sched_migration_cost == -1)
				4552	return 1;
				4553	if (sysctl_sched_migration_cost == 0)
				4554	return 0;
				4555
				4556	delta = now - p->se.exec_start;
				4557
				4558	return delta < (s64)sysctl_sched_migration_cost;
				4559	}
				4560
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	4561	#ifdef CONFIG_NUMA_BALANCING
				4562	/* Returns true if the destination node has incurred more faults */
				4563	static bool migrate_improves_locality(struct task_struct p, struct lb_env env)
				4564	{
				4565	int src_nid, dst_nid;
				4566
				4567	if (!sched_feat(NUMA_FAVOUR_HIGHER) \|\| !p->numa_faults \|\|
				4568	!(env->sd->flags & SD_NUMA)) {
				4569	return false;
				4570	}
				4571
				4572	src_nid = cpu_to_node(env->src_cpu);
				4573	dst_nid = cpu_to_node(env->dst_cpu);
				4574
				4575	if (src_nid == dst_nid \|\|
				4576	p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
				4577	return false;
				4578
				4579	if (dst_nid == p->numa_preferred_nid \|\|
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	4580	task_faults(p, dst_nid) > task_faults(p, src_nid))
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	4581	return true;
				4582
				4583	return false;
				4584	}
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	4585
				4586
				4587	static bool migrate_degrades_locality(struct task_struct p, struct lb_env env)
				4588	{
				4589	int src_nid, dst_nid;
				4590
				4591	if (!sched_feat(NUMA) \|\| !sched_feat(NUMA_RESIST_LOWER))
				4592	return false;
				4593
				4594	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
				4595	return false;
				4596
				4597	src_nid = cpu_to_node(env->src_cpu);
				4598	dst_nid = cpu_to_node(env->dst_cpu);
				4599
				4600	if (src_nid == dst_nid \|\|
				4601	p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
				4602	return false;
				4603
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	4604	if (task_faults(p, dst_nid) < task_faults(p, src_nid))
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	4605	return true;
				4606
				4607	return false;
				4608	}
				4609
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	4610	#else
				4611	static inline bool migrate_improves_locality(struct task_struct *p,
				4612	struct lb_env *env)
				4613	{
				4614	return false;
				4615	}
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	4616
				4617	static inline bool migrate_degrades_locality(struct task_struct *p,
				4618	struct lb_env *env)
				4619	{
				4620	return false;
				4621	}
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	4622	#endif
				4623
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4624	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4625	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				4626	*/
				4627	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4628	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4629	{
				4630	int tsk_cache_hot = 0;
				4631	/*
				4632	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4633	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4634	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4635	* 3) running (obviously), or
				4636	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4637	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4638	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				4639	return 0;
				4640
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4641	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4642	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4643
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4644	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4645
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4646	env->flags \|= LBF_SOME_PINNED;
				4647
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4648	/*
				4649	* Remember if this task can be migrated to any other cpu in
				4650	* our sched_group. We may want to revisit it if we couldn't
				4651	* meet load balance goals by pulling other tasks on src_cpu.
				4652	*
				4653	* Also avoid computing new_dst_cpu if we have already computed
				4654	* one in current iteration.
				4655	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4656	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4657	return 0;
				4658
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4659	/* Prevent to re-select dst_cpu via env's cpus */
				4660	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				4661	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4662	env->flags \|= LBF_DST_PINNED;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4663	env->new_dst_cpu = cpu;
				4664	break;
				4665	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4666	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4667
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4668	return 0;
				4669	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4670
				4671	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4672	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4673
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4674	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4675	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4676	return 0;
				4677	}
				4678
				4679	/*
				4680	* Aggressive migration if:
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	4681	* 1) destination numa is preferred
				4682	* 2) task is cache cold, or
				4683	* 3) too many balance attempts have failed.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4684	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4685	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	4686	if (!tsk_cache_hot)
				4687	tsk_cache_hot = migrate_degrades_locality(p, env);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	4688
				4689	if (migrate_improves_locality(p, env)) {
				4690	#ifdef CONFIG_SCHEDSTATS
				4691	if (tsk_cache_hot) {
				4692	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
				4693	schedstat_inc(p, se.statistics.nr_forced_migrations);
				4694	}
				4695	#endif
				4696	return 1;
				4697	}
				4698
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4699	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4700	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4701
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4702	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4703	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4704	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4705	}
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4706
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4707	return 1;
				4708	}
				4709
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4710	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				4711	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4712	}
				4713
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4714	/*
				4715	* move_one_task tries to move exactly one task from busiest to this_rq, as
				4716	* part of active balancing operations within "domain".
				4717	* Returns 1 if successful and 0 otherwise.
				4718	*
				4719	* Called with both runqueues locked.
				4720	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4721	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4722	{
				4723	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4724
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4725	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4726	if (!can_migrate_task(p, env))
				4727	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4728
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4729	move_task(p, env);
				4730	/*
				4731	* Right now, this is only the second place move_task()
				4732	* is called, so we can safely collect move_task()
				4733	* stats here rather than inside move_task().
				4734	*/
				4735	schedstat_inc(env->sd, lb_gained[env->idle]);
				4736	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4737	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4738	return 0;
				4739	}
				4740
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4741	static const unsigned int sched_nr_migrate_break = 32;
				4742
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4743	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4744	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4745	* this_rq, as part of a balancing operation within domain "sd".
				4746	* Returns 1 if successful and 0 otherwise.
				4747	*
				4748	* Called with both runqueues locked.
				4749	*/
				4750	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4751	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4752	struct list_head *tasks = &env->src_rq->cfs_tasks;
				4753	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4754	unsigned long load;
				4755	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4756
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4757	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4758	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4759
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4760	while (!list_empty(tasks)) {
				4761	p = list_first_entry(tasks, struct task_struct, se.group_node);
				4762
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4763	env->loop++;
				4764	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4765	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4766	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4767
				4768	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4769	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4770	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4771	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4772	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4773	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4774
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4775	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4776	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4777
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4778	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4779
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4780	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4781	goto next;
				4782
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4783	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4784	goto next;
				4785
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4786	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4787	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4788	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4789
				4790	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4791	/*
				4792	* NEWIDLE balancing is a source of latency, so preemptible
				4793	* kernels will stop after the first task is pulled to minimize
				4794	* the critical section.
				4795	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4796	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4797	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4798	#endif
				4799
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4800	/*
				4801	* We only want to steal up to the prescribed amount of
				4802	* weighted load.
				4803	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4804	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4805	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4806
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4807	continue;
				4808	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4809	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4810	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4811
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4812	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4813	* Right now, this is one of only two places move_task() is called,
				4814	* so we can safely collect move_task() stats here rather than
				4815	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4816	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4817	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4818
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4819	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4820	}
				4821
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4822	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4823	/*
				4824	* update tg->load_weight by folding this cpu's load_avg
				4825	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4826	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4827	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4828	struct sched_entity *se = tg->se[cpu];
				4829	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4830
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4831	/* throttled entities do not contribute to load */
				4832	if (throttled_hierarchy(cfs_rq))
				4833	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4834
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4835	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4836
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4837	if (se) {
				4838	update_entity_load_avg(se, 1);
				4839	/*
				4840	* We pivot on our runnable average having decayed to zero for
				4841	* list removal. This generally implies that all our children
				4842	* have also been removed (modulo rounding error or bandwidth
				4843	* control); however, such cases are rare and we can fix these
				4844	* at enqueue.
				4845	*
				4846	* TODO: fix up out-of-order children on enqueue.
				4847	*/
				4848	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4849	list_del_leaf_cfs_rq(cfs_rq);
				4850	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4851	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4852	update_rq_runnable_avg(rq, rq->nr_running);
				4853	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4854	}
				4855
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4856	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4857	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4858	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4859	struct cfs_rq *cfs_rq;
				4860	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4861
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4862	raw_spin_lock_irqsave(&rq->lock, flags);
				4863	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4864	/*
				4865	* Iterates the task_group tree in a bottom up fashion, see
				4866	* list_add_leaf_cfs_rq() for details.
				4867	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4868	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4869	/*
				4870	* Note: We may want to consider periodically releasing
				4871	* rq->lock about these updates so that creating many task
				4872	* groups does not result in continually extending hold time.
				4873	*/
				4874	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4875	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4876
				4877	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4878	}
				4879
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4880	/*
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4881	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4882	* This needs to be done in a top-down fashion because the load of a child
				4883	* group is a fraction of its parents load.
				4884	*/
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4885	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4886	{
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4887	struct rq *rq = rq_of(cfs_rq);
				4888	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4889	unsigned long now = jiffies;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4890	unsigned long load;
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4891
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4892	if (cfs_rq->last_h_load_update == now)
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4893	return;
				4894
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4895	cfs_rq->h_load_next = NULL;
				4896	for_each_sched_entity(se) {
				4897	cfs_rq = cfs_rq_of(se);
				4898	cfs_rq->h_load_next = se;
				4899	if (cfs_rq->last_h_load_update == now)
				4900	break;
				4901	}
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4902
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4903	if (!se) {
Vladimir Davydov	7e3115e	2013-09-14 19:39:46 +0400	[diff] [blame]	4904	cfs_rq->h_load = cfs_rq->runnable_load_avg;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4905	cfs_rq->last_h_load_update = now;
				4906	}
				4907
				4908	while ((se = cfs_rq->h_load_next) != NULL) {
				4909	load = cfs_rq->h_load;
				4910	load = div64_ul(load * se->avg.load_avg_contrib,
				4911	cfs_rq->runnable_load_avg + 1);
				4912	cfs_rq = group_cfs_rq(se);
				4913	cfs_rq->h_load = load;
				4914	cfs_rq->last_h_load_update = now;
				4915	}
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4916	}
				4917
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4918	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4919	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4920	struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4921
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4922	update_cfs_rq_h_load(cfs_rq);
Alex Shi	a003a25	2013-06-20 10:18:51 +0800	[diff] [blame]	4923	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
				4924	cfs_rq->runnable_load_avg + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4925	}
				4926	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4927	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4928	{
				4929	}
				4930
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4931	static unsigned long task_h_load(struct task_struct *p)
				4932	{
Alex Shi	a003a25	2013-06-20 10:18:51 +0800	[diff] [blame]	4933	return p->se.avg.load_avg_contrib;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4934	}
				4935	#endif
				4936
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4937	/******** Helpers for find_busiest_group **********************/
				4938	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4939	* sg_lb_stats - stats of a sched_group required for load_balancing
				4940	*/
				4941	struct sg_lb_stats {
				4942	unsigned long avg_load; /Avg load across the CPUs of the group /
				4943	unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4944	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4945	unsigned long load_per_task;
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4946	unsigned long group_power;
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4947	unsigned int sum_nr_running; /* Nr tasks running in the group */
				4948	unsigned int group_capacity;
				4949	unsigned int idle_cpus;
				4950	unsigned int group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4951	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4952	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4953	};
				4954
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4955	/*
				4956	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4957	* during load balancing.
				4958	*/
				4959	struct sd_lb_stats {
				4960	struct sched_group busiest; / Busiest group in this sd */
				4961	struct sched_group local; / Local group in this sd */
				4962	unsigned long total_load; /* Total load of all groups in sd */
				4963	unsigned long total_pwr; /* Total power of all groups in sd */
				4964	unsigned long avg_load; /* Average load across all groups in sd */
				4965
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4966	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4967	struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4968	};
				4969
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4970	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				4971	{
				4972	/*
				4973	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				4974	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				4975	* We must however clear busiest_stat::avg_load because
				4976	* update_sd_pick_busiest() reads this before assignment.
				4977	*/
				4978	*sds = (struct sd_lb_stats){
				4979	.busiest = NULL,
				4980	.local = NULL,
				4981	.total_load = 0UL,
				4982	.total_pwr = 0UL,
				4983	.busiest_stat = {
				4984	.avg_load = 0UL,
				4985	},
				4986	};
				4987	}
				4988
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4989	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4990	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4991	* @sd: The sched_domain whose load_idx is to be obtained.
				4992	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4993	*
				4994	* Return: The load index.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4995	*/
				4996	static inline int get_sd_load_idx(struct sched_domain *sd,
				4997	enum cpu_idle_type idle)
				4998	{
				4999	int load_idx;
				5000
				5001	switch (idle) {
				5002	case CPU_NOT_IDLE:
				5003	load_idx = sd->busy_idx;
				5004	break;
				5005
				5006	case CPU_NEWLY_IDLE:
				5007	load_idx = sd->newidle_idx;
				5008	break;
				5009	default:
				5010	load_idx = sd->idle_idx;
				5011	break;
				5012	}
				5013
				5014	return load_idx;
				5015	}
				5016
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	5017	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5018	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5019	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5020	}
				5021
				5022	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				5023	{
				5024	return default_scale_freq_power(sd, cpu);
				5025	}
				5026
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	5027	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5028	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	5029	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5030	unsigned long smt_gain = sd->smt_gain;
				5031
				5032	smt_gain /= weight;
				5033
				5034	return smt_gain;
				5035	}
				5036
				5037	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				5038	{
				5039	return default_scale_smt_power(sd, cpu);
				5040	}
				5041
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	5042	static unsigned long scale_rt_power(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5043	{
				5044	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	5045	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5046
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	5047	/*
				5048	* Since we're reading these variables without serialization make sure
				5049	* we read them once before doing sanity checks on them.
				5050	*/
				5051	age_stamp = ACCESS_ONCE(rq->age_stamp);
				5052	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	5053
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	5054	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	5055
				5056	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	5057	/* Ensures that power won't end up being negative */
				5058	available = 0;
				5059	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	5060	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	5061	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5062
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5063	if (unlikely((s64)total < SCHED_POWER_SCALE))
				5064	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5065
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5066	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5067
				5068	return div_u64(available, total);
				5069	}
				5070
				5071	static void update_cpu_power(struct sched_domain *sd, int cpu)
				5072	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	5073	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5074	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5075	struct sched_group *sdg = sd->groups;
				5076
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5077	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				5078	if (sched_feat(ARCH_POWER))
				5079	power *= arch_scale_smt_power(sd, cpu);
				5080	else
				5081	power *= default_scale_smt_power(sd, cpu);
				5082
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5083	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5084	}
				5085
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	5086	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5087
				5088	if (sched_feat(ARCH_POWER))
				5089	power *= arch_scale_freq_power(sd, cpu);
				5090	else
				5091	power *= default_scale_freq_power(sd, cpu);
				5092
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5093	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5094
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5095	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5096	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5097
				5098	if (!power)
				5099	power = 1;
				5100
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5101	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	5102	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5103	}
				5104
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5105	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5106	{
				5107	struct sched_domain *child = sd->child;
				5108	struct sched_group group, sdg = sd->groups;
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	5109	unsigned long power, power_orig;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	5110	unsigned long interval;
				5111
				5112	interval = msecs_to_jiffies(sd->balance_interval);
				5113	interval = clamp(interval, 1UL, max_load_balance_interval);
				5114	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5115
				5116	if (!child) {
				5117	update_cpu_power(sd, cpu);
				5118	return;
				5119	}
				5120
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	5121	power_orig = power = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5122
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	5123	if (child->flags & SD_OVERLAP) {
				5124	/*
				5125	* SD_OVERLAP domains cannot assume that child groups
				5126	* span the current group.
				5127	*/
				5128
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	5129	for_each_cpu(cpu, sched_group_cpus(sdg)) {
				5130	struct sched_group *sg = cpu_rq(cpu)->sd->groups;
				5131
				5132	power_orig += sg->sgp->power_orig;
				5133	power += sg->sgp->power;
				5134	}
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	5135	} else {
				5136	/*
				5137	* !SD_OVERLAP domains can assume that child groups
				5138	* span the current group.
				5139	*/
				5140
				5141	group = child->groups;
				5142	do {
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	5143	power_orig += group->sgp->power_orig;
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	5144	power += group->sgp->power;
				5145	group = group->next;
				5146	} while (group != child->groups);
				5147	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5148
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	5149	sdg->sgp->power_orig = power_orig;
				5150	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5151	}
				5152
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5153	/*
				5154	* Try and fix up capacity for tiny siblings, this is needed when
				5155	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				5156	* which on its own isn't powerful enough.
				5157	*
				5158	* See update_sd_pick_busiest() and check_asym_packing().
				5159	*/
				5160	static inline int
				5161	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				5162	{
				5163	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5164	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5165	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	5166	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5167	return 0;
				5168
				5169	/*
				5170	* If ~90% of the cpu_power is still there, we're good.
				5171	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	5172	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5173	return 1;
				5174
				5175	return 0;
				5176	}
				5177
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5178	/*
				5179	* Group imbalance indicates (and tries to solve) the problem where balancing
				5180	* groups is inadequate due to tsk_cpus_allowed() constraints.
				5181	*
				5182	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				5183	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				5184	* Something like:
				5185	*
				5186	* { 0 1 2 3 } { 4 5 6 7 }
				5187	* * * * *
				5188	*
				5189	* If we were to balance group-wise we'd place two tasks in the first group and
				5190	* two tasks in the second group. Clearly this is undesired as it will overload
				5191	* cpu 3 and leave one of the cpus in the second group unused.
				5192	*
				5193	* The current solution to this issue is detecting the skew in the first group
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5194	* by noticing the lower domain failed to reach balance and had difficulty
				5195	* moving tasks due to affinity constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5196	*
				5197	* When this is so detected; this group becomes a candidate for busiest; see
				5198	* update_sd_pick_busiest(). And calculcate_imbalance() and
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5199	* find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5200	* to create an effective group imbalance.
				5201	*
				5202	* This is a somewhat tricky proposition since the next run might not find the
				5203	* group imbalance and decide the groups need to be balanced again. A most
				5204	* subtle and fragile situation.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5205	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5206
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5207	static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5208	{
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5209	return group->sgp->imbalance;
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5210	}
				5211
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	5212	/*
				5213	* Compute the group capacity.
				5214	*
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	5215	* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
				5216	* first dividing out the smt factor and computing the actual number of cores
				5217	* and limit power unit capacity with that.
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	5218	*/
				5219	static inline int sg_capacity(struct lb_env env, struct sched_group group)
				5220	{
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	5221	unsigned int capacity, smt, cpus;
				5222	unsigned int power, power_orig;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	5223
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	5224	power = group->sgp->power;
				5225	power_orig = group->sgp->power_orig;
				5226	cpus = group->group_weight;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	5227
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	5228	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
				5229	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
				5230	capacity = cpus / smt; /* cores */
				5231
				5232	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	5233	if (!capacity)
				5234	capacity = fix_small_capacity(env->sd, group);
				5235
				5236	return capacity;
				5237	}
				5238
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5239	/**
				5240	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				5241	* @env: The load balancing environment.
				5242	* @group: sched_group whose statistics are to be updated.
				5243	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				5244	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5245	* @sgs: variable to hold the statistics for this group.
				5246	*/
				5247	static inline void update_sg_lb_stats(struct lb_env *env,
				5248	struct sched_group *group, int load_idx,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5249	int local_group, struct sg_lb_stats *sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5250	{
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5251	unsigned long nr_running;
				5252	unsigned long load;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5253	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5254
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	5255	memset(sgs, 0, sizeof(*sgs));
				5256
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5257	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5258	struct rq *rq = cpu_rq(i);
				5259
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	5260	nr_running = rq->nr_running;
				5261
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5262	/* Bias balancing toward cpus of our domain */
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5263	if (local_group)
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5264	load = target_load(i, load_idx);
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5265	else
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5266	load = source_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5267
				5268	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	5269	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5270	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5271	if (idle_cpu(i))
				5272	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5273	}
				5274
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5275	/* Adjust by relative CPU power of the group */
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5276	sgs->group_power = group->sgp->power;
				5277	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5278
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5279	if (sgs->sum_nr_running)
Peter Zijlstra	38d0f77	2013-08-15 19:47:56 +0200	[diff] [blame]	5280	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5281
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5282	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5283
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	5284	sgs->group_imb = sg_imbalanced(group);
				5285	sgs->group_capacity = sg_capacity(env, group);
				5286
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5287	if (sgs->group_capacity > sgs->sum_nr_running)
				5288	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5289	}
				5290
				5291	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5292	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5293	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5294	* @sds: sched_domain statistics
				5295	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	5296	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5297	*
				5298	* Determine if @sg is a busier group than the previously selected
				5299	* busiest group.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	5300	*
				5301	* Return: %true if @sg is a busier group than the previously selected
				5302	* busiest group. %false otherwise.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5303	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5304	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5305	struct sd_lb_stats *sds,
				5306	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5307	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5308	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5309	if (sgs->avg_load <= sds->busiest_stat.avg_load)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5310	return false;
				5311
				5312	if (sgs->sum_nr_running > sgs->group_capacity)
				5313	return true;
				5314
				5315	if (sgs->group_imb)
				5316	return true;
				5317
				5318	/*
				5319	* ASYM_PACKING needs to move all the work to the lowest
				5320	* numbered CPUs in the group, therefore mark all groups
				5321	* higher than ourself as busy.
				5322	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5323	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				5324	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5325	if (!sds->busiest)
				5326	return true;
				5327
				5328	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				5329	return true;
				5330	}
				5331
				5332	return false;
				5333	}
				5334
				5335	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	5336	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5337	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5338	* @balance: Should we balance.
				5339	* @sds: variable to hold the statistics for this sched_domain.
				5340	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5341	static inline void update_sd_lb_stats(struct lb_env *env,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5342	struct sd_lb_stats *sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5343	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5344	struct sched_domain *child = env->sd->child;
				5345	struct sched_group *sg = env->sd->groups;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5346	struct sg_lb_stats tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5347	int load_idx, prefer_sibling = 0;
				5348
				5349	if (child && child->flags & SD_PREFER_SIBLING)
				5350	prefer_sibling = 1;
				5351
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5352	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5353
				5354	do {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5355	struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5356	int local_group;
				5357
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5358	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5359	if (local_group) {
				5360	sds->local = sg;
				5361	sgs = &sds->local_stat;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	5362
				5363	if (env->idle != CPU_NEWLY_IDLE \|\|
				5364	time_after_eq(jiffies, sg->sgp->next_update))
				5365	update_group_power(env->sd, env->dst_cpu);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5366	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5367
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5368	update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5369
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	5370	if (local_group)
				5371	goto next_group;
				5372
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5373	/*
				5374	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5375	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	5376	* and move all the excess tasks away. We lower the capacity
				5377	* of a group only if the local group has the capacity to fit
				5378	* these excess tasks, i.e. nr_running < group_capacity. The
				5379	* extra check prevents the case where you always pull from the
				5380	* heaviest group when it is already under-utilized (possible
				5381	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5382	*/
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	5383	if (prefer_sibling && sds->local &&
				5384	sds->local_stat.group_has_capacity)
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	5385	sgs->group_capacity = min(sgs->group_capacity, 1U);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5386
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	5387	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5388	sds->busiest = sg;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5389	sds->busiest_stat = *sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5390	}
				5391
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	5392	next_group:
				5393	/* Now, start updating sd_lb_stats */
				5394	sds->total_load += sgs->group_load;
				5395	sds->total_pwr += sgs->group_power;
				5396
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5397	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5398	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5399	}
				5400
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5401	/**
				5402	* check_asym_packing - Check to see if the group is packed into the
				5403	* sched doman.
				5404	*
				5405	* This is primarily intended to used at the sibling level. Some
				5406	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				5407	* case of POWER7, it can move to lower SMT modes only when higher
				5408	* threads are idle. When in lower SMT modes, the threads will
				5409	* perform better since they share less core resources. Hence when we
				5410	* have idle threads, we want them to be the higher ones.
				5411	*
				5412	* This packing function is run on idle threads. It checks to see if
				5413	* the busiest CPU in this domain (core in the P7 case) has a higher
				5414	* CPU number than the packing function is being run on. Here we are
				5415	* assuming lower CPU number will be equivalent to lower a SMT thread
				5416	* number.
				5417	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	5418	* Return: 1 when packing is required and a task should be moved to
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	5419	* this CPU. The amount of the imbalance is returned in *imbalance.
				5420	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5421	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5422	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5423	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5424	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5425	{
				5426	int busiest_cpu;
				5427
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5428	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5429	return 0;
				5430
				5431	if (!sds->busiest)
				5432	return 0;
				5433
				5434	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5435	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5436	return 0;
				5437
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5438	env->imbalance = DIV_ROUND_CLOSEST(
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5439	sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
				5440	SCHED_POWER_SCALE);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5441
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5442	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5443	}
				5444
				5445	/**
				5446	* fix_small_imbalance - Calculate the minor imbalance that exists
				5447	* amongst the groups of a sched_domain, during
				5448	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5449	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5450	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5451	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5452	static inline
				5453	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5454	{
				5455	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				5456	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5457	unsigned long scaled_busy_load_per_task;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5458	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5459
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5460	local = &sds->local_stat;
				5461	busiest = &sds->busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5462
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5463	if (!local->sum_nr_running)
				5464	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				5465	else if (busiest->load_per_task > local->load_per_task)
				5466	imbn = 1;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5467
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5468	scaled_busy_load_per_task =
				5469	(busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5470	busiest->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5471
Vladimir Davydov	3029ede	2013-09-15 17:49:14 +0400	[diff] [blame]	5472	if (busiest->avg_load + scaled_busy_load_per_task >=
				5473	local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5474	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5475	return;
				5476	}
				5477
				5478	/*
				5479	* OK, we don't have enough imbalance to justify moving tasks,
				5480	* however we may be able to increase total CPU power used by
				5481	* moving them.
				5482	*/
				5483
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5484	pwr_now += busiest->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5485	min(busiest->load_per_task, busiest->avg_load);
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5486	pwr_now += local->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5487	min(local->load_per_task, local->avg_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5488	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5489
				5490	/* Amount of load we'd subtract */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5491	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5492	busiest->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5493	if (busiest->avg_load > tmp) {
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5494	pwr_move += busiest->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5495	min(busiest->load_per_task,
				5496	busiest->avg_load - tmp);
				5497	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5498
				5499	/* Amount of load we'd add */
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5500	if (busiest->avg_load * busiest->group_power <
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5501	busiest->load_per_task * SCHED_POWER_SCALE) {
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5502	tmp = (busiest->avg_load * busiest->group_power) /
				5503	local->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5504	} else {
				5505	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5506	local->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5507	}
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5508	pwr_move += local->group_power *
				5509	min(local->load_per_task, local->avg_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5510	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5511
				5512	/* Move if we gain throughput */
				5513	if (pwr_move > pwr_now)
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5514	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5515	}
				5516
				5517	/**
				5518	* calculate_imbalance - Calculate the amount of imbalance present within the
				5519	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5520	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5521	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5522	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5523	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5524	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5525	unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5526	struct sg_lb_stats local, busiest;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5527
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5528	local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5529	busiest = &sds->busiest_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5530
				5531	if (busiest->group_imb) {
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5532	/*
				5533	* In the group_imb case we cannot rely on group-wide averages
				5534	* to ensure cpu-load equilibrium, look at wider averages. XXX
				5535	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5536	busiest->load_per_task =
				5537	min(busiest->load_per_task, sds->avg_load);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5538	}
				5539
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5540	/*
				5541	* In the presence of smp nice balancing, certain scenarios can have
				5542	* max load less than avg load(as we skip the groups at or below
				5543	* its cpu_power, while calculating max_load..)
				5544	*/
Vladimir Davydov	b188555	2013-09-15 17:49:13 +0400	[diff] [blame]	5545	if (busiest->avg_load <= sds->avg_load \|\|
				5546	local->avg_load >= sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5547	env->imbalance = 0;
				5548	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5549	}
				5550
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5551	if (!busiest->group_imb) {
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5552	/*
				5553	* Don't want to pull so many tasks that a group would go idle.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5554	* Except of course for the group_imb case, since then we might
				5555	* have to drop below capacity to reach cpu-load equilibrium.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5556	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5557	load_above_capacity =
				5558	(busiest->sum_nr_running - busiest->group_capacity);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5559
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5560	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5561	load_above_capacity /= busiest->group_power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5562	}
				5563
				5564	/*
				5565	* We're trying to get all the cpus to the average_load, so we don't
				5566	* want to push ourselves above the average load, nor do we wish to
				5567	* reduce the max loaded cpu below the average load. At the same time,
				5568	* we also don't want to reduce the group load below the group capacity
				5569	* (so that we can implement power-savings policies etc). Thus we look
				5570	* for the minimum possible imbalance.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5571	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5572	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5573
				5574	/* How much load to actually move to equalise the imbalance */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5575	env->imbalance = min(
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5576	max_pull * busiest->group_power,
				5577	(sds->avg_load - local->avg_load) * local->group_power
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5578	) / SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5579
				5580	/*
				5581	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	5582	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5583	* a think about bumping its value to force at least one task to be
				5584	* moved
				5585	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5586	if (env->imbalance < busiest->load_per_task)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5587	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5588	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5589
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5590	/***** find_busiest_group() helpers end here *******************/
				5591
				5592	/**
				5593	* find_busiest_group - Returns the busiest group within the sched_domain
				5594	* if there is an imbalance. If there isn't an imbalance, and
				5595	* the user has opted for power-savings, it returns a group whose
				5596	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				5597	* such a group exists.
				5598	*
				5599	* Also calculates the amount of weighted load which should be moved
				5600	* to restore balance.
				5601	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5602	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5603	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	5604	* Return: - The busiest group if imbalance exists.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5605	* - If no imbalance and user has opted for power-savings balance,
				5606	* return the least loaded group whose CPUs can be
				5607	* put to idle by rebalancing its tasks onto our group.
				5608	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5609	static struct sched_group find_busiest_group(struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5610	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5611	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5612	struct sd_lb_stats sds;
				5613
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	5614	init_sd_lb_stats(&sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5615
				5616	/*
				5617	* Compute the various statistics relavent for load balancing at
				5618	* this level.
				5619	*/
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5620	update_sd_lb_stats(env, &sds);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5621	local = &sds.local_stat;
				5622	busiest = &sds.busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5623
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5624	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				5625	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5626	return sds.busiest;
				5627
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5628	/* There is no busy sibling group to pull tasks from */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5629	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5630	goto out_balanced;
				5631
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5632	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	5633
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5634	/*
				5635	* If the busiest group is imbalanced the below checks don't
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5636	* work because they assume all things are equal, which typically
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5637	* isn't true due to cpus_allowed constraints and the like.
				5638	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5639	if (busiest->group_imb)
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5640	goto force_balance;
				5641
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5642	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5643	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
				5644	!busiest->group_has_capacity)
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5645	goto force_balance;
				5646
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5647	/*
				5648	* If the local group is more busy than the selected busiest group
				5649	* don't try and pull any tasks.
				5650	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5651	if (local->avg_load >= busiest->avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5652	goto out_balanced;
				5653
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5654	/*
				5655	* Don't pull any tasks if this group is already above the domain
				5656	* average load.
				5657	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5658	if (local->avg_load >= sds.avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5659	goto out_balanced;
				5660
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5661	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5662	/*
				5663	* This cpu is idle. If the busiest group load doesn't
				5664	* have more tasks than the number of available cpu's and
				5665	* there is no imbalance between this and busiest group
				5666	* wrt to idle cpu's, it is balanced.
				5667	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5668	if ((local->idle_cpus < busiest->idle_cpus) &&
				5669	busiest->sum_nr_running <= busiest->group_weight)
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5670	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5671	} else {
				5672	/*
				5673	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				5674	* imbalance_pct to be conservative.
				5675	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5676	if (100 * busiest->avg_load <=
				5677	env->sd->imbalance_pct * local->avg_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5678	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5679	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5680
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5681	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5682	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5683	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5684	return sds.busiest;
				5685
				5686	out_balanced:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5687	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5688	return NULL;
				5689	}
				5690
				5691	/*
				5692	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				5693	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5694	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5695	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5696	{
				5697	struct rq busiest = NULL, rq;
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5698	unsigned long busiest_load = 0, busiest_power = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5699	int i;
				5700
Peter Zijlstra	6906a40	2013-08-19 15:20:21 +0200	[diff] [blame]	5701	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5702	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5703	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				5704	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5705	unsigned long wl;
				5706
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5707	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5708	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5709
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5710	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5711	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5712
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5713	/*
				5714	* When comparing with imbalance, use weighted_cpuload()
				5715	* which is not scaled with the cpu power.
				5716	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5717	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5718	continue;
				5719
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5720	/*
				5721	* For the load comparisons with the other cpu's, consider
				5722	* the weighted_cpuload() scaled with the cpu power, so that
				5723	* the load can be moved away from the cpu that is potentially
				5724	* running at a lower capacity.
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5725	*
				5726	* Thus we're looking for max(wl_i / power_i), crosswise
				5727	* multiplication to rid ourselves of the division works out
				5728	* to: wl_i * power_j > wl_j * power_i; where j is our
				5729	* previous maximum.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5730	*/
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5731	if (wl * busiest_power > busiest_load * power) {
				5732	busiest_load = wl;
				5733	busiest_power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5734	busiest = rq;
				5735	}
				5736	}
				5737
				5738	return busiest;
				5739	}
				5740
				5741	/*
				5742	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				5743	* so long as it is large enough.
				5744	*/
				5745	#define MAX_PINNED_INTERVAL 512
				5746
				5747	/* Working cpumask for load_balance and load_balance_newidle. */
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5748	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5749
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5750	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	5751	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5752	struct sched_domain *sd = env->sd;
				5753
				5754	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5755
				5756	/*
				5757	* ASYM_PACKING needs to force migrate tasks from busy but
				5758	* higher numbered CPUs in order to pack all tasks in the
				5759	* lowest numbered CPUs.
				5760	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5761	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5762	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	5763	}
				5764
				5765	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				5766	}
				5767
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5768	static int active_load_balance_cpu_stop(void *data);
				5769
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5770	static int should_we_balance(struct lb_env *env)
				5771	{
				5772	struct sched_group *sg = env->sd->groups;
				5773	struct cpumask sg_cpus, sg_mask;
				5774	int cpu, balance_cpu = -1;
				5775
				5776	/*
				5777	* In the newly idle case, we will allow all the cpu's
				5778	* to do the newly idle load balance.
				5779	*/
				5780	if (env->idle == CPU_NEWLY_IDLE)
				5781	return 1;
				5782
				5783	sg_cpus = sched_group_cpus(sg);
				5784	sg_mask = sched_group_mask(sg);
				5785	/* Try to find first idle cpu */
				5786	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				5787	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				5788	continue;
				5789
				5790	balance_cpu = cpu;
				5791	break;
				5792	}
				5793
				5794	if (balance_cpu == -1)
				5795	balance_cpu = group_balance_cpu(sg);
				5796
				5797	/*
				5798	* First idle cpu or the first cpu(busiest) in this sched group
				5799	* is eligible for doing load balancing at this and above domains.
				5800	*/
Joonsoo Kim	b0cff9d	2013-09-10 15:54:49 +0900	[diff] [blame]	5801	return balance_cpu == env->dst_cpu;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5802	}
				5803
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5804	/*
				5805	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				5806	* tasks if there is an imbalance.
				5807	*/
				5808	static int load_balance(int this_cpu, struct rq *this_rq,
				5809	struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5810	int *continue_balancing)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5811	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5812	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5813	struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5814	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5815	struct rq *busiest;
				5816	unsigned long flags;
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5817	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5818
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5819	struct lb_env env = {
				5820	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5821	.dst_cpu = this_cpu,
				5822	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5823	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5824	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5825	.loop_break = sched_nr_migrate_break,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5826	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5827	};
				5828
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5829	/*
				5830	* For NEWLY_IDLE load_balancing, we don't need to consider
				5831	* other cpus in our group
				5832	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5833	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5834	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5835
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5836	cpumask_copy(cpus, cpu_active_mask);
				5837
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5838	schedstat_inc(sd, lb_count[idle]);
				5839
				5840	redo:
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5841	if (!should_we_balance(&env)) {
				5842	*continue_balancing = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5843	goto out_balanced;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5844	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5845
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5846	group = find_busiest_group(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5847	if (!group) {
				5848	schedstat_inc(sd, lb_nobusyg[idle]);
				5849	goto out_balanced;
				5850	}
				5851
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5852	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5853	if (!busiest) {
				5854	schedstat_inc(sd, lb_nobusyq[idle]);
				5855	goto out_balanced;
				5856	}
				5857
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5858	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5859
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5860	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5861
				5862	ld_moved = 0;
				5863	if (busiest->nr_running > 1) {
				5864	/*
				5865	* Attempt to move tasks. If find_busiest_group has found
				5866	* an imbalance but busiest->nr_running <= 1, the group is
				5867	* still unbalanced. ld_moved simply stays zero, so it is
				5868	* correctly treated as an imbalance.
				5869	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5870	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5871	env.src_cpu = busiest->cpu;
				5872	env.src_rq = busiest;
				5873	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5874
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5875	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5876	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5877	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5878
				5879	/*
				5880	* cur_ld_moved - load moved in current iteration
				5881	* ld_moved - cumulative load moved across iterations
				5882	*/
				5883	cur_ld_moved = move_tasks(&env);
				5884	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5885	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5886	local_irq_restore(flags);
				5887
				5888	/*
				5889	* some other cpu did the load balance for us.
				5890	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5891	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5892	resched_cpu(env.dst_cpu);
				5893
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	5894	if (env.flags & LBF_NEED_BREAK) {
				5895	env.flags &= ~LBF_NEED_BREAK;
				5896	goto more_balance;
				5897	}
				5898
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5899	/*
				5900	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5901	* us and move them to an alternate dst_cpu in our sched_group
				5902	* where they can run. The upper limit on how many times we
				5903	* iterate on same src_cpu is dependent on number of cpus in our
				5904	* sched_group.
				5905	*
				5906	* This changes load balance semantics a bit on who can move
				5907	* load to a given_cpu. In addition to the given_cpu itself
				5908	* (or a ilb_cpu acting on its behalf where given_cpu is
				5909	* nohz-idle), we now have balance_cpu in a position to move
				5910	* load to given_cpu. In rare situations, this may cause
				5911	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5912	* _independently_ and at _same_ time to move some load to
				5913	* given_cpu) causing exceess load to be moved to given_cpu.
				5914	* This however should not happen so much in practice and
				5915	* moreover subsequent load balance cycles should correct the
				5916	* excess load moved.
				5917	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5918	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5919
Vladimir Davydov	7aff2e3	2013-09-15 21:30:13 +0400	[diff] [blame]	5920	/* Prevent to re-select dst_cpu via env's cpus */
				5921	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				5922
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5923	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5924	env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5925	env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5926	env.loop = 0;
				5927	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5928
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5929	/*
				5930	* Go back to "more_balance" rather than "redo" since we
				5931	* need to continue with same src_cpu.
				5932	*/
				5933	goto more_balance;
				5934	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5935
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5936	/*
				5937	* We failed to reach balance because of affinity.
				5938	*/
				5939	if (sd_parent) {
				5940	int *group_imbalance = &sd_parent->groups->sgp->imbalance;
				5941
				5942	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
				5943	*group_imbalance = 1;
				5944	} else if (*group_imbalance)
				5945	*group_imbalance = 0;
				5946	}
				5947
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5948	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5949	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5950	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5951	if (!cpumask_empty(cpus)) {
				5952	env.loop = 0;
				5953	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5954	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5955	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5956	goto out_balanced;
				5957	}
				5958	}
				5959
				5960	if (!ld_moved) {
				5961	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5962	/*
				5963	* Increment the failure counter only on periodic balance.
				5964	* We do not want newidle balance, which can be very
				5965	* frequent, pollute the failure counter causing
				5966	* excessive cache_hot migrations and active balances.
				5967	*/
				5968	if (idle != CPU_NEWLY_IDLE)
				5969	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5970
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5971	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5972	raw_spin_lock_irqsave(&busiest->lock, flags);
				5973
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5974	/* don't kick the active_load_balance_cpu_stop,
				5975	* if the curr task on busiest cpu can't be
				5976	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5977	*/
				5978	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5979	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5980	raw_spin_unlock_irqrestore(&busiest->lock,
				5981	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5982	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5983	goto out_one_pinned;
				5984	}
				5985
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5986	/*
				5987	* ->active_balance synchronizes accesses to
				5988	* ->active_balance_work. Once set, it's cleared
				5989	* only after active load balance is finished.
				5990	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5991	if (!busiest->active_balance) {
				5992	busiest->active_balance = 1;
				5993	busiest->push_cpu = this_cpu;
				5994	active_balance = 1;
				5995	}
				5996	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5997
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5998	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5999	stop_one_cpu_nowait(cpu_of(busiest),
				6000	active_load_balance_cpu_stop, busiest,
				6001	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6002	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6003
				6004	/*
				6005	* We've kicked active balancing, reset the failure
				6006	* counter.
				6007	*/
				6008	sd->nr_balance_failed = sd->cache_nice_tries+1;
				6009	}
				6010	} else
				6011	sd->nr_balance_failed = 0;
				6012
				6013	if (likely(!active_balance)) {
				6014	/* We were unbalanced, so reset the balancing interval */
				6015	sd->balance_interval = sd->min_interval;
				6016	} else {
				6017	/*
				6018	* If we've begun active balancing, start to back off. This
				6019	* case may not be covered by the all_pinned logic if there
				6020	* is only 1 task on the busy runqueue (because we don't call
				6021	* move_tasks).
				6022	*/
				6023	if (sd->balance_interval < sd->max_interval)
				6024	sd->balance_interval *= 2;
				6025	}
				6026
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6027	goto out;
				6028
				6029	out_balanced:
				6030	schedstat_inc(sd, lb_balanced[idle]);
				6031
				6032	sd->nr_balance_failed = 0;
				6033
				6034	out_one_pinned:
				6035	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6036	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	6037	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6038	(sd->balance_interval < sd->max_interval))
				6039	sd->balance_interval *= 2;
				6040
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	6041	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6042	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6043	return ld_moved;
				6044	}
				6045
				6046	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6047	* idle_balance is called by schedule() if this_cpu is about to become
				6048	* idle. Attempts to pull tasks from other CPUs.
				6049	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6050	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6051	{
				6052	struct sched_domain *sd;
				6053	int pulled_task = 0;
				6054	unsigned long next_balance = jiffies + HZ;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	6055	u64 curr_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6056
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	6057	this_rq->idle_stamp = rq_clock(this_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6058
				6059	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				6060	return;
				6061
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	6062	/*
				6063	* Drop the rq->lock, but keep IRQ/preempt disabled.
				6064	*/
				6065	raw_spin_unlock(&this_rq->lock);
				6066
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6067	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6068	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6069	for_each_domain(this_cpu, sd) {
				6070	unsigned long interval;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	6071	int continue_balancing = 1;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	6072	u64 t0, domain_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6073
				6074	if (!(sd->flags & SD_LOAD_BALANCE))
				6075	continue;
				6076
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	6077	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
				6078	break;
				6079
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	6080	if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	6081	t0 = sched_clock_cpu(this_cpu);
				6082
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6083	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	6084	pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	6085	sd, CPU_NEWLY_IDLE,
				6086	&continue_balancing);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	6087
				6088	domain_cost = sched_clock_cpu(this_cpu) - t0;
				6089	if (domain_cost > sd->max_newidle_lb_cost)
				6090	sd->max_newidle_lb_cost = domain_cost;
				6091
				6092	curr_cost += domain_cost;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	6093	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6094
				6095	interval = msecs_to_jiffies(sd->balance_interval);
				6096	if (time_after(next_balance, sd->last_balance + interval))
				6097	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	6098	if (pulled_task) {
				6099	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6100	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	6101	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6102	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6103	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	6104
				6105	raw_spin_lock(&this_rq->lock);
				6106
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6107	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				6108	/*
				6109	* We are going idle. next_balance may be set based on
				6110	* a busy processor. So reset next_balance.
				6111	*/
				6112	this_rq->next_balance = next_balance;
				6113	}
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	6114
				6115	if (curr_cost > this_rq->max_idle_balance_cost)
				6116	this_rq->max_idle_balance_cost = curr_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6117	}
				6118
				6119	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6120	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				6121	* running tasks off the busiest CPU onto idle CPUs. It requires at
				6122	* least 1 task to be running on each physical CPU where possible, and
				6123	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6124	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6125	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6126	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6127	struct rq *busiest_rq = data;
				6128	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6129	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6130	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6131	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6132
				6133	raw_spin_lock_irq(&busiest_rq->lock);
				6134
				6135	/* make sure the requested cpu hasn't gone down in the meantime */
				6136	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				6137	!busiest_rq->active_balance))
				6138	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6139
				6140	/* Is there any task to move? */
				6141	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6142	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6143
				6144	/*
				6145	* This condition is "impossible", if it occurs
				6146	* we need to fix it. Originally reported by
				6147	* Bjorn Helgaas on a 128-cpu setup.
				6148	*/
				6149	BUG_ON(busiest_rq == target_rq);
				6150
				6151	/* move a task from busiest_rq to target_rq */
				6152	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6153
				6154	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6155	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6156	for_each_domain(target_cpu, sd) {
				6157	if ((sd->flags & SD_LOAD_BALANCE) &&
				6158	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				6159	break;
				6160	}
				6161
				6162	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6163	struct lb_env env = {
				6164	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6165	.dst_cpu = target_cpu,
				6166	.dst_rq = target_rq,
				6167	.src_cpu = busiest_rq->cpu,
				6168	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6169	.idle = CPU_IDLE,
				6170	};
				6171
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6172	schedstat_inc(sd, alb_count);
				6173
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6174	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6175	schedstat_inc(sd, alb_pushed);
				6176	else
				6177	schedstat_inc(sd, alb_failed);
				6178	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6179	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6180	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	6181	out_unlock:
				6182	busiest_rq->active_balance = 0;
				6183	raw_spin_unlock_irq(&busiest_rq->lock);
				6184	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6185	}
				6186
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6187	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6188	/*
				6189	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6190	* - When one of the busy CPUs notice that there may be an idle rebalancing
				6191	* needed, they will kick the idle load balancer, which then does idle
				6192	* load balancing for all the idle CPUs.
				6193	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6194	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6195	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6196	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6197	unsigned long next_balance; /* in jiffy units */
				6198	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6199
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	6200	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6201	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6202	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6203
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	6204	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				6205	return ilb;
				6206
				6207	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6208	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6209
				6210	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6211	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				6212	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				6213	* CPU (if there is one).
				6214	*/
				6215	static void nohz_balancer_kick(int cpu)
				6216	{
				6217	int ilb_cpu;
				6218
				6219	nohz.next_balance++;
				6220
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6221	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6222
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6223	if (ilb_cpu >= nr_cpu_ids)
				6224	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6225
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	6226	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6227	return;
				6228	/*
				6229	* Use smp_send_reschedule() instead of resched_cpu().
				6230	* This way we generate a sched IPI on the target cpu which
				6231	* is idle. And the softirq performing nohz idle load balance
				6232	* will be run before returning from the IPI.
				6233	*/
				6234	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6235	return;
				6236	}
				6237
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6238	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6239	{
				6240	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				6241	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				6242	atomic_dec(&nohz.nr_cpus);
				6243	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				6244	}
				6245	}
				6246
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6247	static inline void set_cpu_sd_state_busy(void)
				6248	{
				6249	struct sched_domain *sd;
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6250
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6251	rcu_read_lock();
Nathan Zimmer	424c93f	2013-05-09 11:24:03 -0500	[diff] [blame]	6252	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	6253
				6254	if (!sd \|\| !sd->nohz_idle)
				6255	goto unlock;
				6256	sd->nohz_idle = 0;
				6257
				6258	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6259	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	6260	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6261	rcu_read_unlock();
				6262	}
				6263
				6264	void set_cpu_sd_state_idle(void)
				6265	{
				6266	struct sched_domain *sd;
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6267
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6268	rcu_read_lock();
Nathan Zimmer	424c93f	2013-05-09 11:24:03 -0500	[diff] [blame]	6269	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	6270
				6271	if (!sd \|\| sd->nohz_idle)
				6272	goto unlock;
				6273	sd->nohz_idle = 1;
				6274
				6275	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6276	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	6277	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6278	rcu_read_unlock();
				6279	}
				6280
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6281	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6282	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6283	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6284	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6285	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6286	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6287	/*
				6288	* If this cpu is going down, then nothing needs to be done.
				6289	*/
				6290	if (!cpu_active(cpu))
				6291	return;
				6292
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6293	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				6294	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6295
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6296	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				6297	atomic_inc(&nohz.nr_cpus);
				6298	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6299	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6300
Paul Gortmaker	0db0628	2013-06-19 14:53:51 -0400	[diff] [blame]	6301	static int sched_ilb_notifier(struct notifier_block *nfb,
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6302	unsigned long action, void *hcpu)
				6303	{
				6304	switch (action & ~CPU_TASKS_FROZEN) {
				6305	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6306	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6307	return NOTIFY_OK;
				6308	default:
				6309	return NOTIFY_DONE;
				6310	}
				6311	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6312	#endif
				6313
				6314	static DEFINE_SPINLOCK(balancing);
				6315
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	6316	/*
				6317	* Scale the max load_balance interval with the number of CPUs in the system.
				6318	* This trades load-balance latency on larger machines for less cross talk.
				6319	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6320	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	6321	{
				6322	max_load_balance_interval = HZ*num_online_cpus()/10;
				6323	}
				6324
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6325	/*
				6326	* It checks each scheduling domain to see if it is due to be balanced,
				6327	* and initiates a balancing operation if so.
				6328	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	6329	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6330	*/
				6331	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				6332	{
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	6333	int continue_balancing = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6334	struct rq *rq = cpu_rq(cpu);
				6335	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	6336	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6337	/* Earliest time when we have to do rebalance again */
				6338	unsigned long next_balance = jiffies + 60*HZ;
				6339	int update_next_balance = 0;
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	6340	int need_serialize, need_decay = 0;
				6341	u64 max_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6342
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6343	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	6344
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6345	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6346	for_each_domain(cpu, sd) {
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	6347	/*
				6348	* Decay the newidle max times here because this is a regular
				6349	* visit to all the domains. Decay ~1% per second.
				6350	*/
				6351	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				6352	sd->max_newidle_lb_cost =
				6353	(sd->max_newidle_lb_cost * 253) / 256;
				6354	sd->next_decay_max_lb_cost = jiffies + HZ;
				6355	need_decay = 1;
				6356	}
				6357	max_cost += sd->max_newidle_lb_cost;
				6358
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6359	if (!(sd->flags & SD_LOAD_BALANCE))
				6360	continue;
				6361
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	6362	/*
				6363	* Stop the load balance at this level. There is another
				6364	* CPU in our sched group which is doing load balancing more
				6365	* actively.
				6366	*/
				6367	if (!continue_balancing) {
				6368	if (need_decay)
				6369	continue;
				6370	break;
				6371	}
				6372
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6373	interval = sd->balance_interval;
				6374	if (idle != CPU_IDLE)
				6375	interval *= sd->busy_factor;
				6376
				6377	/* scale ms to jiffies */
				6378	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	6379	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6380
				6381	need_serialize = sd->flags & SD_SERIALIZE;
				6382
				6383	if (need_serialize) {
				6384	if (!spin_trylock(&balancing))
				6385	goto out;
				6386	}
				6387
				6388	if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	6389	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6390	/*
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6391	* The LBF_DST_PINNED logic could have changed
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	6392	* env->dst_cpu, so we can't know our idle
				6393	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6394	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	6395	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6396	}
				6397	sd->last_balance = jiffies;
				6398	}
				6399	if (need_serialize)
				6400	spin_unlock(&balancing);
				6401	out:
				6402	if (time_after(next_balance, sd->last_balance + interval)) {
				6403	next_balance = sd->last_balance + interval;
				6404	update_next_balance = 1;
				6405	}
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	6406	}
				6407	if (need_decay) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6408	/*
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	6409	* Ensure the rq-wide value also decays but keep it at a
				6410	* reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6411	*/
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	6412	rq->max_idle_balance_cost =
				6413	max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6414	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6415	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6416
				6417	/*
				6418	* next_balance will be updated only when there is a need.
				6419	* When the cpu is attached to null domain for ex, it will not be
				6420	* updated.
				6421	*/
				6422	if (likely(update_next_balance))
				6423	rq->next_balance = next_balance;
				6424	}
				6425
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6426	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6427	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6428	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6429	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				6430	*/
				6431	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				6432	{
				6433	struct rq *this_rq = cpu_rq(this_cpu);
				6434	struct rq *rq;
				6435	int balance_cpu;
				6436
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6437	if (idle != CPU_IDLE \|\|
				6438	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				6439	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6440
				6441	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	6442	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6443	continue;
				6444
				6445	/*
				6446	* If this cpu gets work to do, stop the load balancing
				6447	* work being done for other cpus. Next load
				6448	* balancing owner will pick it up.
				6449	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6450	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6451	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6452
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	6453	rq = cpu_rq(balance_cpu);
				6454
				6455	raw_spin_lock_irq(&rq->lock);
				6456	update_rq_clock(rq);
				6457	update_idle_cpu_load(rq);
				6458	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6459
				6460	rebalance_domains(balance_cpu, CPU_IDLE);
				6461
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6462	if (time_after(this_rq->next_balance, rq->next_balance))
				6463	this_rq->next_balance = rq->next_balance;
				6464	}
				6465	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6466	end:
				6467	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6468	}
				6469
				6470	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6471	* Current heuristic for kicking the idle load balancer in the presence
				6472	* of an idle cpu is the system.
				6473	* - This rq has more than one task.
				6474	* - At any scheduler domain level, this cpu's scheduler group has multiple
				6475	* busy cpu's exceeding the group's power.
				6476	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				6477	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6478	*/
				6479	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				6480	{
				6481	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6482	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6483
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6484	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6485	return 0;
				6486
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6487	/*
				6488	* We may be recently in ticked or tickless idle mode. At the first
				6489	* busy tick after returning from idle, we will update the busy stats.
				6490	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	6491	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	6492	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6493
				6494	/*
				6495	* None are in tickless mode and hence no need for NOHZ idle load
				6496	* balancing.
				6497	*/
				6498	if (likely(!atomic_read(&nohz.nr_cpus)))
				6499	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6500
				6501	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6502	return 0;
				6503
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6504	if (rq->nr_running >= 2)
				6505	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6506
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	6507	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6508	for_each_domain(cpu, sd) {
				6509	struct sched_group *sg = sd->groups;
				6510	struct sched_group_power *sgp = sg->sgp;
				6511	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6512
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6513	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	6514	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6515
				6516	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				6517	&& (cpumask_first_and(nohz.idle_cpus_mask,
				6518	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	6519	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6520
				6521	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				6522	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6523	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	6524	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6525	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	6526
				6527	need_kick_unlock:
				6528	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	6529	need_kick:
				6530	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6531	}
				6532	#else
				6533	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				6534	#endif
				6535
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6536	/*
				6537	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6538	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6539	*/
				6540	static void run_rebalance_domains(struct softirq_action *h)
				6541	{
				6542	int this_cpu = smp_processor_id();
				6543	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	6544	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6545	CPU_IDLE : CPU_NOT_IDLE;
				6546
				6547	rebalance_domains(this_cpu, idle);
				6548
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6549	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6550	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6551	* balancing on behalf of the other idle cpus whose ticks are
				6552	* stopped.
				6553	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6554	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6555	}
				6556
				6557	static inline int on_null_domain(int cpu)
				6558	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	6559	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6560	}
				6561
				6562	/*
				6563	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6564	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6565	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6566	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6567	/* Don't need to rebalance while attached to NULL domain */
				6568	if (time_after_eq(jiffies, rq->next_balance) &&
				6569	likely(!on_null_domain(cpu)))
				6570	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6571	#ifdef CONFIG_NO_HZ_COMMON
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6572	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6573	nohz_balancer_kick(cpu);
				6574	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6575	}
				6576
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6577	static void rq_online_fair(struct rq *rq)
				6578	{
				6579	update_sysctl();
				6580	}
				6581
				6582	static void rq_offline_fair(struct rq *rq)
				6583	{
				6584	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	6585
				6586	/* Ensure any throttled groups are reachable by pick_next_task */
				6587	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6588	}
				6589
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	6590	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	6591
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6592	/*
				6593	* scheduler tick hitting a task of our scheduling class:
				6594	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6595	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6596	{
				6597	struct cfs_rq *cfs_rq;
				6598	struct sched_entity *se = &curr->se;
				6599
				6600	for_each_sched_entity(se) {
				6601	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6602	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6603	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	6604
Dave Kleikamp	10e84b9	2013-07-31 13:53:35 -0700	[diff] [blame]	6605	if (numabalancing_enabled)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	6606	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	6607
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	6608	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6609	}
				6610
				6611	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6612	* called on fork with the child task as argument from the parent's context
				6613	* - child not yet on the tasklist
				6614	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6615	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6616	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6617	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	6618	struct cfs_rq *cfs_rq;
				6619	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	6620	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6621	struct rq *rq = this_rq();
				6622	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6623
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	6624	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6625
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	6626	update_rq_clock(rq);
				6627
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	6628	cfs_rq = task_cfs_rq(current);
				6629	curr = cfs_rq->curr;
				6630
Daisuke Nishimura	6c9a27f	2013-09-10 18:16:36 +0900	[diff] [blame]	6631	/*
				6632	* Not only the cpu but also the task_group of the parent might have
				6633	* been changed after parent->se.parent,cfs_rq were copied to
				6634	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
				6635	* of child point to valid ones.
				6636	*/
				6637	rcu_read_lock();
				6638	__set_task_cpu(p, this_cpu);
				6639	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6640
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	6641	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6642
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	6643	if (curr)
				6644	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	6645	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6646
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6647	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	6648	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	6649	* Upon rescheduling, sched_class::put_prev_task() will place
				6650	* 'current' within the tree based on its new key value.
				6651	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6652	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6653	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6654	}
				6655
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6656	se->vruntime -= cfs_rq->min_vruntime;
				6657
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	6658	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6659	}
				6660
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6661	/*
				6662	* Priority of the task has changed. Check to see if we preempt
				6663	* the current task.
				6664	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6665	static void
				6666	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6667	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6668	if (!p->se.on_rq)
				6669	return;
				6670
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6671	/*
				6672	* Reschedule if we are currently running on this runqueue and
				6673	* our priority decreased, or if we are not currently running on
				6674	* this runqueue and our priority is higher than the current's
				6675	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6676	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6677	if (p->prio > oldprio)
				6678	resched_task(rq->curr);
				6679	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	6680	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6681	}
				6682
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6683	static void switched_from_fair(struct rq rq, struct task_struct p)
				6684	{
				6685	struct sched_entity *se = &p->se;
				6686	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6687
				6688	/*
				6689	* Ensure the task's vruntime is normalized, so that when its
				6690	* switched back to the fair class the enqueue_entity(.flags=0) will
				6691	* do the right thing.
				6692	*
				6693	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				6694	* have normalized the vruntime, if it was !on_rq, then only when
				6695	* the task is sleeping will it still have non-normalized vruntime.
				6696	*/
				6697	if (!se->on_rq && p->state != TASK_RUNNING) {
				6698	/*
				6699	* Fix up our vruntime so that the current sleep doesn't
				6700	* cause 'unlimited' sleep bonus.
				6701	*/
				6702	place_entity(cfs_rq, se, 0);
				6703	se->vruntime -= cfs_rq->min_vruntime;
				6704	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6705
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6706	#ifdef CONFIG_SMP
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6707	/*
				6708	* Remove our load from contribution when we leave sched_fair
				6709	* and ensure we don't carry in an old decay_count if we
				6710	* switch back.
				6711	*/
Kirill Tkhai	87e3c8a	2013-07-21 04:32:07 +0400	[diff] [blame]	6712	if (se->avg.decay_count) {
				6713	__synchronize_entity_decay(se);
				6714	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6715	}
				6716	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6717	}
				6718
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6719	/*
				6720	* We switched to the sched_fair class.
				6721	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6722	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6723	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6724	if (!p->se.on_rq)
				6725	return;
				6726
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6727	/*
				6728	* We were most likely switched from sched_rt, so
				6729	* kick off the schedule if running, otherwise just see
				6730	* if we can still preempt the current task.
				6731	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6732	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6733	resched_task(rq->curr);
				6734	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	6735	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6736	}
				6737
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6738	/* Account for a task changing its policy or group.
				6739	*
				6740	* This routine is mostly called to set cfs_rq->curr field when a task
				6741	* migrates between groups/classes.
				6742	*/
				6743	static void set_curr_task_fair(struct rq *rq)
				6744	{
				6745	struct sched_entity *se = &rq->curr->se;
				6746
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	6747	for_each_sched_entity(se) {
				6748	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6749
				6750	set_next_entity(cfs_rq, se);
				6751	/* ensure bandwidth has been allocated on our new cfs_rq */
				6752	account_cfs_rq_runtime(cfs_rq, 0);
				6753	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6754	}
				6755
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6756	void init_cfs_rq(struct cfs_rq *cfs_rq)
				6757	{
				6758	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6759	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				6760	#ifndef CONFIG_64BIT
				6761	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				6762	#endif
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6763	#ifdef CONFIG_SMP
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6764	atomic64_set(&cfs_rq->decay_counter, 1);
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	6765	atomic_long_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6766	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6767	}
				6768
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6769	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6770	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6771	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6772	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6773	/*
				6774	* If the task was not on the rq at the time of this cgroup movement
				6775	* it must have been asleep, sleeping tasks keep their ->vruntime
				6776	* absolute on their old rq until wakeup (needed for the fair sleeper
				6777	* bonus in place_entity()).
				6778	*
				6779	* If it was on the rq, we've just 'preempted' it, which does convert
				6780	* ->vruntime to a relative base.
				6781	*
				6782	* Make sure both cases convert their relative position when migrating
				6783	* to another cgroup's rq. This does somewhat interfere with the
				6784	* fair sleeper stuff for the first placement, but who cares.
				6785	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6786	/*
				6787	* When !on_rq, vruntime of the task has usually NOT been normalized.
				6788	* But there are some cases where it has already been normalized:
				6789	*
				6790	* - Moving a forked child which is waiting for being woken up by
				6791	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	6792	* - Moving a task which has been woken up by try_to_wake_up() and
				6793	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6794	*
				6795	* To prevent boost or penalty in the new cfs_rq caused by delta
				6796	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				6797	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	6798	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6799	on_rq = 1;
				6800
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6801	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6802	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				6803	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6804	if (!on_rq) {
				6805	cfs_rq = cfs_rq_of(&p->se);
				6806	p->se.vruntime += cfs_rq->min_vruntime;
				6807	#ifdef CONFIG_SMP
				6808	/*
				6809	* migrate_task_rq_fair() will have removed our previous
				6810	* contribution, but we must synchronize for ongoing future
				6811	* decay.
				6812	*/
				6813	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				6814	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				6815	#endif
				6816	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6817	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6818
				6819	void free_fair_sched_group(struct task_group *tg)
				6820	{
				6821	int i;
				6822
				6823	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				6824
				6825	for_each_possible_cpu(i) {
				6826	if (tg->cfs_rq)
				6827	kfree(tg->cfs_rq[i]);
				6828	if (tg->se)
				6829	kfree(tg->se[i]);
				6830	}
				6831
				6832	kfree(tg->cfs_rq);
				6833	kfree(tg->se);
				6834	}
				6835
				6836	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6837	{
				6838	struct cfs_rq *cfs_rq;
				6839	struct sched_entity *se;
				6840	int i;
				6841
				6842	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				6843	if (!tg->cfs_rq)
				6844	goto err;
				6845	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				6846	if (!tg->se)
				6847	goto err;
				6848
				6849	tg->shares = NICE_0_LOAD;
				6850
				6851	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				6852
				6853	for_each_possible_cpu(i) {
				6854	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				6855	GFP_KERNEL, cpu_to_node(i));
				6856	if (!cfs_rq)
				6857	goto err;
				6858
				6859	se = kzalloc_node(sizeof(struct sched_entity),
				6860	GFP_KERNEL, cpu_to_node(i));
				6861	if (!se)
				6862	goto err_free_rq;
				6863
				6864	init_cfs_rq(cfs_rq);
				6865	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				6866	}
				6867
				6868	return 1;
				6869
				6870	err_free_rq:
				6871	kfree(cfs_rq);
				6872	err:
				6873	return 0;
				6874	}
				6875
				6876	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				6877	{
				6878	struct rq *rq = cpu_rq(cpu);
				6879	unsigned long flags;
				6880
				6881	/*
				6882	* Only empty task groups can be destroyed; so we can speculatively
				6883	* check on_list without danger of it being re-added.
				6884	*/
				6885	if (!tg->cfs_rq[cpu]->on_list)
				6886	return;
				6887
				6888	raw_spin_lock_irqsave(&rq->lock, flags);
				6889	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6890	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6891	}
				6892
				6893	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6894	struct sched_entity *se, int cpu,
				6895	struct sched_entity *parent)
				6896	{
				6897	struct rq *rq = cpu_rq(cpu);
				6898
				6899	cfs_rq->tg = tg;
				6900	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6901	init_cfs_rq_runtime(cfs_rq);
				6902
				6903	tg->cfs_rq[cpu] = cfs_rq;
				6904	tg->se[cpu] = se;
				6905
				6906	/* se could be NULL for root_task_group */
				6907	if (!se)
				6908	return;
				6909
				6910	if (!parent)
				6911	se->cfs_rq = &rq->cfs;
				6912	else
				6913	se->cfs_rq = parent->my_q;
				6914
				6915	se->my_q = cfs_rq;
				6916	update_load_set(&se->load, 0);
				6917	se->parent = parent;
				6918	}
				6919
				6920	static DEFINE_MUTEX(shares_mutex);
				6921
				6922	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6923	{
				6924	int i;
				6925	unsigned long flags;
				6926
				6927	/*
				6928	* We can't change the weight of the root cgroup.
				6929	*/
				6930	if (!tg->se[0])
				6931	return -EINVAL;
				6932
				6933	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6934
				6935	mutex_lock(&shares_mutex);
				6936	if (tg->shares == shares)
				6937	goto done;
				6938
				6939	tg->shares = shares;
				6940	for_each_possible_cpu(i) {
				6941	struct rq *rq = cpu_rq(i);
				6942	struct sched_entity *se;
				6943
				6944	se = tg->se[i];
				6945	/* Propagate contribution to hierarchy */
				6946	raw_spin_lock_irqsave(&rq->lock, flags);
Frederic Weisbecker	71b1da4	2013-04-12 01:50:59 +0200	[diff] [blame]	6947
				6948	/* Possible calls to update_curr() need rq clock */
				6949	update_rq_clock(rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6950	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6951	update_cfs_shares(group_cfs_rq(se));
				6952	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6953	}
				6954
				6955	done:
				6956	mutex_unlock(&shares_mutex);
				6957	return 0;
				6958	}
				6959	#else /* CONFIG_FAIR_GROUP_SCHED */
				6960
				6961	void free_fair_sched_group(struct task_group *tg) { }
				6962
				6963	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6964	{
				6965	return 1;
				6966	}
				6967
				6968	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6969
				6970	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6971
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6972
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6973	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6974	{
				6975	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6976	unsigned int rr_interval = 0;
				6977
				6978	/*
				6979	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6980	* idle runqueue:
				6981	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6982	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6983	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6984
				6985	return rr_interval;
				6986	}
				6987
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6988	/*
				6989	* All the scheduling class methods:
				6990	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6991	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6992	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6993	.enqueue_task = enqueue_task_fair,
				6994	.dequeue_task = dequeue_task_fair,
				6995	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6996	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6997
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6998	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6999
				7000	.pick_next_task = pick_next_task_fair,
				7001	.put_prev_task = put_prev_task_fair,
				7002
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	7003	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	7004	.select_task_rq = select_task_rq_fair,
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	7005	.migrate_task_rq = migrate_task_rq_fair,
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	7006
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	7007	.rq_online = rq_online_fair,
				7008	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	7009
				7010	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	7011	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	7012
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	7013	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	7014	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	7015	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	7016
				7017	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	7018	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	7019	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	7020
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	7021	.get_rr_interval = get_rr_interval_fair,
				7022
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	7023	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	7024	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	7025	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	7026	};
				7027
				7028	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	7029	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	7030	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	7031	struct cfs_rq *cfs_rq;
				7032
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	7033	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	7034	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	7035	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	7036	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	7037	}
				7038	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	7039
				7040	__init void init_sched_fair_class(void)
				7041	{
				7042	#ifdef CONFIG_SMP
				7043	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				7044
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	7045	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	7046	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	7047	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7048	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	7049	#endif
				7050	#endif /* SMP */
				7051
				7052	}