Blame - kernel/sched/fair.c - android_kernel_oneplus_msm8996

blob: 4603d6cb9e259480de0a076e89a69bbb5cfc189b [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	116	/*
				117	* Increase the granularity value when there are more CPUs,
				118	* because with more CPUs the 'effective latency' as visible
				119	* to users decreases. But the relationship is not linear,
				120	* so pick a second-best guess by going with the log2 of the
				121	* number of CPUs.
				122	*
				123	* This idea comes from the SD scheduler of Con Kolivas:
				124	*/
				125	static int get_update_sysctl_factor(void)
				126	{
				127	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				128	unsigned int factor;
				129
				130	switch (sysctl_sched_tunable_scaling) {
				131	case SCHED_TUNABLESCALING_NONE:
				132	factor = 1;
				133	break;
				134	case SCHED_TUNABLESCALING_LINEAR:
				135	factor = cpus;
				136	break;
				137	case SCHED_TUNABLESCALING_LOG:
				138	default:
				139	factor = 1 + ilog2(cpus);
				140	break;
				141	}
				142
				143	return factor;
				144	}
				145
				146	static void update_sysctl(void)
				147	{
				148	unsigned int factor = get_update_sysctl_factor();
				149
				150	#define SET_SYSCTL(name) \
				151	(sysctl_##name = (factor) * normalized_sysctl_##name)
				152	SET_SYSCTL(sched_min_granularity);
				153	SET_SYSCTL(sched_latency);
				154	SET_SYSCTL(sched_wakeup_granularity);
				155	#undef SET_SYSCTL
				156	}
				157
				158	void sched_init_granularity(void)
				159	{
				160	update_sysctl();
				161	}
				162
				163	#if BITS_PER_LONG == 32
				164	# define WMULT_CONST (~0UL)
				165	#else
				166	# define WMULT_CONST (1UL << 32)
				167	#endif
				168
				169	#define WMULT_SHIFT 32
				170
				171	/*
				172	* Shift right and round:
				173	*/
				174	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				175
				176	/*
				177	* delta *= weight / lw
				178	*/
				179	static unsigned long
				180	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				181	struct load_weight *lw)
				182	{
				183	u64 tmp;
				184
				185	/*
				186	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				187	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				188	* 2^SCHED_LOAD_RESOLUTION.
				189	*/
				190	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				191	tmp = (u64)delta_exec * scale_load_down(weight);
				192	else
				193	tmp = (u64)delta_exec;
				194
				195	if (!lw->inv_weight) {
				196	unsigned long w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* Check whether we'd overflow the 64-bit multiplication:
				208	*/
				209	if (unlikely(tmp > WMULT_CONST))
				210	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				211	WMULT_SHIFT/2);
				212	else
				213	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				214
				215	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				216	}
				217
				218
				219	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	220
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	221	/**************************************************************
				222	* CFS operations on generic schedulable entities:
				223	*/
				224
				225	#ifdef CONFIG_FAIR_GROUP_SCHED
				226
				227	/* cpu runqueue to which this cfs_rq is attached */
				228	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				229	{
				230	return cfs_rq->rq;
				231	}
				232
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	233	/* An entity is a task if it doesn't "own" a runqueue */
				234	#define entity_is_task(se) (!se->my_q)
				235
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	236	static inline struct task_struct task_of(struct sched_entity se)
				237	{
				238	#ifdef CONFIG_SCHED_DEBUG
				239	WARN_ON_ONCE(!entity_is_task(se));
				240	#endif
				241	return container_of(se, struct task_struct, se);
				242	}
				243
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	244	/* Walk up scheduling entities hierarchy */
				245	#define for_each_sched_entity(se) \
				246	for (; se; se = se->parent)
				247
				248	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				249	{
				250	return p->se.cfs_rq;
				251	}
				252
				253	/* runqueue on which this entity is (to be) queued */
				254	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				255	{
				256	return se->cfs_rq;
				257	}
				258
				259	/* runqueue "owned" by this group */
				260	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				261	{
				262	return grp->my_q;
				263	}
				264
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	265	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				266	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	267
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	268	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				269	{
				270	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	271	/*
				272	* Ensure we either appear before our parent (if already
				273	* enqueued) or force our parent to appear after us when it is
				274	* enqueued. The fact that we always enqueue bottom-up
				275	* reduces this to two cases.
				276	*/
				277	if (cfs_rq->tg->parent &&
				278	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				279	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	280	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	281	} else {
				282	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				283	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				284	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	285
				286	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	287	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	288	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	289	}
				290	}
				291
				292	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				293	{
				294	if (cfs_rq->on_list) {
				295	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				296	cfs_rq->on_list = 0;
				297	}
				298	}
				299
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	300	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				301	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				302	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				303
				304	/* Do the two (enqueued) entities belong to the same group ? */
				305	static inline int
				306	is_same_group(struct sched_entity se, struct sched_entity pse)
				307	{
				308	if (se->cfs_rq == pse->cfs_rq)
				309	return 1;
				310
				311	return 0;
				312	}
				313
				314	static inline struct sched_entity parent_entity(struct sched_entity se)
				315	{
				316	return se->parent;
				317	}
				318
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	319	/* return depth at which a sched entity is present in the hierarchy */
				320	static inline int depth_se(struct sched_entity *se)
				321	{
				322	int depth = 0;
				323
				324	for_each_sched_entity(se)
				325	depth++;
				326
				327	return depth;
				328	}
				329
				330	static void
				331	find_matching_se(struct sched_entity se, struct sched_entity pse)
				332	{
				333	int se_depth, pse_depth;
				334
				335	/*
				336	* preemption test can be made between sibling entities who are in the
				337	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				338	* both tasks until we find their ancestors who are siblings of common
				339	* parent.
				340	*/
				341
				342	/* First walk up until both entities are at same depth */
				343	se_depth = depth_se(*se);
				344	pse_depth = depth_se(*pse);
				345
				346	while (se_depth > pse_depth) {
				347	se_depth--;
				348	se = parent_entity(se);
				349	}
				350
				351	while (pse_depth > se_depth) {
				352	pse_depth--;
				353	pse = parent_entity(pse);
				354	}
				355
				356	while (!is_same_group(se, pse)) {
				357	se = parent_entity(se);
				358	pse = parent_entity(pse);
				359	}
				360	}
				361
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	362	#else /* !CONFIG_FAIR_GROUP_SCHED */
				363
				364	static inline struct task_struct task_of(struct sched_entity se)
				365	{
				366	return container_of(se, struct task_struct, se);
				367	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	368
				369	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				370	{
				371	return container_of(cfs_rq, struct rq, cfs);
				372	}
				373
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	374	#define entity_is_task(se) 1
				375
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	376	#define for_each_sched_entity(se) \
				377	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	378
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	380	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	382	}
				383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				385	{
				386	struct task_struct *p = task_of(se);
				387	struct rq *rq = task_rq(p);
				388
				389	return &rq->cfs;
				390	}
				391
				392	/* runqueue "owned" by this group */
				393	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				394	{
				395	return NULL;
				396	}
				397
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	398	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				399	{
				400	}
				401
				402	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				403	{
				404	}
				405
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	406	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				407	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				408
				409	static inline int
				410	is_same_group(struct sched_entity se, struct sched_entity pse)
				411	{
				412	return 1;
				413	}
				414
				415	static inline struct sched_entity parent_entity(struct sched_entity se)
				416	{
				417	return NULL;
				418	}
				419
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	420	static inline void
				421	find_matching_se(struct sched_entity se, struct sched_entity pse)
				422	{
				423	}
				424
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	425	#endif /* CONFIG_FAIR_GROUP_SCHED */
				426
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	427	static __always_inline
				428	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	429
				430	/**************************************************************
				431	* Scheduling class tree data structure manipulation methods:
				432	*/
				433
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	434	static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	435	{
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	436	s64 delta = (s64)(vruntime - min_vruntime);
				437	if (delta > 0)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	438	min_vruntime = vruntime;
				439
				440	return min_vruntime;
				441	}
				442
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	443	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	444	{
				445	s64 delta = (s64)(vruntime - min_vruntime);
				446	if (delta < 0)
				447	min_vruntime = vruntime;
				448
				449	return min_vruntime;
				450	}
				451
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	452	static inline int entity_before(struct sched_entity *a,
				453	struct sched_entity *b)
				454	{
				455	return (s64)(a->vruntime - b->vruntime) < 0;
				456	}
				457
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	458	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				459	{
				460	u64 vruntime = cfs_rq->min_vruntime;
				461
				462	if (cfs_rq->curr)
				463	vruntime = cfs_rq->curr->vruntime;
				464
				465	if (cfs_rq->rb_leftmost) {
				466	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				467	struct sched_entity,
				468	run_node);
				469
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	470	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	471	vruntime = se->vruntime;
				472	else
				473	vruntime = min_vruntime(vruntime, se->vruntime);
				474	}
				475
				476	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	477	#ifndef CONFIG_64BIT
				478	smp_wmb();
				479	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				480	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	481	}
				482
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	483	/*
				484	* Enqueue an entity into the rb-tree:
				485	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	486	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	487	{
				488	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				489	struct rb_node *parent = NULL;
				490	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	491	int leftmost = 1;
				492
				493	/*
				494	* Find the right place in the rbtree:
				495	*/
				496	while (*link) {
				497	parent = *link;
				498	entry = rb_entry(parent, struct sched_entity, run_node);
				499	/*
				500	* We dont care about collisions. Nodes with
				501	* the same key stay together.
				502	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	503	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	504	link = &parent->rb_left;
				505	} else {
				506	link = &parent->rb_right;
				507	leftmost = 0;
				508	}
				509	}
				510
				511	/*
				512	* Maintain a cache of leftmost tree entries (it is frequently
				513	* used):
				514	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	515	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	516	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	517
				518	rb_link_node(&se->run_node, parent, link);
				519	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	520	}
				521
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	522	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	524	if (cfs_rq->rb_leftmost == &se->run_node) {
				525	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	526
				527	next_node = rb_next(&se->run_node);
				528	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	529	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	530
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	531	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	}
				533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	534	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	535	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	536	struct rb_node *left = cfs_rq->rb_leftmost;
				537
				538	if (!left)
				539	return NULL;
				540
				541	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	}
				543
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	544	static struct sched_entity __pick_next_entity(struct sched_entity se)
				545	{
				546	struct rb_node *next = rb_next(&se->run_node);
				547
				548	if (!next)
				549	return NULL;
				550
				551	return rb_entry(next, struct sched_entity, run_node);
				552	}
				553
				554	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	555	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	556	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	557	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	558
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	559	if (!last)
				560	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	561
				562	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	563	}
				564
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	565	/**************************************************************
				566	* Scheduling class statistics methods:
				567	*/
				568
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	569	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	570	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	571	loff_t *ppos)
				572	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	573	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	574	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	575
				576	if (ret \|\| !write)
				577	return ret;
				578
				579	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				580	sysctl_sched_min_granularity);
				581
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	582	#define WRT_SYSCTL(name) \
				583	(normalized_sysctl_##name = sysctl_##name / (factor))
				584	WRT_SYSCTL(sched_min_granularity);
				585	WRT_SYSCTL(sched_latency);
				586	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	587	#undef WRT_SYSCTL
				588
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	589	return 0;
				590	}
				591	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	592
				593	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	594	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	595	*/
				596	static inline unsigned long
				597	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				598	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	599	if (unlikely(se->load.weight != NICE_0_LOAD))
				600	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	601
				602	return delta;
				603	}
				604
				605	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	606	* The idea is to set a period in which each task runs once.
				607	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	608	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	609	* this period because otherwise the slices get too small.
				610	*
				611	* p = (nr <= nl) ? l : l*nr/nl
				612	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	613	static u64 __sched_period(unsigned long nr_running)
				614	{
				615	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	616	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	617
				618	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	619	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	620	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	621	}
				622
				623	return period;
				624	}
				625
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	626	/*
				627	* We calculate the wall-time slice from the period by taking a part
				628	* proportional to the weight.
				629	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	630	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	631	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	632	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	633	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	634	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	635
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	636	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	637	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	638	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	639
				640	cfs_rq = cfs_rq_of(se);
				641	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	642
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	643	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	644	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	645
				646	update_load_add(&lw, se->load.weight);
				647	load = &lw;
				648	}
				649	slice = calc_delta_mine(slice, se->load.weight, load);
				650	}
				651	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	652	}
				653
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	654	/*
Peter Zijlstra	ac884de	2008-04-19 19:45:00 +0200	[diff] [blame]	655	* We calculate the vruntime slice of a to be inserted task
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	656	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	657	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	658	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	659	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	660	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	662	}
				663
				664	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	665	* Update the current task's runtime statistics. Skip current tasks that
				666	* are not in our scheduling class.
				667	*/
				668	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	669	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				670	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	672	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	673
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	674	schedstat_set(curr->statistics.exec_max,
				675	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	676
				677	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	678	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	679	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	680
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	681	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	682	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	683	}
				684
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	685	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	686	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	687	struct sched_entity *curr = cfs_rq->curr;
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	688	u64 now = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	689	unsigned long delta_exec;
				690
				691	if (unlikely(!curr))
				692	return;
				693
				694	/*
				695	* Get the amount of time the current task was running
				696	* since the last time we changed load (this cannot
				697	* overflow on 32 bits):
				698	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	699	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	700	if (!delta_exec)
				701	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	702
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	703	__update_curr(cfs_rq, curr, delta_exec);
				704	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	705
				706	if (entity_is_task(curr)) {
				707	struct task_struct *curtask = task_of(curr);
				708
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	709	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	710	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	711	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	712	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	713
				714	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	715	}
				716
				717	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	718	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	719	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	720	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	721	}
				722
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	723	/*
				724	* Task is being enqueued - update stats:
				725	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	726	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	727	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	/*
				729	* Are we enqueueing a waiting task? (for current tasks
				730	* a dequeue/enqueue event is a NOP)
				731	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	732	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	733	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	734	}
				735
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	736	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	737	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	738	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	739	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				740	rq_of(cfs_rq)->clock - se->statistics.wait_start));
				741	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				742	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				743	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	744	#ifdef CONFIG_SCHEDSTATS
				745	if (entity_is_task(se)) {
				746	trace_sched_stat_wait(task_of(se),
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	747	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	748	}
				749	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	750	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	751	}
				752
				753	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	754	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	755	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	/*
				757	* Mark the end of the wait period if dequeueing a
				758	* waiting task:
				759	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	760	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	761	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	}
				763
				764	/*
				765	* We are picking a new current task - update its stats:
				766	*/
				767	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	768	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	769	{
				770	/*
				771	* We are starting a new run period:
				772	*/
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	773	se->exec_start = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	774	}
				775
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	776	/**************************************************
				777	* Scheduling class queueing methods:
				778	*/
				779
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	780	#ifdef CONFIG_NUMA_BALANCING
				781	/*
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	782	* numa task sample period in ms
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	783	*/
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	784	unsigned int sysctl_numa_balancing_scan_period_min = 100;
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	785	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
				786	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	787
				788	/* Portion of address space to scan in MB */
				789	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	790
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	791	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				792	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				793
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	794	static void task_numa_placement(struct task_struct *p)
				795	{
				796	int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
				797
				798	if (p->numa_scan_seq == seq)
				799	return;
				800	p->numa_scan_seq = seq;
				801
				802	/* FIXME: Scheduling placement policy hints go here */
				803	}
				804
				805	/*
				806	* Got a PROT_NONE fault for a page on @node.
				807	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	808	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	809	{
				810	struct task_struct *p = current;
				811
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	812	if (!sched_feat_numa(NUMA))
				813	return;
				814
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	815	/* FIXME: Allocate task-specific structure for placement policy here */
				816
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	817	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	818	* If pages are properly placed (did not migrate) then scan slower.
				819	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	820	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	821	if (!migrated)
				822	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
				823	p->numa_scan_period + jiffies_to_msecs(10));
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	824
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	825	task_numa_placement(p);
				826	}
				827
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	828	static void reset_ptenuma_scan(struct task_struct *p)
				829	{
				830	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				831	p->mm->numa_scan_offset = 0;
				832	}
				833
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	834	/*
				835	* The expensive part of numa migration is done from task_work context.
				836	* Triggered from task_tick_numa().
				837	*/
				838	void task_numa_work(struct callback_head *work)
				839	{
				840	unsigned long migrate, next_scan, now = jiffies;
				841	struct task_struct *p = current;
				842	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	843	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	844	unsigned long start, end;
				845	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	846
				847	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				848
				849	work->next = work; /* protect against double add */
				850	/*
				851	* Who cares about NUMA placement when they're dying.
				852	*
				853	* NOTE: make sure not to dereference p->mm before this check,
				854	* exit_task_work() happens _after_ exit_mm() so we could be called
				855	* without p->mm even though we still had it when we enqueued this
				856	* work.
				857	*/
				858	if (p->flags & PF_EXITING)
				859	return;
				860
				861	/*
Mel Gorman	5bca230	2012-11-22 14:40:03 +0000	[diff] [blame]	862	* We do not care about task placement until a task runs on a node
				863	* other than the first one used by the address space. This is
				864	* largely because migrations are driven by what CPU the task
				865	* is running on. If it's never scheduled on another node, it'll
				866	* not migrate so why bother trapping the fault.
				867	*/
				868	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
				869	mm->first_nid = numa_node_id();
				870	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
				871	/* Are we running on a new node yet? */
				872	if (numa_node_id() == mm->first_nid &&
				873	!sched_feat_numa(NUMA_FORCE))
				874	return;
				875
				876	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
				877	}
				878
				879	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	880	* Reset the scan period if enough time has gone by. Objective is that
				881	* scanning will be reduced if pages are properly placed. As tasks
				882	* can enter different phases this needs to be re-examined. Lacking
				883	* proper tracking of reference behaviour, this blunt hammer is used.
				884	*/
				885	migrate = mm->numa_next_reset;
				886	if (time_after(now, migrate)) {
				887	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				888	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				889	xchg(&mm->numa_next_reset, next_scan);
				890	}
				891
				892	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	893	* Enforce maximal scan/migration frequency..
				894	*/
				895	migrate = mm->numa_next_scan;
				896	if (time_before(now, migrate))
				897	return;
				898
				899	if (p->numa_scan_period == 0)
				900	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				901
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	902	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	903	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				904	return;
				905
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	906	/*
				907	* Do not set pte_numa if the current running node is rate-limited.
				908	* This loses statistics on the fault but if we are unwilling to
				909	* migrate to this node, it is less likely we can do useful work
				910	*/
				911	if (migrate_ratelimited(numa_node_id()))
				912	return;
				913
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	914	start = mm->numa_scan_offset;
				915	pages = sysctl_numa_balancing_scan_size;
				916	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				917	if (!pages)
				918	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	919
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	920	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	921	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	922	if (!vma) {
				923	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	924	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	925	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	926	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	927	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	928	if (!vma_migratable(vma))
				929	continue;
				930
				931	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	932	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	933	continue;
				934
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	935	do {
				936	start = max(start, vma->vm_start);
				937	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				938	end = min(end, vma->vm_end);
				939	pages -= change_prot_numa(vma, start, end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	940
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	941	start = end;
				942	if (pages <= 0)
				943	goto out;
				944	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	945	}
				946
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	947	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	948	/*
				949	* It is possible to reach the end of the VMA list but the last few VMAs are
				950	* not guaranteed to the vma_migratable. If they are not, we would find the
				951	* !migratable VMA on the next scan but not reset the scanner to the start
				952	* so check it now.
				953	*/
				954	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	955	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	956	else
				957	reset_ptenuma_scan(p);
				958	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	959	}
				960
				961	/*
				962	* Drive the periodic memory faults..
				963	*/
				964	void task_tick_numa(struct rq rq, struct task_struct curr)
				965	{
				966	struct callback_head *work = &curr->numa_work;
				967	u64 period, now;
				968
				969	/*
				970	* We don't care about NUMA placement if we don't have memory.
				971	*/
				972	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				973	return;
				974
				975	/*
				976	* Using runtime rather than walltime has the dual advantage that
				977	* we (mostly) drive the selection from busy threads and that the
				978	* task needs to have done some actual work before we bother with
				979	* NUMA placement.
				980	*/
				981	now = curr->se.sum_exec_runtime;
				982	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				983
				984	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	985	if (!curr->node_stamp)
				986	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	987	curr->node_stamp = now;
				988
				989	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				990	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				991	task_work_add(curr, work, true);
				992	}
				993	}
				994	}
				995	#else
				996	static void task_tick_numa(struct rq rq, struct task_struct curr)
				997	{
				998	}
				999	#endif /* CONFIG_NUMA_BALANCING */
				1000
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1001	static void
				1002	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1003	{
				1004	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1005	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1006	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1007	#ifdef CONFIG_SMP
				1008	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1009	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1010	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1011	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1012	}
				1013
				1014	static void
				1015	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1016	{
				1017	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1018	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1019	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1020	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1021	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1022	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1023	}
				1024
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1025	#ifdef CONFIG_FAIR_GROUP_SCHED
				1026	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1027	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1028	{
				1029	long tg_weight;
				1030
				1031	/*
				1032	* Use this CPU's actual weight instead of the last load_contribution
				1033	* to gain a more accurate current total weight. See
				1034	* update_cfs_rq_load_contribution().
				1035	*/
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1036	tg_weight = atomic64_read(&tg->load_avg);
				1037	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1038	tg_weight += cfs_rq->load.weight;
				1039
				1040	return tg_weight;
				1041	}
				1042
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1043	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1044	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1045	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1046
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1047	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1048	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1049
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1050	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1051	if (tg_weight)
				1052	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1053
				1054	if (shares < MIN_SHARES)
				1055	shares = MIN_SHARES;
				1056	if (shares > tg->shares)
				1057	shares = tg->shares;
				1058
				1059	return shares;
				1060	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1061	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1062	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1063	{
				1064	return tg->shares;
				1065	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1066	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1067	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1068	unsigned long weight)
				1069	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1070	if (se->on_rq) {
				1071	/* commit outstanding execution time */
				1072	if (cfs_rq->curr == se)
				1073	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1074	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1075	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1076
				1077	update_load_set(&se->load, weight);
				1078
				1079	if (se->on_rq)
				1080	account_entity_enqueue(cfs_rq, se);
				1081	}
				1082
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1083	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1084
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1085	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1086	{
				1087	struct task_group *tg;
				1088	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1089	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1090
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1091	tg = cfs_rq->tg;
				1092	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1093	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1094	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1095	#ifndef CONFIG_SMP
				1096	if (likely(se->load.weight == tg->shares))
				1097	return;
				1098	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1099	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1100
				1101	reweight_entity(cfs_rq_of(se), se, shares);
				1102	}
				1103	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1104	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1105	{
				1106	}
				1107	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1108
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	1109	/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
				1110	#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1111	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1112	* We choose a half-life close to 1 scheduling period.
				1113	* Note: The tables below are dependent on this value.
				1114	*/
				1115	#define LOAD_AVG_PERIOD 32
				1116	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1117	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1118
				1119	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1120	static const u32 runnable_avg_yN_inv[] = {
				1121	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1122	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1123	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1124	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1125	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1126	0x85aac367, 0x82cd8698,
				1127	};
				1128
				1129	/*
				1130	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1131	* over-estimates when re-combining.
				1132	*/
				1133	static const u32 runnable_avg_yN_sum[] = {
				1134	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1135	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1136	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1137	};
				1138
				1139	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1140	* Approximate:
				1141	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1142	*/
				1143	static __always_inline u64 decay_load(u64 val, u64 n)
				1144	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1145	unsigned int local_n;
				1146
				1147	if (!n)
				1148	return val;
				1149	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1150	return 0;
				1151
				1152	/* after bounds checking we can collapse to 32-bit */
				1153	local_n = n;
				1154
				1155	/*
				1156	* As y^PERIOD = 1/2, we can combine
				1157	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1158	* With a look-up table which covers k^n (n<PERIOD)
				1159	*
				1160	* To achieve constant time decay_load.
				1161	*/
				1162	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1163	val >>= local_n / LOAD_AVG_PERIOD;
				1164	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1165	}
				1166
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1167	val *= runnable_avg_yN_inv[local_n];
				1168	/* We don't use SRR here since we always want to round down. */
				1169	return val >> 32;
				1170	}
				1171
				1172	/*
				1173	* For updates fully spanning n periods, the contribution to runnable
				1174	* average will be: \Sum 1024*y^n
				1175	*
				1176	* We can compute this reasonably efficiently by combining:
				1177	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1178	*/
				1179	static u32 __compute_runnable_contrib(u64 n)
				1180	{
				1181	u32 contrib = 0;
				1182
				1183	if (likely(n <= LOAD_AVG_PERIOD))
				1184	return runnable_avg_yN_sum[n];
				1185	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1186	return LOAD_AVG_MAX;
				1187
				1188	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1189	do {
				1190	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1191	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1192
				1193	n -= LOAD_AVG_PERIOD;
				1194	} while (n > LOAD_AVG_PERIOD);
				1195
				1196	contrib = decay_load(contrib, n);
				1197	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1198	}
				1199
				1200	/*
				1201	* We can represent the historical contribution to runnable average as the
				1202	* coefficients of a geometric series. To do this we sub-divide our runnable
				1203	* history into segments of approximately 1ms (1024us); label the segment that
				1204	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1205	*
				1206	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1207	* p0 p1 p2
				1208	* (now) (~1ms ago) (~2ms ago)
				1209	*
				1210	* Let u_i denote the fraction of p_i that the entity was runnable.
				1211	*
				1212	* We then designate the fractions u_i as our co-efficients, yielding the
				1213	* following representation of historical load:
				1214	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1215	*
				1216	* We choose y based on the with of a reasonably scheduling period, fixing:
				1217	* y^32 = 0.5
				1218	*
				1219	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1220	* approximately half as much as the contribution to load within the last ms
				1221	* (u_0).
				1222	*
				1223	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1224	* sum again by y is sufficient to update:
				1225	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1226	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1227	*/
				1228	static __always_inline int __update_entity_runnable_avg(u64 now,
				1229	struct sched_avg *sa,
				1230	int runnable)
				1231	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1232	u64 delta, periods;
				1233	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1234	int delta_w, decayed = 0;
				1235
				1236	delta = now - sa->last_runnable_update;
				1237	/*
				1238	* This should only happen when time goes backwards, which it
				1239	* unfortunately does during sched clock init when we swap over to TSC.
				1240	*/
				1241	if ((s64)delta < 0) {
				1242	sa->last_runnable_update = now;
				1243	return 0;
				1244	}
				1245
				1246	/*
				1247	* Use 1024ns as the unit of measurement since it's a reasonable
				1248	* approximation of 1us and fast to compute.
				1249	*/
				1250	delta >>= 10;
				1251	if (!delta)
				1252	return 0;
				1253	sa->last_runnable_update = now;
				1254
				1255	/* delta_w is the amount already accumulated against our next period */
				1256	delta_w = sa->runnable_avg_period % 1024;
				1257	if (delta + delta_w >= 1024) {
				1258	/* period roll-over */
				1259	decayed = 1;
				1260
				1261	/*
				1262	* Now that we know we're crossing a period boundary, figure
				1263	* out how much from delta we need to complete the current
				1264	* period and accrue it.
				1265	*/
				1266	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1267	if (runnable)
				1268	sa->runnable_avg_sum += delta_w;
				1269	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1270
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1271	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1272
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1273	/* Figure out how many additional periods this update spans */
				1274	periods = delta / 1024;
				1275	delta %= 1024;
				1276
				1277	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1278	periods + 1);
				1279	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1280	periods + 1);
				1281
				1282	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1283	runnable_contrib = __compute_runnable_contrib(periods);
				1284	if (runnable)
				1285	sa->runnable_avg_sum += runnable_contrib;
				1286	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1287	}
				1288
				1289	/* Remainder of delta accrued against u_0` */
				1290	if (runnable)
				1291	sa->runnable_avg_sum += delta;
				1292	sa->runnable_avg_period += delta;
				1293
				1294	return decayed;
				1295	}
				1296
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1297	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1298	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1299	{
				1300	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1301	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1302
				1303	decays -= se->avg.decay_count;
				1304	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1305	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1306
				1307	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1308	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1309
				1310	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1311	}
				1312
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1313	#ifdef CONFIG_FAIR_GROUP_SCHED
				1314	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1315	int force_update)
				1316	{
				1317	struct task_group *tg = cfs_rq->tg;
				1318	s64 tg_contrib;
				1319
				1320	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1321	tg_contrib -= cfs_rq->tg_load_contrib;
				1322
				1323	if (force_update \|\| abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1324	atomic64_add(tg_contrib, &tg->load_avg);
				1325	cfs_rq->tg_load_contrib += tg_contrib;
				1326	}
				1327	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1328
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1329	/*
				1330	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1331	* representation for computing load contributions.
				1332	*/
				1333	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1334	struct cfs_rq *cfs_rq)
				1335	{
				1336	struct task_group *tg = cfs_rq->tg;
				1337	long contrib;
				1338
				1339	/* The fraction of a cpu used by this cfs_rq */
				1340	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1341	sa->runnable_avg_period + 1);
				1342	contrib -= cfs_rq->tg_runnable_contrib;
				1343
				1344	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1345	atomic_add(contrib, &tg->runnable_avg);
				1346	cfs_rq->tg_runnable_contrib += contrib;
				1347	}
				1348	}
				1349
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1350	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1351	{
				1352	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1353	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1354	int runnable_avg;
				1355
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1356	u64 contrib;
				1357
				1358	contrib = cfs_rq->tg_load_contrib * tg->shares;
				1359	se->avg.load_avg_contrib = div64_u64(contrib,
				1360	atomic64_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1361
				1362	/*
				1363	* For group entities we need to compute a correction term in the case
				1364	* that they are consuming <1 cpu so that we would contribute the same
				1365	* load as a task of equal weight.
				1366	*
				1367	* Explicitly co-ordinating this measurement would be expensive, but
				1368	* fortunately the sum of each cpus contribution forms a usable
				1369	* lower-bound on the true value.
				1370	*
				1371	* Consider the aggregate of 2 contributions. Either they are disjoint
				1372	* (and the sum represents true value) or they are disjoint and we are
				1373	* understating by the aggregate of their overlap.
				1374	*
				1375	* Extending this to N cpus, for a given overlap, the maximum amount we
				1376	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1377	* cpus that overlap for this interval and w_i is the interval width.
				1378	*
				1379	* On a small machine; the first term is well-bounded which bounds the
				1380	* total error since w_i is a subset of the period. Whereas on a
				1381	* larger machine, while this first term can be larger, if w_i is the
				1382	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1383	* our upper bound of 1-cpu.
				1384	*/
				1385	runnable_avg = atomic_read(&tg->runnable_avg);
				1386	if (runnable_avg < NICE_0_LOAD) {
				1387	se->avg.load_avg_contrib *= runnable_avg;
				1388	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1389	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1390	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1391	#else
				1392	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1393	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1394	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1395	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1396	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1397	#endif
				1398
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1399	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1400	{
				1401	u32 contrib;
				1402
				1403	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1404	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1405	contrib /= (se->avg.runnable_avg_period + 1);
				1406	se->avg.load_avg_contrib = scale_load(contrib);
				1407	}
				1408
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1409	/* Compute the current contribution to load_avg by se, return any delta */
				1410	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1411	{
				1412	long old_contrib = se->avg.load_avg_contrib;
				1413
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1414	if (entity_is_task(se)) {
				1415	__update_task_entity_contrib(se);
				1416	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1417	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1418	__update_group_entity_contrib(se);
				1419	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1420
				1421	return se->avg.load_avg_contrib - old_contrib;
				1422	}
				1423
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1424	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1425	long load_contrib)
				1426	{
				1427	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1428	cfs_rq->blocked_load_avg -= load_contrib;
				1429	else
				1430	cfs_rq->blocked_load_avg = 0;
				1431	}
				1432
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1433	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1434
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1435	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1436	static inline void update_entity_load_avg(struct sched_entity *se,
				1437	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1438	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1439	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1440	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1441	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1442
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1443	/*
				1444	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1445	* case they are the parent of a throttled hierarchy.
				1446	*/
				1447	if (entity_is_task(se))
				1448	now = cfs_rq_clock_task(cfs_rq);
				1449	else
				1450	now = cfs_rq_clock_task(group_cfs_rq(se));
				1451
				1452	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1453	return;
				1454
				1455	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1456
				1457	if (!update_cfs_rq)
				1458	return;
				1459
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1460	if (se->on_rq)
				1461	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1462	else
				1463	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1464	}
				1465
				1466	/*
				1467	* Decay the load contributed by all blocked children and account this so that
				1468	* their contribution may appropriately discounted when they wake up.
				1469	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1470	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1471	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1472	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1473	u64 decays;
				1474
				1475	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1476	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1477	return;
				1478
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1479	if (atomic64_read(&cfs_rq->removed_load)) {
				1480	u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
				1481	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1482	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1483
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1484	if (decays) {
				1485	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1486	decays);
				1487	atomic64_add(decays, &cfs_rq->decay_counter);
				1488	cfs_rq->last_decay = now;
				1489	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1490
				1491	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1492	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1493
				1494	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1495	{
				1496	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1497	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1498	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1499
				1500	/* Add the load generated by se into cfs_rq's child load-average */
				1501	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1502	struct sched_entity *se,
				1503	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1504	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1505	/*
				1506	* We track migrations using entity decay_count <= 0, on a wake-up
				1507	* migration we use a negative decay count to track the remote decays
				1508	* accumulated while sleeping.
				1509	*/
				1510	if (unlikely(se->avg.decay_count <= 0)) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1511	se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1512	if (se->avg.decay_count) {
				1513	/*
				1514	* In a wake-up migration we have to approximate the
				1515	* time sleeping. This is because we can't synchronize
				1516	* clock_task between the two cpus, and it is not
				1517	* guaranteed to be read-safe. Instead, we can
				1518	* approximate this using our carried decays, which are
				1519	* explicitly atomically readable.
				1520	*/
				1521	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1522	<< 20;
				1523	update_entity_load_avg(se, 0);
				1524	/* Indicate that we're now synchronized and on-rq */
				1525	se->avg.decay_count = 0;
				1526	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1527	wakeup = 0;
				1528	} else {
				1529	__synchronize_entity_decay(se);
				1530	}
				1531
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1532	/* migrated tasks did not contribute to our blocked load */
				1533	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1534	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1535	update_entity_load_avg(se, 0);
				1536	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1537
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1538	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1539	/* we force update consideration on load-balancer moves */
				1540	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1541	}
				1542
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1543	/*
				1544	* Remove se's load from this cfs_rq child load-average, if the entity is
				1545	* transitioning to a blocked state we track its projected decay using
				1546	* blocked_load_avg.
				1547	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1548	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1549	struct sched_entity *se,
				1550	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1551	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1553	/* we force update consideration on load-balancer moves */
				1554	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1555
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1556	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1557	if (sleep) {
				1558	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1559	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1560	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1561	}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1562	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1563	static inline void update_entity_load_avg(struct sched_entity *se,
				1564	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1565	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1566	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1567	struct sched_entity *se,
				1568	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1569	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1570	struct sched_entity *se,
				1571	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1572	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1573	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1574	#endif
				1575
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1576	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1577	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1578	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1579	struct task_struct *tsk = NULL;
				1580
				1581	if (entity_is_task(se))
				1582	tsk = task_of(se);
				1583
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1584	if (se->statistics.sleep_start) {
				1585	u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1586
				1587	if ((s64)delta < 0)
				1588	delta = 0;
				1589
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1590	if (unlikely(delta > se->statistics.sleep_max))
				1591	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1592
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1593	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1594	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1595
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1596	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1597	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1598	trace_sched_stat_sleep(tsk, delta);
				1599	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1600	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1601	if (se->statistics.block_start) {
				1602	u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1603
				1604	if ((s64)delta < 0)
				1605	delta = 0;
				1606
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1607	if (unlikely(delta > se->statistics.block_max))
				1608	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1609
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1610	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1611	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1612
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1613	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1614	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1615	se->statistics.iowait_sum += delta;
				1616	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1617	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1618	}
				1619
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1620	trace_sched_stat_blocked(tsk, delta);
				1621
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1622	/*
				1623	* Blocking time is in units of nanosecs, so shift by
				1624	* 20 to get a milliseconds-range estimation of the
				1625	* amount of time that the task spent sleeping:
				1626	*/
				1627	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1628	profile_hits(SLEEP_PROFILING,
				1629	(void *)get_wchan(tsk),
				1630	delta >> 20);
				1631	}
				1632	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1633	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1634	}
				1635	#endif
				1636	}
				1637
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1638	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1639	{
				1640	#ifdef CONFIG_SCHED_DEBUG
				1641	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1642
				1643	if (d < 0)
				1644	d = -d;
				1645
				1646	if (d > 3*sysctl_sched_latency)
				1647	schedstat_inc(cfs_rq, nr_spread_over);
				1648	#endif
				1649	}
				1650
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1651	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1652	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1653	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1654	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1655
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1656	/*
				1657	* The 'current' period is already promised to the current tasks,
				1658	* however the extra weight of the new task will slow them down a
				1659	* little, place the new task so that it fits in the slot that
				1660	* stays open at the end.
				1661	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1662	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1663	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1664
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1665	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1666	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1667	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1668
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1669	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1670	* Halve their sleep time's effect, to allow
				1671	* for a gentler effect of sleepers:
				1672	*/
				1673	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1674	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1675
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1676	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1677	}
				1678
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1679	/* ensure we never gain time by being placed backwards. */
				1680	vruntime = max_vruntime(se->vruntime, vruntime);
				1681
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1682	se->vruntime = vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1683	}
				1684
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1685	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1686
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1687	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1688	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1689	{
				1690	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1691	* Update the normalized vruntime before updating min_vruntime
				1692	* through callig update_curr().
				1693	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1694	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1695	se->vruntime += cfs_rq->min_vruntime;
				1696
				1697	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1698	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1699	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1700	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1701	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1702	account_entity_enqueue(cfs_rq, se);
				1703	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1704
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1705	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1706	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1707	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1708	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1709
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1710	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1711	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1712	if (se != cfs_rq->curr)
				1713	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1714	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1715
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1716	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1717	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1718	check_enqueue_throttle(cfs_rq);
				1719	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1720	}
				1721
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1722	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1723	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1724	for_each_sched_entity(se) {
				1725	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1726	if (cfs_rq->last == se)
				1727	cfs_rq->last = NULL;
				1728	else
				1729	break;
				1730	}
				1731	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1732
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1733	static void __clear_buddies_next(struct sched_entity *se)
				1734	{
				1735	for_each_sched_entity(se) {
				1736	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1737	if (cfs_rq->next == se)
				1738	cfs_rq->next = NULL;
				1739	else
				1740	break;
				1741	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1742	}
				1743
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1744	static void __clear_buddies_skip(struct sched_entity *se)
				1745	{
				1746	for_each_sched_entity(se) {
				1747	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1748	if (cfs_rq->skip == se)
				1749	cfs_rq->skip = NULL;
				1750	else
				1751	break;
				1752	}
				1753	}
				1754
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1755	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1756	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1757	if (cfs_rq->last == se)
				1758	__clear_buddies_last(se);
				1759
				1760	if (cfs_rq->next == se)
				1761	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1762
				1763	if (cfs_rq->skip == se)
				1764	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1765	}
				1766
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1767	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1768
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1769	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1770	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1771	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1772	/*
				1773	* Update run-time statistics of the 'current'.
				1774	*/
				1775	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1776	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1777
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1778	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1779	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1780	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1781	if (entity_is_task(se)) {
				1782	struct task_struct *tsk = task_of(se);
				1783
				1784	if (tsk->state & TASK_INTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1785	se->statistics.sleep_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1786	if (tsk->state & TASK_UNINTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1787	se->statistics.block_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1788	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1789	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1790	}
				1791
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1792	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1793
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1794	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1795	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1796	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1797	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1798
				1799	/*
				1800	* Normalize the entity after updating the min_vruntime because the
				1801	* update can refer to the ->curr item and we need to reflect this
				1802	* movement in our normalized position.
				1803	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1804	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1805	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1806
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1807	/* return excess runtime on last dequeue */
				1808	return_cfs_rq_runtime(cfs_rq);
				1809
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1810	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1811	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1812	}
				1813
				1814	/*
				1815	* Preempt the current task with a newly woken task if needed:
				1816	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1817	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1818	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1819	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1820	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1821	struct sched_entity *se;
				1822	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1823
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	1824	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1825	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1826	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1827	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1828	/*
				1829	* The current task ran long enough, ensure it doesn't get
				1830	* re-elected due to buddy favours.
				1831	*/
				1832	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1833	return;
				1834	}
				1835
				1836	/*
				1837	* Ensure that a task that missed wakeup preemption by a
				1838	* narrow margin doesn't have to wait for a full slice.
				1839	* This also mitigates buddy induced latencies under load.
				1840	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1841	if (delta_exec < sysctl_sched_min_granularity)
				1842	return;
				1843
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1844	se = __pick_first_entity(cfs_rq);
				1845	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1846
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1847	if (delta < 0)
				1848	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1849
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1850	if (delta > ideal_runtime)
				1851	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1852	}
				1853
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1854	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1855	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1856	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1857	/* 'current' is not kept within the tree. */
				1858	if (se->on_rq) {
				1859	/*
				1860	* Any task has to be enqueued before it get to execute on
				1861	* a CPU. So account for the time it spent waiting on the
				1862	* runqueue.
				1863	*/
				1864	update_stats_wait_end(cfs_rq, se);
				1865	__dequeue_entity(cfs_rq, se);
				1866	}
				1867
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1868	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1869	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1870	#ifdef CONFIG_SCHEDSTATS
				1871	/*
				1872	* Track our maximum slice length, if the CPU's load is at
				1873	* least twice that of our own weight (i.e. dont track it
				1874	* when there are only lesser-weight tasks around):
				1875	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	1876	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1877	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1878	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				1879	}
				1880	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	1881	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1882	}
				1883
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	1884	static int
				1885	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				1886
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1887	/*
				1888	* Pick the next process, keeping these things in mind, in this order:
				1889	* 1) keep things fair between processes/task groups
				1890	* 2) pick the "next" process, since someone really wants that to run
				1891	* 3) pick the "last" process, for cache locality
				1892	* 4) do not run the "skip" process, if something else is available
				1893	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1894	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1895	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1896	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1897	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1898
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1899	/*
				1900	* Avoid running the skip buddy, if running something else can
				1901	* be done without getting too unfair.
				1902	*/
				1903	if (cfs_rq->skip == se) {
				1904	struct sched_entity *second = __pick_next_entity(se);
				1905	if (second && wakeup_preempt_entity(second, left) < 1)
				1906	se = second;
				1907	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1908
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1909	/*
				1910	* Prefer last buddy, try to return the CPU to a preempted task.
				1911	*/
				1912	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				1913	se = cfs_rq->last;
				1914
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1915	/*
				1916	* Someone really wants this to run. If it's not unfair, run it.
				1917	*/
				1918	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				1919	se = cfs_rq->next;
				1920
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1921	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1922
				1923	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1924	}
				1925
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1926	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				1927
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	1928	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1929	{
				1930	/*
				1931	* If still on the runqueue then deactivate_task()
				1932	* was not called and update_curr() has to be done:
				1933	*/
				1934	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1935	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1936
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1937	/* throttle cfs_rqs exceeding runtime */
				1938	check_cfs_rq_runtime(cfs_rq);
				1939
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1940	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1941	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1942	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1943	/* Put 'current' back into the tree. */
				1944	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1945	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1946	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1947	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1948	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1949	}
				1950
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1951	static void
				1952	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1953	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1954	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1955	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1956	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1957	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1958
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	1959	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1960	* Ensure that runnable average is periodically updated.
				1961	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1962	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1963	update_cfs_rq_blocked_load(cfs_rq, 1);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1964
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1965	#ifdef CONFIG_SCHED_HRTICK
				1966	/*
				1967	* queued ticks are scheduled to match the slice, so don't bother
				1968	* validating it and just reschedule.
				1969	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	1970	if (queued) {
				1971	resched_task(rq_of(cfs_rq)->curr);
				1972	return;
				1973	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1974	/*
				1975	* don't let the period tick interfere with the hrtick preemption
				1976	*/
				1977	if (!sched_feat(DOUBLE_TICK) &&
				1978	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				1979	return;
				1980	#endif
				1981
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	1982	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1983	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1984	}
				1985
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	1986
				1987	/**************************************************
				1988	* CFS bandwidth control machinery
				1989	*/
				1990
				1991	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1992
				1993	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1994	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1995
				1996	static inline bool cfs_bandwidth_used(void)
				1997	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1998	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1999	}
				2000
				2001	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2002	{
				2003	/* only need to count groups transitioning between enabled/!enabled */
				2004	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2005	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2006	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2007	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2008	}
				2009	#else /* HAVE_JUMP_LABEL */
				2010	static bool cfs_bandwidth_used(void)
				2011	{
				2012	return true;
				2013	}
				2014
				2015	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2016	#endif /* HAVE_JUMP_LABEL */
				2017
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2018	/*
				2019	* default period for cfs group bandwidth.
				2020	* default: 0.1s, units: nanoseconds
				2021	*/
				2022	static inline u64 default_cfs_period(void)
				2023	{
				2024	return 100000000ULL;
				2025	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2026
				2027	static inline u64 sched_cfs_bandwidth_slice(void)
				2028	{
				2029	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2030	}
				2031
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2032	/*
				2033	* Replenish runtime according to assigned quota and update expiration time.
				2034	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2035	* additional synchronization around rq->lock.
				2036	*
				2037	* requires cfs_b->lock
				2038	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2039	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2040	{
				2041	u64 now;
				2042
				2043	if (cfs_b->quota == RUNTIME_INF)
				2044	return;
				2045
				2046	now = sched_clock_cpu(smp_processor_id());
				2047	cfs_b->runtime = cfs_b->quota;
				2048	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2049	}
				2050
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2051	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2052	{
				2053	return &tg->cfs_bandwidth;
				2054	}
				2055
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2056	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2057	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2058	{
				2059	if (unlikely(cfs_rq->throttle_count))
				2060	return cfs_rq->throttled_clock_task;
				2061
				2062	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
				2063	}
				2064
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2065	/* returns 0 on failure to allocate runtime */
				2066	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2067	{
				2068	struct task_group *tg = cfs_rq->tg;
				2069	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2070	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2071
				2072	/* note: this is a positive sum as runtime_remaining <= 0 */
				2073	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2074
				2075	raw_spin_lock(&cfs_b->lock);
				2076	if (cfs_b->quota == RUNTIME_INF)
				2077	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2078	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2079	/*
				2080	* If the bandwidth pool has become inactive, then at least one
				2081	* period must have elapsed since the last consumption.
				2082	* Refresh the global state and ensure bandwidth timer becomes
				2083	* active.
				2084	*/
				2085	if (!cfs_b->timer_active) {
				2086	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2087	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2088	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2089
				2090	if (cfs_b->runtime > 0) {
				2091	amount = min(cfs_b->runtime, min_amount);
				2092	cfs_b->runtime -= amount;
				2093	cfs_b->idle = 0;
				2094	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2095	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2096	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2097	raw_spin_unlock(&cfs_b->lock);
				2098
				2099	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2100	/*
				2101	* we may have advanced our local expiration to account for allowed
				2102	* spread between our sched_clock and the one on which runtime was
				2103	* issued.
				2104	*/
				2105	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2106	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2107
				2108	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2109	}
				2110
				2111	/*
				2112	* Note: This depends on the synchronization provided by sched_clock and the
				2113	* fact that rq->clock snapshots this value.
				2114	*/
				2115	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2116	{
				2117	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2118	struct rq *rq = rq_of(cfs_rq);
				2119
				2120	/* if the deadline is ahead of our clock, nothing to do */
				2121	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
				2122	return;
				2123
				2124	if (cfs_rq->runtime_remaining < 0)
				2125	return;
				2126
				2127	/*
				2128	* If the local deadline has passed we have to consider the
				2129	* possibility that our sched_clock is 'fast' and the global deadline
				2130	* has not truly expired.
				2131	*
				2132	* Fortunately we can check determine whether this the case by checking
				2133	* whether the global deadline has advanced.
				2134	*/
				2135
				2136	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2137	/* extend local deadline, drift is bounded above by 2 ticks */
				2138	cfs_rq->runtime_expires += TICK_NSEC;
				2139	} else {
				2140	/* global deadline is ahead, expiration has passed */
				2141	cfs_rq->runtime_remaining = 0;
				2142	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2143	}
				2144
				2145	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2146	unsigned long delta_exec)
				2147	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2148	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2149	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2150	expire_cfs_rq_runtime(cfs_rq);
				2151
				2152	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2153	return;
				2154
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2155	/*
				2156	* if we're unable to extend our runtime we resched so that the active
				2157	* hierarchy can be throttled
				2158	*/
				2159	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2160	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2161	}
				2162
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2163	static __always_inline
				2164	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2165	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2166	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2167	return;
				2168
				2169	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2170	}
				2171
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2172	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2173	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2174	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2175	}
				2176
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2177	/* check whether cfs_rq, or any parent, is throttled */
				2178	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2179	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2180	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2181	}
				2182
				2183	/*
				2184	* Ensure that neither of the group entities corresponding to src_cpu or
				2185	* dest_cpu are members of a throttled hierarchy when performing group
				2186	* load-balance operations.
				2187	*/
				2188	static inline int throttled_lb_pair(struct task_group *tg,
				2189	int src_cpu, int dest_cpu)
				2190	{
				2191	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2192
				2193	src_cfs_rq = tg->cfs_rq[src_cpu];
				2194	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2195
				2196	return throttled_hierarchy(src_cfs_rq) \|\|
				2197	throttled_hierarchy(dest_cfs_rq);
				2198	}
				2199
				2200	/* updated child weight may affect parent so we have to do this bottom up */
				2201	static int tg_unthrottle_up(struct task_group tg, void data)
				2202	{
				2203	struct rq *rq = data;
				2204	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2205
				2206	cfs_rq->throttle_count--;
				2207	#ifdef CONFIG_SMP
				2208	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2209	/* adjust cfs_rq_clock_task() */
				2210	cfs_rq->throttled_clock_task_time += rq->clock_task -
				2211	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2212	}
				2213	#endif
				2214
				2215	return 0;
				2216	}
				2217
				2218	static int tg_throttle_down(struct task_group tg, void data)
				2219	{
				2220	struct rq *rq = data;
				2221	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2222
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2223	/* group is entering throttled state, stop time */
				2224	if (!cfs_rq->throttle_count)
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2225	cfs_rq->throttled_clock_task = rq->clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2226	cfs_rq->throttle_count++;
				2227
				2228	return 0;
				2229	}
				2230
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2231	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2232	{
				2233	struct rq *rq = rq_of(cfs_rq);
				2234	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2235	struct sched_entity *se;
				2236	long task_delta, dequeue = 1;
				2237
				2238	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2239
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2240	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2241	rcu_read_lock();
				2242	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2243	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2244
				2245	task_delta = cfs_rq->h_nr_running;
				2246	for_each_sched_entity(se) {
				2247	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2248	/* throttled entity or throttle-on-deactivate */
				2249	if (!se->on_rq)
				2250	break;
				2251
				2252	if (dequeue)
				2253	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2254	qcfs_rq->h_nr_running -= task_delta;
				2255
				2256	if (qcfs_rq->load.weight)
				2257	dequeue = 0;
				2258	}
				2259
				2260	if (!se)
				2261	rq->nr_running -= task_delta;
				2262
				2263	cfs_rq->throttled = 1;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2264	cfs_rq->throttled_clock = rq->clock;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2265	raw_spin_lock(&cfs_b->lock);
				2266	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2267	raw_spin_unlock(&cfs_b->lock);
				2268	}
				2269
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2270	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2271	{
				2272	struct rq *rq = rq_of(cfs_rq);
				2273	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2274	struct sched_entity *se;
				2275	int enqueue = 1;
				2276	long task_delta;
				2277
				2278	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2279
				2280	cfs_rq->throttled = 0;
				2281	raw_spin_lock(&cfs_b->lock);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2282	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2283	list_del_rcu(&cfs_rq->throttled_list);
				2284	raw_spin_unlock(&cfs_b->lock);
				2285
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2286	update_rq_clock(rq);
				2287	/* update hierarchical throttle state */
				2288	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2289
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2290	if (!cfs_rq->load.weight)
				2291	return;
				2292
				2293	task_delta = cfs_rq->h_nr_running;
				2294	for_each_sched_entity(se) {
				2295	if (se->on_rq)
				2296	enqueue = 0;
				2297
				2298	cfs_rq = cfs_rq_of(se);
				2299	if (enqueue)
				2300	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2301	cfs_rq->h_nr_running += task_delta;
				2302
				2303	if (cfs_rq_throttled(cfs_rq))
				2304	break;
				2305	}
				2306
				2307	if (!se)
				2308	rq->nr_running += task_delta;
				2309
				2310	/* determine whether we need to wake up potentially idle cpu */
				2311	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2312	resched_task(rq->curr);
				2313	}
				2314
				2315	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2316	u64 remaining, u64 expires)
				2317	{
				2318	struct cfs_rq *cfs_rq;
				2319	u64 runtime = remaining;
				2320
				2321	rcu_read_lock();
				2322	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2323	throttled_list) {
				2324	struct rq *rq = rq_of(cfs_rq);
				2325
				2326	raw_spin_lock(&rq->lock);
				2327	if (!cfs_rq_throttled(cfs_rq))
				2328	goto next;
				2329
				2330	runtime = -cfs_rq->runtime_remaining + 1;
				2331	if (runtime > remaining)
				2332	runtime = remaining;
				2333	remaining -= runtime;
				2334
				2335	cfs_rq->runtime_remaining += runtime;
				2336	cfs_rq->runtime_expires = expires;
				2337
				2338	/* we check whether we're throttled above */
				2339	if (cfs_rq->runtime_remaining > 0)
				2340	unthrottle_cfs_rq(cfs_rq);
				2341
				2342	next:
				2343	raw_spin_unlock(&rq->lock);
				2344
				2345	if (!remaining)
				2346	break;
				2347	}
				2348	rcu_read_unlock();
				2349
				2350	return remaining;
				2351	}
				2352
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2353	/*
				2354	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2355	* cfs_rqs as appropriate. If there has been no activity within the last
				2356	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2357	* used to track this state.
				2358	*/
				2359	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2360	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2361	u64 runtime, runtime_expires;
				2362	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2363
				2364	raw_spin_lock(&cfs_b->lock);
				2365	/* no need to continue the timer with no bandwidth constraint */
				2366	if (cfs_b->quota == RUNTIME_INF)
				2367	goto out_unlock;
				2368
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2369	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2370	/* idle depends on !throttled (for the case of a large deficit) */
				2371	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2372	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2373
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2374	/* if we're going inactive then everything else can be deferred */
				2375	if (idle)
				2376	goto out_unlock;
				2377
				2378	__refill_cfs_bandwidth_runtime(cfs_b);
				2379
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2380	if (!throttled) {
				2381	/* mark as potentially idle for the upcoming period */
				2382	cfs_b->idle = 1;
				2383	goto out_unlock;
				2384	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2385
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2386	/* account preceding periods in which throttling occurred */
				2387	cfs_b->nr_throttled += overrun;
				2388
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2389	/*
				2390	* There are throttled entities so we must first use the new bandwidth
				2391	* to unthrottle them before making it generally available. This
				2392	* ensures that all existing debts will be paid before a new cfs_rq is
				2393	* allowed to run.
				2394	*/
				2395	runtime = cfs_b->runtime;
				2396	runtime_expires = cfs_b->runtime_expires;
				2397	cfs_b->runtime = 0;
				2398
				2399	/*
				2400	* This check is repeated as we are holding onto the new bandwidth
				2401	* while we unthrottle. This can potentially race with an unthrottled
				2402	* group trying to acquire new bandwidth from the global pool.
				2403	*/
				2404	while (throttled && runtime > 0) {
				2405	raw_spin_unlock(&cfs_b->lock);
				2406	/* we can't nest cfs_b->lock while distributing bandwidth */
				2407	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2408	runtime_expires);
				2409	raw_spin_lock(&cfs_b->lock);
				2410
				2411	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2412	}
				2413
				2414	/* return (any) remaining runtime */
				2415	cfs_b->runtime = runtime;
				2416	/*
				2417	* While we are ensured activity in the period following an
				2418	* unthrottle, this also covers the case in which the new bandwidth is
				2419	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2420	* timer to remain active while there are any throttled entities.)
				2421	*/
				2422	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2423	out_unlock:
				2424	if (idle)
				2425	cfs_b->timer_active = 0;
				2426	raw_spin_unlock(&cfs_b->lock);
				2427
				2428	return idle;
				2429	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2430
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2431	/* a cfs_rq won't donate quota below this amount */
				2432	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2433	/* minimum remaining period time to redistribute slack quota */
				2434	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2435	/* how long we wait to gather additional slack before distributing */
				2436	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2437
				2438	/* are we near the end of the current quota period? */
				2439	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2440	{
				2441	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2442	u64 remaining;
				2443
				2444	/* if the call-back is running a quota refresh is already occurring */
				2445	if (hrtimer_callback_running(refresh_timer))
				2446	return 1;
				2447
				2448	/* is a quota refresh about to occur? */
				2449	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2450	if (remaining < min_expire)
				2451	return 1;
				2452
				2453	return 0;
				2454	}
				2455
				2456	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2457	{
				2458	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2459
				2460	/* if there's a quota refresh soon don't bother with slack */
				2461	if (runtime_refresh_within(cfs_b, min_left))
				2462	return;
				2463
				2464	start_bandwidth_timer(&cfs_b->slack_timer,
				2465	ns_to_ktime(cfs_bandwidth_slack_period));
				2466	}
				2467
				2468	/* we know any runtime found here is valid as update_curr() precedes return */
				2469	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2470	{
				2471	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2472	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2473
				2474	if (slack_runtime <= 0)
				2475	return;
				2476
				2477	raw_spin_lock(&cfs_b->lock);
				2478	if (cfs_b->quota != RUNTIME_INF &&
				2479	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2480	cfs_b->runtime += slack_runtime;
				2481
				2482	/* we are under rq->lock, defer unthrottling using a timer */
				2483	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2484	!list_empty(&cfs_b->throttled_cfs_rq))
				2485	start_cfs_slack_bandwidth(cfs_b);
				2486	}
				2487	raw_spin_unlock(&cfs_b->lock);
				2488
				2489	/* even if it's not valid for return we don't want to try again */
				2490	cfs_rq->runtime_remaining -= slack_runtime;
				2491	}
				2492
				2493	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2494	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2495	if (!cfs_bandwidth_used())
				2496	return;
				2497
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2498	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2499	return;
				2500
				2501	__return_cfs_rq_runtime(cfs_rq);
				2502	}
				2503
				2504	/*
				2505	* This is done with a timer (instead of inline with bandwidth return) since
				2506	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2507	*/
				2508	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2509	{
				2510	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2511	u64 expires;
				2512
				2513	/* confirm we're still not at a refresh boundary */
				2514	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2515	return;
				2516
				2517	raw_spin_lock(&cfs_b->lock);
				2518	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2519	runtime = cfs_b->runtime;
				2520	cfs_b->runtime = 0;
				2521	}
				2522	expires = cfs_b->runtime_expires;
				2523	raw_spin_unlock(&cfs_b->lock);
				2524
				2525	if (!runtime)
				2526	return;
				2527
				2528	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2529
				2530	raw_spin_lock(&cfs_b->lock);
				2531	if (expires == cfs_b->runtime_expires)
				2532	cfs_b->runtime = runtime;
				2533	raw_spin_unlock(&cfs_b->lock);
				2534	}
				2535
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2536	/*
				2537	* When a group wakes up we want to make sure that its quota is not already
				2538	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2539	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2540	*/
				2541	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2542	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2543	if (!cfs_bandwidth_used())
				2544	return;
				2545
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2546	/* an active group must be handled by the update_curr()->put() path */
				2547	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2548	return;
				2549
				2550	/* ensure the group is not already throttled */
				2551	if (cfs_rq_throttled(cfs_rq))
				2552	return;
				2553
				2554	/* update runtime allocation */
				2555	account_cfs_rq_runtime(cfs_rq, 0);
				2556	if (cfs_rq->runtime_remaining <= 0)
				2557	throttle_cfs_rq(cfs_rq);
				2558	}
				2559
				2560	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2561	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2562	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2563	if (!cfs_bandwidth_used())
				2564	return;
				2565
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2566	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2567	return;
				2568
				2569	/*
				2570	* it's possible for a throttled entity to be forced into a running
				2571	* state (e.g. set_curr_task), in this case we're finished.
				2572	*/
				2573	if (cfs_rq_throttled(cfs_rq))
				2574	return;
				2575
				2576	throttle_cfs_rq(cfs_rq);
				2577	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2578
				2579	static inline u64 default_cfs_period(void);
				2580	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
				2581	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
				2582
				2583	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2584	{
				2585	struct cfs_bandwidth *cfs_b =
				2586	container_of(timer, struct cfs_bandwidth, slack_timer);
				2587	do_sched_cfs_slack_timer(cfs_b);
				2588
				2589	return HRTIMER_NORESTART;
				2590	}
				2591
				2592	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2593	{
				2594	struct cfs_bandwidth *cfs_b =
				2595	container_of(timer, struct cfs_bandwidth, period_timer);
				2596	ktime_t now;
				2597	int overrun;
				2598	int idle = 0;
				2599
				2600	for (;;) {
				2601	now = hrtimer_cb_get_time(timer);
				2602	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2603
				2604	if (!overrun)
				2605	break;
				2606
				2607	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2608	}
				2609
				2610	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2611	}
				2612
				2613	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2614	{
				2615	raw_spin_lock_init(&cfs_b->lock);
				2616	cfs_b->runtime = 0;
				2617	cfs_b->quota = RUNTIME_INF;
				2618	cfs_b->period = ns_to_ktime(default_cfs_period());
				2619
				2620	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2621	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2622	cfs_b->period_timer.function = sched_cfs_period_timer;
				2623	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2624	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2625	}
				2626
				2627	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2628	{
				2629	cfs_rq->runtime_enabled = 0;
				2630	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2631	}
				2632
				2633	/* requires cfs_b->lock, may release to reprogram timer */
				2634	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2635	{
				2636	/*
				2637	* The timer may be active because we're trying to set a new bandwidth
				2638	* period or because we're racing with the tear-down path
				2639	* (timer_active==0 becomes visible before the hrtimer call-back
				2640	* terminates). In either case we ensure that it's re-programmed
				2641	*/
				2642	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2643	raw_spin_unlock(&cfs_b->lock);
				2644	/* ensure cfs_b->lock is available while we wait */
				2645	hrtimer_cancel(&cfs_b->period_timer);
				2646
				2647	raw_spin_lock(&cfs_b->lock);
				2648	/* if someone else restarted the timer then we're done */
				2649	if (cfs_b->timer_active)
				2650	return;
				2651	}
				2652
				2653	cfs_b->timer_active = 1;
				2654	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2655	}
				2656
				2657	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2658	{
				2659	hrtimer_cancel(&cfs_b->period_timer);
				2660	hrtimer_cancel(&cfs_b->slack_timer);
				2661	}
				2662
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2663	static void unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2664	{
				2665	struct cfs_rq *cfs_rq;
				2666
				2667	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2668	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2669
				2670	if (!cfs_rq->runtime_enabled)
				2671	continue;
				2672
				2673	/*
				2674	* clock_task is not advancing so we just need to make sure
				2675	* there's some valid quota amount
				2676	*/
				2677	cfs_rq->runtime_remaining = cfs_b->quota;
				2678	if (cfs_rq_throttled(cfs_rq))
				2679	unthrottle_cfs_rq(cfs_rq);
				2680	}
				2681	}
				2682
				2683	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2684	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2685	{
				2686	return rq_of(cfs_rq)->clock_task;
				2687	}
				2688
				2689	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2690	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2691	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2692	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2693	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2694
				2695	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2696	{
				2697	return 0;
				2698	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2699
				2700	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2701	{
				2702	return 0;
				2703	}
				2704
				2705	static inline int throttled_lb_pair(struct task_group *tg,
				2706	int src_cpu, int dest_cpu)
				2707	{
				2708	return 0;
				2709	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2710
				2711	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2712
				2713	#ifdef CONFIG_FAIR_GROUP_SCHED
				2714	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2715	#endif
				2716
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2717	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2718	{
				2719	return NULL;
				2720	}
				2721	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2722	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2723
				2724	#endif /* CONFIG_CFS_BANDWIDTH */
				2725
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2726	/**************************************************
				2727	* CFS operations on tasks:
				2728	*/
				2729
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2730	#ifdef CONFIG_SCHED_HRTICK
				2731	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2732	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2733	struct sched_entity *se = &p->se;
				2734	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2735
				2736	WARN_ON(task_rq(p) != rq);
				2737
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2738	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2739	u64 slice = sched_slice(cfs_rq, se);
				2740	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2741	s64 delta = slice - ran;
				2742
				2743	if (delta < 0) {
				2744	if (rq->curr == p)
				2745	resched_task(p);
				2746	return;
				2747	}
				2748
				2749	/*
				2750	* Don't schedule slices shorter than 10000ns, that just
				2751	* doesn't make sense. Rely on vruntime for fairness.
				2752	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2753	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2754	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2755
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2756	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2757	}
				2758	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2759
				2760	/*
				2761	* called from enqueue/dequeue and updates the hrtick when the
				2762	* current task is from our class and nr_running is low enough
				2763	* to matter.
				2764	*/
				2765	static void hrtick_update(struct rq *rq)
				2766	{
				2767	struct task_struct *curr = rq->curr;
				2768
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2769	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2770	return;
				2771
				2772	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2773	hrtick_start_fair(rq, curr);
				2774	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2775	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2776	static inline void
				2777	hrtick_start_fair(struct rq rq, struct task_struct p)
				2778	{
				2779	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2780
				2781	static inline void hrtick_update(struct rq *rq)
				2782	{
				2783	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2784	#endif
				2785
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2786	/*
				2787	* The enqueue_task method is called before nr_running is
				2788	* increased. Here we update the fair scheduling stats and
				2789	* then put the task into the rbtree:
				2790	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2791	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2792	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2793	{
				2794	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2795	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2796
				2797	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2798	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2799	break;
				2800	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2801	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2802
				2803	/*
				2804	* end evaluation on encountering a throttled cfs_rq
				2805	*
				2806	* note: in the case of encountering a throttled cfs_rq we will
				2807	* post the final h_nr_running increment below.
				2808	*/
				2809	if (cfs_rq_throttled(cfs_rq))
				2810	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2811	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2812
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2813	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2814	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2815
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2816	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2817	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2818	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2819
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2820	if (cfs_rq_throttled(cfs_rq))
				2821	break;
				2822
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2823	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2824	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2825	}
				2826
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2827	if (!se) {
				2828	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2829	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2830	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2831	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2832	}
				2833
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2834	static void set_next_buddy(struct sched_entity *se);
				2835
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2836	/*
				2837	* The dequeue_task method is called before nr_running is
				2838	* decreased. We remove the task from the rbtree and
				2839	* update the fair scheduling stats:
				2840	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2841	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2842	{
				2843	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2844	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2845	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2846
				2847	for_each_sched_entity(se) {
				2848	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2849	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2850
				2851	/*
				2852	* end evaluation on encountering a throttled cfs_rq
				2853	*
				2854	* note: in the case of encountering a throttled cfs_rq we will
				2855	* post the final h_nr_running decrement below.
				2856	*/
				2857	if (cfs_rq_throttled(cfs_rq))
				2858	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2859	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2860
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2861	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2862	if (cfs_rq->load.weight) {
				2863	/*
				2864	* Bias pick_next to pick a task from this cfs_rq, as
				2865	* p is sleeping when it is within its sched_slice.
				2866	*/
				2867	if (task_sleep && parent_entity(se))
				2868	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	2869
				2870	/* avoid re-evaluating load for this entity */
				2871	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2872	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2873	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2874	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2875	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2876
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2877	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2878	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2879	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2880
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2881	if (cfs_rq_throttled(cfs_rq))
				2882	break;
				2883
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2884	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2885	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2886	}
				2887
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2888	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2889	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2890	update_rq_runnable_avg(rq, 1);
				2891	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2892	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2893	}
				2894
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	2895	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2896	/* Used instead of source_load when we know the type == 0 */
				2897	static unsigned long weighted_cpuload(const int cpu)
				2898	{
				2899	return cpu_rq(cpu)->load.weight;
				2900	}
				2901
				2902	/*
				2903	* Return a low guess at the load of a migration-source cpu weighted
				2904	* according to the scheduling class and "nice" value.
				2905	*
				2906	* We want to under-estimate the load of migration sources, to
				2907	* balance conservatively.
				2908	*/
				2909	static unsigned long source_load(int cpu, int type)
				2910	{
				2911	struct rq *rq = cpu_rq(cpu);
				2912	unsigned long total = weighted_cpuload(cpu);
				2913
				2914	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2915	return total;
				2916
				2917	return min(rq->cpu_load[type-1], total);
				2918	}
				2919
				2920	/*
				2921	* Return a high guess at the load of a migration-target cpu weighted
				2922	* according to the scheduling class and "nice" value.
				2923	*/
				2924	static unsigned long target_load(int cpu, int type)
				2925	{
				2926	struct rq *rq = cpu_rq(cpu);
				2927	unsigned long total = weighted_cpuload(cpu);
				2928
				2929	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2930	return total;
				2931
				2932	return max(rq->cpu_load[type-1], total);
				2933	}
				2934
				2935	static unsigned long power_of(int cpu)
				2936	{
				2937	return cpu_rq(cpu)->cpu_power;
				2938	}
				2939
				2940	static unsigned long cpu_avg_load_per_task(int cpu)
				2941	{
				2942	struct rq *rq = cpu_rq(cpu);
				2943	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
				2944
				2945	if (nr_running)
				2946	return rq->load.weight / nr_running;
				2947
				2948	return 0;
				2949	}
				2950
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	2951
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2952	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2953	{
				2954	struct sched_entity *se = &p->se;
				2955	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2956	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2957
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2958	#ifndef CONFIG_64BIT
				2959	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2960
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2961	do {
				2962	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				2963	smp_rmb();
				2964	min_vruntime = cfs_rq->min_vruntime;
				2965	} while (min_vruntime != min_vruntime_copy);
				2966	#else
				2967	min_vruntime = cfs_rq->min_vruntime;
				2968	#endif
				2969
				2970	se->vruntime -= min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2971	}
				2972
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	2973	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	2974	/*
				2975	* effective_load() calculates the load change as seen from the root_task_group
				2976	*
				2977	* Adding load to a group doesn't make a group heavier, but can cause movement
				2978	* of group shares between cpus. Assuming the shares were perfectly aligned one
				2979	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2980	*
				2981	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				2982	* on this @cpu and results in a total addition (subtraction) of @wg to the
				2983	* total group weight.
				2984	*
				2985	* Given a runqueue weight distribution (rw_i) we can compute a shares
				2986	* distribution (s_i) using:
				2987	*
				2988	* s_i = rw_i / \Sum rw_j (1)
				2989	*
				2990	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				2991	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				2992	* shares distribution (s_i):
				2993	*
				2994	* rw_i = { 2, 4, 1, 0 }
				2995	* s_i = { 2/7, 4/7, 1/7, 0 }
				2996	*
				2997	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				2998	* task used to run on and the CPU the waker is running on), we need to
				2999	* compute the effect of waking a task on either CPU and, in case of a sync
				3000	* wakeup, compute the effect of the current task going to sleep.
				3001	*
				3002	* So for a change of @wl to the local @cpu with an overall group weight change
				3003	* of @wl we can compute the new shares distribution (s'_i) using:
				3004	*
				3005	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3006	*
				3007	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3008	* differences in waking a task to CPU 0. The additional task changes the
				3009	* weight and shares distributions like:
				3010	*
				3011	* rw'_i = { 3, 4, 1, 0 }
				3012	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3013	*
				3014	* We can then compute the difference in effective weight by using:
				3015	*
				3016	* dw_i = S * (s'_i - s_i) (3)
				3017	*
				3018	* Where 'S' is the group weight as seen by its parent.
				3019	*
				3020	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3021	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3022	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3023	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3024	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3025	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3026	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3027
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3028	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3029	return wl;
				3030
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3031	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3032	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3033
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3034	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3035
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3036	/*
				3037	* W = @wg + \Sum rw_j
				3038	*/
				3039	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3040
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3041	/*
				3042	* w = rw_i + @wl
				3043	*/
				3044	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3045
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3046	/*
				3047	* wl = S * s'_i; see (2)
				3048	*/
				3049	if (W > 0 && w < W)
				3050	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3051	else
				3052	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3053
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3054	/*
				3055	* Per the above, wl is the new se->load.weight value; since
				3056	* those are clipped to [MIN_SHARES, ...) do so now. See
				3057	* calc_cfs_shares().
				3058	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3059	if (wl < MIN_SHARES)
				3060	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3061
				3062	/*
				3063	* wl = dw_i = S * (s'_i - s_i); see (3)
				3064	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3065	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3066
				3067	/*
				3068	* Recursively apply this logic to all parent groups to compute
				3069	* the final effective load change on the root group. Since
				3070	* only the @tg group gets extra weight, all parent groups can
				3071	* only redistribute existing shares. @wl is the shift in shares
				3072	* resulting from this level per the above.
				3073	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3074	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3075	}
				3076
				3077	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3078	}
				3079	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3080
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3081	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3082	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3083	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3084	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3085	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3086
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3087	#endif
				3088
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3089	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3090	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3091	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3092	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3093	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3094	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3095	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3096	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3097
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3098	idx = sd->wake_idx;
				3099	this_cpu = smp_processor_id();
				3100	prev_cpu = task_cpu(p);
				3101	load = source_load(prev_cpu, idx);
				3102	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3103
				3104	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3105	* If sync wakeup then subtract the (maximum possible)
				3106	* effect of the currently running task from the load
				3107	* of the current CPU:
				3108	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3109	if (sync) {
				3110	tg = task_group(current);
				3111	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3112
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3113	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3114	load += effective_load(tg, prev_cpu, 0, -weight);
				3115	}
				3116
				3117	tg = task_group(p);
				3118	weight = p->se.load.weight;
				3119
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3120	/*
				3121	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3122	* due to the sync cause above having dropped this_load to 0, we'll
				3123	* always have an imbalance, but there's really nothing you can do
				3124	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3125	*
				3126	* Otherwise check if either cpus are near enough in load to allow this
				3127	* task to be woken on this_cpu.
				3128	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3129	if (this_load > 0) {
				3130	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3131
				3132	this_eff_load = 100;
				3133	this_eff_load *= power_of(prev_cpu);
				3134	this_eff_load *= this_load +
				3135	effective_load(tg, this_cpu, weight, weight);
				3136
				3137	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3138	prev_eff_load *= power_of(this_cpu);
				3139	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3140
				3141	balanced = this_eff_load <= prev_eff_load;
				3142	} else
				3143	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3144
				3145	/*
				3146	* If the currently running task will sleep within
				3147	* a reasonable amount of time then attract this newly
				3148	* woken task:
				3149	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3150	if (sync && balanced)
				3151	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3152
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3153	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3154	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3155
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3156	if (balanced \|\|
				3157	(this_load <= load &&
				3158	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3159	/*
				3160	* This domain has SD_WAKE_AFFINE and
				3161	* p is cache cold in this domain, and
				3162	* there is no bad imbalance.
				3163	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3164	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3165	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3166
				3167	return 1;
				3168	}
				3169	return 0;
				3170	}
				3171
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3172	/*
				3173	* find_idlest_group finds and returns the least busy CPU group within the
				3174	* domain.
				3175	*/
				3176	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3177	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3178	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3179	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3180	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3181	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3182	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3183
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3184	do {
				3185	unsigned long load, avg_load;
				3186	int local_group;
				3187	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3188
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3189	/* Skip over this group if it has no CPUs allowed */
				3190	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3191	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3192	continue;
				3193
				3194	local_group = cpumask_test_cpu(this_cpu,
				3195	sched_group_cpus(group));
				3196
				3197	/* Tally up the load of all CPUs in the group */
				3198	avg_load = 0;
				3199
				3200	for_each_cpu(i, sched_group_cpus(group)) {
				3201	/* Bias balancing toward cpus of our domain */
				3202	if (local_group)
				3203	load = source_load(i, load_idx);
				3204	else
				3205	load = target_load(i, load_idx);
				3206
				3207	avg_load += load;
				3208	}
				3209
				3210	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3211	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3212
				3213	if (local_group) {
				3214	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3215	} else if (avg_load < min_load) {
				3216	min_load = avg_load;
				3217	idlest = group;
				3218	}
				3219	} while (group = group->next, group != sd->groups);
				3220
				3221	if (!idlest \|\| 100this_load < imbalancemin_load)
				3222	return NULL;
				3223	return idlest;
				3224	}
				3225
				3226	/*
				3227	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3228	*/
				3229	static int
				3230	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3231	{
				3232	unsigned long load, min_load = ULONG_MAX;
				3233	int idlest = -1;
				3234	int i;
				3235
				3236	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3237	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3238	load = weighted_cpuload(i);
				3239
				3240	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3241	min_load = load;
				3242	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3243	}
				3244	}
				3245
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3246	return idlest;
				3247	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3248
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3249	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3250	* Try and locate an idle CPU in the sched_domain.
				3251	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3252	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3253	{
				3254	int cpu = smp_processor_id();
				3255	int prev_cpu = task_cpu(p);
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3256	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3257	struct sched_group *sg;
				3258	int i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3259
				3260	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3261	* If the task is going to be woken-up on this cpu and if it is
				3262	* already idle, then it is the right target.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3263	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3264	if (target == cpu && idle_cpu(cpu))
				3265	return cpu;
				3266
				3267	/*
				3268	* If the task is going to be woken-up on the cpu where it previously
				3269	* ran and if it is currently idle, then it the right target.
				3270	*/
				3271	if (target == prev_cpu && idle_cpu(prev_cpu))
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3272	return prev_cpu;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3273
				3274	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3275	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3276	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3277	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3278	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3279	sg = sd->groups;
				3280	do {
				3281	if (!cpumask_intersects(sched_group_cpus(sg),
				3282	tsk_cpus_allowed(p)))
				3283	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3284
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3285	for_each_cpu(i, sched_group_cpus(sg)) {
				3286	if (!idle_cpu(i))
				3287	goto next;
				3288	}
				3289
				3290	target = cpumask_first_and(sched_group_cpus(sg),
				3291	tsk_cpus_allowed(p));
				3292	goto done;
				3293	next:
				3294	sg = sg->next;
				3295	} while (sg != sd->groups);
				3296	}
				3297	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3298	return target;
				3299	}
				3300
				3301	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3302	* sched_balance_self: balance the current task (running on cpu) in domains
				3303	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3304	* SD_BALANCE_EXEC.
				3305	*
				3306	* Balance, ie. select the least loaded group.
				3307	*
				3308	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3309	*
				3310	* preempt must be disabled.
				3311	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3312	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3313	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3314	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3315	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3316	int cpu = smp_processor_id();
				3317	int prev_cpu = task_cpu(p);
				3318	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3319	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3320	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3321
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3322	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3323	return prev_cpu;
				3324
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3325	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3326	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3327	want_affine = 1;
				3328	new_cpu = prev_cpu;
				3329	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3330
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3331	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3332	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	3333	if (!(tmp->flags & SD_LOAD_BALANCE))
				3334	continue;
				3335
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3336	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3337	* If both cpu and prev_cpu are part of this domain,
				3338	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3339	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3340	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3341	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3342	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3343	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3344	}
				3345
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3346	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3347	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3348	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3349
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3350	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3351	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3352	prev_cpu = cpu;
				3353
				3354	new_cpu = select_idle_sibling(p, prev_cpu);
				3355	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3356	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3357
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3358	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3359	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3360	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3361	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3362
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3363	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3364	sd = sd->child;
				3365	continue;
				3366	}
				3367
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3368	if (sd_flag & SD_BALANCE_WAKE)
				3369	load_idx = sd->wake_idx;
				3370
				3371	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3372	if (!group) {
				3373	sd = sd->child;
				3374	continue;
				3375	}
				3376
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3377	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3378	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3379	/* Now try balancing at a lower domain level of cpu */
				3380	sd = sd->child;
				3381	continue;
				3382	}
				3383
				3384	/* Now try balancing at a lower domain level of new_cpu */
				3385	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3386	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3387	sd = NULL;
				3388	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3389	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3390	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3391	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3392	sd = tmp;
				3393	}
				3394	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3395	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3396	unlock:
				3397	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3398
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3399	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3400	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3401
				3402	/*
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3403	* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
				3404	* removed when useful for applications beyond shares distribution (e.g.
				3405	* load-balance).
				3406	*/
				3407	#ifdef CONFIG_FAIR_GROUP_SCHED
				3408	/*
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3409	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3410	* cfs_rq_of(p) references at time of call are still valid and identify the
				3411	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3412	* other assumptions, including the state of rq->lock, should be made.
				3413	*/
				3414	static void
				3415	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3416	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3417	struct sched_entity *se = &p->se;
				3418	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3419
				3420	/*
				3421	* Load tracking: accumulate removed load so that it can be processed
				3422	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3423	* to blocked load iff they have a positive decay-count. It can never
				3424	* be negative here since on-rq tasks have decay-count == 0.
				3425	*/
				3426	if (se->avg.decay_count) {
				3427	se->avg.decay_count = -__synchronize_entity_decay(se);
				3428	atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
				3429	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3430	}
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3431	#endif
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3432	#endif /* CONFIG_SMP */
				3433
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3434	static unsigned long
				3435	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3436	{
				3437	unsigned long gran = sysctl_sched_wakeup_granularity;
				3438
				3439	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3440	* Since its curr running now, convert the gran from real-time
				3441	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3442	*
				3443	* By using 'se' instead of 'curr' we penalize light tasks, so
				3444	* they get preempted easier. That is, if 'se' < 'curr' then
				3445	* the resulting gran will be larger, therefore penalizing the
				3446	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3447	* be smaller, again penalizing the lighter task.
				3448	*
				3449	* This is especially important for buddies when the leftmost
				3450	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3451	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3452	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3453	}
				3454
				3455	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3456	* Should 'se' preempt 'curr'.
				3457	*
				3458	* \|s1
				3459	* \|s2
				3460	* \|s3
				3461	* g
				3462	* \|<--->\|c
				3463	*
				3464	* w(c, s1) = -1
				3465	* w(c, s2) = 0
				3466	* w(c, s3) = 1
				3467	*
				3468	*/
				3469	static int
				3470	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3471	{
				3472	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3473
				3474	if (vdiff <= 0)
				3475	return -1;
				3476
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3477	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3478	if (vdiff > gran)
				3479	return 1;
				3480
				3481	return 0;
				3482	}
				3483
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3484	static void set_last_buddy(struct sched_entity *se)
				3485	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3486	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3487	return;
				3488
				3489	for_each_sched_entity(se)
				3490	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3491	}
				3492
				3493	static void set_next_buddy(struct sched_entity *se)
				3494	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3495	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3496	return;
				3497
				3498	for_each_sched_entity(se)
				3499	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3500	}
				3501
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3502	static void set_skip_buddy(struct sched_entity *se)
				3503	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3504	for_each_sched_entity(se)
				3505	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3506	}
				3507
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3508	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3509	* Preempt the current task with a newly woken task if needed:
				3510	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3511	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3512	{
				3513	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3514	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3515	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3516	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3517	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3518
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3519	if (unlikely(se == pse))
				3520	return;
				3521
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3522	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3523	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3524	* unconditionally check_prempt_curr() after an enqueue (which may have
				3525	* lead to a throttle). This both saves work and prevents false
				3526	* next-buddy nomination below.
				3527	*/
				3528	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3529	return;
				3530
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3531	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3532	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3533	next_buddy_marked = 1;
				3534	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3535
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3536	/*
				3537	* We can come here with TIF_NEED_RESCHED already set from new task
				3538	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3539	*
				3540	* Note: this also catches the edge-case of curr being in a throttled
				3541	* group (e.g. via set_curr_task), since update_curr() (in the
				3542	* enqueue of curr) will have resulted in resched being set. This
				3543	* prevents us from potentially nominating it as a false LAST_BUDDY
				3544	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3545	*/
				3546	if (test_tsk_need_resched(curr))
				3547	return;
				3548
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3549	/* Idle tasks are by definition preempted by non-idle tasks. */
				3550	if (unlikely(curr->policy == SCHED_IDLE) &&
				3551	likely(p->policy != SCHED_IDLE))
				3552	goto preempt;
				3553
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3554	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3555	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3556	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3557	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	3558	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3559	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3560
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3561	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3562	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3563	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3564	if (wakeup_preempt_entity(se, pse) == 1) {
				3565	/*
				3566	* Bias pick_next to pick the sched entity that is
				3567	* triggering this preemption.
				3568	*/
				3569	if (!next_buddy_marked)
				3570	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3571	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3572	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3573
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3574	return;
				3575
				3576	preempt:
				3577	resched_task(curr);
				3578	/*
				3579	* Only set the backward buddy when the current task is still
				3580	* on the rq. This can happen when a wakeup gets interleaved
				3581	* with schedule on the ->pre_schedule() or idle_balance()
				3582	* point, either of which can * drop the rq lock.
				3583	*
				3584	* Also, during early boot the idle thread is in the fair class,
				3585	* for obvious reasons its a bad idea to schedule back to it.
				3586	*/
				3587	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3588	return;
				3589
				3590	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3591	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3592	}
				3593
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3594	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3595	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3596	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3597	struct cfs_rq *cfs_rq = &rq->cfs;
				3598	struct sched_entity *se;
				3599
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3600	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3601	return NULL;
				3602
				3603	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3604	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3605	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3606	cfs_rq = group_cfs_rq(se);
				3607	} while (cfs_rq);
				3608
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3609	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3610	if (hrtick_enabled(rq))
				3611	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3612
				3613	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3614	}
				3615
				3616	/*
				3617	* Account for a descheduled task:
				3618	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3619	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3620	{
				3621	struct sched_entity *se = &prev->se;
				3622	struct cfs_rq *cfs_rq;
				3623
				3624	for_each_sched_entity(se) {
				3625	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3626	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3627	}
				3628	}
				3629
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3630	/*
				3631	* sched_yield() is very simple
				3632	*
				3633	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3634	*/
				3635	static void yield_task_fair(struct rq *rq)
				3636	{
				3637	struct task_struct *curr = rq->curr;
				3638	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3639	struct sched_entity *se = &curr->se;
				3640
				3641	/*
				3642	* Are we the only task in the tree?
				3643	*/
				3644	if (unlikely(rq->nr_running == 1))
				3645	return;
				3646
				3647	clear_buddies(cfs_rq, se);
				3648
				3649	if (curr->policy != SCHED_BATCH) {
				3650	update_rq_clock(rq);
				3651	/*
				3652	* Update run-time statistics of the 'current'.
				3653	*/
				3654	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3655	/*
				3656	* Tell update_rq_clock() that we've just updated,
				3657	* so we don't do microscopic update in schedule()
				3658	* and double the fastpath cost.
				3659	*/
				3660	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3661	}
				3662
				3663	set_skip_buddy(se);
				3664	}
				3665
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3666	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3667	{
				3668	struct sched_entity *se = &p->se;
				3669
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3670	/* throttled hierarchies are not runnable */
				3671	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3672	return false;
				3673
				3674	/* Tell the scheduler that we'd really like pse to run next. */
				3675	set_next_buddy(se);
				3676
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3677	yield_task_fair(rq);
				3678
				3679	return true;
				3680	}
				3681
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3682	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3683	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3684	* Fair scheduling class load-balancing methods.
				3685	*
				3686	* BASICS
				3687	*
				3688	* The purpose of load-balancing is to achieve the same basic fairness the
				3689	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3690	* time to each task. This is expressed in the following equation:
				3691	*
				3692	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3693	*
				3694	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3695	* W_i,0 is defined as:
				3696	*
				3697	* W_i,0 = \Sum_j w_i,j (2)
				3698	*
				3699	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3700	* is derived from the nice value as per prio_to_weight[].
				3701	*
				3702	* The weight average is an exponential decay average of the instantaneous
				3703	* weight:
				3704	*
				3705	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3706	*
				3707	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3708	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3709	* can also include other factors [XXX].
				3710	*
				3711	* To achieve this balance we define a measure of imbalance which follows
				3712	* directly from (1):
				3713	*
				3714	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3715	*
				3716	* We them move tasks around to minimize the imbalance. In the continuous
				3717	* function space it is obvious this converges, in the discrete case we get
				3718	* a few fun cases generally called infeasible weight scenarios.
				3719	*
				3720	* [XXX expand on:
				3721	* - infeasible weights;
				3722	* - local vs global optima in the discrete case. ]
				3723	*
				3724	*
				3725	* SCHED DOMAINS
				3726	*
				3727	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3728	* for all i,j solution, we create a tree of cpus that follows the hardware
				3729	* topology where each level pairs two lower groups (or better). This results
				3730	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3731	* tree to only the first of the previous level and we decrease the frequency
				3732	* of load-balance at each level inv. proportional to the number of cpus in
				3733	* the groups.
				3734	*
				3735	* This yields:
				3736	*
				3737	* log_2 n 1 n
				3738	* \Sum { --- * --- * 2^i } = O(n) (5)
				3739	* i = 0 2^i 2^i
				3740	* `- size of each group
				3741	* \| \| `- number of cpus doing load-balance
				3742	* \| `- freq
				3743	* `- sum over all levels
				3744	*
				3745	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3746	* this makes (5) the runtime complexity of the balancer.
				3747	*
				3748	* An important property here is that each CPU is still (indirectly) connected
				3749	* to every other cpu in at most O(log n) steps:
				3750	*
				3751	* The adjacency matrix of the resulting graph is given by:
				3752	*
				3753	* log_2 n
				3754	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3755	* k = 0
				3756	*
				3757	* And you'll find that:
				3758	*
				3759	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3760	*
				3761	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3762	* The task movement gives a factor of O(m), giving a convergence complexity
				3763	* of:
				3764	*
				3765	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3766	*
				3767	*
				3768	* WORK CONSERVING
				3769	*
				3770	* In order to avoid CPUs going idle while there's still work to do, new idle
				3771	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3772	* tree itself instead of relying on other CPUs to bring it work.
				3773	*
				3774	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3775	* time.
				3776	*
				3777	* [XXX more?]
				3778	*
				3779	*
				3780	* CGROUPS
				3781	*
				3782	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3783	*
				3784	* s_k,i
				3785	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3786	* S_k
				3787	*
				3788	* Where
				3789	*
				3790	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3791	*
				3792	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3793	*
				3794	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3795	* property.
				3796	*
				3797	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3798	* rewrite all of this once again.]
				3799	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3800
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3801	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3802
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3803	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3804	#define LBF_NEED_BREAK 0x02
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3805	#define LBF_SOME_PINNED 0x04
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3806
				3807	struct lb_env {
				3808	struct sched_domain *sd;
				3809
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3810	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3811	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3812
				3813	int dst_cpu;
				3814	struct rq *dst_rq;
				3815
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3816	struct cpumask *dst_grpmask;
				3817	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3818	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3819	long imbalance;
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	3820	/* The set of CPUs under consideration for load-balancing */
				3821	struct cpumask *cpus;
				3822
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3823	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3824
				3825	unsigned int loop;
				3826	unsigned int loop_break;
				3827	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3828	};
				3829
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3830	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3831	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3832	* Both runqueues must be locked.
				3833	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3834	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3835	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3836	deactivate_task(env->src_rq, p, 0);
				3837	set_task_cpu(p, env->dst_cpu);
				3838	activate_task(env->dst_rq, p, 0);
				3839	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3840	}
				3841
				3842	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3843	* Is this task likely cache-hot:
				3844	*/
				3845	static int
				3846	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				3847	{
				3848	s64 delta;
				3849
				3850	if (p->sched_class != &fair_sched_class)
				3851	return 0;
				3852
				3853	if (unlikely(p->policy == SCHED_IDLE))
				3854	return 0;
				3855
				3856	/*
				3857	* Buddy candidates are cache hot:
				3858	*/
				3859	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				3860	(&p->se == cfs_rq_of(&p->se)->next \|\|
				3861	&p->se == cfs_rq_of(&p->se)->last))
				3862	return 1;
				3863
				3864	if (sysctl_sched_migration_cost == -1)
				3865	return 1;
				3866	if (sysctl_sched_migration_cost == 0)
				3867	return 0;
				3868
				3869	delta = now - p->se.exec_start;
				3870
				3871	return delta < (s64)sysctl_sched_migration_cost;
				3872	}
				3873
				3874	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3875	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				3876	*/
				3877	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3878	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3879	{
				3880	int tsk_cache_hot = 0;
				3881	/*
				3882	* We do not migrate tasks that are:
				3883	* 1) running (obviously), or
				3884	* 2) cannot be migrated to this CPU due to cpus_allowed, or
				3885	* 3) are cache-hot on their current CPU.
				3886	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3887	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3888	int new_dst_cpu;
				3889
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3890	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3891
				3892	/*
				3893	* Remember if this task can be migrated to any other cpu in
				3894	* our sched_group. We may want to revisit it if we couldn't
				3895	* meet load balance goals by pulling other tasks on src_cpu.
				3896	*
				3897	* Also avoid computing new_dst_cpu if we have already computed
				3898	* one in current iteration.
				3899	*/
				3900	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
				3901	return 0;
				3902
				3903	new_dst_cpu = cpumask_first_and(env->dst_grpmask,
				3904	tsk_cpus_allowed(p));
				3905	if (new_dst_cpu < nr_cpu_ids) {
				3906	env->flags \|= LBF_SOME_PINNED;
				3907	env->new_dst_cpu = new_dst_cpu;
				3908	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3909	return 0;
				3910	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3911
				3912	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3913	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3914
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3915	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3916	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3917	return 0;
				3918	}
				3919
				3920	/*
				3921	* Aggressive migration if:
				3922	* 1) task is cache cold, or
				3923	* 2) too many balance attempts have failed.
				3924	*/
				3925
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3926	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3927	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3928	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3929	#ifdef CONFIG_SCHEDSTATS
				3930	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3931	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3932	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3933	}
				3934	#endif
				3935	return 1;
				3936	}
				3937
				3938	if (tsk_cache_hot) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3939	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3940	return 0;
				3941	}
				3942	return 1;
				3943	}
				3944
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3945	/*
				3946	* move_one_task tries to move exactly one task from busiest to this_rq, as
				3947	* part of active balancing operations within "domain".
				3948	* Returns 1 if successful and 0 otherwise.
				3949	*
				3950	* Called with both runqueues locked.
				3951	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3952	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3953	{
				3954	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3955
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3956	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
				3957	if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
				3958	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3959
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3960	if (!can_migrate_task(p, env))
				3961	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3962
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3963	move_task(p, env);
				3964	/*
				3965	* Right now, this is only the second place move_task()
				3966	* is called, so we can safely collect move_task()
				3967	* stats here rather than inside move_task().
				3968	*/
				3969	schedstat_inc(env->sd, lb_gained[env->idle]);
				3970	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3971	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3972	return 0;
				3973	}
				3974
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3975	static unsigned long task_h_load(struct task_struct *p);
				3976
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	3977	static const unsigned int sched_nr_migrate_break = 32;
				3978
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3979	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3980	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3981	* this_rq, as part of a balancing operation within domain "sd".
				3982	* Returns 1 if successful and 0 otherwise.
				3983	*
				3984	* Called with both runqueues locked.
				3985	*/
				3986	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3987	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3988	struct list_head *tasks = &env->src_rq->cfs_tasks;
				3989	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3990	unsigned long load;
				3991	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3992
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3993	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3994	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3995
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3996	while (!list_empty(tasks)) {
				3997	p = list_first_entry(tasks, struct task_struct, se.group_node);
				3998
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3999	env->loop++;
				4000	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4001	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4002	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4003
				4004	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4005	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4006	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4007	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4008	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4009	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4010
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4011	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4012	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4013
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4014	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4015
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4016	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4017	goto next;
				4018
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4019	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4020	goto next;
				4021
				4022	if (!can_migrate_task(p, env))
				4023	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4024
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4025	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4026	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4027	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4028
				4029	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4030	/*
				4031	* NEWIDLE balancing is a source of latency, so preemptible
				4032	* kernels will stop after the first task is pulled to minimize
				4033	* the critical section.
				4034	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4035	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4036	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4037	#endif
				4038
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4039	/*
				4040	* We only want to steal up to the prescribed amount of
				4041	* weighted load.
				4042	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4043	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4044	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4045
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4046	continue;
				4047	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4048	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4049	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4050
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4051	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4052	* Right now, this is one of only two places move_task() is called,
				4053	* so we can safely collect move_task() stats here rather than
				4054	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4055	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4056	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4057
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4058	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4059	}
				4060
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4061	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4062	/*
				4063	* update tg->load_weight by folding this cpu's load_avg
				4064	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4065	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4066	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4067	struct sched_entity *se = tg->se[cpu];
				4068	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4069
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4070	/* throttled entities do not contribute to load */
				4071	if (throttled_hierarchy(cfs_rq))
				4072	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4073
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4074	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4075
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4076	if (se) {
				4077	update_entity_load_avg(se, 1);
				4078	/*
				4079	* We pivot on our runnable average having decayed to zero for
				4080	* list removal. This generally implies that all our children
				4081	* have also been removed (modulo rounding error or bandwidth
				4082	* control); however, such cases are rare and we can fix these
				4083	* at enqueue.
				4084	*
				4085	* TODO: fix up out-of-order children on enqueue.
				4086	*/
				4087	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4088	list_del_leaf_cfs_rq(cfs_rq);
				4089	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4090	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4091	update_rq_runnable_avg(rq, rq->nr_running);
				4092	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4093	}
				4094
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4095	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4096	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4097	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4098	struct cfs_rq *cfs_rq;
				4099	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4100
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4101	raw_spin_lock_irqsave(&rq->lock, flags);
				4102	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4103	/*
				4104	* Iterates the task_group tree in a bottom up fashion, see
				4105	* list_add_leaf_cfs_rq() for details.
				4106	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4107	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4108	/*
				4109	* Note: We may want to consider periodically releasing
				4110	* rq->lock about these updates so that creating many task
				4111	* groups does not result in continually extending hold time.
				4112	*/
				4113	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4114	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4115
				4116	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4117	}
				4118
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4119	/*
				4120	* Compute the cpu's hierarchical load factor for each task group.
				4121	* This needs to be done in a top-down fashion because the load of a child
				4122	* group is a fraction of its parents load.
				4123	*/
				4124	static int tg_load_down(struct task_group tg, void data)
				4125	{
				4126	unsigned long load;
				4127	long cpu = (long)data;
				4128
				4129	if (!tg->parent) {
				4130	load = cpu_rq(cpu)->load.weight;
				4131	} else {
				4132	load = tg->parent->cfs_rq[cpu]->h_load;
				4133	load *= tg->se[cpu]->load.weight;
				4134	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
				4135	}
				4136
				4137	tg->cfs_rq[cpu]->h_load = load;
				4138
				4139	return 0;
				4140	}
				4141
				4142	static void update_h_load(long cpu)
				4143	{
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4144	struct rq *rq = cpu_rq(cpu);
				4145	unsigned long now = jiffies;
				4146
				4147	if (rq->h_load_throttle == now)
				4148	return;
				4149
				4150	rq->h_load_throttle = now;
				4151
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4152	rcu_read_lock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4153	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4154	rcu_read_unlock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4155	}
				4156
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4157	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4158	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4159	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				4160	unsigned long load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4161
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4162	load = p->se.load.weight;
				4163	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4164
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4165	return load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4166	}
				4167	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4168	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4169	{
				4170	}
				4171
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4172	static inline void update_h_load(long cpu)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4173	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4174	}
				4175
				4176	static unsigned long task_h_load(struct task_struct *p)
				4177	{
				4178	return p->se.load.weight;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4179	}
				4180	#endif
				4181
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4182	/******** Helpers for find_busiest_group **********************/
				4183	/*
				4184	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4185	* during load balancing.
				4186	*/
				4187	struct sd_lb_stats {
				4188	struct sched_group busiest; / Busiest group in this sd */
				4189	struct sched_group this; / Local group in this sd */
				4190	unsigned long total_load; /* Total load of all groups in sd */
				4191	unsigned long total_pwr; /* Total power of all groups in sd */
				4192	unsigned long avg_load; /* Average load across all groups in sd */
				4193
				4194	/** Statistics of this group */
				4195	unsigned long this_load;
				4196	unsigned long this_load_per_task;
				4197	unsigned long this_nr_running;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4198	unsigned long this_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4199	unsigned int this_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4200
				4201	/* Statistics of the busiest group */
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4202	unsigned int busiest_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4203	unsigned long max_load;
				4204	unsigned long busiest_load_per_task;
				4205	unsigned long busiest_nr_running;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4206	unsigned long busiest_group_capacity;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4207	unsigned long busiest_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4208	unsigned int busiest_group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4209
				4210	int group_imb; /* Is there imbalance in this sd */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4211	};
				4212
				4213	/*
				4214	* sg_lb_stats - stats of a sched_group required for load_balancing
				4215	*/
				4216	struct sg_lb_stats {
				4217	unsigned long avg_load; /Avg load across the CPUs of the group /
				4218	unsigned long group_load; /* Total load over the CPUs of the group */
				4219	unsigned long sum_nr_running; /* Nr tasks running in the group */
				4220	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				4221	unsigned long group_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4222	unsigned long idle_cpus;
				4223	unsigned long group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4224	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4225	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4226	};
				4227
				4228	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4229	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4230	* @sd: The sched_domain whose load_idx is to be obtained.
				4231	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
				4232	*/
				4233	static inline int get_sd_load_idx(struct sched_domain *sd,
				4234	enum cpu_idle_type idle)
				4235	{
				4236	int load_idx;
				4237
				4238	switch (idle) {
				4239	case CPU_NOT_IDLE:
				4240	load_idx = sd->busy_idx;
				4241	break;
				4242
				4243	case CPU_NEWLY_IDLE:
				4244	load_idx = sd->newidle_idx;
				4245	break;
				4246	default:
				4247	load_idx = sd->idle_idx;
				4248	break;
				4249	}
				4250
				4251	return load_idx;
				4252	}
				4253
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4254	unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
				4255	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4256	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4257	}
				4258
				4259	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4260	{
				4261	return default_scale_freq_power(sd, cpu);
				4262	}
				4263
				4264	unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
				4265	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4266	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4267	unsigned long smt_gain = sd->smt_gain;
				4268
				4269	smt_gain /= weight;
				4270
				4271	return smt_gain;
				4272	}
				4273
				4274	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4275	{
				4276	return default_scale_smt_power(sd, cpu);
				4277	}
				4278
				4279	unsigned long scale_rt_power(int cpu)
				4280	{
				4281	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4282	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4283
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4284	/*
				4285	* Since we're reading these variables without serialization make sure
				4286	* we read them once before doing sanity checks on them.
				4287	*/
				4288	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4289	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4290
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4291	total = sched_avg_period() + (rq->clock - age_stamp);
				4292
				4293	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4294	/* Ensures that power won't end up being negative */
				4295	available = 0;
				4296	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4297	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4298	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4299
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4300	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4301	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4302
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4303	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4304
				4305	return div_u64(available, total);
				4306	}
				4307
				4308	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4309	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4310	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4311	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4312	struct sched_group *sdg = sd->groups;
				4313
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4314	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4315	if (sched_feat(ARCH_POWER))
				4316	power *= arch_scale_smt_power(sd, cpu);
				4317	else
				4318	power *= default_scale_smt_power(sd, cpu);
				4319
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4320	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4321	}
				4322
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4323	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4324
				4325	if (sched_feat(ARCH_POWER))
				4326	power *= arch_scale_freq_power(sd, cpu);
				4327	else
				4328	power *= default_scale_freq_power(sd, cpu);
				4329
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4330	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4331
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4332	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4333	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4334
				4335	if (!power)
				4336	power = 1;
				4337
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4338	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4339	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4340	}
				4341
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4342	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4343	{
				4344	struct sched_domain *child = sd->child;
				4345	struct sched_group group, sdg = sd->groups;
				4346	unsigned long power;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4347	unsigned long interval;
				4348
				4349	interval = msecs_to_jiffies(sd->balance_interval);
				4350	interval = clamp(interval, 1UL, max_load_balance_interval);
				4351	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4352
				4353	if (!child) {
				4354	update_cpu_power(sd, cpu);
				4355	return;
				4356	}
				4357
				4358	power = 0;
				4359
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4360	if (child->flags & SD_OVERLAP) {
				4361	/*
				4362	* SD_OVERLAP domains cannot assume that child groups
				4363	* span the current group.
				4364	*/
				4365
				4366	for_each_cpu(cpu, sched_group_cpus(sdg))
				4367	power += power_of(cpu);
				4368	} else {
				4369	/*
				4370	* !SD_OVERLAP domains can assume that child groups
				4371	* span the current group.
				4372	*/
				4373
				4374	group = child->groups;
				4375	do {
				4376	power += group->sgp->power;
				4377	group = group->next;
				4378	} while (group != child->groups);
				4379	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4380
Peter Zijlstra	c3decf0	2012-05-31 12:05:32 +0200	[diff] [blame]	4381	sdg->sgp->power_orig = sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4382	}
				4383
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4384	/*
				4385	* Try and fix up capacity for tiny siblings, this is needed when
				4386	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4387	* which on its own isn't powerful enough.
				4388	*
				4389	* See update_sd_pick_busiest() and check_asym_packing().
				4390	*/
				4391	static inline int
				4392	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4393	{
				4394	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4395	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4396	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4397	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4398	return 0;
				4399
				4400	/*
				4401	* If ~90% of the cpu_power is still there, we're good.
				4402	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4403	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4404	return 1;
				4405
				4406	return 0;
				4407	}
				4408
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4409	/**
				4410	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4411	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4412	* @group: sched_group whose statistics are to be updated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4413	* @load_idx: Load index of sched_domain of this_cpu for load calc.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4414	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4415	* @balance: Should we balance.
				4416	* @sgs: variable to hold the statistics for this group.
				4417	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4418	static inline void update_sg_lb_stats(struct lb_env *env,
				4419	struct sched_group *group, int load_idx,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4420	int local_group, int balance, struct sg_lb_stats sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4421	{
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4422	unsigned long nr_running, max_nr_running, min_nr_running;
				4423	unsigned long load, max_cpu_load, min_cpu_load;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4424	unsigned int balance_cpu = -1, first_idle_cpu = 0;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4425	unsigned long avg_load_per_task = 0;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4426	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4427
Gautham R Shenoy	871e35b	2010-01-20 14:02:44 -0600	[diff] [blame]	4428	if (local_group)
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4429	balance_cpu = group_balance_cpu(group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4430
				4431	/* Tally up the load of all CPUs in the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4432	max_cpu_load = 0;
				4433	min_cpu_load = ~0UL;
Nikhil Rao	2582f0e	2010-10-13 12:09:36 -0700	[diff] [blame]	4434	max_nr_running = 0;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4435	min_nr_running = ~0UL;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4436
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4437	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4438	struct rq *rq = cpu_rq(i);
				4439
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4440	nr_running = rq->nr_running;
				4441
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4442	/* Bias balancing toward cpus of our domain */
				4443	if (local_group) {
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4444	if (idle_cpu(i) && !first_idle_cpu &&
				4445	cpumask_test_cpu(i, sched_group_mask(group))) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4446	first_idle_cpu = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4447	balance_cpu = i;
				4448	}
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4449
				4450	load = target_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4451	} else {
				4452	load = source_load(i, load_idx);
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4453	if (load > max_cpu_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4454	max_cpu_load = load;
				4455	if (min_cpu_load > load)
				4456	min_cpu_load = load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4457
				4458	if (nr_running > max_nr_running)
				4459	max_nr_running = nr_running;
				4460	if (min_nr_running > nr_running)
				4461	min_nr_running = nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4462	}
				4463
				4464	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4465	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4466	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4467	if (idle_cpu(i))
				4468	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4469	}
				4470
				4471	/*
				4472	* First idle cpu or the first cpu(busiest) in this sched group
				4473	* is eligible for doing load balancing at this and above
				4474	* domains. In the newly idle case, we will allow all the cpu's
				4475	* to do the newly idle load balance.
				4476	*/
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4477	if (local_group) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4478	if (env->idle != CPU_NEWLY_IDLE) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4479	if (balance_cpu != env->dst_cpu) {
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4480	*balance = 0;
				4481	return;
				4482	}
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4483	update_group_power(env->sd, env->dst_cpu);
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4484	} else if (time_after_eq(jiffies, group->sgp->next_update))
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4485	update_group_power(env->sd, env->dst_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4486	}
				4487
				4488	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4489	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4490
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4491	/*
				4492	* Consider the group unbalanced when the imbalance is larger
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4493	* than the average weight of a task.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4494	*
				4495	* APZ: with cgroup the avg task weight can vary wildly and
				4496	* might not be a suitable number - should we keep a
				4497	* normalized nr_running number somewhere that negates
				4498	* the hierarchy?
				4499	*/
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4500	if (sgs->sum_nr_running)
				4501	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4502
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4503	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
				4504	(max_nr_running - min_nr_running) > 1)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4505	sgs->group_imb = 1;
				4506
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4507	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4508	SCHED_POWER_SCALE);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4509	if (!sgs->group_capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4510	sgs->group_capacity = fix_small_capacity(env->sd, group);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4511	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4512
				4513	if (sgs->group_capacity > sgs->sum_nr_running)
				4514	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4515	}
				4516
				4517	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4518	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4519	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4520	* @sds: sched_domain statistics
				4521	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4522	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4523	*
				4524	* Determine if @sg is a busier group than the previously selected
				4525	* busiest group.
				4526	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4527	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4528	struct sd_lb_stats *sds,
				4529	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4530	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4531	{
				4532	if (sgs->avg_load <= sds->max_load)
				4533	return false;
				4534
				4535	if (sgs->sum_nr_running > sgs->group_capacity)
				4536	return true;
				4537
				4538	if (sgs->group_imb)
				4539	return true;
				4540
				4541	/*
				4542	* ASYM_PACKING needs to move all the work to the lowest
				4543	* numbered CPUs in the group, therefore mark all groups
				4544	* higher than ourself as busy.
				4545	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4546	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4547	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4548	if (!sds->busiest)
				4549	return true;
				4550
				4551	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4552	return true;
				4553	}
				4554
				4555	return false;
				4556	}
				4557
				4558	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4559	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4560	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4561	* @balance: Should we balance.
				4562	* @sds: variable to hold the statistics for this sched_domain.
				4563	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4564	static inline void update_sd_lb_stats(struct lb_env *env,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4565	int balance, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4566	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4567	struct sched_domain *child = env->sd->child;
				4568	struct sched_group *sg = env->sd->groups;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4569	struct sg_lb_stats sgs;
				4570	int load_idx, prefer_sibling = 0;
				4571
				4572	if (child && child->flags & SD_PREFER_SIBLING)
				4573	prefer_sibling = 1;
				4574
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4575	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4576
				4577	do {
				4578	int local_group;
				4579
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4580	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4581	memset(&sgs, 0, sizeof(sgs));
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4582	update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4583
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4584	if (local_group && !(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4585	return;
				4586
				4587	sds->total_load += sgs.group_load;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4588	sds->total_pwr += sg->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4589
				4590	/*
				4591	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4592	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4593	* and move all the excess tasks away. We lower the capacity
				4594	* of a group only if the local group has the capacity to fit
				4595	* these excess tasks, i.e. nr_running < group_capacity. The
				4596	* extra check prevents the case where you always pull from the
				4597	* heaviest group when it is already under-utilized (possible
				4598	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4599	*/
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4600	if (prefer_sibling && !local_group && sds->this_has_capacity)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4601	sgs.group_capacity = min(sgs.group_capacity, 1UL);
				4602
				4603	if (local_group) {
				4604	sds->this_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4605	sds->this = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4606	sds->this_nr_running = sgs.sum_nr_running;
				4607	sds->this_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4608	sds->this_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4609	sds->this_idle_cpus = sgs.idle_cpus;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4610	} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4611	sds->max_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4612	sds->busiest = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4613	sds->busiest_nr_running = sgs.sum_nr_running;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4614	sds->busiest_idle_cpus = sgs.idle_cpus;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4615	sds->busiest_group_capacity = sgs.group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4616	sds->busiest_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4617	sds->busiest_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4618	sds->busiest_group_weight = sgs.group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4619	sds->group_imb = sgs.group_imb;
				4620	}
				4621
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4622	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4623	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4624	}
				4625
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4626	/**
				4627	* check_asym_packing - Check to see if the group is packed into the
				4628	* sched doman.
				4629	*
				4630	* This is primarily intended to used at the sibling level. Some
				4631	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4632	* case of POWER7, it can move to lower SMT modes only when higher
				4633	* threads are idle. When in lower SMT modes, the threads will
				4634	* perform better since they share less core resources. Hence when we
				4635	* have idle threads, we want them to be the higher ones.
				4636	*
				4637	* This packing function is run on idle threads. It checks to see if
				4638	* the busiest CPU in this domain (core in the P7 case) has a higher
				4639	* CPU number than the packing function is being run on. Here we are
				4640	* assuming lower CPU number will be equivalent to lower a SMT thread
				4641	* number.
				4642	*
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4643	* Returns 1 when packing is required and a task should be moved to
				4644	* this CPU. The amount of the imbalance is returned in *imbalance.
				4645	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4646	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4647	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4648	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4649	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4650	{
				4651	int busiest_cpu;
				4652
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4653	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4654	return 0;
				4655
				4656	if (!sds->busiest)
				4657	return 0;
				4658
				4659	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4660	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4661	return 0;
				4662
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4663	env->imbalance = DIV_ROUND_CLOSEST(
				4664	sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
				4665
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4666	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4667	}
				4668
				4669	/**
				4670	* fix_small_imbalance - Calculate the minor imbalance that exists
				4671	* amongst the groups of a sched_domain, during
				4672	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4673	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4674	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4675	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4676	static inline
				4677	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4678	{
				4679	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4680	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4681	unsigned long scaled_busy_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4682
				4683	if (sds->this_nr_running) {
				4684	sds->this_load_per_task /= sds->this_nr_running;
				4685	if (sds->busiest_load_per_task >
				4686	sds->this_load_per_task)
				4687	imbn = 1;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4688	} else {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4689	sds->this_load_per_task =
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4690	cpu_avg_load_per_task(env->dst_cpu);
				4691	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4692
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4693	scaled_busy_load_per_task = sds->busiest_load_per_task
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4694	* SCHED_POWER_SCALE;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4695	scaled_busy_load_per_task /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4696
				4697	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
				4698	(scaled_busy_load_per_task * imbn)) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4699	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4700	return;
				4701	}
				4702
				4703	/*
				4704	* OK, we don't have enough imbalance to justify moving tasks,
				4705	* however we may be able to increase total CPU power used by
				4706	* moving them.
				4707	*/
				4708
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4709	pwr_now += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4710	min(sds->busiest_load_per_task, sds->max_load);
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4711	pwr_now += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4712	min(sds->this_load_per_task, sds->this_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4713	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4714
				4715	/* Amount of load we'd subtract */
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4716	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4717	sds->busiest->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4718	if (sds->max_load > tmp)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4719	pwr_move += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4720	min(sds->busiest_load_per_task, sds->max_load - tmp);
				4721
				4722	/* Amount of load we'd add */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4723	if (sds->max_load * sds->busiest->sgp->power <
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4724	sds->busiest_load_per_task * SCHED_POWER_SCALE)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4725	tmp = (sds->max_load * sds->busiest->sgp->power) /
				4726	sds->this->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4727	else
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4728	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4729	sds->this->sgp->power;
				4730	pwr_move += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4731	min(sds->this_load_per_task, sds->this_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4732	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4733
				4734	/* Move if we gain throughput */
				4735	if (pwr_move > pwr_now)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4736	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4737	}
				4738
				4739	/**
				4740	* calculate_imbalance - Calculate the amount of imbalance present within the
				4741	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4742	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4743	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4744	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4745	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4746	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4747	unsigned long max_pull, load_above_capacity = ~0UL;
				4748
				4749	sds->busiest_load_per_task /= sds->busiest_nr_running;
				4750	if (sds->group_imb) {
				4751	sds->busiest_load_per_task =
				4752	min(sds->busiest_load_per_task, sds->avg_load);
				4753	}
				4754
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4755	/*
				4756	* In the presence of smp nice balancing, certain scenarios can have
				4757	* max load less than avg load(as we skip the groups at or below
				4758	* its cpu_power, while calculating max_load..)
				4759	*/
				4760	if (sds->max_load < sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4761	env->imbalance = 0;
				4762	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4763	}
				4764
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4765	if (!sds->group_imb) {
				4766	/*
				4767	* Don't want to pull so many tasks that a group would go idle.
				4768	*/
				4769	load_above_capacity = (sds->busiest_nr_running -
				4770	sds->busiest_group_capacity);
				4771
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4772	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4773
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4774	load_above_capacity /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4775	}
				4776
				4777	/*
				4778	* We're trying to get all the cpus to the average_load, so we don't
				4779	* want to push ourselves above the average load, nor do we wish to
				4780	* reduce the max loaded cpu below the average load. At the same time,
				4781	* we also don't want to reduce the group load below the group capacity
				4782	* (so that we can implement power-savings policies etc). Thus we look
				4783	* for the minimum possible imbalance.
				4784	* Be careful of negative numbers as they'll appear as very large values
				4785	* with unsigned longs.
				4786	*/
				4787	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4788
				4789	/* How much load to actually move to equalise the imbalance */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4790	env->imbalance = min(max_pull * sds->busiest->sgp->power,
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4791	(sds->avg_load - sds->this_load) * sds->this->sgp->power)
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4792	/ SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4793
				4794	/*
				4795	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4796	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4797	* a think about bumping its value to force at least one task to be
				4798	* moved
				4799	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4800	if (env->imbalance < sds->busiest_load_per_task)
				4801	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4802
				4803	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4804
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4805	/***** find_busiest_group() helpers end here *******************/
				4806
				4807	/**
				4808	* find_busiest_group - Returns the busiest group within the sched_domain
				4809	* if there is an imbalance. If there isn't an imbalance, and
				4810	* the user has opted for power-savings, it returns a group whose
				4811	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				4812	* such a group exists.
				4813	*
				4814	* Also calculates the amount of weighted load which should be moved
				4815	* to restore balance.
				4816	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4817	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4818	* @balance: Pointer to a variable indicating if this_cpu
				4819	* is the appropriate cpu to perform load balancing at this_level.
				4820	*
				4821	* Returns: - the busiest group if imbalance exists.
				4822	* - If no imbalance and user has opted for power-savings balance,
				4823	* return the least loaded group whose CPUs can be
				4824	* put to idle by rebalancing its tasks onto our group.
				4825	*/
				4826	static struct sched_group *
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4827	find_busiest_group(struct lb_env env, int balance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4828	{
				4829	struct sd_lb_stats sds;
				4830
				4831	memset(&sds, 0, sizeof(sds));
				4832
				4833	/*
				4834	* Compute the various statistics relavent for load balancing at
				4835	* this level.
				4836	*/
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4837	update_sd_lb_stats(env, balance, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4838
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4839	/*
				4840	* this_cpu is not the appropriate cpu to perform load balancing at
				4841	* this level.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4842	*/
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4843	if (!(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4844	goto ret;
				4845
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4846	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				4847	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4848	return sds.busiest;
				4849
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4850	/* There is no busy sibling group to pull tasks from */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4851	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
				4852	goto out_balanced;
				4853
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4854	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	4855
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4856	/*
				4857	* If the busiest group is imbalanced the below checks don't
				4858	* work because they assumes all things are equal, which typically
				4859	* isn't true due to cpus_allowed constraints and the like.
				4860	*/
				4861	if (sds.group_imb)
				4862	goto force_balance;
				4863
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4864	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4865	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4866	!sds.busiest_has_capacity)
				4867	goto force_balance;
				4868
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4869	/*
				4870	* If the local group is more busy than the selected busiest group
				4871	* don't try and pull any tasks.
				4872	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4873	if (sds.this_load >= sds.max_load)
				4874	goto out_balanced;
				4875
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4876	/*
				4877	* Don't pull any tasks if this group is already above the domain
				4878	* average load.
				4879	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4880	if (sds.this_load >= sds.avg_load)
				4881	goto out_balanced;
				4882
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4883	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4884	/*
				4885	* This cpu is idle. If the busiest group load doesn't
				4886	* have more tasks than the number of available cpu's and
				4887	* there is no imbalance between this and busiest group
				4888	* wrt to idle cpu's, it is balanced.
				4889	*/
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4890	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4891	sds.busiest_nr_running <= sds.busiest_group_weight)
				4892	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4893	} else {
				4894	/*
				4895	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				4896	* imbalance_pct to be conservative.
				4897	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4898	if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4899	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4900	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4901
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4902	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4903	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4904	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4905	return sds.busiest;
				4906
				4907	out_balanced:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4908	ret:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4909	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4910	return NULL;
				4911	}
				4912
				4913	/*
				4914	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				4915	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4916	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4917	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4918	{
				4919	struct rq busiest = NULL, rq;
				4920	unsigned long max_load = 0;
				4921	int i;
				4922
				4923	for_each_cpu(i, sched_group_cpus(group)) {
				4924	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4925	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				4926	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4927	unsigned long wl;
				4928
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4929	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4930	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4931
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4932	if (!cpumask_test_cpu(i, env->cpus))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4933	continue;
				4934
				4935	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4936	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4937
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4938	/*
				4939	* When comparing with imbalance, use weighted_cpuload()
				4940	* which is not scaled with the cpu power.
				4941	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4942	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4943	continue;
				4944
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4945	/*
				4946	* For the load comparisons with the other cpu's, consider
				4947	* the weighted_cpuload() scaled with the cpu power, so that
				4948	* the load can be moved away from the cpu that is potentially
				4949	* running at a lower capacity.
				4950	*/
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4951	wl = (wl * SCHED_POWER_SCALE) / power;
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4952
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4953	if (wl > max_load) {
				4954	max_load = wl;
				4955	busiest = rq;
				4956	}
				4957	}
				4958
				4959	return busiest;
				4960	}
				4961
				4962	/*
				4963	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				4964	* so long as it is large enough.
				4965	*/
				4966	#define MAX_PINNED_INTERVAL 512
				4967
				4968	/* Working cpumask for load_balance and load_balance_newidle. */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4969	DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4970
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4971	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4972	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4973	struct sched_domain *sd = env->sd;
				4974
				4975	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4976
				4977	/*
				4978	* ASYM_PACKING needs to force migrate tasks from busy but
				4979	* higher numbered CPUs in order to pack all tasks in the
				4980	* lowest numbered CPUs.
				4981	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4982	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4983	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4984	}
				4985
				4986	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				4987	}
				4988
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	4989	static int active_load_balance_cpu_stop(void *data);
				4990
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4991	/*
				4992	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				4993	* tasks if there is an imbalance.
				4994	*/
				4995	static int load_balance(int this_cpu, struct rq *this_rq,
				4996	struct sched_domain *sd, enum cpu_idle_type idle,
				4997	int *balance)
				4998	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4999	int ld_moved, cur_ld_moved, active_balance = 0;
				5000	int lb_iterations, max_lb_iterations;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5001	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5002	struct rq *busiest;
				5003	unsigned long flags;
				5004	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
				5005
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5006	struct lb_env env = {
				5007	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5008	.dst_cpu = this_cpu,
				5009	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5010	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5011	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5012	.loop_break = sched_nr_migrate_break,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5013	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5014	};
				5015
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5016	cpumask_copy(cpus, cpu_active_mask);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5017	max_lb_iterations = cpumask_weight(env.dst_grpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5018
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5019	schedstat_inc(sd, lb_count[idle]);
				5020
				5021	redo:
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5022	group = find_busiest_group(&env, balance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5023
				5024	if (*balance == 0)
				5025	goto out_balanced;
				5026
				5027	if (!group) {
				5028	schedstat_inc(sd, lb_nobusyg[idle]);
				5029	goto out_balanced;
				5030	}
				5031
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5032	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5033	if (!busiest) {
				5034	schedstat_inc(sd, lb_nobusyq[idle]);
				5035	goto out_balanced;
				5036	}
				5037
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5038	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5039
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5040	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5041
				5042	ld_moved = 0;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5043	lb_iterations = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5044	if (busiest->nr_running > 1) {
				5045	/*
				5046	* Attempt to move tasks. If find_busiest_group has found
				5047	* an imbalance but busiest->nr_running <= 1, the group is
				5048	* still unbalanced. ld_moved simply stays zero, so it is
				5049	* correctly treated as an imbalance.
				5050	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5051	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5052	env.src_cpu = busiest->cpu;
				5053	env.src_rq = busiest;
				5054	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5055
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	5056	update_h_load(env.src_cpu);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5057	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5058	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5059	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5060
				5061	/*
				5062	* cur_ld_moved - load moved in current iteration
				5063	* ld_moved - cumulative load moved across iterations
				5064	*/
				5065	cur_ld_moved = move_tasks(&env);
				5066	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5067	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5068	local_irq_restore(flags);
				5069
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5070	if (env.flags & LBF_NEED_BREAK) {
				5071	env.flags &= ~LBF_NEED_BREAK;
				5072	goto more_balance;
				5073	}
				5074
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5075	/*
				5076	* some other cpu did the load balance for us.
				5077	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5078	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5079	resched_cpu(env.dst_cpu);
				5080
				5081	/*
				5082	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5083	* us and move them to an alternate dst_cpu in our sched_group
				5084	* where they can run. The upper limit on how many times we
				5085	* iterate on same src_cpu is dependent on number of cpus in our
				5086	* sched_group.
				5087	*
				5088	* This changes load balance semantics a bit on who can move
				5089	* load to a given_cpu. In addition to the given_cpu itself
				5090	* (or a ilb_cpu acting on its behalf where given_cpu is
				5091	* nohz-idle), we now have balance_cpu in a position to move
				5092	* load to given_cpu. In rare situations, this may cause
				5093	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5094	* _independently_ and at _same_ time to move some load to
				5095	* given_cpu) causing exceess load to be moved to given_cpu.
				5096	* This however should not happen so much in practice and
				5097	* moreover subsequent load balance cycles should correct the
				5098	* excess load moved.
				5099	*/
				5100	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
				5101	lb_iterations++ < max_lb_iterations) {
				5102
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5103	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5104	env.dst_cpu = env.new_dst_cpu;
				5105	env.flags &= ~LBF_SOME_PINNED;
				5106	env.loop = 0;
				5107	env.loop_break = sched_nr_migrate_break;
				5108	/*
				5109	* Go back to "more_balance" rather than "redo" since we
				5110	* need to continue with same src_cpu.
				5111	*/
				5112	goto more_balance;
				5113	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5114
				5115	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5116	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5117	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5118	if (!cpumask_empty(cpus)) {
				5119	env.loop = 0;
				5120	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5121	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5122	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5123	goto out_balanced;
				5124	}
				5125	}
				5126
				5127	if (!ld_moved) {
				5128	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5129	/*
				5130	* Increment the failure counter only on periodic balance.
				5131	* We do not want newidle balance, which can be very
				5132	* frequent, pollute the failure counter causing
				5133	* excessive cache_hot migrations and active balances.
				5134	*/
				5135	if (idle != CPU_NEWLY_IDLE)
				5136	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5137
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5138	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5139	raw_spin_lock_irqsave(&busiest->lock, flags);
				5140
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5141	/* don't kick the active_load_balance_cpu_stop,
				5142	* if the curr task on busiest cpu can't be
				5143	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5144	*/
				5145	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5146	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5147	raw_spin_unlock_irqrestore(&busiest->lock,
				5148	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5149	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5150	goto out_one_pinned;
				5151	}
				5152
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5153	/*
				5154	* ->active_balance synchronizes accesses to
				5155	* ->active_balance_work. Once set, it's cleared
				5156	* only after active load balance is finished.
				5157	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5158	if (!busiest->active_balance) {
				5159	busiest->active_balance = 1;
				5160	busiest->push_cpu = this_cpu;
				5161	active_balance = 1;
				5162	}
				5163	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5164
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5165	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5166	stop_one_cpu_nowait(cpu_of(busiest),
				5167	active_load_balance_cpu_stop, busiest,
				5168	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5169	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5170
				5171	/*
				5172	* We've kicked active balancing, reset the failure
				5173	* counter.
				5174	*/
				5175	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5176	}
				5177	} else
				5178	sd->nr_balance_failed = 0;
				5179
				5180	if (likely(!active_balance)) {
				5181	/* We were unbalanced, so reset the balancing interval */
				5182	sd->balance_interval = sd->min_interval;
				5183	} else {
				5184	/*
				5185	* If we've begun active balancing, start to back off. This
				5186	* case may not be covered by the all_pinned logic if there
				5187	* is only 1 task on the busy runqueue (because we don't call
				5188	* move_tasks).
				5189	*/
				5190	if (sd->balance_interval < sd->max_interval)
				5191	sd->balance_interval *= 2;
				5192	}
				5193
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5194	goto out;
				5195
				5196	out_balanced:
				5197	schedstat_inc(sd, lb_balanced[idle]);
				5198
				5199	sd->nr_balance_failed = 0;
				5200
				5201	out_one_pinned:
				5202	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5203	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5204	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5205	(sd->balance_interval < sd->max_interval))
				5206	sd->balance_interval *= 2;
				5207
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5208	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5209	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5210	return ld_moved;
				5211	}
				5212
				5213	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5214	* idle_balance is called by schedule() if this_cpu is about to become
				5215	* idle. Attempts to pull tasks from other CPUs.
				5216	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5217	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5218	{
				5219	struct sched_domain *sd;
				5220	int pulled_task = 0;
				5221	unsigned long next_balance = jiffies + HZ;
				5222
				5223	this_rq->idle_stamp = this_rq->clock;
				5224
				5225	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5226	return;
				5227
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5228	update_rq_runnable_avg(this_rq, 1);
				5229
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5230	/*
				5231	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5232	*/
				5233	raw_spin_unlock(&this_rq->lock);
				5234
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5235	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5236	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5237	for_each_domain(this_cpu, sd) {
				5238	unsigned long interval;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5239	int balance = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5240
				5241	if (!(sd->flags & SD_LOAD_BALANCE))
				5242	continue;
				5243
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5244	if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5245	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5246	pulled_task = load_balance(this_cpu, this_rq,
				5247	sd, CPU_NEWLY_IDLE, &balance);
				5248	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5249
				5250	interval = msecs_to_jiffies(sd->balance_interval);
				5251	if (time_after(next_balance, sd->last_balance + interval))
				5252	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5253	if (pulled_task) {
				5254	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5255	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5256	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5257	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5258	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5259
				5260	raw_spin_lock(&this_rq->lock);
				5261
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5262	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5263	/*
				5264	* We are going idle. next_balance may be set based on
				5265	* a busy processor. So reset next_balance.
				5266	*/
				5267	this_rq->next_balance = next_balance;
				5268	}
				5269	}
				5270
				5271	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5272	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5273	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5274	* least 1 task to be running on each physical CPU where possible, and
				5275	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5276	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5277	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5278	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5279	struct rq *busiest_rq = data;
				5280	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5281	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5282	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5283	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5284
				5285	raw_spin_lock_irq(&busiest_rq->lock);
				5286
				5287	/* make sure the requested cpu hasn't gone down in the meantime */
				5288	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5289	!busiest_rq->active_balance))
				5290	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5291
				5292	/* Is there any task to move? */
				5293	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5294	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5295
				5296	/*
				5297	* This condition is "impossible", if it occurs
				5298	* we need to fix it. Originally reported by
				5299	* Bjorn Helgaas on a 128-cpu setup.
				5300	*/
				5301	BUG_ON(busiest_rq == target_rq);
				5302
				5303	/* move a task from busiest_rq to target_rq */
				5304	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5305
				5306	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5307	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5308	for_each_domain(target_cpu, sd) {
				5309	if ((sd->flags & SD_LOAD_BALANCE) &&
				5310	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5311	break;
				5312	}
				5313
				5314	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5315	struct lb_env env = {
				5316	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5317	.dst_cpu = target_cpu,
				5318	.dst_rq = target_rq,
				5319	.src_cpu = busiest_rq->cpu,
				5320	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5321	.idle = CPU_IDLE,
				5322	};
				5323
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5324	schedstat_inc(sd, alb_count);
				5325
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5326	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5327	schedstat_inc(sd, alb_pushed);
				5328	else
				5329	schedstat_inc(sd, alb_failed);
				5330	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5331	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5332	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5333	out_unlock:
				5334	busiest_rq->active_balance = 0;
				5335	raw_spin_unlock_irq(&busiest_rq->lock);
				5336	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5337	}
				5338
				5339	#ifdef CONFIG_NO_HZ
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5340	/*
				5341	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5342	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5343	* needed, they will kick the idle load balancer, which then does idle
				5344	* load balancing for all the idle CPUs.
				5345	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5346	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5347	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5348	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5349	unsigned long next_balance; /* in jiffy units */
				5350	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5351
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5352	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5353	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5354	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5355
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	5356	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5357	return ilb;
				5358
				5359	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5360	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5361
				5362	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5363	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5364	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5365	* CPU (if there is one).
				5366	*/
				5367	static void nohz_balancer_kick(int cpu)
				5368	{
				5369	int ilb_cpu;
				5370
				5371	nohz.next_balance++;
				5372
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5373	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5374
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5375	if (ilb_cpu >= nr_cpu_ids)
				5376	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5377
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5378	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5379	return;
				5380	/*
				5381	* Use smp_send_reschedule() instead of resched_cpu().
				5382	* This way we generate a sched IPI on the target cpu which
				5383	* is idle. And the softirq performing nohz idle load balance
				5384	* will be run before returning from the IPI.
				5385	*/
				5386	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5387	return;
				5388	}
				5389
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5390	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5391	{
				5392	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5393	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5394	atomic_dec(&nohz.nr_cpus);
				5395	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5396	}
				5397	}
				5398
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5399	static inline void set_cpu_sd_state_busy(void)
				5400	{
				5401	struct sched_domain *sd;
				5402	int cpu = smp_processor_id();
				5403
				5404	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5405	return;
				5406	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
				5407
				5408	rcu_read_lock();
				5409	for_each_domain(cpu, sd)
				5410	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
				5411	rcu_read_unlock();
				5412	}
				5413
				5414	void set_cpu_sd_state_idle(void)
				5415	{
				5416	struct sched_domain *sd;
				5417	int cpu = smp_processor_id();
				5418
				5419	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5420	return;
				5421	set_bit(NOHZ_IDLE, nohz_flags(cpu));
				5422
				5423	rcu_read_lock();
				5424	for_each_domain(cpu, sd)
				5425	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
				5426	rcu_read_unlock();
				5427	}
				5428
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5429	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5430	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5431	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5432	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5433	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5434	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5435	/*
				5436	* If this cpu is going down, then nothing needs to be done.
				5437	*/
				5438	if (!cpu_active(cpu))
				5439	return;
				5440
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5441	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5442	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5443
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5444	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5445	atomic_inc(&nohz.nr_cpus);
				5446	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5447	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5448
				5449	static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
				5450	unsigned long action, void *hcpu)
				5451	{
				5452	switch (action & ~CPU_TASKS_FROZEN) {
				5453	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5454	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5455	return NOTIFY_OK;
				5456	default:
				5457	return NOTIFY_DONE;
				5458	}
				5459	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5460	#endif
				5461
				5462	static DEFINE_SPINLOCK(balancing);
				5463
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5464	/*
				5465	* Scale the max load_balance interval with the number of CPUs in the system.
				5466	* This trades load-balance latency on larger machines for less cross talk.
				5467	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5468	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5469	{
				5470	max_load_balance_interval = HZ*num_online_cpus()/10;
				5471	}
				5472
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5473	/*
				5474	* It checks each scheduling domain to see if it is due to be balanced,
				5475	* and initiates a balancing operation if so.
				5476	*
				5477	* Balancing parameters are set up in arch_init_sched_domains.
				5478	*/
				5479	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5480	{
				5481	int balance = 1;
				5482	struct rq *rq = cpu_rq(cpu);
				5483	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5484	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5485	/* Earliest time when we have to do rebalance again */
				5486	unsigned long next_balance = jiffies + 60*HZ;
				5487	int update_next_balance = 0;
				5488	int need_serialize;
				5489
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5490	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5491
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5492	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5493	for_each_domain(cpu, sd) {
				5494	if (!(sd->flags & SD_LOAD_BALANCE))
				5495	continue;
				5496
				5497	interval = sd->balance_interval;
				5498	if (idle != CPU_IDLE)
				5499	interval *= sd->busy_factor;
				5500
				5501	/* scale ms to jiffies */
				5502	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5503	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5504
				5505	need_serialize = sd->flags & SD_SERIALIZE;
				5506
				5507	if (need_serialize) {
				5508	if (!spin_trylock(&balancing))
				5509	goto out;
				5510	}
				5511
				5512	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				5513	if (load_balance(cpu, rq, sd, idle, &balance)) {
				5514	/*
				5515	* We've pulled tasks over so either we're no
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5516	* longer idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5517	*/
				5518	idle = CPU_NOT_IDLE;
				5519	}
				5520	sd->last_balance = jiffies;
				5521	}
				5522	if (need_serialize)
				5523	spin_unlock(&balancing);
				5524	out:
				5525	if (time_after(next_balance, sd->last_balance + interval)) {
				5526	next_balance = sd->last_balance + interval;
				5527	update_next_balance = 1;
				5528	}
				5529
				5530	/*
				5531	* Stop the load balance at this level. There is another
				5532	* CPU in our sched group which is doing load balancing more
				5533	* actively.
				5534	*/
				5535	if (!balance)
				5536	break;
				5537	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5538	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5539
				5540	/*
				5541	* next_balance will be updated only when there is a need.
				5542	* When the cpu is attached to null domain for ex, it will not be
				5543	* updated.
				5544	*/
				5545	if (likely(update_next_balance))
				5546	rq->next_balance = next_balance;
				5547	}
				5548
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5549	#ifdef CONFIG_NO_HZ
				5550	/*
				5551	* In CONFIG_NO_HZ case, the idle balance kickee will do the
				5552	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5553	*/
				5554	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5555	{
				5556	struct rq *this_rq = cpu_rq(this_cpu);
				5557	struct rq *rq;
				5558	int balance_cpu;
				5559
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5560	if (idle != CPU_IDLE \|\|
				5561	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5562	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5563
				5564	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5565	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5566	continue;
				5567
				5568	/*
				5569	* If this cpu gets work to do, stop the load balancing
				5570	* work being done for other cpus. Next load
				5571	* balancing owner will pick it up.
				5572	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5573	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5574	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5575
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5576	rq = cpu_rq(balance_cpu);
				5577
				5578	raw_spin_lock_irq(&rq->lock);
				5579	update_rq_clock(rq);
				5580	update_idle_cpu_load(rq);
				5581	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5582
				5583	rebalance_domains(balance_cpu, CPU_IDLE);
				5584
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5585	if (time_after(this_rq->next_balance, rq->next_balance))
				5586	this_rq->next_balance = rq->next_balance;
				5587	}
				5588	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5589	end:
				5590	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5591	}
				5592
				5593	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5594	* Current heuristic for kicking the idle load balancer in the presence
				5595	* of an idle cpu is the system.
				5596	* - This rq has more than one task.
				5597	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5598	* busy cpu's exceeding the group's power.
				5599	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5600	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5601	*/
				5602	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5603	{
				5604	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5605	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5606
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5607	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5608	return 0;
				5609
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5610	/*
				5611	* We may be recently in ticked or tickless idle mode. At the first
				5612	* busy tick after returning from idle, we will update the busy stats.
				5613	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5614	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5615	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5616
				5617	/*
				5618	* None are in tickless mode and hence no need for NOHZ idle load
				5619	* balancing.
				5620	*/
				5621	if (likely(!atomic_read(&nohz.nr_cpus)))
				5622	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5623
				5624	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5625	return 0;
				5626
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5627	if (rq->nr_running >= 2)
				5628	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5629
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5630	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5631	for_each_domain(cpu, sd) {
				5632	struct sched_group *sg = sd->groups;
				5633	struct sched_group_power *sgp = sg->sgp;
				5634	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5635
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5636	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5637	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5638
				5639	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5640	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5641	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5642	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5643
				5644	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5645	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5646	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5647	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5648	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5649
				5650	need_kick_unlock:
				5651	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5652	need_kick:
				5653	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5654	}
				5655	#else
				5656	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5657	#endif
				5658
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5659	/*
				5660	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5661	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5662	*/
				5663	static void run_rebalance_domains(struct softirq_action *h)
				5664	{
				5665	int this_cpu = smp_processor_id();
				5666	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5667	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5668	CPU_IDLE : CPU_NOT_IDLE;
				5669
				5670	rebalance_domains(this_cpu, idle);
				5671
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5672	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5673	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5674	* balancing on behalf of the other idle cpus whose ticks are
				5675	* stopped.
				5676	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5677	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5678	}
				5679
				5680	static inline int on_null_domain(int cpu)
				5681	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5682	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5683	}
				5684
				5685	/*
				5686	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5687	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5688	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5689	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5690	/* Don't need to rebalance while attached to NULL domain */
				5691	if (time_after_eq(jiffies, rq->next_balance) &&
				5692	likely(!on_null_domain(cpu)))
				5693	raise_softirq(SCHED_SOFTIRQ);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5694	#ifdef CONFIG_NO_HZ
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5695	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5696	nohz_balancer_kick(cpu);
				5697	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5698	}
				5699
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5700	static void rq_online_fair(struct rq *rq)
				5701	{
				5702	update_sysctl();
				5703	}
				5704
				5705	static void rq_offline_fair(struct rq *rq)
				5706	{
				5707	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	5708
				5709	/* Ensure any throttled groups are reachable by pick_next_task */
				5710	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5711	}
				5712
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	5713	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	5714
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5715	/*
				5716	* scheduler tick hitting a task of our scheduling class:
				5717	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5718	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5719	{
				5720	struct cfs_rq *cfs_rq;
				5721	struct sched_entity *se = &curr->se;
				5722
				5723	for_each_sched_entity(se) {
				5724	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5725	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5726	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5727
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	5728	if (sched_feat_numa(NUMA))
				5729	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	5730
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5731	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5732	}
				5733
				5734	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5735	* called on fork with the child task as argument from the parent's context
				5736	* - child not yet on the tasklist
				5737	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5738	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5739	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5740	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5741	struct cfs_rq *cfs_rq;
				5742	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	5743	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5744	struct rq *rq = this_rq();
				5745	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5746
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5747	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5748
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	5749	update_rq_clock(rq);
				5750
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5751	cfs_rq = task_cfs_rq(current);
				5752	curr = cfs_rq->curr;
				5753
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5754	if (unlikely(task_cpu(p) != this_cpu)) {
				5755	rcu_read_lock();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5756	__set_task_cpu(p, this_cpu);
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5757	rcu_read_unlock();
				5758	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5759
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	5760	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5761
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	5762	if (curr)
				5763	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	5764	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5765
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5766	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	5767	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	5768	* Upon rescheduling, sched_class::put_prev_task() will place
				5769	* 'current' within the tree based on its new key value.
				5770	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5771	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5772	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5773	}
				5774
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5775	se->vruntime -= cfs_rq->min_vruntime;
				5776
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5777	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5778	}
				5779
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5780	/*
				5781	* Priority of the task has changed. Check to see if we preempt
				5782	* the current task.
				5783	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5784	static void
				5785	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5786	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5787	if (!p->se.on_rq)
				5788	return;
				5789
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5790	/*
				5791	* Reschedule if we are currently running on this runqueue and
				5792	* our priority decreased, or if we are not currently running on
				5793	* this runqueue and our priority is higher than the current's
				5794	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5795	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5796	if (p->prio > oldprio)
				5797	resched_task(rq->curr);
				5798	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5799	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5800	}
				5801
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5802	static void switched_from_fair(struct rq rq, struct task_struct p)
				5803	{
				5804	struct sched_entity *se = &p->se;
				5805	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5806
				5807	/*
				5808	* Ensure the task's vruntime is normalized, so that when its
				5809	* switched back to the fair class the enqueue_entity(.flags=0) will
				5810	* do the right thing.
				5811	*
				5812	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				5813	* have normalized the vruntime, if it was !on_rq, then only when
				5814	* the task is sleeping will it still have non-normalized vruntime.
				5815	*/
				5816	if (!se->on_rq && p->state != TASK_RUNNING) {
				5817	/*
				5818	* Fix up our vruntime so that the current sleep doesn't
				5819	* cause 'unlimited' sleep bonus.
				5820	*/
				5821	place_entity(cfs_rq, se, 0);
				5822	se->vruntime -= cfs_rq->min_vruntime;
				5823	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5824
				5825	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5826	/*
				5827	* Remove our load from contribution when we leave sched_fair
				5828	* and ensure we don't carry in an old decay_count if we
				5829	* switch back.
				5830	*/
				5831	if (p->se.avg.decay_count) {
				5832	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
				5833	__synchronize_entity_decay(&p->se);
				5834	subtract_blocked_load_contrib(cfs_rq,
				5835	p->se.avg.load_avg_contrib);
				5836	}
				5837	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5838	}
				5839
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5840	/*
				5841	* We switched to the sched_fair class.
				5842	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5843	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5844	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5845	if (!p->se.on_rq)
				5846	return;
				5847
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5848	/*
				5849	* We were most likely switched from sched_rt, so
				5850	* kick off the schedule if running, otherwise just see
				5851	* if we can still preempt the current task.
				5852	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5853	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5854	resched_task(rq->curr);
				5855	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5856	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5857	}
				5858
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5859	/* Account for a task changing its policy or group.
				5860	*
				5861	* This routine is mostly called to set cfs_rq->curr field when a task
				5862	* migrates between groups/classes.
				5863	*/
				5864	static void set_curr_task_fair(struct rq *rq)
				5865	{
				5866	struct sched_entity *se = &rq->curr->se;
				5867
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	5868	for_each_sched_entity(se) {
				5869	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5870
				5871	set_next_entity(cfs_rq, se);
				5872	/* ensure bandwidth has been allocated on our new cfs_rq */
				5873	account_cfs_rq_runtime(cfs_rq, 0);
				5874	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5875	}
				5876
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5877	void init_cfs_rq(struct cfs_rq *cfs_rq)
				5878	{
				5879	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5880	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				5881	#ifndef CONFIG_64BIT
				5882	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				5883	#endif
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5884	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5885	atomic64_set(&cfs_rq->decay_counter, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5886	atomic64_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5887	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5888	}
				5889
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5890	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5891	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5892	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5893	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5894	/*
				5895	* If the task was not on the rq at the time of this cgroup movement
				5896	* it must have been asleep, sleeping tasks keep their ->vruntime
				5897	* absolute on their old rq until wakeup (needed for the fair sleeper
				5898	* bonus in place_entity()).
				5899	*
				5900	* If it was on the rq, we've just 'preempted' it, which does convert
				5901	* ->vruntime to a relative base.
				5902	*
				5903	* Make sure both cases convert their relative position when migrating
				5904	* to another cgroup's rq. This does somewhat interfere with the
				5905	* fair sleeper stuff for the first placement, but who cares.
				5906	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5907	/*
				5908	* When !on_rq, vruntime of the task has usually NOT been normalized.
				5909	* But there are some cases where it has already been normalized:
				5910	*
				5911	* - Moving a forked child which is waiting for being woken up by
				5912	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5913	* - Moving a task which has been woken up by try_to_wake_up() and
				5914	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5915	*
				5916	* To prevent boost or penalty in the new cfs_rq caused by delta
				5917	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				5918	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5919	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5920	on_rq = 1;
				5921
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5922	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5923	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				5924	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5925	if (!on_rq) {
				5926	cfs_rq = cfs_rq_of(&p->se);
				5927	p->se.vruntime += cfs_rq->min_vruntime;
				5928	#ifdef CONFIG_SMP
				5929	/*
				5930	* migrate_task_rq_fair() will have removed our previous
				5931	* contribution, but we must synchronize for ongoing future
				5932	* decay.
				5933	*/
				5934	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				5935	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				5936	#endif
				5937	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5938	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5939
				5940	void free_fair_sched_group(struct task_group *tg)
				5941	{
				5942	int i;
				5943
				5944	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5945
				5946	for_each_possible_cpu(i) {
				5947	if (tg->cfs_rq)
				5948	kfree(tg->cfs_rq[i]);
				5949	if (tg->se)
				5950	kfree(tg->se[i]);
				5951	}
				5952
				5953	kfree(tg->cfs_rq);
				5954	kfree(tg->se);
				5955	}
				5956
				5957	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				5958	{
				5959	struct cfs_rq *cfs_rq;
				5960	struct sched_entity *se;
				5961	int i;
				5962
				5963	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				5964	if (!tg->cfs_rq)
				5965	goto err;
				5966	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				5967	if (!tg->se)
				5968	goto err;
				5969
				5970	tg->shares = NICE_0_LOAD;
				5971
				5972	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5973
				5974	for_each_possible_cpu(i) {
				5975	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				5976	GFP_KERNEL, cpu_to_node(i));
				5977	if (!cfs_rq)
				5978	goto err;
				5979
				5980	se = kzalloc_node(sizeof(struct sched_entity),
				5981	GFP_KERNEL, cpu_to_node(i));
				5982	if (!se)
				5983	goto err_free_rq;
				5984
				5985	init_cfs_rq(cfs_rq);
				5986	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				5987	}
				5988
				5989	return 1;
				5990
				5991	err_free_rq:
				5992	kfree(cfs_rq);
				5993	err:
				5994	return 0;
				5995	}
				5996
				5997	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				5998	{
				5999	struct rq *rq = cpu_rq(cpu);
				6000	unsigned long flags;
				6001
				6002	/*
				6003	* Only empty task groups can be destroyed; so we can speculatively
				6004	* check on_list without danger of it being re-added.
				6005	*/
				6006	if (!tg->cfs_rq[cpu]->on_list)
				6007	return;
				6008
				6009	raw_spin_lock_irqsave(&rq->lock, flags);
				6010	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6011	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6012	}
				6013
				6014	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6015	struct sched_entity *se, int cpu,
				6016	struct sched_entity *parent)
				6017	{
				6018	struct rq *rq = cpu_rq(cpu);
				6019
				6020	cfs_rq->tg = tg;
				6021	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6022	init_cfs_rq_runtime(cfs_rq);
				6023
				6024	tg->cfs_rq[cpu] = cfs_rq;
				6025	tg->se[cpu] = se;
				6026
				6027	/* se could be NULL for root_task_group */
				6028	if (!se)
				6029	return;
				6030
				6031	if (!parent)
				6032	se->cfs_rq = &rq->cfs;
				6033	else
				6034	se->cfs_rq = parent->my_q;
				6035
				6036	se->my_q = cfs_rq;
				6037	update_load_set(&se->load, 0);
				6038	se->parent = parent;
				6039	}
				6040
				6041	static DEFINE_MUTEX(shares_mutex);
				6042
				6043	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6044	{
				6045	int i;
				6046	unsigned long flags;
				6047
				6048	/*
				6049	* We can't change the weight of the root cgroup.
				6050	*/
				6051	if (!tg->se[0])
				6052	return -EINVAL;
				6053
				6054	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6055
				6056	mutex_lock(&shares_mutex);
				6057	if (tg->shares == shares)
				6058	goto done;
				6059
				6060	tg->shares = shares;
				6061	for_each_possible_cpu(i) {
				6062	struct rq *rq = cpu_rq(i);
				6063	struct sched_entity *se;
				6064
				6065	se = tg->se[i];
				6066	/* Propagate contribution to hierarchy */
				6067	raw_spin_lock_irqsave(&rq->lock, flags);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6068	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6069	update_cfs_shares(group_cfs_rq(se));
				6070	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6071	}
				6072
				6073	done:
				6074	mutex_unlock(&shares_mutex);
				6075	return 0;
				6076	}
				6077	#else /* CONFIG_FAIR_GROUP_SCHED */
				6078
				6079	void free_fair_sched_group(struct task_group *tg) { }
				6080
				6081	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6082	{
				6083	return 1;
				6084	}
				6085
				6086	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6087
				6088	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6089
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6090
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6091	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6092	{
				6093	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6094	unsigned int rr_interval = 0;
				6095
				6096	/*
				6097	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6098	* idle runqueue:
				6099	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6100	if (rq->cfs.load.weight)
				6101	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6102
				6103	return rr_interval;
				6104	}
				6105
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6106	/*
				6107	* All the scheduling class methods:
				6108	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6109	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6110	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6111	.enqueue_task = enqueue_task_fair,
				6112	.dequeue_task = dequeue_task_fair,
				6113	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6114	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6115
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6116	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6117
				6118	.pick_next_task = pick_next_task_fair,
				6119	.put_prev_task = put_prev_task_fair,
				6120
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6121	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6122	.select_task_rq = select_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6123	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6124	.migrate_task_rq = migrate_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6125	#endif
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6126	.rq_online = rq_online_fair,
				6127	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6128
				6129	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6130	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6131
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6132	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6133	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6134	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6135
				6136	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6137	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6138	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6139
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6140	.get_rr_interval = get_rr_interval_fair,
				6141
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6142	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6143	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6144	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6145	};
				6146
				6147	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6148	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6149	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6150	struct cfs_rq *cfs_rq;
				6151
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6152	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6153	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6154	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6155	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6156	}
				6157	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6158
				6159	__init void init_sched_fair_class(void)
				6160	{
				6161	#ifdef CONFIG_SMP
				6162	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6163
				6164	#ifdef CONFIG_NO_HZ
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6165	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6166	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6167	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6168	#endif
				6169	#endif /* SMP */
				6170
				6171	}