Blame - kernel/sched/fair.c - android_kernel_oneplus_msm8996

blob: 7a33e5986fc522e8860a712a234b39774ee80938 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	116	/*
				117	* Increase the granularity value when there are more CPUs,
				118	* because with more CPUs the 'effective latency' as visible
				119	* to users decreases. But the relationship is not linear,
				120	* so pick a second-best guess by going with the log2 of the
				121	* number of CPUs.
				122	*
				123	* This idea comes from the SD scheduler of Con Kolivas:
				124	*/
				125	static int get_update_sysctl_factor(void)
				126	{
				127	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				128	unsigned int factor;
				129
				130	switch (sysctl_sched_tunable_scaling) {
				131	case SCHED_TUNABLESCALING_NONE:
				132	factor = 1;
				133	break;
				134	case SCHED_TUNABLESCALING_LINEAR:
				135	factor = cpus;
				136	break;
				137	case SCHED_TUNABLESCALING_LOG:
				138	default:
				139	factor = 1 + ilog2(cpus);
				140	break;
				141	}
				142
				143	return factor;
				144	}
				145
				146	static void update_sysctl(void)
				147	{
				148	unsigned int factor = get_update_sysctl_factor();
				149
				150	#define SET_SYSCTL(name) \
				151	(sysctl_##name = (factor) * normalized_sysctl_##name)
				152	SET_SYSCTL(sched_min_granularity);
				153	SET_SYSCTL(sched_latency);
				154	SET_SYSCTL(sched_wakeup_granularity);
				155	#undef SET_SYSCTL
				156	}
				157
				158	void sched_init_granularity(void)
				159	{
				160	update_sysctl();
				161	}
				162
				163	#if BITS_PER_LONG == 32
				164	# define WMULT_CONST (~0UL)
				165	#else
				166	# define WMULT_CONST (1UL << 32)
				167	#endif
				168
				169	#define WMULT_SHIFT 32
				170
				171	/*
				172	* Shift right and round:
				173	*/
				174	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				175
				176	/*
				177	* delta *= weight / lw
				178	*/
				179	static unsigned long
				180	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				181	struct load_weight *lw)
				182	{
				183	u64 tmp;
				184
				185	/*
				186	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				187	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				188	* 2^SCHED_LOAD_RESOLUTION.
				189	*/
				190	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				191	tmp = (u64)delta_exec * scale_load_down(weight);
				192	else
				193	tmp = (u64)delta_exec;
				194
				195	if (!lw->inv_weight) {
				196	unsigned long w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* Check whether we'd overflow the 64-bit multiplication:
				208	*/
				209	if (unlikely(tmp > WMULT_CONST))
				210	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				211	WMULT_SHIFT/2);
				212	else
				213	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				214
				215	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				216	}
				217
				218
				219	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	220
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	221	/**************************************************************
				222	* CFS operations on generic schedulable entities:
				223	*/
				224
				225	#ifdef CONFIG_FAIR_GROUP_SCHED
				226
				227	/* cpu runqueue to which this cfs_rq is attached */
				228	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				229	{
				230	return cfs_rq->rq;
				231	}
				232
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	233	/* An entity is a task if it doesn't "own" a runqueue */
				234	#define entity_is_task(se) (!se->my_q)
				235
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	236	static inline struct task_struct task_of(struct sched_entity se)
				237	{
				238	#ifdef CONFIG_SCHED_DEBUG
				239	WARN_ON_ONCE(!entity_is_task(se));
				240	#endif
				241	return container_of(se, struct task_struct, se);
				242	}
				243
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	244	/* Walk up scheduling entities hierarchy */
				245	#define for_each_sched_entity(se) \
				246	for (; se; se = se->parent)
				247
				248	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				249	{
				250	return p->se.cfs_rq;
				251	}
				252
				253	/* runqueue on which this entity is (to be) queued */
				254	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				255	{
				256	return se->cfs_rq;
				257	}
				258
				259	/* runqueue "owned" by this group */
				260	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				261	{
				262	return grp->my_q;
				263	}
				264
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	265	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				266	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	267
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	268	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				269	{
				270	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	271	/*
				272	* Ensure we either appear before our parent (if already
				273	* enqueued) or force our parent to appear after us when it is
				274	* enqueued. The fact that we always enqueue bottom-up
				275	* reduces this to two cases.
				276	*/
				277	if (cfs_rq->tg->parent &&
				278	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				279	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	280	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	281	} else {
				282	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				283	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				284	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	285
				286	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	287	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	288	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	289	}
				290	}
				291
				292	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				293	{
				294	if (cfs_rq->on_list) {
				295	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				296	cfs_rq->on_list = 0;
				297	}
				298	}
				299
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	300	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				301	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				302	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				303
				304	/* Do the two (enqueued) entities belong to the same group ? */
				305	static inline int
				306	is_same_group(struct sched_entity se, struct sched_entity pse)
				307	{
				308	if (se->cfs_rq == pse->cfs_rq)
				309	return 1;
				310
				311	return 0;
				312	}
				313
				314	static inline struct sched_entity parent_entity(struct sched_entity se)
				315	{
				316	return se->parent;
				317	}
				318
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	319	/* return depth at which a sched entity is present in the hierarchy */
				320	static inline int depth_se(struct sched_entity *se)
				321	{
				322	int depth = 0;
				323
				324	for_each_sched_entity(se)
				325	depth++;
				326
				327	return depth;
				328	}
				329
				330	static void
				331	find_matching_se(struct sched_entity se, struct sched_entity pse)
				332	{
				333	int se_depth, pse_depth;
				334
				335	/*
				336	* preemption test can be made between sibling entities who are in the
				337	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				338	* both tasks until we find their ancestors who are siblings of common
				339	* parent.
				340	*/
				341
				342	/* First walk up until both entities are at same depth */
				343	se_depth = depth_se(*se);
				344	pse_depth = depth_se(*pse);
				345
				346	while (se_depth > pse_depth) {
				347	se_depth--;
				348	se = parent_entity(se);
				349	}
				350
				351	while (pse_depth > se_depth) {
				352	pse_depth--;
				353	pse = parent_entity(pse);
				354	}
				355
				356	while (!is_same_group(se, pse)) {
				357	se = parent_entity(se);
				358	pse = parent_entity(pse);
				359	}
				360	}
				361
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	362	#else /* !CONFIG_FAIR_GROUP_SCHED */
				363
				364	static inline struct task_struct task_of(struct sched_entity se)
				365	{
				366	return container_of(se, struct task_struct, se);
				367	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	368
				369	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				370	{
				371	return container_of(cfs_rq, struct rq, cfs);
				372	}
				373
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	374	#define entity_is_task(se) 1
				375
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	376	#define for_each_sched_entity(se) \
				377	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	378
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	380	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	382	}
				383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				385	{
				386	struct task_struct *p = task_of(se);
				387	struct rq *rq = task_rq(p);
				388
				389	return &rq->cfs;
				390	}
				391
				392	/* runqueue "owned" by this group */
				393	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				394	{
				395	return NULL;
				396	}
				397
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	398	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				399	{
				400	}
				401
				402	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				403	{
				404	}
				405
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	406	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				407	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				408
				409	static inline int
				410	is_same_group(struct sched_entity se, struct sched_entity pse)
				411	{
				412	return 1;
				413	}
				414
				415	static inline struct sched_entity parent_entity(struct sched_entity se)
				416	{
				417	return NULL;
				418	}
				419
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	420	static inline void
				421	find_matching_se(struct sched_entity se, struct sched_entity pse)
				422	{
				423	}
				424
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	425	#endif /* CONFIG_FAIR_GROUP_SCHED */
				426
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	427	static __always_inline
				428	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	429
				430	/**************************************************************
				431	* Scheduling class tree data structure manipulation methods:
				432	*/
				433
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	434	static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	435	{
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	436	s64 delta = (s64)(vruntime - min_vruntime);
				437	if (delta > 0)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	438	min_vruntime = vruntime;
				439
				440	return min_vruntime;
				441	}
				442
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	443	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	444	{
				445	s64 delta = (s64)(vruntime - min_vruntime);
				446	if (delta < 0)
				447	min_vruntime = vruntime;
				448
				449	return min_vruntime;
				450	}
				451
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	452	static inline int entity_before(struct sched_entity *a,
				453	struct sched_entity *b)
				454	{
				455	return (s64)(a->vruntime - b->vruntime) < 0;
				456	}
				457
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	458	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				459	{
				460	u64 vruntime = cfs_rq->min_vruntime;
				461
				462	if (cfs_rq->curr)
				463	vruntime = cfs_rq->curr->vruntime;
				464
				465	if (cfs_rq->rb_leftmost) {
				466	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				467	struct sched_entity,
				468	run_node);
				469
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	470	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	471	vruntime = se->vruntime;
				472	else
				473	vruntime = min_vruntime(vruntime, se->vruntime);
				474	}
				475
				476	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	477	#ifndef CONFIG_64BIT
				478	smp_wmb();
				479	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				480	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	481	}
				482
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	483	/*
				484	* Enqueue an entity into the rb-tree:
				485	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	486	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	487	{
				488	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				489	struct rb_node *parent = NULL;
				490	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	491	int leftmost = 1;
				492
				493	/*
				494	* Find the right place in the rbtree:
				495	*/
				496	while (*link) {
				497	parent = *link;
				498	entry = rb_entry(parent, struct sched_entity, run_node);
				499	/*
				500	* We dont care about collisions. Nodes with
				501	* the same key stay together.
				502	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	503	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	504	link = &parent->rb_left;
				505	} else {
				506	link = &parent->rb_right;
				507	leftmost = 0;
				508	}
				509	}
				510
				511	/*
				512	* Maintain a cache of leftmost tree entries (it is frequently
				513	* used):
				514	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	515	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	516	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	517
				518	rb_link_node(&se->run_node, parent, link);
				519	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	520	}
				521
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	522	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	524	if (cfs_rq->rb_leftmost == &se->run_node) {
				525	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	526
				527	next_node = rb_next(&se->run_node);
				528	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	529	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	530
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	531	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	}
				533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	534	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	535	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	536	struct rb_node *left = cfs_rq->rb_leftmost;
				537
				538	if (!left)
				539	return NULL;
				540
				541	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	}
				543
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	544	static struct sched_entity __pick_next_entity(struct sched_entity se)
				545	{
				546	struct rb_node *next = rb_next(&se->run_node);
				547
				548	if (!next)
				549	return NULL;
				550
				551	return rb_entry(next, struct sched_entity, run_node);
				552	}
				553
				554	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	555	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	556	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	557	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	558
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	559	if (!last)
				560	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	561
				562	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	563	}
				564
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	565	/**************************************************************
				566	* Scheduling class statistics methods:
				567	*/
				568
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	569	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	570	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	571	loff_t *ppos)
				572	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	573	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	574	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	575
				576	if (ret \|\| !write)
				577	return ret;
				578
				579	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				580	sysctl_sched_min_granularity);
				581
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	582	#define WRT_SYSCTL(name) \
				583	(normalized_sysctl_##name = sysctl_##name / (factor))
				584	WRT_SYSCTL(sched_min_granularity);
				585	WRT_SYSCTL(sched_latency);
				586	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	587	#undef WRT_SYSCTL
				588
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	589	return 0;
				590	}
				591	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	592
				593	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	594	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	595	*/
				596	static inline unsigned long
				597	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				598	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	599	if (unlikely(se->load.weight != NICE_0_LOAD))
				600	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	601
				602	return delta;
				603	}
				604
				605	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	606	* The idea is to set a period in which each task runs once.
				607	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	608	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	609	* this period because otherwise the slices get too small.
				610	*
				611	* p = (nr <= nl) ? l : l*nr/nl
				612	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	613	static u64 __sched_period(unsigned long nr_running)
				614	{
				615	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	616	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	617
				618	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	619	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	620	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	621	}
				622
				623	return period;
				624	}
				625
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	626	/*
				627	* We calculate the wall-time slice from the period by taking a part
				628	* proportional to the weight.
				629	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	630	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	631	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	632	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	633	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	634	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	635
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	636	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	637	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	638	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	639
				640	cfs_rq = cfs_rq_of(se);
				641	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	642
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	643	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	644	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	645
				646	update_load_add(&lw, se->load.weight);
				647	load = &lw;
				648	}
				649	slice = calc_delta_mine(slice, se->load.weight, load);
				650	}
				651	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	652	}
				653
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	654	/*
Peter Zijlstra	ac884de	2008-04-19 19:45:00 +0200	[diff] [blame]	655	* We calculate the vruntime slice of a to be inserted task
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	656	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	657	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	658	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	659	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	660	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	662	}
				663
				664	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	665	* Update the current task's runtime statistics. Skip current tasks that
				666	* are not in our scheduling class.
				667	*/
				668	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	669	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				670	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	672	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	673
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	674	schedstat_set(curr->statistics.exec_max,
				675	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	676
				677	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	678	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	679	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	680
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	681	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	682	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	683	}
				684
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	685	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	686	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	687	struct sched_entity *curr = cfs_rq->curr;
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	688	u64 now = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	689	unsigned long delta_exec;
				690
				691	if (unlikely(!curr))
				692	return;
				693
				694	/*
				695	* Get the amount of time the current task was running
				696	* since the last time we changed load (this cannot
				697	* overflow on 32 bits):
				698	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	699	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	700	if (!delta_exec)
				701	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	702
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	703	__update_curr(cfs_rq, curr, delta_exec);
				704	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	705
				706	if (entity_is_task(curr)) {
				707	struct task_struct *curtask = task_of(curr);
				708
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	709	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	710	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	711	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	712	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	713
				714	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	715	}
				716
				717	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	718	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	719	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	720	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	721	}
				722
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	723	/*
				724	* Task is being enqueued - update stats:
				725	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	726	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	727	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	/*
				729	* Are we enqueueing a waiting task? (for current tasks
				730	* a dequeue/enqueue event is a NOP)
				731	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	732	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	733	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	734	}
				735
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	736	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	737	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	738	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	739	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				740	rq_of(cfs_rq)->clock - se->statistics.wait_start));
				741	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				742	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				743	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	744	#ifdef CONFIG_SCHEDSTATS
				745	if (entity_is_task(se)) {
				746	trace_sched_stat_wait(task_of(se),
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	747	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	748	}
				749	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	750	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	751	}
				752
				753	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	754	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	755	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	/*
				757	* Mark the end of the wait period if dequeueing a
				758	* waiting task:
				759	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	760	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	761	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	}
				763
				764	/*
				765	* We are picking a new current task - update its stats:
				766	*/
				767	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	768	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	769	{
				770	/*
				771	* We are starting a new run period:
				772	*/
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	773	se->exec_start = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	774	}
				775
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	776	/**************************************************
				777	* Scheduling class queueing methods:
				778	*/
				779
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	780	#ifdef CONFIG_NUMA_BALANCING
				781	/*
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	782	* numa task sample period in ms
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	783	*/
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	784	unsigned int sysctl_numa_balancing_scan_period_min = 100;
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	785	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
				786	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	787
				788	/* Portion of address space to scan in MB */
				789	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	790
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	791	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				792	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				793
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	794	static void task_numa_placement(struct task_struct *p)
				795	{
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	796	int seq;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	797
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	798	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				799	return;
				800	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	801	if (p->numa_scan_seq == seq)
				802	return;
				803	p->numa_scan_seq = seq;
				804
				805	/* FIXME: Scheduling placement policy hints go here */
				806	}
				807
				808	/*
				809	* Got a PROT_NONE fault for a page on @node.
				810	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	811	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	812	{
				813	struct task_struct *p = current;
				814
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	815	if (!sched_feat_numa(NUMA))
				816	return;
				817
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	818	/* FIXME: Allocate task-specific structure for placement policy here */
				819
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	820	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	821	* If pages are properly placed (did not migrate) then scan slower.
				822	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	823	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	824	if (!migrated)
				825	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
				826	p->numa_scan_period + jiffies_to_msecs(10));
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	827
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	828	task_numa_placement(p);
				829	}
				830
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	831	static void reset_ptenuma_scan(struct task_struct *p)
				832	{
				833	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				834	p->mm->numa_scan_offset = 0;
				835	}
				836
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	837	/*
				838	* The expensive part of numa migration is done from task_work context.
				839	* Triggered from task_tick_numa().
				840	*/
				841	void task_numa_work(struct callback_head *work)
				842	{
				843	unsigned long migrate, next_scan, now = jiffies;
				844	struct task_struct *p = current;
				845	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	846	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	847	unsigned long start, end;
				848	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	849
				850	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				851
				852	work->next = work; /* protect against double add */
				853	/*
				854	* Who cares about NUMA placement when they're dying.
				855	*
				856	* NOTE: make sure not to dereference p->mm before this check,
				857	* exit_task_work() happens _after_ exit_mm() so we could be called
				858	* without p->mm even though we still had it when we enqueued this
				859	* work.
				860	*/
				861	if (p->flags & PF_EXITING)
				862	return;
				863
				864	/*
Mel Gorman	5bca230	2012-11-22 14:40:03 +0000	[diff] [blame]	865	* We do not care about task placement until a task runs on a node
				866	* other than the first one used by the address space. This is
				867	* largely because migrations are driven by what CPU the task
				868	* is running on. If it's never scheduled on another node, it'll
				869	* not migrate so why bother trapping the fault.
				870	*/
				871	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
				872	mm->first_nid = numa_node_id();
				873	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
				874	/* Are we running on a new node yet? */
				875	if (numa_node_id() == mm->first_nid &&
				876	!sched_feat_numa(NUMA_FORCE))
				877	return;
				878
				879	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
				880	}
				881
				882	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	883	* Reset the scan period if enough time has gone by. Objective is that
				884	* scanning will be reduced if pages are properly placed. As tasks
				885	* can enter different phases this needs to be re-examined. Lacking
				886	* proper tracking of reference behaviour, this blunt hammer is used.
				887	*/
				888	migrate = mm->numa_next_reset;
				889	if (time_after(now, migrate)) {
				890	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				891	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				892	xchg(&mm->numa_next_reset, next_scan);
				893	}
				894
				895	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	896	* Enforce maximal scan/migration frequency..
				897	*/
				898	migrate = mm->numa_next_scan;
				899	if (time_before(now, migrate))
				900	return;
				901
				902	if (p->numa_scan_period == 0)
				903	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				904
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	905	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	906	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				907	return;
				908
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	909	/*
				910	* Do not set pte_numa if the current running node is rate-limited.
				911	* This loses statistics on the fault but if we are unwilling to
				912	* migrate to this node, it is less likely we can do useful work
				913	*/
				914	if (migrate_ratelimited(numa_node_id()))
				915	return;
				916
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	917	start = mm->numa_scan_offset;
				918	pages = sysctl_numa_balancing_scan_size;
				919	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				920	if (!pages)
				921	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	922
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	923	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	924	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	925	if (!vma) {
				926	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	927	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	928	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	929	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	930	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	931	if (!vma_migratable(vma))
				932	continue;
				933
				934	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	935	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	936	continue;
				937
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	938	do {
				939	start = max(start, vma->vm_start);
				940	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				941	end = min(end, vma->vm_end);
				942	pages -= change_prot_numa(vma, start, end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	943
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	944	start = end;
				945	if (pages <= 0)
				946	goto out;
				947	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	948	}
				949
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	950	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	951	/*
				952	* It is possible to reach the end of the VMA list but the last few VMAs are
				953	* not guaranteed to the vma_migratable. If they are not, we would find the
				954	* !migratable VMA on the next scan but not reset the scanner to the start
				955	* so check it now.
				956	*/
				957	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	958	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	959	else
				960	reset_ptenuma_scan(p);
				961	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	962	}
				963
				964	/*
				965	* Drive the periodic memory faults..
				966	*/
				967	void task_tick_numa(struct rq rq, struct task_struct curr)
				968	{
				969	struct callback_head *work = &curr->numa_work;
				970	u64 period, now;
				971
				972	/*
				973	* We don't care about NUMA placement if we don't have memory.
				974	*/
				975	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				976	return;
				977
				978	/*
				979	* Using runtime rather than walltime has the dual advantage that
				980	* we (mostly) drive the selection from busy threads and that the
				981	* task needs to have done some actual work before we bother with
				982	* NUMA placement.
				983	*/
				984	now = curr->se.sum_exec_runtime;
				985	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				986
				987	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	988	if (!curr->node_stamp)
				989	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	990	curr->node_stamp = now;
				991
				992	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				993	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				994	task_work_add(curr, work, true);
				995	}
				996	}
				997	}
				998	#else
				999	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1000	{
				1001	}
				1002	#endif /* CONFIG_NUMA_BALANCING */
				1003
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1004	static void
				1005	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1006	{
				1007	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1008	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1009	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1010	#ifdef CONFIG_SMP
				1011	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1012	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1013	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1014	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1015	}
				1016
				1017	static void
				1018	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1019	{
				1020	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1021	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1022	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1023	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1024	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1025	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1026	}
				1027
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1028	#ifdef CONFIG_FAIR_GROUP_SCHED
				1029	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1030	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1031	{
				1032	long tg_weight;
				1033
				1034	/*
				1035	* Use this CPU's actual weight instead of the last load_contribution
				1036	* to gain a more accurate current total weight. See
				1037	* update_cfs_rq_load_contribution().
				1038	*/
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1039	tg_weight = atomic64_read(&tg->load_avg);
				1040	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1041	tg_weight += cfs_rq->load.weight;
				1042
				1043	return tg_weight;
				1044	}
				1045
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1046	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1047	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1048	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1049
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1050	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1051	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1052
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1053	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1054	if (tg_weight)
				1055	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1056
				1057	if (shares < MIN_SHARES)
				1058	shares = MIN_SHARES;
				1059	if (shares > tg->shares)
				1060	shares = tg->shares;
				1061
				1062	return shares;
				1063	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1064	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1065	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1066	{
				1067	return tg->shares;
				1068	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1069	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1070	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1071	unsigned long weight)
				1072	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1073	if (se->on_rq) {
				1074	/* commit outstanding execution time */
				1075	if (cfs_rq->curr == se)
				1076	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1077	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1078	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1079
				1080	update_load_set(&se->load, weight);
				1081
				1082	if (se->on_rq)
				1083	account_entity_enqueue(cfs_rq, se);
				1084	}
				1085
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1086	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1087
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1088	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1089	{
				1090	struct task_group *tg;
				1091	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1092	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1093
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1094	tg = cfs_rq->tg;
				1095	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1096	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1097	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1098	#ifndef CONFIG_SMP
				1099	if (likely(se->load.weight == tg->shares))
				1100	return;
				1101	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1102	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1103
				1104	reweight_entity(cfs_rq_of(se), se, shares);
				1105	}
				1106	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1107	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1108	{
				1109	}
				1110	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1111
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	1112	/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
				1113	#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1114	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1115	* We choose a half-life close to 1 scheduling period.
				1116	* Note: The tables below are dependent on this value.
				1117	*/
				1118	#define LOAD_AVG_PERIOD 32
				1119	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1120	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1121
				1122	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1123	static const u32 runnable_avg_yN_inv[] = {
				1124	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1125	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1126	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1127	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1128	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1129	0x85aac367, 0x82cd8698,
				1130	};
				1131
				1132	/*
				1133	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1134	* over-estimates when re-combining.
				1135	*/
				1136	static const u32 runnable_avg_yN_sum[] = {
				1137	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1138	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1139	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1140	};
				1141
				1142	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1143	* Approximate:
				1144	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1145	*/
				1146	static __always_inline u64 decay_load(u64 val, u64 n)
				1147	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1148	unsigned int local_n;
				1149
				1150	if (!n)
				1151	return val;
				1152	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1153	return 0;
				1154
				1155	/* after bounds checking we can collapse to 32-bit */
				1156	local_n = n;
				1157
				1158	/*
				1159	* As y^PERIOD = 1/2, we can combine
				1160	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1161	* With a look-up table which covers k^n (n<PERIOD)
				1162	*
				1163	* To achieve constant time decay_load.
				1164	*/
				1165	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1166	val >>= local_n / LOAD_AVG_PERIOD;
				1167	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1168	}
				1169
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1170	val *= runnable_avg_yN_inv[local_n];
				1171	/* We don't use SRR here since we always want to round down. */
				1172	return val >> 32;
				1173	}
				1174
				1175	/*
				1176	* For updates fully spanning n periods, the contribution to runnable
				1177	* average will be: \Sum 1024*y^n
				1178	*
				1179	* We can compute this reasonably efficiently by combining:
				1180	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1181	*/
				1182	static u32 __compute_runnable_contrib(u64 n)
				1183	{
				1184	u32 contrib = 0;
				1185
				1186	if (likely(n <= LOAD_AVG_PERIOD))
				1187	return runnable_avg_yN_sum[n];
				1188	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1189	return LOAD_AVG_MAX;
				1190
				1191	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1192	do {
				1193	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1194	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1195
				1196	n -= LOAD_AVG_PERIOD;
				1197	} while (n > LOAD_AVG_PERIOD);
				1198
				1199	contrib = decay_load(contrib, n);
				1200	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1201	}
				1202
				1203	/*
				1204	* We can represent the historical contribution to runnable average as the
				1205	* coefficients of a geometric series. To do this we sub-divide our runnable
				1206	* history into segments of approximately 1ms (1024us); label the segment that
				1207	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1208	*
				1209	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1210	* p0 p1 p2
				1211	* (now) (~1ms ago) (~2ms ago)
				1212	*
				1213	* Let u_i denote the fraction of p_i that the entity was runnable.
				1214	*
				1215	* We then designate the fractions u_i as our co-efficients, yielding the
				1216	* following representation of historical load:
				1217	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1218	*
				1219	* We choose y based on the with of a reasonably scheduling period, fixing:
				1220	* y^32 = 0.5
				1221	*
				1222	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1223	* approximately half as much as the contribution to load within the last ms
				1224	* (u_0).
				1225	*
				1226	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1227	* sum again by y is sufficient to update:
				1228	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1229	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1230	*/
				1231	static __always_inline int __update_entity_runnable_avg(u64 now,
				1232	struct sched_avg *sa,
				1233	int runnable)
				1234	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1235	u64 delta, periods;
				1236	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1237	int delta_w, decayed = 0;
				1238
				1239	delta = now - sa->last_runnable_update;
				1240	/*
				1241	* This should only happen when time goes backwards, which it
				1242	* unfortunately does during sched clock init when we swap over to TSC.
				1243	*/
				1244	if ((s64)delta < 0) {
				1245	sa->last_runnable_update = now;
				1246	return 0;
				1247	}
				1248
				1249	/*
				1250	* Use 1024ns as the unit of measurement since it's a reasonable
				1251	* approximation of 1us and fast to compute.
				1252	*/
				1253	delta >>= 10;
				1254	if (!delta)
				1255	return 0;
				1256	sa->last_runnable_update = now;
				1257
				1258	/* delta_w is the amount already accumulated against our next period */
				1259	delta_w = sa->runnable_avg_period % 1024;
				1260	if (delta + delta_w >= 1024) {
				1261	/* period roll-over */
				1262	decayed = 1;
				1263
				1264	/*
				1265	* Now that we know we're crossing a period boundary, figure
				1266	* out how much from delta we need to complete the current
				1267	* period and accrue it.
				1268	*/
				1269	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1270	if (runnable)
				1271	sa->runnable_avg_sum += delta_w;
				1272	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1273
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1274	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1275
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1276	/* Figure out how many additional periods this update spans */
				1277	periods = delta / 1024;
				1278	delta %= 1024;
				1279
				1280	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1281	periods + 1);
				1282	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1283	periods + 1);
				1284
				1285	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1286	runnable_contrib = __compute_runnable_contrib(periods);
				1287	if (runnable)
				1288	sa->runnable_avg_sum += runnable_contrib;
				1289	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1290	}
				1291
				1292	/* Remainder of delta accrued against u_0` */
				1293	if (runnable)
				1294	sa->runnable_avg_sum += delta;
				1295	sa->runnable_avg_period += delta;
				1296
				1297	return decayed;
				1298	}
				1299
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1300	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1301	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1302	{
				1303	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1304	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1305
				1306	decays -= se->avg.decay_count;
				1307	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1308	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1309
				1310	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1311	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1312
				1313	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1314	}
				1315
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1316	#ifdef CONFIG_FAIR_GROUP_SCHED
				1317	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1318	int force_update)
				1319	{
				1320	struct task_group *tg = cfs_rq->tg;
				1321	s64 tg_contrib;
				1322
				1323	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1324	tg_contrib -= cfs_rq->tg_load_contrib;
				1325
				1326	if (force_update \|\| abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1327	atomic64_add(tg_contrib, &tg->load_avg);
				1328	cfs_rq->tg_load_contrib += tg_contrib;
				1329	}
				1330	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1331
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1332	/*
				1333	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1334	* representation for computing load contributions.
				1335	*/
				1336	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1337	struct cfs_rq *cfs_rq)
				1338	{
				1339	struct task_group *tg = cfs_rq->tg;
				1340	long contrib;
				1341
				1342	/* The fraction of a cpu used by this cfs_rq */
				1343	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1344	sa->runnable_avg_period + 1);
				1345	contrib -= cfs_rq->tg_runnable_contrib;
				1346
				1347	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1348	atomic_add(contrib, &tg->runnable_avg);
				1349	cfs_rq->tg_runnable_contrib += contrib;
				1350	}
				1351	}
				1352
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1353	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1354	{
				1355	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1356	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1357	int runnable_avg;
				1358
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1359	u64 contrib;
				1360
				1361	contrib = cfs_rq->tg_load_contrib * tg->shares;
				1362	se->avg.load_avg_contrib = div64_u64(contrib,
				1363	atomic64_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1364
				1365	/*
				1366	* For group entities we need to compute a correction term in the case
				1367	* that they are consuming <1 cpu so that we would contribute the same
				1368	* load as a task of equal weight.
				1369	*
				1370	* Explicitly co-ordinating this measurement would be expensive, but
				1371	* fortunately the sum of each cpus contribution forms a usable
				1372	* lower-bound on the true value.
				1373	*
				1374	* Consider the aggregate of 2 contributions. Either they are disjoint
				1375	* (and the sum represents true value) or they are disjoint and we are
				1376	* understating by the aggregate of their overlap.
				1377	*
				1378	* Extending this to N cpus, for a given overlap, the maximum amount we
				1379	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1380	* cpus that overlap for this interval and w_i is the interval width.
				1381	*
				1382	* On a small machine; the first term is well-bounded which bounds the
				1383	* total error since w_i is a subset of the period. Whereas on a
				1384	* larger machine, while this first term can be larger, if w_i is the
				1385	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1386	* our upper bound of 1-cpu.
				1387	*/
				1388	runnable_avg = atomic_read(&tg->runnable_avg);
				1389	if (runnable_avg < NICE_0_LOAD) {
				1390	se->avg.load_avg_contrib *= runnable_avg;
				1391	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1392	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1393	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1394	#else
				1395	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1396	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1397	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1398	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1399	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1400	#endif
				1401
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1402	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1403	{
				1404	u32 contrib;
				1405
				1406	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1407	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1408	contrib /= (se->avg.runnable_avg_period + 1);
				1409	se->avg.load_avg_contrib = scale_load(contrib);
				1410	}
				1411
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1412	/* Compute the current contribution to load_avg by se, return any delta */
				1413	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1414	{
				1415	long old_contrib = se->avg.load_avg_contrib;
				1416
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1417	if (entity_is_task(se)) {
				1418	__update_task_entity_contrib(se);
				1419	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1420	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1421	__update_group_entity_contrib(se);
				1422	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1423
				1424	return se->avg.load_avg_contrib - old_contrib;
				1425	}
				1426
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1427	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1428	long load_contrib)
				1429	{
				1430	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1431	cfs_rq->blocked_load_avg -= load_contrib;
				1432	else
				1433	cfs_rq->blocked_load_avg = 0;
				1434	}
				1435
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1436	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1437
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1438	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1439	static inline void update_entity_load_avg(struct sched_entity *se,
				1440	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1441	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1442	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1443	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1444	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1445
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1446	/*
				1447	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1448	* case they are the parent of a throttled hierarchy.
				1449	*/
				1450	if (entity_is_task(se))
				1451	now = cfs_rq_clock_task(cfs_rq);
				1452	else
				1453	now = cfs_rq_clock_task(group_cfs_rq(se));
				1454
				1455	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1456	return;
				1457
				1458	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1459
				1460	if (!update_cfs_rq)
				1461	return;
				1462
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1463	if (se->on_rq)
				1464	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1465	else
				1466	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1467	}
				1468
				1469	/*
				1470	* Decay the load contributed by all blocked children and account this so that
				1471	* their contribution may appropriately discounted when they wake up.
				1472	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1473	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1474	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1475	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1476	u64 decays;
				1477
				1478	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1479	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1480	return;
				1481
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1482	if (atomic64_read(&cfs_rq->removed_load)) {
				1483	u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
				1484	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1485	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1486
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1487	if (decays) {
				1488	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1489	decays);
				1490	atomic64_add(decays, &cfs_rq->decay_counter);
				1491	cfs_rq->last_decay = now;
				1492	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1493
				1494	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1495	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1496
				1497	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1498	{
				1499	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1500	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1501	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1502
				1503	/* Add the load generated by se into cfs_rq's child load-average */
				1504	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1505	struct sched_entity *se,
				1506	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1507	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1508	/*
				1509	* We track migrations using entity decay_count <= 0, on a wake-up
				1510	* migration we use a negative decay count to track the remote decays
				1511	* accumulated while sleeping.
				1512	*/
				1513	if (unlikely(se->avg.decay_count <= 0)) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1514	se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1515	if (se->avg.decay_count) {
				1516	/*
				1517	* In a wake-up migration we have to approximate the
				1518	* time sleeping. This is because we can't synchronize
				1519	* clock_task between the two cpus, and it is not
				1520	* guaranteed to be read-safe. Instead, we can
				1521	* approximate this using our carried decays, which are
				1522	* explicitly atomically readable.
				1523	*/
				1524	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1525	<< 20;
				1526	update_entity_load_avg(se, 0);
				1527	/* Indicate that we're now synchronized and on-rq */
				1528	se->avg.decay_count = 0;
				1529	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1530	wakeup = 0;
				1531	} else {
				1532	__synchronize_entity_decay(se);
				1533	}
				1534
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1535	/* migrated tasks did not contribute to our blocked load */
				1536	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1537	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1538	update_entity_load_avg(se, 0);
				1539	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1540
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1541	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1542	/* we force update consideration on load-balancer moves */
				1543	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1544	}
				1545
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1546	/*
				1547	* Remove se's load from this cfs_rq child load-average, if the entity is
				1548	* transitioning to a blocked state we track its projected decay using
				1549	* blocked_load_avg.
				1550	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1551	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	struct sched_entity *se,
				1553	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1554	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1555	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1556	/* we force update consideration on load-balancer moves */
				1557	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1558
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1559	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1560	if (sleep) {
				1561	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1562	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1563	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1564	}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1565	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1566	static inline void update_entity_load_avg(struct sched_entity *se,
				1567	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1568	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1569	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1570	struct sched_entity *se,
				1571	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1572	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1573	struct sched_entity *se,
				1574	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1575	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1576	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1577	#endif
				1578
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1579	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1580	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1581	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1582	struct task_struct *tsk = NULL;
				1583
				1584	if (entity_is_task(se))
				1585	tsk = task_of(se);
				1586
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1587	if (se->statistics.sleep_start) {
				1588	u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1589
				1590	if ((s64)delta < 0)
				1591	delta = 0;
				1592
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1593	if (unlikely(delta > se->statistics.sleep_max))
				1594	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1595
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1596	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1597	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1598
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1599	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1600	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1601	trace_sched_stat_sleep(tsk, delta);
				1602	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1603	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1604	if (se->statistics.block_start) {
				1605	u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1606
				1607	if ((s64)delta < 0)
				1608	delta = 0;
				1609
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1610	if (unlikely(delta > se->statistics.block_max))
				1611	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1612
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1613	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1614	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1615
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1616	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1617	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1618	se->statistics.iowait_sum += delta;
				1619	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1620	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1621	}
				1622
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1623	trace_sched_stat_blocked(tsk, delta);
				1624
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1625	/*
				1626	* Blocking time is in units of nanosecs, so shift by
				1627	* 20 to get a milliseconds-range estimation of the
				1628	* amount of time that the task spent sleeping:
				1629	*/
				1630	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1631	profile_hits(SLEEP_PROFILING,
				1632	(void *)get_wchan(tsk),
				1633	delta >> 20);
				1634	}
				1635	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1636	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1637	}
				1638	#endif
				1639	}
				1640
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1641	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1642	{
				1643	#ifdef CONFIG_SCHED_DEBUG
				1644	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1645
				1646	if (d < 0)
				1647	d = -d;
				1648
				1649	if (d > 3*sysctl_sched_latency)
				1650	schedstat_inc(cfs_rq, nr_spread_over);
				1651	#endif
				1652	}
				1653
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1654	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1655	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1656	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1657	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1658
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1659	/*
				1660	* The 'current' period is already promised to the current tasks,
				1661	* however the extra weight of the new task will slow them down a
				1662	* little, place the new task so that it fits in the slot that
				1663	* stays open at the end.
				1664	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1665	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1666	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1667
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1668	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1669	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1670	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1671
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1672	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1673	* Halve their sleep time's effect, to allow
				1674	* for a gentler effect of sleepers:
				1675	*/
				1676	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1677	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1678
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1679	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1680	}
				1681
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1682	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	1683	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1684	}
				1685
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1686	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1687
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1688	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1689	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1690	{
				1691	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1692	* Update the normalized vruntime before updating min_vruntime
				1693	* through callig update_curr().
				1694	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1695	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1696	se->vruntime += cfs_rq->min_vruntime;
				1697
				1698	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1699	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1700	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1701	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1702	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1703	account_entity_enqueue(cfs_rq, se);
				1704	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1705
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1706	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1707	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1708	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1709	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1710
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1711	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1712	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1713	if (se != cfs_rq->curr)
				1714	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1715	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1716
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1717	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1718	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1719	check_enqueue_throttle(cfs_rq);
				1720	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1721	}
				1722
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1723	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1724	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1725	for_each_sched_entity(se) {
				1726	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1727	if (cfs_rq->last == se)
				1728	cfs_rq->last = NULL;
				1729	else
				1730	break;
				1731	}
				1732	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1733
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1734	static void __clear_buddies_next(struct sched_entity *se)
				1735	{
				1736	for_each_sched_entity(se) {
				1737	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1738	if (cfs_rq->next == se)
				1739	cfs_rq->next = NULL;
				1740	else
				1741	break;
				1742	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1743	}
				1744
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1745	static void __clear_buddies_skip(struct sched_entity *se)
				1746	{
				1747	for_each_sched_entity(se) {
				1748	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1749	if (cfs_rq->skip == se)
				1750	cfs_rq->skip = NULL;
				1751	else
				1752	break;
				1753	}
				1754	}
				1755
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1756	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1757	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1758	if (cfs_rq->last == se)
				1759	__clear_buddies_last(se);
				1760
				1761	if (cfs_rq->next == se)
				1762	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1763
				1764	if (cfs_rq->skip == se)
				1765	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1766	}
				1767
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1768	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1769
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1770	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1771	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1772	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1773	/*
				1774	* Update run-time statistics of the 'current'.
				1775	*/
				1776	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1777	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1778
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1779	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1780	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1781	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1782	if (entity_is_task(se)) {
				1783	struct task_struct *tsk = task_of(se);
				1784
				1785	if (tsk->state & TASK_INTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1786	se->statistics.sleep_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1787	if (tsk->state & TASK_UNINTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1788	se->statistics.block_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1789	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1790	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1791	}
				1792
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1793	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1794
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1795	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1796	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1797	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1798	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1799
				1800	/*
				1801	* Normalize the entity after updating the min_vruntime because the
				1802	* update can refer to the ->curr item and we need to reflect this
				1803	* movement in our normalized position.
				1804	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1805	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1806	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1807
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1808	/* return excess runtime on last dequeue */
				1809	return_cfs_rq_runtime(cfs_rq);
				1810
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1811	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1812	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1813	}
				1814
				1815	/*
				1816	* Preempt the current task with a newly woken task if needed:
				1817	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1818	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1819	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1820	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1821	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1822	struct sched_entity *se;
				1823	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1824
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	1825	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1826	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1827	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1828	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1829	/*
				1830	* The current task ran long enough, ensure it doesn't get
				1831	* re-elected due to buddy favours.
				1832	*/
				1833	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1834	return;
				1835	}
				1836
				1837	/*
				1838	* Ensure that a task that missed wakeup preemption by a
				1839	* narrow margin doesn't have to wait for a full slice.
				1840	* This also mitigates buddy induced latencies under load.
				1841	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1842	if (delta_exec < sysctl_sched_min_granularity)
				1843	return;
				1844
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1845	se = __pick_first_entity(cfs_rq);
				1846	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1847
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1848	if (delta < 0)
				1849	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1850
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1851	if (delta > ideal_runtime)
				1852	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1853	}
				1854
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1855	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1856	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1857	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1858	/* 'current' is not kept within the tree. */
				1859	if (se->on_rq) {
				1860	/*
				1861	* Any task has to be enqueued before it get to execute on
				1862	* a CPU. So account for the time it spent waiting on the
				1863	* runqueue.
				1864	*/
				1865	update_stats_wait_end(cfs_rq, se);
				1866	__dequeue_entity(cfs_rq, se);
				1867	}
				1868
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1869	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1870	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1871	#ifdef CONFIG_SCHEDSTATS
				1872	/*
				1873	* Track our maximum slice length, if the CPU's load is at
				1874	* least twice that of our own weight (i.e. dont track it
				1875	* when there are only lesser-weight tasks around):
				1876	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	1877	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1878	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1879	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				1880	}
				1881	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	1882	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1883	}
				1884
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	1885	static int
				1886	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				1887
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1888	/*
				1889	* Pick the next process, keeping these things in mind, in this order:
				1890	* 1) keep things fair between processes/task groups
				1891	* 2) pick the "next" process, since someone really wants that to run
				1892	* 3) pick the "last" process, for cache locality
				1893	* 4) do not run the "skip" process, if something else is available
				1894	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1895	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1896	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1897	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1898	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1899
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1900	/*
				1901	* Avoid running the skip buddy, if running something else can
				1902	* be done without getting too unfair.
				1903	*/
				1904	if (cfs_rq->skip == se) {
				1905	struct sched_entity *second = __pick_next_entity(se);
				1906	if (second && wakeup_preempt_entity(second, left) < 1)
				1907	se = second;
				1908	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1909
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1910	/*
				1911	* Prefer last buddy, try to return the CPU to a preempted task.
				1912	*/
				1913	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				1914	se = cfs_rq->last;
				1915
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1916	/*
				1917	* Someone really wants this to run. If it's not unfair, run it.
				1918	*/
				1919	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				1920	se = cfs_rq->next;
				1921
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1922	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1923
				1924	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1925	}
				1926
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1927	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				1928
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	1929	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1930	{
				1931	/*
				1932	* If still on the runqueue then deactivate_task()
				1933	* was not called and update_curr() has to be done:
				1934	*/
				1935	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1936	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1937
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1938	/* throttle cfs_rqs exceeding runtime */
				1939	check_cfs_rq_runtime(cfs_rq);
				1940
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1941	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1942	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1943	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1944	/* Put 'current' back into the tree. */
				1945	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1946	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1947	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1948	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1949	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1950	}
				1951
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1952	static void
				1953	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1954	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1955	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1956	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1957	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1958	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1959
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	1960	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1961	* Ensure that runnable average is periodically updated.
				1962	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1963	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1964	update_cfs_rq_blocked_load(cfs_rq, 1);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1965
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1966	#ifdef CONFIG_SCHED_HRTICK
				1967	/*
				1968	* queued ticks are scheduled to match the slice, so don't bother
				1969	* validating it and just reschedule.
				1970	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	1971	if (queued) {
				1972	resched_task(rq_of(cfs_rq)->curr);
				1973	return;
				1974	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1975	/*
				1976	* don't let the period tick interfere with the hrtick preemption
				1977	*/
				1978	if (!sched_feat(DOUBLE_TICK) &&
				1979	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				1980	return;
				1981	#endif
				1982
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	1983	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1984	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1985	}
				1986
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	1987
				1988	/**************************************************
				1989	* CFS bandwidth control machinery
				1990	*/
				1991
				1992	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1993
				1994	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1995	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1996
				1997	static inline bool cfs_bandwidth_used(void)
				1998	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1999	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2000	}
				2001
				2002	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2003	{
				2004	/* only need to count groups transitioning between enabled/!enabled */
				2005	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2006	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2007	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2008	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2009	}
				2010	#else /* HAVE_JUMP_LABEL */
				2011	static bool cfs_bandwidth_used(void)
				2012	{
				2013	return true;
				2014	}
				2015
				2016	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2017	#endif /* HAVE_JUMP_LABEL */
				2018
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2019	/*
				2020	* default period for cfs group bandwidth.
				2021	* default: 0.1s, units: nanoseconds
				2022	*/
				2023	static inline u64 default_cfs_period(void)
				2024	{
				2025	return 100000000ULL;
				2026	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2027
				2028	static inline u64 sched_cfs_bandwidth_slice(void)
				2029	{
				2030	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2031	}
				2032
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2033	/*
				2034	* Replenish runtime according to assigned quota and update expiration time.
				2035	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2036	* additional synchronization around rq->lock.
				2037	*
				2038	* requires cfs_b->lock
				2039	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2040	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2041	{
				2042	u64 now;
				2043
				2044	if (cfs_b->quota == RUNTIME_INF)
				2045	return;
				2046
				2047	now = sched_clock_cpu(smp_processor_id());
				2048	cfs_b->runtime = cfs_b->quota;
				2049	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2050	}
				2051
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2052	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2053	{
				2054	return &tg->cfs_bandwidth;
				2055	}
				2056
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2057	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2058	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2059	{
				2060	if (unlikely(cfs_rq->throttle_count))
				2061	return cfs_rq->throttled_clock_task;
				2062
				2063	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
				2064	}
				2065
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2066	/* returns 0 on failure to allocate runtime */
				2067	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2068	{
				2069	struct task_group *tg = cfs_rq->tg;
				2070	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2071	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2072
				2073	/* note: this is a positive sum as runtime_remaining <= 0 */
				2074	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2075
				2076	raw_spin_lock(&cfs_b->lock);
				2077	if (cfs_b->quota == RUNTIME_INF)
				2078	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2079	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2080	/*
				2081	* If the bandwidth pool has become inactive, then at least one
				2082	* period must have elapsed since the last consumption.
				2083	* Refresh the global state and ensure bandwidth timer becomes
				2084	* active.
				2085	*/
				2086	if (!cfs_b->timer_active) {
				2087	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2088	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2089	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2090
				2091	if (cfs_b->runtime > 0) {
				2092	amount = min(cfs_b->runtime, min_amount);
				2093	cfs_b->runtime -= amount;
				2094	cfs_b->idle = 0;
				2095	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2096	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2097	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2098	raw_spin_unlock(&cfs_b->lock);
				2099
				2100	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2101	/*
				2102	* we may have advanced our local expiration to account for allowed
				2103	* spread between our sched_clock and the one on which runtime was
				2104	* issued.
				2105	*/
				2106	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2107	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2108
				2109	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2110	}
				2111
				2112	/*
				2113	* Note: This depends on the synchronization provided by sched_clock and the
				2114	* fact that rq->clock snapshots this value.
				2115	*/
				2116	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2117	{
				2118	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2119	struct rq *rq = rq_of(cfs_rq);
				2120
				2121	/* if the deadline is ahead of our clock, nothing to do */
				2122	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
				2123	return;
				2124
				2125	if (cfs_rq->runtime_remaining < 0)
				2126	return;
				2127
				2128	/*
				2129	* If the local deadline has passed we have to consider the
				2130	* possibility that our sched_clock is 'fast' and the global deadline
				2131	* has not truly expired.
				2132	*
				2133	* Fortunately we can check determine whether this the case by checking
				2134	* whether the global deadline has advanced.
				2135	*/
				2136
				2137	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2138	/* extend local deadline, drift is bounded above by 2 ticks */
				2139	cfs_rq->runtime_expires += TICK_NSEC;
				2140	} else {
				2141	/* global deadline is ahead, expiration has passed */
				2142	cfs_rq->runtime_remaining = 0;
				2143	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2144	}
				2145
				2146	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2147	unsigned long delta_exec)
				2148	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2149	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2150	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2151	expire_cfs_rq_runtime(cfs_rq);
				2152
				2153	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2154	return;
				2155
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2156	/*
				2157	* if we're unable to extend our runtime we resched so that the active
				2158	* hierarchy can be throttled
				2159	*/
				2160	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2161	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2162	}
				2163
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2164	static __always_inline
				2165	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2166	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2167	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2168	return;
				2169
				2170	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2171	}
				2172
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2173	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2174	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2175	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2176	}
				2177
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2178	/* check whether cfs_rq, or any parent, is throttled */
				2179	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2180	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2181	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2182	}
				2183
				2184	/*
				2185	* Ensure that neither of the group entities corresponding to src_cpu or
				2186	* dest_cpu are members of a throttled hierarchy when performing group
				2187	* load-balance operations.
				2188	*/
				2189	static inline int throttled_lb_pair(struct task_group *tg,
				2190	int src_cpu, int dest_cpu)
				2191	{
				2192	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2193
				2194	src_cfs_rq = tg->cfs_rq[src_cpu];
				2195	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2196
				2197	return throttled_hierarchy(src_cfs_rq) \|\|
				2198	throttled_hierarchy(dest_cfs_rq);
				2199	}
				2200
				2201	/* updated child weight may affect parent so we have to do this bottom up */
				2202	static int tg_unthrottle_up(struct task_group tg, void data)
				2203	{
				2204	struct rq *rq = data;
				2205	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2206
				2207	cfs_rq->throttle_count--;
				2208	#ifdef CONFIG_SMP
				2209	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2210	/* adjust cfs_rq_clock_task() */
				2211	cfs_rq->throttled_clock_task_time += rq->clock_task -
				2212	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2213	}
				2214	#endif
				2215
				2216	return 0;
				2217	}
				2218
				2219	static int tg_throttle_down(struct task_group tg, void data)
				2220	{
				2221	struct rq *rq = data;
				2222	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2223
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2224	/* group is entering throttled state, stop time */
				2225	if (!cfs_rq->throttle_count)
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2226	cfs_rq->throttled_clock_task = rq->clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2227	cfs_rq->throttle_count++;
				2228
				2229	return 0;
				2230	}
				2231
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2232	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2233	{
				2234	struct rq *rq = rq_of(cfs_rq);
				2235	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2236	struct sched_entity *se;
				2237	long task_delta, dequeue = 1;
				2238
				2239	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2240
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2241	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2242	rcu_read_lock();
				2243	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2244	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2245
				2246	task_delta = cfs_rq->h_nr_running;
				2247	for_each_sched_entity(se) {
				2248	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2249	/* throttled entity or throttle-on-deactivate */
				2250	if (!se->on_rq)
				2251	break;
				2252
				2253	if (dequeue)
				2254	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2255	qcfs_rq->h_nr_running -= task_delta;
				2256
				2257	if (qcfs_rq->load.weight)
				2258	dequeue = 0;
				2259	}
				2260
				2261	if (!se)
				2262	rq->nr_running -= task_delta;
				2263
				2264	cfs_rq->throttled = 1;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2265	cfs_rq->throttled_clock = rq->clock;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2266	raw_spin_lock(&cfs_b->lock);
				2267	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2268	raw_spin_unlock(&cfs_b->lock);
				2269	}
				2270
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2271	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2272	{
				2273	struct rq *rq = rq_of(cfs_rq);
				2274	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2275	struct sched_entity *se;
				2276	int enqueue = 1;
				2277	long task_delta;
				2278
				2279	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2280
				2281	cfs_rq->throttled = 0;
				2282	raw_spin_lock(&cfs_b->lock);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2283	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2284	list_del_rcu(&cfs_rq->throttled_list);
				2285	raw_spin_unlock(&cfs_b->lock);
				2286
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2287	update_rq_clock(rq);
				2288	/* update hierarchical throttle state */
				2289	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2290
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2291	if (!cfs_rq->load.weight)
				2292	return;
				2293
				2294	task_delta = cfs_rq->h_nr_running;
				2295	for_each_sched_entity(se) {
				2296	if (se->on_rq)
				2297	enqueue = 0;
				2298
				2299	cfs_rq = cfs_rq_of(se);
				2300	if (enqueue)
				2301	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2302	cfs_rq->h_nr_running += task_delta;
				2303
				2304	if (cfs_rq_throttled(cfs_rq))
				2305	break;
				2306	}
				2307
				2308	if (!se)
				2309	rq->nr_running += task_delta;
				2310
				2311	/* determine whether we need to wake up potentially idle cpu */
				2312	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2313	resched_task(rq->curr);
				2314	}
				2315
				2316	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2317	u64 remaining, u64 expires)
				2318	{
				2319	struct cfs_rq *cfs_rq;
				2320	u64 runtime = remaining;
				2321
				2322	rcu_read_lock();
				2323	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2324	throttled_list) {
				2325	struct rq *rq = rq_of(cfs_rq);
				2326
				2327	raw_spin_lock(&rq->lock);
				2328	if (!cfs_rq_throttled(cfs_rq))
				2329	goto next;
				2330
				2331	runtime = -cfs_rq->runtime_remaining + 1;
				2332	if (runtime > remaining)
				2333	runtime = remaining;
				2334	remaining -= runtime;
				2335
				2336	cfs_rq->runtime_remaining += runtime;
				2337	cfs_rq->runtime_expires = expires;
				2338
				2339	/* we check whether we're throttled above */
				2340	if (cfs_rq->runtime_remaining > 0)
				2341	unthrottle_cfs_rq(cfs_rq);
				2342
				2343	next:
				2344	raw_spin_unlock(&rq->lock);
				2345
				2346	if (!remaining)
				2347	break;
				2348	}
				2349	rcu_read_unlock();
				2350
				2351	return remaining;
				2352	}
				2353
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2354	/*
				2355	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2356	* cfs_rqs as appropriate. If there has been no activity within the last
				2357	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2358	* used to track this state.
				2359	*/
				2360	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2361	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2362	u64 runtime, runtime_expires;
				2363	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2364
				2365	raw_spin_lock(&cfs_b->lock);
				2366	/* no need to continue the timer with no bandwidth constraint */
				2367	if (cfs_b->quota == RUNTIME_INF)
				2368	goto out_unlock;
				2369
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2370	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2371	/* idle depends on !throttled (for the case of a large deficit) */
				2372	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2373	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2374
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2375	/* if we're going inactive then everything else can be deferred */
				2376	if (idle)
				2377	goto out_unlock;
				2378
				2379	__refill_cfs_bandwidth_runtime(cfs_b);
				2380
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2381	if (!throttled) {
				2382	/* mark as potentially idle for the upcoming period */
				2383	cfs_b->idle = 1;
				2384	goto out_unlock;
				2385	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2386
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2387	/* account preceding periods in which throttling occurred */
				2388	cfs_b->nr_throttled += overrun;
				2389
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2390	/*
				2391	* There are throttled entities so we must first use the new bandwidth
				2392	* to unthrottle them before making it generally available. This
				2393	* ensures that all existing debts will be paid before a new cfs_rq is
				2394	* allowed to run.
				2395	*/
				2396	runtime = cfs_b->runtime;
				2397	runtime_expires = cfs_b->runtime_expires;
				2398	cfs_b->runtime = 0;
				2399
				2400	/*
				2401	* This check is repeated as we are holding onto the new bandwidth
				2402	* while we unthrottle. This can potentially race with an unthrottled
				2403	* group trying to acquire new bandwidth from the global pool.
				2404	*/
				2405	while (throttled && runtime > 0) {
				2406	raw_spin_unlock(&cfs_b->lock);
				2407	/* we can't nest cfs_b->lock while distributing bandwidth */
				2408	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2409	runtime_expires);
				2410	raw_spin_lock(&cfs_b->lock);
				2411
				2412	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2413	}
				2414
				2415	/* return (any) remaining runtime */
				2416	cfs_b->runtime = runtime;
				2417	/*
				2418	* While we are ensured activity in the period following an
				2419	* unthrottle, this also covers the case in which the new bandwidth is
				2420	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2421	* timer to remain active while there are any throttled entities.)
				2422	*/
				2423	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2424	out_unlock:
				2425	if (idle)
				2426	cfs_b->timer_active = 0;
				2427	raw_spin_unlock(&cfs_b->lock);
				2428
				2429	return idle;
				2430	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2431
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2432	/* a cfs_rq won't donate quota below this amount */
				2433	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2434	/* minimum remaining period time to redistribute slack quota */
				2435	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2436	/* how long we wait to gather additional slack before distributing */
				2437	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2438
				2439	/* are we near the end of the current quota period? */
				2440	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2441	{
				2442	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2443	u64 remaining;
				2444
				2445	/* if the call-back is running a quota refresh is already occurring */
				2446	if (hrtimer_callback_running(refresh_timer))
				2447	return 1;
				2448
				2449	/* is a quota refresh about to occur? */
				2450	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2451	if (remaining < min_expire)
				2452	return 1;
				2453
				2454	return 0;
				2455	}
				2456
				2457	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2458	{
				2459	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2460
				2461	/* if there's a quota refresh soon don't bother with slack */
				2462	if (runtime_refresh_within(cfs_b, min_left))
				2463	return;
				2464
				2465	start_bandwidth_timer(&cfs_b->slack_timer,
				2466	ns_to_ktime(cfs_bandwidth_slack_period));
				2467	}
				2468
				2469	/* we know any runtime found here is valid as update_curr() precedes return */
				2470	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2471	{
				2472	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2473	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2474
				2475	if (slack_runtime <= 0)
				2476	return;
				2477
				2478	raw_spin_lock(&cfs_b->lock);
				2479	if (cfs_b->quota != RUNTIME_INF &&
				2480	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2481	cfs_b->runtime += slack_runtime;
				2482
				2483	/* we are under rq->lock, defer unthrottling using a timer */
				2484	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2485	!list_empty(&cfs_b->throttled_cfs_rq))
				2486	start_cfs_slack_bandwidth(cfs_b);
				2487	}
				2488	raw_spin_unlock(&cfs_b->lock);
				2489
				2490	/* even if it's not valid for return we don't want to try again */
				2491	cfs_rq->runtime_remaining -= slack_runtime;
				2492	}
				2493
				2494	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2495	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2496	if (!cfs_bandwidth_used())
				2497	return;
				2498
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2499	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2500	return;
				2501
				2502	__return_cfs_rq_runtime(cfs_rq);
				2503	}
				2504
				2505	/*
				2506	* This is done with a timer (instead of inline with bandwidth return) since
				2507	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2508	*/
				2509	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2510	{
				2511	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2512	u64 expires;
				2513
				2514	/* confirm we're still not at a refresh boundary */
				2515	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2516	return;
				2517
				2518	raw_spin_lock(&cfs_b->lock);
				2519	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2520	runtime = cfs_b->runtime;
				2521	cfs_b->runtime = 0;
				2522	}
				2523	expires = cfs_b->runtime_expires;
				2524	raw_spin_unlock(&cfs_b->lock);
				2525
				2526	if (!runtime)
				2527	return;
				2528
				2529	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2530
				2531	raw_spin_lock(&cfs_b->lock);
				2532	if (expires == cfs_b->runtime_expires)
				2533	cfs_b->runtime = runtime;
				2534	raw_spin_unlock(&cfs_b->lock);
				2535	}
				2536
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2537	/*
				2538	* When a group wakes up we want to make sure that its quota is not already
				2539	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2540	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2541	*/
				2542	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2543	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2544	if (!cfs_bandwidth_used())
				2545	return;
				2546
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2547	/* an active group must be handled by the update_curr()->put() path */
				2548	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2549	return;
				2550
				2551	/* ensure the group is not already throttled */
				2552	if (cfs_rq_throttled(cfs_rq))
				2553	return;
				2554
				2555	/* update runtime allocation */
				2556	account_cfs_rq_runtime(cfs_rq, 0);
				2557	if (cfs_rq->runtime_remaining <= 0)
				2558	throttle_cfs_rq(cfs_rq);
				2559	}
				2560
				2561	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2562	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2563	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2564	if (!cfs_bandwidth_used())
				2565	return;
				2566
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2567	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2568	return;
				2569
				2570	/*
				2571	* it's possible for a throttled entity to be forced into a running
				2572	* state (e.g. set_curr_task), in this case we're finished.
				2573	*/
				2574	if (cfs_rq_throttled(cfs_rq))
				2575	return;
				2576
				2577	throttle_cfs_rq(cfs_rq);
				2578	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2579
				2580	static inline u64 default_cfs_period(void);
				2581	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
				2582	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
				2583
				2584	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2585	{
				2586	struct cfs_bandwidth *cfs_b =
				2587	container_of(timer, struct cfs_bandwidth, slack_timer);
				2588	do_sched_cfs_slack_timer(cfs_b);
				2589
				2590	return HRTIMER_NORESTART;
				2591	}
				2592
				2593	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2594	{
				2595	struct cfs_bandwidth *cfs_b =
				2596	container_of(timer, struct cfs_bandwidth, period_timer);
				2597	ktime_t now;
				2598	int overrun;
				2599	int idle = 0;
				2600
				2601	for (;;) {
				2602	now = hrtimer_cb_get_time(timer);
				2603	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2604
				2605	if (!overrun)
				2606	break;
				2607
				2608	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2609	}
				2610
				2611	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2612	}
				2613
				2614	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2615	{
				2616	raw_spin_lock_init(&cfs_b->lock);
				2617	cfs_b->runtime = 0;
				2618	cfs_b->quota = RUNTIME_INF;
				2619	cfs_b->period = ns_to_ktime(default_cfs_period());
				2620
				2621	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2622	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2623	cfs_b->period_timer.function = sched_cfs_period_timer;
				2624	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2625	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2626	}
				2627
				2628	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2629	{
				2630	cfs_rq->runtime_enabled = 0;
				2631	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2632	}
				2633
				2634	/* requires cfs_b->lock, may release to reprogram timer */
				2635	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2636	{
				2637	/*
				2638	* The timer may be active because we're trying to set a new bandwidth
				2639	* period or because we're racing with the tear-down path
				2640	* (timer_active==0 becomes visible before the hrtimer call-back
				2641	* terminates). In either case we ensure that it's re-programmed
				2642	*/
				2643	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2644	raw_spin_unlock(&cfs_b->lock);
				2645	/* ensure cfs_b->lock is available while we wait */
				2646	hrtimer_cancel(&cfs_b->period_timer);
				2647
				2648	raw_spin_lock(&cfs_b->lock);
				2649	/* if someone else restarted the timer then we're done */
				2650	if (cfs_b->timer_active)
				2651	return;
				2652	}
				2653
				2654	cfs_b->timer_active = 1;
				2655	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2656	}
				2657
				2658	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2659	{
				2660	hrtimer_cancel(&cfs_b->period_timer);
				2661	hrtimer_cancel(&cfs_b->slack_timer);
				2662	}
				2663
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	2664	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2665	{
				2666	struct cfs_rq *cfs_rq;
				2667
				2668	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2669	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2670
				2671	if (!cfs_rq->runtime_enabled)
				2672	continue;
				2673
				2674	/*
				2675	* clock_task is not advancing so we just need to make sure
				2676	* there's some valid quota amount
				2677	*/
				2678	cfs_rq->runtime_remaining = cfs_b->quota;
				2679	if (cfs_rq_throttled(cfs_rq))
				2680	unthrottle_cfs_rq(cfs_rq);
				2681	}
				2682	}
				2683
				2684	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2685	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2686	{
				2687	return rq_of(cfs_rq)->clock_task;
				2688	}
				2689
				2690	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2691	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2692	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2693	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2694	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2695
				2696	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2697	{
				2698	return 0;
				2699	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2700
				2701	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2702	{
				2703	return 0;
				2704	}
				2705
				2706	static inline int throttled_lb_pair(struct task_group *tg,
				2707	int src_cpu, int dest_cpu)
				2708	{
				2709	return 0;
				2710	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2711
				2712	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2713
				2714	#ifdef CONFIG_FAIR_GROUP_SCHED
				2715	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2716	#endif
				2717
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2718	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2719	{
				2720	return NULL;
				2721	}
				2722	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2723	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2724
				2725	#endif /* CONFIG_CFS_BANDWIDTH */
				2726
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2727	/**************************************************
				2728	* CFS operations on tasks:
				2729	*/
				2730
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2731	#ifdef CONFIG_SCHED_HRTICK
				2732	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2733	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2734	struct sched_entity *se = &p->se;
				2735	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2736
				2737	WARN_ON(task_rq(p) != rq);
				2738
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2739	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2740	u64 slice = sched_slice(cfs_rq, se);
				2741	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2742	s64 delta = slice - ran;
				2743
				2744	if (delta < 0) {
				2745	if (rq->curr == p)
				2746	resched_task(p);
				2747	return;
				2748	}
				2749
				2750	/*
				2751	* Don't schedule slices shorter than 10000ns, that just
				2752	* doesn't make sense. Rely on vruntime for fairness.
				2753	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2754	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2755	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2756
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2757	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2758	}
				2759	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2760
				2761	/*
				2762	* called from enqueue/dequeue and updates the hrtick when the
				2763	* current task is from our class and nr_running is low enough
				2764	* to matter.
				2765	*/
				2766	static void hrtick_update(struct rq *rq)
				2767	{
				2768	struct task_struct *curr = rq->curr;
				2769
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2770	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2771	return;
				2772
				2773	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2774	hrtick_start_fair(rq, curr);
				2775	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2776	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2777	static inline void
				2778	hrtick_start_fair(struct rq rq, struct task_struct p)
				2779	{
				2780	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2781
				2782	static inline void hrtick_update(struct rq *rq)
				2783	{
				2784	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2785	#endif
				2786
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2787	/*
				2788	* The enqueue_task method is called before nr_running is
				2789	* increased. Here we update the fair scheduling stats and
				2790	* then put the task into the rbtree:
				2791	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2792	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2793	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2794	{
				2795	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2796	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2797
				2798	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2799	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2800	break;
				2801	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2802	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2803
				2804	/*
				2805	* end evaluation on encountering a throttled cfs_rq
				2806	*
				2807	* note: in the case of encountering a throttled cfs_rq we will
				2808	* post the final h_nr_running increment below.
				2809	*/
				2810	if (cfs_rq_throttled(cfs_rq))
				2811	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2812	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2813
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2814	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2815	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2816
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2817	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2818	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2819	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2820
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2821	if (cfs_rq_throttled(cfs_rq))
				2822	break;
				2823
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2824	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2825	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2826	}
				2827
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2828	if (!se) {
				2829	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2830	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2831	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2832	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2833	}
				2834
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2835	static void set_next_buddy(struct sched_entity *se);
				2836
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2837	/*
				2838	* The dequeue_task method is called before nr_running is
				2839	* decreased. We remove the task from the rbtree and
				2840	* update the fair scheduling stats:
				2841	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2842	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2843	{
				2844	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2845	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2846	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2847
				2848	for_each_sched_entity(se) {
				2849	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2850	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2851
				2852	/*
				2853	* end evaluation on encountering a throttled cfs_rq
				2854	*
				2855	* note: in the case of encountering a throttled cfs_rq we will
				2856	* post the final h_nr_running decrement below.
				2857	*/
				2858	if (cfs_rq_throttled(cfs_rq))
				2859	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2860	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2861
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2862	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2863	if (cfs_rq->load.weight) {
				2864	/*
				2865	* Bias pick_next to pick a task from this cfs_rq, as
				2866	* p is sleeping when it is within its sched_slice.
				2867	*/
				2868	if (task_sleep && parent_entity(se))
				2869	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	2870
				2871	/* avoid re-evaluating load for this entity */
				2872	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2873	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2874	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2875	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2876	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2877
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2878	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2879	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2880	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2881
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2882	if (cfs_rq_throttled(cfs_rq))
				2883	break;
				2884
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2885	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2886	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2887	}
				2888
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2889	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2890	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2891	update_rq_runnable_avg(rq, 1);
				2892	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2893	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2894	}
				2895
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	2896	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2897	/* Used instead of source_load when we know the type == 0 */
				2898	static unsigned long weighted_cpuload(const int cpu)
				2899	{
				2900	return cpu_rq(cpu)->load.weight;
				2901	}
				2902
				2903	/*
				2904	* Return a low guess at the load of a migration-source cpu weighted
				2905	* according to the scheduling class and "nice" value.
				2906	*
				2907	* We want to under-estimate the load of migration sources, to
				2908	* balance conservatively.
				2909	*/
				2910	static unsigned long source_load(int cpu, int type)
				2911	{
				2912	struct rq *rq = cpu_rq(cpu);
				2913	unsigned long total = weighted_cpuload(cpu);
				2914
				2915	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2916	return total;
				2917
				2918	return min(rq->cpu_load[type-1], total);
				2919	}
				2920
				2921	/*
				2922	* Return a high guess at the load of a migration-target cpu weighted
				2923	* according to the scheduling class and "nice" value.
				2924	*/
				2925	static unsigned long target_load(int cpu, int type)
				2926	{
				2927	struct rq *rq = cpu_rq(cpu);
				2928	unsigned long total = weighted_cpuload(cpu);
				2929
				2930	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2931	return total;
				2932
				2933	return max(rq->cpu_load[type-1], total);
				2934	}
				2935
				2936	static unsigned long power_of(int cpu)
				2937	{
				2938	return cpu_rq(cpu)->cpu_power;
				2939	}
				2940
				2941	static unsigned long cpu_avg_load_per_task(int cpu)
				2942	{
				2943	struct rq *rq = cpu_rq(cpu);
				2944	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
				2945
				2946	if (nr_running)
				2947	return rq->load.weight / nr_running;
				2948
				2949	return 0;
				2950	}
				2951
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	2952
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2953	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2954	{
				2955	struct sched_entity *se = &p->se;
				2956	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2957	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2958
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2959	#ifndef CONFIG_64BIT
				2960	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2961
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2962	do {
				2963	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				2964	smp_rmb();
				2965	min_vruntime = cfs_rq->min_vruntime;
				2966	} while (min_vruntime != min_vruntime_copy);
				2967	#else
				2968	min_vruntime = cfs_rq->min_vruntime;
				2969	#endif
				2970
				2971	se->vruntime -= min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2972	}
				2973
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	2974	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	2975	/*
				2976	* effective_load() calculates the load change as seen from the root_task_group
				2977	*
				2978	* Adding load to a group doesn't make a group heavier, but can cause movement
				2979	* of group shares between cpus. Assuming the shares were perfectly aligned one
				2980	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2981	*
				2982	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				2983	* on this @cpu and results in a total addition (subtraction) of @wg to the
				2984	* total group weight.
				2985	*
				2986	* Given a runqueue weight distribution (rw_i) we can compute a shares
				2987	* distribution (s_i) using:
				2988	*
				2989	* s_i = rw_i / \Sum rw_j (1)
				2990	*
				2991	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				2992	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				2993	* shares distribution (s_i):
				2994	*
				2995	* rw_i = { 2, 4, 1, 0 }
				2996	* s_i = { 2/7, 4/7, 1/7, 0 }
				2997	*
				2998	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				2999	* task used to run on and the CPU the waker is running on), we need to
				3000	* compute the effect of waking a task on either CPU and, in case of a sync
				3001	* wakeup, compute the effect of the current task going to sleep.
				3002	*
				3003	* So for a change of @wl to the local @cpu with an overall group weight change
				3004	* of @wl we can compute the new shares distribution (s'_i) using:
				3005	*
				3006	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3007	*
				3008	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3009	* differences in waking a task to CPU 0. The additional task changes the
				3010	* weight and shares distributions like:
				3011	*
				3012	* rw'_i = { 3, 4, 1, 0 }
				3013	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3014	*
				3015	* We can then compute the difference in effective weight by using:
				3016	*
				3017	* dw_i = S * (s'_i - s_i) (3)
				3018	*
				3019	* Where 'S' is the group weight as seen by its parent.
				3020	*
				3021	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3022	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3023	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3024	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3025	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3026	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3027	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3028
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3029	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3030	return wl;
				3031
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3032	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3033	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3034
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3035	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3036
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3037	/*
				3038	* W = @wg + \Sum rw_j
				3039	*/
				3040	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3041
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3042	/*
				3043	* w = rw_i + @wl
				3044	*/
				3045	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3046
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3047	/*
				3048	* wl = S * s'_i; see (2)
				3049	*/
				3050	if (W > 0 && w < W)
				3051	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3052	else
				3053	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3054
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3055	/*
				3056	* Per the above, wl is the new se->load.weight value; since
				3057	* those are clipped to [MIN_SHARES, ...) do so now. See
				3058	* calc_cfs_shares().
				3059	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3060	if (wl < MIN_SHARES)
				3061	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3062
				3063	/*
				3064	* wl = dw_i = S * (s'_i - s_i); see (3)
				3065	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3066	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3067
				3068	/*
				3069	* Recursively apply this logic to all parent groups to compute
				3070	* the final effective load change on the root group. Since
				3071	* only the @tg group gets extra weight, all parent groups can
				3072	* only redistribute existing shares. @wl is the shift in shares
				3073	* resulting from this level per the above.
				3074	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3075	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3076	}
				3077
				3078	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3079	}
				3080	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3081
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3082	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3083	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3084	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3085	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3086	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3087
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3088	#endif
				3089
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3090	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3091	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3092	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3093	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3094	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3095	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3096	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3097	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3098
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3099	idx = sd->wake_idx;
				3100	this_cpu = smp_processor_id();
				3101	prev_cpu = task_cpu(p);
				3102	load = source_load(prev_cpu, idx);
				3103	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3104
				3105	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3106	* If sync wakeup then subtract the (maximum possible)
				3107	* effect of the currently running task from the load
				3108	* of the current CPU:
				3109	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3110	if (sync) {
				3111	tg = task_group(current);
				3112	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3113
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3114	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3115	load += effective_load(tg, prev_cpu, 0, -weight);
				3116	}
				3117
				3118	tg = task_group(p);
				3119	weight = p->se.load.weight;
				3120
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3121	/*
				3122	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3123	* due to the sync cause above having dropped this_load to 0, we'll
				3124	* always have an imbalance, but there's really nothing you can do
				3125	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3126	*
				3127	* Otherwise check if either cpus are near enough in load to allow this
				3128	* task to be woken on this_cpu.
				3129	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3130	if (this_load > 0) {
				3131	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3132
				3133	this_eff_load = 100;
				3134	this_eff_load *= power_of(prev_cpu);
				3135	this_eff_load *= this_load +
				3136	effective_load(tg, this_cpu, weight, weight);
				3137
				3138	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3139	prev_eff_load *= power_of(this_cpu);
				3140	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3141
				3142	balanced = this_eff_load <= prev_eff_load;
				3143	} else
				3144	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3145
				3146	/*
				3147	* If the currently running task will sleep within
				3148	* a reasonable amount of time then attract this newly
				3149	* woken task:
				3150	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3151	if (sync && balanced)
				3152	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3153
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3154	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3155	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3156
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3157	if (balanced \|\|
				3158	(this_load <= load &&
				3159	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3160	/*
				3161	* This domain has SD_WAKE_AFFINE and
				3162	* p is cache cold in this domain, and
				3163	* there is no bad imbalance.
				3164	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3165	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3166	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3167
				3168	return 1;
				3169	}
				3170	return 0;
				3171	}
				3172
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3173	/*
				3174	* find_idlest_group finds and returns the least busy CPU group within the
				3175	* domain.
				3176	*/
				3177	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3178	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3179	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3180	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3181	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3182	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3183	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3184
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3185	do {
				3186	unsigned long load, avg_load;
				3187	int local_group;
				3188	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3189
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3190	/* Skip over this group if it has no CPUs allowed */
				3191	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3192	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3193	continue;
				3194
				3195	local_group = cpumask_test_cpu(this_cpu,
				3196	sched_group_cpus(group));
				3197
				3198	/* Tally up the load of all CPUs in the group */
				3199	avg_load = 0;
				3200
				3201	for_each_cpu(i, sched_group_cpus(group)) {
				3202	/* Bias balancing toward cpus of our domain */
				3203	if (local_group)
				3204	load = source_load(i, load_idx);
				3205	else
				3206	load = target_load(i, load_idx);
				3207
				3208	avg_load += load;
				3209	}
				3210
				3211	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3212	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3213
				3214	if (local_group) {
				3215	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3216	} else if (avg_load < min_load) {
				3217	min_load = avg_load;
				3218	idlest = group;
				3219	}
				3220	} while (group = group->next, group != sd->groups);
				3221
				3222	if (!idlest \|\| 100this_load < imbalancemin_load)
				3223	return NULL;
				3224	return idlest;
				3225	}
				3226
				3227	/*
				3228	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3229	*/
				3230	static int
				3231	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3232	{
				3233	unsigned long load, min_load = ULONG_MAX;
				3234	int idlest = -1;
				3235	int i;
				3236
				3237	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3238	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3239	load = weighted_cpuload(i);
				3240
				3241	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3242	min_load = load;
				3243	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3244	}
				3245	}
				3246
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3247	return idlest;
				3248	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3249
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3250	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3251	* Try and locate an idle CPU in the sched_domain.
				3252	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3253	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3254	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3255	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3256	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3257	int i = task_cpu(p);
				3258
				3259	if (idle_cpu(target))
				3260	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3261
				3262	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3263	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3264	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3265	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				3266	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3267
				3268	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3269	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3270	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3271	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3272	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3273	sg = sd->groups;
				3274	do {
				3275	if (!cpumask_intersects(sched_group_cpus(sg),
				3276	tsk_cpus_allowed(p)))
				3277	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3278
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3279	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3280	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3281	goto next;
				3282	}
				3283
				3284	target = cpumask_first_and(sched_group_cpus(sg),
				3285	tsk_cpus_allowed(p));
				3286	goto done;
				3287	next:
				3288	sg = sg->next;
				3289	} while (sg != sd->groups);
				3290	}
				3291	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3292	return target;
				3293	}
				3294
				3295	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3296	* sched_balance_self: balance the current task (running on cpu) in domains
				3297	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3298	* SD_BALANCE_EXEC.
				3299	*
				3300	* Balance, ie. select the least loaded group.
				3301	*
				3302	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3303	*
				3304	* preempt must be disabled.
				3305	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3306	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3307	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3308	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3309	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3310	int cpu = smp_processor_id();
				3311	int prev_cpu = task_cpu(p);
				3312	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3313	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3314	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3315
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3316	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3317	return prev_cpu;
				3318
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3319	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3320	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3321	want_affine = 1;
				3322	new_cpu = prev_cpu;
				3323	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3324
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3325	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3326	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	3327	if (!(tmp->flags & SD_LOAD_BALANCE))
				3328	continue;
				3329
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3330	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3331	* If both cpu and prev_cpu are part of this domain,
				3332	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3333	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3334	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3335	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3336	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3337	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3338	}
				3339
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3340	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3341	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3342	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3343
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3344	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3345	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3346	prev_cpu = cpu;
				3347
				3348	new_cpu = select_idle_sibling(p, prev_cpu);
				3349	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3350	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3351
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3352	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3353	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3354	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3355	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3356
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3357	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3358	sd = sd->child;
				3359	continue;
				3360	}
				3361
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3362	if (sd_flag & SD_BALANCE_WAKE)
				3363	load_idx = sd->wake_idx;
				3364
				3365	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3366	if (!group) {
				3367	sd = sd->child;
				3368	continue;
				3369	}
				3370
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3371	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3372	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3373	/* Now try balancing at a lower domain level of cpu */
				3374	sd = sd->child;
				3375	continue;
				3376	}
				3377
				3378	/* Now try balancing at a lower domain level of new_cpu */
				3379	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3380	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3381	sd = NULL;
				3382	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3383	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3384	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3385	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3386	sd = tmp;
				3387	}
				3388	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3389	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3390	unlock:
				3391	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3392
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3393	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3394	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3395
				3396	/*
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3397	* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
				3398	* removed when useful for applications beyond shares distribution (e.g.
				3399	* load-balance).
				3400	*/
				3401	#ifdef CONFIG_FAIR_GROUP_SCHED
				3402	/*
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3403	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3404	* cfs_rq_of(p) references at time of call are still valid and identify the
				3405	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3406	* other assumptions, including the state of rq->lock, should be made.
				3407	*/
				3408	static void
				3409	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3410	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3411	struct sched_entity *se = &p->se;
				3412	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3413
				3414	/*
				3415	* Load tracking: accumulate removed load so that it can be processed
				3416	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3417	* to blocked load iff they have a positive decay-count. It can never
				3418	* be negative here since on-rq tasks have decay-count == 0.
				3419	*/
				3420	if (se->avg.decay_count) {
				3421	se->avg.decay_count = -__synchronize_entity_decay(se);
				3422	atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
				3423	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3424	}
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3425	#endif
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3426	#endif /* CONFIG_SMP */
				3427
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3428	static unsigned long
				3429	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3430	{
				3431	unsigned long gran = sysctl_sched_wakeup_granularity;
				3432
				3433	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3434	* Since its curr running now, convert the gran from real-time
				3435	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3436	*
				3437	* By using 'se' instead of 'curr' we penalize light tasks, so
				3438	* they get preempted easier. That is, if 'se' < 'curr' then
				3439	* the resulting gran will be larger, therefore penalizing the
				3440	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3441	* be smaller, again penalizing the lighter task.
				3442	*
				3443	* This is especially important for buddies when the leftmost
				3444	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3445	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3446	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3447	}
				3448
				3449	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3450	* Should 'se' preempt 'curr'.
				3451	*
				3452	* \|s1
				3453	* \|s2
				3454	* \|s3
				3455	* g
				3456	* \|<--->\|c
				3457	*
				3458	* w(c, s1) = -1
				3459	* w(c, s2) = 0
				3460	* w(c, s3) = 1
				3461	*
				3462	*/
				3463	static int
				3464	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3465	{
				3466	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3467
				3468	if (vdiff <= 0)
				3469	return -1;
				3470
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3471	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3472	if (vdiff > gran)
				3473	return 1;
				3474
				3475	return 0;
				3476	}
				3477
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3478	static void set_last_buddy(struct sched_entity *se)
				3479	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3480	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3481	return;
				3482
				3483	for_each_sched_entity(se)
				3484	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3485	}
				3486
				3487	static void set_next_buddy(struct sched_entity *se)
				3488	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3489	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3490	return;
				3491
				3492	for_each_sched_entity(se)
				3493	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3494	}
				3495
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3496	static void set_skip_buddy(struct sched_entity *se)
				3497	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3498	for_each_sched_entity(se)
				3499	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3500	}
				3501
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3502	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3503	* Preempt the current task with a newly woken task if needed:
				3504	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3505	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3506	{
				3507	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3508	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3509	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3510	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3511	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3512
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3513	if (unlikely(se == pse))
				3514	return;
				3515
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3516	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3517	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3518	* unconditionally check_prempt_curr() after an enqueue (which may have
				3519	* lead to a throttle). This both saves work and prevents false
				3520	* next-buddy nomination below.
				3521	*/
				3522	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3523	return;
				3524
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3525	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3526	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3527	next_buddy_marked = 1;
				3528	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3529
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3530	/*
				3531	* We can come here with TIF_NEED_RESCHED already set from new task
				3532	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3533	*
				3534	* Note: this also catches the edge-case of curr being in a throttled
				3535	* group (e.g. via set_curr_task), since update_curr() (in the
				3536	* enqueue of curr) will have resulted in resched being set. This
				3537	* prevents us from potentially nominating it as a false LAST_BUDDY
				3538	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3539	*/
				3540	if (test_tsk_need_resched(curr))
				3541	return;
				3542
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3543	/* Idle tasks are by definition preempted by non-idle tasks. */
				3544	if (unlikely(curr->policy == SCHED_IDLE) &&
				3545	likely(p->policy != SCHED_IDLE))
				3546	goto preempt;
				3547
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3548	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3549	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3550	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3551	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	3552	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3553	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3554
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3555	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3556	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3557	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3558	if (wakeup_preempt_entity(se, pse) == 1) {
				3559	/*
				3560	* Bias pick_next to pick the sched entity that is
				3561	* triggering this preemption.
				3562	*/
				3563	if (!next_buddy_marked)
				3564	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3565	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3566	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3567
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3568	return;
				3569
				3570	preempt:
				3571	resched_task(curr);
				3572	/*
				3573	* Only set the backward buddy when the current task is still
				3574	* on the rq. This can happen when a wakeup gets interleaved
				3575	* with schedule on the ->pre_schedule() or idle_balance()
				3576	* point, either of which can * drop the rq lock.
				3577	*
				3578	* Also, during early boot the idle thread is in the fair class,
				3579	* for obvious reasons its a bad idea to schedule back to it.
				3580	*/
				3581	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3582	return;
				3583
				3584	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3585	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3586	}
				3587
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3588	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3589	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3590	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3591	struct cfs_rq *cfs_rq = &rq->cfs;
				3592	struct sched_entity *se;
				3593
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3594	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3595	return NULL;
				3596
				3597	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3598	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3599	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3600	cfs_rq = group_cfs_rq(se);
				3601	} while (cfs_rq);
				3602
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3603	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3604	if (hrtick_enabled(rq))
				3605	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3606
				3607	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3608	}
				3609
				3610	/*
				3611	* Account for a descheduled task:
				3612	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3613	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3614	{
				3615	struct sched_entity *se = &prev->se;
				3616	struct cfs_rq *cfs_rq;
				3617
				3618	for_each_sched_entity(se) {
				3619	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3620	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3621	}
				3622	}
				3623
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3624	/*
				3625	* sched_yield() is very simple
				3626	*
				3627	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3628	*/
				3629	static void yield_task_fair(struct rq *rq)
				3630	{
				3631	struct task_struct *curr = rq->curr;
				3632	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3633	struct sched_entity *se = &curr->se;
				3634
				3635	/*
				3636	* Are we the only task in the tree?
				3637	*/
				3638	if (unlikely(rq->nr_running == 1))
				3639	return;
				3640
				3641	clear_buddies(cfs_rq, se);
				3642
				3643	if (curr->policy != SCHED_BATCH) {
				3644	update_rq_clock(rq);
				3645	/*
				3646	* Update run-time statistics of the 'current'.
				3647	*/
				3648	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3649	/*
				3650	* Tell update_rq_clock() that we've just updated,
				3651	* so we don't do microscopic update in schedule()
				3652	* and double the fastpath cost.
				3653	*/
				3654	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3655	}
				3656
				3657	set_skip_buddy(se);
				3658	}
				3659
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3660	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3661	{
				3662	struct sched_entity *se = &p->se;
				3663
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3664	/* throttled hierarchies are not runnable */
				3665	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3666	return false;
				3667
				3668	/* Tell the scheduler that we'd really like pse to run next. */
				3669	set_next_buddy(se);
				3670
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3671	yield_task_fair(rq);
				3672
				3673	return true;
				3674	}
				3675
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3676	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3677	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3678	* Fair scheduling class load-balancing methods.
				3679	*
				3680	* BASICS
				3681	*
				3682	* The purpose of load-balancing is to achieve the same basic fairness the
				3683	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3684	* time to each task. This is expressed in the following equation:
				3685	*
				3686	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3687	*
				3688	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3689	* W_i,0 is defined as:
				3690	*
				3691	* W_i,0 = \Sum_j w_i,j (2)
				3692	*
				3693	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3694	* is derived from the nice value as per prio_to_weight[].
				3695	*
				3696	* The weight average is an exponential decay average of the instantaneous
				3697	* weight:
				3698	*
				3699	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3700	*
				3701	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3702	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3703	* can also include other factors [XXX].
				3704	*
				3705	* To achieve this balance we define a measure of imbalance which follows
				3706	* directly from (1):
				3707	*
				3708	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3709	*
				3710	* We them move tasks around to minimize the imbalance. In the continuous
				3711	* function space it is obvious this converges, in the discrete case we get
				3712	* a few fun cases generally called infeasible weight scenarios.
				3713	*
				3714	* [XXX expand on:
				3715	* - infeasible weights;
				3716	* - local vs global optima in the discrete case. ]
				3717	*
				3718	*
				3719	* SCHED DOMAINS
				3720	*
				3721	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3722	* for all i,j solution, we create a tree of cpus that follows the hardware
				3723	* topology where each level pairs two lower groups (or better). This results
				3724	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3725	* tree to only the first of the previous level and we decrease the frequency
				3726	* of load-balance at each level inv. proportional to the number of cpus in
				3727	* the groups.
				3728	*
				3729	* This yields:
				3730	*
				3731	* log_2 n 1 n
				3732	* \Sum { --- * --- * 2^i } = O(n) (5)
				3733	* i = 0 2^i 2^i
				3734	* `- size of each group
				3735	* \| \| `- number of cpus doing load-balance
				3736	* \| `- freq
				3737	* `- sum over all levels
				3738	*
				3739	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3740	* this makes (5) the runtime complexity of the balancer.
				3741	*
				3742	* An important property here is that each CPU is still (indirectly) connected
				3743	* to every other cpu in at most O(log n) steps:
				3744	*
				3745	* The adjacency matrix of the resulting graph is given by:
				3746	*
				3747	* log_2 n
				3748	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3749	* k = 0
				3750	*
				3751	* And you'll find that:
				3752	*
				3753	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3754	*
				3755	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3756	* The task movement gives a factor of O(m), giving a convergence complexity
				3757	* of:
				3758	*
				3759	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3760	*
				3761	*
				3762	* WORK CONSERVING
				3763	*
				3764	* In order to avoid CPUs going idle while there's still work to do, new idle
				3765	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3766	* tree itself instead of relying on other CPUs to bring it work.
				3767	*
				3768	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3769	* time.
				3770	*
				3771	* [XXX more?]
				3772	*
				3773	*
				3774	* CGROUPS
				3775	*
				3776	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3777	*
				3778	* s_k,i
				3779	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3780	* S_k
				3781	*
				3782	* Where
				3783	*
				3784	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3785	*
				3786	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3787	*
				3788	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3789	* property.
				3790	*
				3791	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3792	* rewrite all of this once again.]
				3793	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3794
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3795	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3796
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3797	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3798	#define LBF_NEED_BREAK 0x02
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3799	#define LBF_SOME_PINNED 0x04
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3800
				3801	struct lb_env {
				3802	struct sched_domain *sd;
				3803
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3804	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3805	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3806
				3807	int dst_cpu;
				3808	struct rq *dst_rq;
				3809
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3810	struct cpumask *dst_grpmask;
				3811	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3812	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3813	long imbalance;
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	3814	/* The set of CPUs under consideration for load-balancing */
				3815	struct cpumask *cpus;
				3816
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3817	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3818
				3819	unsigned int loop;
				3820	unsigned int loop_break;
				3821	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3822	};
				3823
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3824	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3825	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3826	* Both runqueues must be locked.
				3827	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3828	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3829	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3830	deactivate_task(env->src_rq, p, 0);
				3831	set_task_cpu(p, env->dst_cpu);
				3832	activate_task(env->dst_rq, p, 0);
				3833	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3834	}
				3835
				3836	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3837	* Is this task likely cache-hot:
				3838	*/
				3839	static int
				3840	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				3841	{
				3842	s64 delta;
				3843
				3844	if (p->sched_class != &fair_sched_class)
				3845	return 0;
				3846
				3847	if (unlikely(p->policy == SCHED_IDLE))
				3848	return 0;
				3849
				3850	/*
				3851	* Buddy candidates are cache hot:
				3852	*/
				3853	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				3854	(&p->se == cfs_rq_of(&p->se)->next \|\|
				3855	&p->se == cfs_rq_of(&p->se)->last))
				3856	return 1;
				3857
				3858	if (sysctl_sched_migration_cost == -1)
				3859	return 1;
				3860	if (sysctl_sched_migration_cost == 0)
				3861	return 0;
				3862
				3863	delta = now - p->se.exec_start;
				3864
				3865	return delta < (s64)sysctl_sched_migration_cost;
				3866	}
				3867
				3868	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3869	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				3870	*/
				3871	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3872	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3873	{
				3874	int tsk_cache_hot = 0;
				3875	/*
				3876	* We do not migrate tasks that are:
				3877	* 1) running (obviously), or
				3878	* 2) cannot be migrated to this CPU due to cpus_allowed, or
				3879	* 3) are cache-hot on their current CPU.
				3880	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3881	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3882	int new_dst_cpu;
				3883
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3884	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3885
				3886	/*
				3887	* Remember if this task can be migrated to any other cpu in
				3888	* our sched_group. We may want to revisit it if we couldn't
				3889	* meet load balance goals by pulling other tasks on src_cpu.
				3890	*
				3891	* Also avoid computing new_dst_cpu if we have already computed
				3892	* one in current iteration.
				3893	*/
				3894	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
				3895	return 0;
				3896
				3897	new_dst_cpu = cpumask_first_and(env->dst_grpmask,
				3898	tsk_cpus_allowed(p));
				3899	if (new_dst_cpu < nr_cpu_ids) {
				3900	env->flags \|= LBF_SOME_PINNED;
				3901	env->new_dst_cpu = new_dst_cpu;
				3902	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3903	return 0;
				3904	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3905
				3906	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3907	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3908
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3909	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3910	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3911	return 0;
				3912	}
				3913
				3914	/*
				3915	* Aggressive migration if:
				3916	* 1) task is cache cold, or
				3917	* 2) too many balance attempts have failed.
				3918	*/
				3919
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3920	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3921	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3922	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3923	#ifdef CONFIG_SCHEDSTATS
				3924	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3925	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3926	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3927	}
				3928	#endif
				3929	return 1;
				3930	}
				3931
				3932	if (tsk_cache_hot) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3933	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3934	return 0;
				3935	}
				3936	return 1;
				3937	}
				3938
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3939	/*
				3940	* move_one_task tries to move exactly one task from busiest to this_rq, as
				3941	* part of active balancing operations within "domain".
				3942	* Returns 1 if successful and 0 otherwise.
				3943	*
				3944	* Called with both runqueues locked.
				3945	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3946	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3947	{
				3948	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3949
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3950	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
				3951	if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
				3952	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3953
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3954	if (!can_migrate_task(p, env))
				3955	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3956
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3957	move_task(p, env);
				3958	/*
				3959	* Right now, this is only the second place move_task()
				3960	* is called, so we can safely collect move_task()
				3961	* stats here rather than inside move_task().
				3962	*/
				3963	schedstat_inc(env->sd, lb_gained[env->idle]);
				3964	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3965	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3966	return 0;
				3967	}
				3968
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3969	static unsigned long task_h_load(struct task_struct *p);
				3970
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	3971	static const unsigned int sched_nr_migrate_break = 32;
				3972
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3973	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3974	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3975	* this_rq, as part of a balancing operation within domain "sd".
				3976	* Returns 1 if successful and 0 otherwise.
				3977	*
				3978	* Called with both runqueues locked.
				3979	*/
				3980	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3981	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3982	struct list_head *tasks = &env->src_rq->cfs_tasks;
				3983	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3984	unsigned long load;
				3985	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3986
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3987	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3988	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3989
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3990	while (!list_empty(tasks)) {
				3991	p = list_first_entry(tasks, struct task_struct, se.group_node);
				3992
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3993	env->loop++;
				3994	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3995	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3996	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3997
				3998	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3999	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4000	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4001	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4002	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4003	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4004
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4005	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4006	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4007
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4008	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4009
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4010	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4011	goto next;
				4012
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4013	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4014	goto next;
				4015
				4016	if (!can_migrate_task(p, env))
				4017	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4018
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4019	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4020	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4021	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4022
				4023	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4024	/*
				4025	* NEWIDLE balancing is a source of latency, so preemptible
				4026	* kernels will stop after the first task is pulled to minimize
				4027	* the critical section.
				4028	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4029	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4030	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4031	#endif
				4032
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4033	/*
				4034	* We only want to steal up to the prescribed amount of
				4035	* weighted load.
				4036	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4037	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4038	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4039
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4040	continue;
				4041	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4042	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4043	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4044
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4045	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4046	* Right now, this is one of only two places move_task() is called,
				4047	* so we can safely collect move_task() stats here rather than
				4048	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4049	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4050	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4051
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4052	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4053	}
				4054
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4055	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4056	/*
				4057	* update tg->load_weight by folding this cpu's load_avg
				4058	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4059	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4060	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4061	struct sched_entity *se = tg->se[cpu];
				4062	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4063
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4064	/* throttled entities do not contribute to load */
				4065	if (throttled_hierarchy(cfs_rq))
				4066	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4067
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4068	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4069
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4070	if (se) {
				4071	update_entity_load_avg(se, 1);
				4072	/*
				4073	* We pivot on our runnable average having decayed to zero for
				4074	* list removal. This generally implies that all our children
				4075	* have also been removed (modulo rounding error or bandwidth
				4076	* control); however, such cases are rare and we can fix these
				4077	* at enqueue.
				4078	*
				4079	* TODO: fix up out-of-order children on enqueue.
				4080	*/
				4081	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4082	list_del_leaf_cfs_rq(cfs_rq);
				4083	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4084	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4085	update_rq_runnable_avg(rq, rq->nr_running);
				4086	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4087	}
				4088
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4089	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4090	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4091	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4092	struct cfs_rq *cfs_rq;
				4093	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4094
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4095	raw_spin_lock_irqsave(&rq->lock, flags);
				4096	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4097	/*
				4098	* Iterates the task_group tree in a bottom up fashion, see
				4099	* list_add_leaf_cfs_rq() for details.
				4100	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4101	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4102	/*
				4103	* Note: We may want to consider periodically releasing
				4104	* rq->lock about these updates so that creating many task
				4105	* groups does not result in continually extending hold time.
				4106	*/
				4107	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4108	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4109
				4110	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4111	}
				4112
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4113	/*
				4114	* Compute the cpu's hierarchical load factor for each task group.
				4115	* This needs to be done in a top-down fashion because the load of a child
				4116	* group is a fraction of its parents load.
				4117	*/
				4118	static int tg_load_down(struct task_group tg, void data)
				4119	{
				4120	unsigned long load;
				4121	long cpu = (long)data;
				4122
				4123	if (!tg->parent) {
				4124	load = cpu_rq(cpu)->load.weight;
				4125	} else {
				4126	load = tg->parent->cfs_rq[cpu]->h_load;
				4127	load *= tg->se[cpu]->load.weight;
				4128	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
				4129	}
				4130
				4131	tg->cfs_rq[cpu]->h_load = load;
				4132
				4133	return 0;
				4134	}
				4135
				4136	static void update_h_load(long cpu)
				4137	{
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4138	struct rq *rq = cpu_rq(cpu);
				4139	unsigned long now = jiffies;
				4140
				4141	if (rq->h_load_throttle == now)
				4142	return;
				4143
				4144	rq->h_load_throttle = now;
				4145
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4146	rcu_read_lock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4147	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4148	rcu_read_unlock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4149	}
				4150
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4151	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4152	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4153	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				4154	unsigned long load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4155
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4156	load = p->se.load.weight;
				4157	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4158
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4159	return load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4160	}
				4161	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4162	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4163	{
				4164	}
				4165
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4166	static inline void update_h_load(long cpu)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4167	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4168	}
				4169
				4170	static unsigned long task_h_load(struct task_struct *p)
				4171	{
				4172	return p->se.load.weight;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4173	}
				4174	#endif
				4175
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4176	/******** Helpers for find_busiest_group **********************/
				4177	/*
				4178	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4179	* during load balancing.
				4180	*/
				4181	struct sd_lb_stats {
				4182	struct sched_group busiest; / Busiest group in this sd */
				4183	struct sched_group this; / Local group in this sd */
				4184	unsigned long total_load; /* Total load of all groups in sd */
				4185	unsigned long total_pwr; /* Total power of all groups in sd */
				4186	unsigned long avg_load; /* Average load across all groups in sd */
				4187
				4188	/** Statistics of this group */
				4189	unsigned long this_load;
				4190	unsigned long this_load_per_task;
				4191	unsigned long this_nr_running;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4192	unsigned long this_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4193	unsigned int this_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4194
				4195	/* Statistics of the busiest group */
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4196	unsigned int busiest_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4197	unsigned long max_load;
				4198	unsigned long busiest_load_per_task;
				4199	unsigned long busiest_nr_running;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4200	unsigned long busiest_group_capacity;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4201	unsigned long busiest_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4202	unsigned int busiest_group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4203
				4204	int group_imb; /* Is there imbalance in this sd */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4205	};
				4206
				4207	/*
				4208	* sg_lb_stats - stats of a sched_group required for load_balancing
				4209	*/
				4210	struct sg_lb_stats {
				4211	unsigned long avg_load; /Avg load across the CPUs of the group /
				4212	unsigned long group_load; /* Total load over the CPUs of the group */
				4213	unsigned long sum_nr_running; /* Nr tasks running in the group */
				4214	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				4215	unsigned long group_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4216	unsigned long idle_cpus;
				4217	unsigned long group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4218	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4219	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4220	};
				4221
				4222	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4223	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4224	* @sd: The sched_domain whose load_idx is to be obtained.
				4225	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
				4226	*/
				4227	static inline int get_sd_load_idx(struct sched_domain *sd,
				4228	enum cpu_idle_type idle)
				4229	{
				4230	int load_idx;
				4231
				4232	switch (idle) {
				4233	case CPU_NOT_IDLE:
				4234	load_idx = sd->busy_idx;
				4235	break;
				4236
				4237	case CPU_NEWLY_IDLE:
				4238	load_idx = sd->newidle_idx;
				4239	break;
				4240	default:
				4241	load_idx = sd->idle_idx;
				4242	break;
				4243	}
				4244
				4245	return load_idx;
				4246	}
				4247
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4248	unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
				4249	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4250	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4251	}
				4252
				4253	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4254	{
				4255	return default_scale_freq_power(sd, cpu);
				4256	}
				4257
				4258	unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
				4259	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4260	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4261	unsigned long smt_gain = sd->smt_gain;
				4262
				4263	smt_gain /= weight;
				4264
				4265	return smt_gain;
				4266	}
				4267
				4268	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4269	{
				4270	return default_scale_smt_power(sd, cpu);
				4271	}
				4272
				4273	unsigned long scale_rt_power(int cpu)
				4274	{
				4275	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4276	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4277
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4278	/*
				4279	* Since we're reading these variables without serialization make sure
				4280	* we read them once before doing sanity checks on them.
				4281	*/
				4282	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4283	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4284
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4285	total = sched_avg_period() + (rq->clock - age_stamp);
				4286
				4287	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4288	/* Ensures that power won't end up being negative */
				4289	available = 0;
				4290	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4291	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4292	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4293
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4294	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4295	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4296
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4297	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4298
				4299	return div_u64(available, total);
				4300	}
				4301
				4302	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4303	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4304	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4305	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4306	struct sched_group *sdg = sd->groups;
				4307
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4308	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4309	if (sched_feat(ARCH_POWER))
				4310	power *= arch_scale_smt_power(sd, cpu);
				4311	else
				4312	power *= default_scale_smt_power(sd, cpu);
				4313
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4314	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4315	}
				4316
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4317	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4318
				4319	if (sched_feat(ARCH_POWER))
				4320	power *= arch_scale_freq_power(sd, cpu);
				4321	else
				4322	power *= default_scale_freq_power(sd, cpu);
				4323
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4324	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4325
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4326	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4327	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4328
				4329	if (!power)
				4330	power = 1;
				4331
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4332	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4333	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4334	}
				4335
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4336	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4337	{
				4338	struct sched_domain *child = sd->child;
				4339	struct sched_group group, sdg = sd->groups;
				4340	unsigned long power;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4341	unsigned long interval;
				4342
				4343	interval = msecs_to_jiffies(sd->balance_interval);
				4344	interval = clamp(interval, 1UL, max_load_balance_interval);
				4345	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4346
				4347	if (!child) {
				4348	update_cpu_power(sd, cpu);
				4349	return;
				4350	}
				4351
				4352	power = 0;
				4353
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4354	if (child->flags & SD_OVERLAP) {
				4355	/*
				4356	* SD_OVERLAP domains cannot assume that child groups
				4357	* span the current group.
				4358	*/
				4359
				4360	for_each_cpu(cpu, sched_group_cpus(sdg))
				4361	power += power_of(cpu);
				4362	} else {
				4363	/*
				4364	* !SD_OVERLAP domains can assume that child groups
				4365	* span the current group.
				4366	*/
				4367
				4368	group = child->groups;
				4369	do {
				4370	power += group->sgp->power;
				4371	group = group->next;
				4372	} while (group != child->groups);
				4373	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4374
Peter Zijlstra	c3decf0	2012-05-31 12:05:32 +0200	[diff] [blame]	4375	sdg->sgp->power_orig = sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4376	}
				4377
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4378	/*
				4379	* Try and fix up capacity for tiny siblings, this is needed when
				4380	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4381	* which on its own isn't powerful enough.
				4382	*
				4383	* See update_sd_pick_busiest() and check_asym_packing().
				4384	*/
				4385	static inline int
				4386	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4387	{
				4388	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4389	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4390	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4391	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4392	return 0;
				4393
				4394	/*
				4395	* If ~90% of the cpu_power is still there, we're good.
				4396	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4397	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4398	return 1;
				4399
				4400	return 0;
				4401	}
				4402
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4403	/**
				4404	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4405	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4406	* @group: sched_group whose statistics are to be updated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4407	* @load_idx: Load index of sched_domain of this_cpu for load calc.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4408	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4409	* @balance: Should we balance.
				4410	* @sgs: variable to hold the statistics for this group.
				4411	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4412	static inline void update_sg_lb_stats(struct lb_env *env,
				4413	struct sched_group *group, int load_idx,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4414	int local_group, int balance, struct sg_lb_stats sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4415	{
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4416	unsigned long nr_running, max_nr_running, min_nr_running;
				4417	unsigned long load, max_cpu_load, min_cpu_load;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4418	unsigned int balance_cpu = -1, first_idle_cpu = 0;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4419	unsigned long avg_load_per_task = 0;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4420	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4421
Gautham R Shenoy	871e35b	2010-01-20 14:02:44 -0600	[diff] [blame]	4422	if (local_group)
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4423	balance_cpu = group_balance_cpu(group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4424
				4425	/* Tally up the load of all CPUs in the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4426	max_cpu_load = 0;
				4427	min_cpu_load = ~0UL;
Nikhil Rao	2582f0e	2010-10-13 12:09:36 -0700	[diff] [blame]	4428	max_nr_running = 0;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4429	min_nr_running = ~0UL;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4430
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4431	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4432	struct rq *rq = cpu_rq(i);
				4433
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4434	nr_running = rq->nr_running;
				4435
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4436	/* Bias balancing toward cpus of our domain */
				4437	if (local_group) {
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4438	if (idle_cpu(i) && !first_idle_cpu &&
				4439	cpumask_test_cpu(i, sched_group_mask(group))) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4440	first_idle_cpu = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4441	balance_cpu = i;
				4442	}
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4443
				4444	load = target_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4445	} else {
				4446	load = source_load(i, load_idx);
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4447	if (load > max_cpu_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4448	max_cpu_load = load;
				4449	if (min_cpu_load > load)
				4450	min_cpu_load = load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4451
				4452	if (nr_running > max_nr_running)
				4453	max_nr_running = nr_running;
				4454	if (min_nr_running > nr_running)
				4455	min_nr_running = nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4456	}
				4457
				4458	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4459	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4460	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4461	if (idle_cpu(i))
				4462	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4463	}
				4464
				4465	/*
				4466	* First idle cpu or the first cpu(busiest) in this sched group
				4467	* is eligible for doing load balancing at this and above
				4468	* domains. In the newly idle case, we will allow all the cpu's
				4469	* to do the newly idle load balance.
				4470	*/
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4471	if (local_group) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4472	if (env->idle != CPU_NEWLY_IDLE) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4473	if (balance_cpu != env->dst_cpu) {
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4474	*balance = 0;
				4475	return;
				4476	}
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4477	update_group_power(env->sd, env->dst_cpu);
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4478	} else if (time_after_eq(jiffies, group->sgp->next_update))
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4479	update_group_power(env->sd, env->dst_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4480	}
				4481
				4482	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4483	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4484
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4485	/*
				4486	* Consider the group unbalanced when the imbalance is larger
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4487	* than the average weight of a task.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4488	*
				4489	* APZ: with cgroup the avg task weight can vary wildly and
				4490	* might not be a suitable number - should we keep a
				4491	* normalized nr_running number somewhere that negates
				4492	* the hierarchy?
				4493	*/
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4494	if (sgs->sum_nr_running)
				4495	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4496
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4497	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
				4498	(max_nr_running - min_nr_running) > 1)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4499	sgs->group_imb = 1;
				4500
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4501	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4502	SCHED_POWER_SCALE);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4503	if (!sgs->group_capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4504	sgs->group_capacity = fix_small_capacity(env->sd, group);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4505	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4506
				4507	if (sgs->group_capacity > sgs->sum_nr_running)
				4508	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4509	}
				4510
				4511	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4512	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4513	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4514	* @sds: sched_domain statistics
				4515	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4516	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4517	*
				4518	* Determine if @sg is a busier group than the previously selected
				4519	* busiest group.
				4520	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4521	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4522	struct sd_lb_stats *sds,
				4523	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4524	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4525	{
				4526	if (sgs->avg_load <= sds->max_load)
				4527	return false;
				4528
				4529	if (sgs->sum_nr_running > sgs->group_capacity)
				4530	return true;
				4531
				4532	if (sgs->group_imb)
				4533	return true;
				4534
				4535	/*
				4536	* ASYM_PACKING needs to move all the work to the lowest
				4537	* numbered CPUs in the group, therefore mark all groups
				4538	* higher than ourself as busy.
				4539	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4540	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4541	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4542	if (!sds->busiest)
				4543	return true;
				4544
				4545	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4546	return true;
				4547	}
				4548
				4549	return false;
				4550	}
				4551
				4552	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4553	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4554	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4555	* @balance: Should we balance.
				4556	* @sds: variable to hold the statistics for this sched_domain.
				4557	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4558	static inline void update_sd_lb_stats(struct lb_env *env,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4559	int balance, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4560	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4561	struct sched_domain *child = env->sd->child;
				4562	struct sched_group *sg = env->sd->groups;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4563	struct sg_lb_stats sgs;
				4564	int load_idx, prefer_sibling = 0;
				4565
				4566	if (child && child->flags & SD_PREFER_SIBLING)
				4567	prefer_sibling = 1;
				4568
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4569	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4570
				4571	do {
				4572	int local_group;
				4573
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4574	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4575	memset(&sgs, 0, sizeof(sgs));
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4576	update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4577
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4578	if (local_group && !(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4579	return;
				4580
				4581	sds->total_load += sgs.group_load;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4582	sds->total_pwr += sg->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4583
				4584	/*
				4585	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4586	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4587	* and move all the excess tasks away. We lower the capacity
				4588	* of a group only if the local group has the capacity to fit
				4589	* these excess tasks, i.e. nr_running < group_capacity. The
				4590	* extra check prevents the case where you always pull from the
				4591	* heaviest group when it is already under-utilized (possible
				4592	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4593	*/
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4594	if (prefer_sibling && !local_group && sds->this_has_capacity)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4595	sgs.group_capacity = min(sgs.group_capacity, 1UL);
				4596
				4597	if (local_group) {
				4598	sds->this_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4599	sds->this = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4600	sds->this_nr_running = sgs.sum_nr_running;
				4601	sds->this_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4602	sds->this_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4603	sds->this_idle_cpus = sgs.idle_cpus;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4604	} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4605	sds->max_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4606	sds->busiest = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4607	sds->busiest_nr_running = sgs.sum_nr_running;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4608	sds->busiest_idle_cpus = sgs.idle_cpus;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4609	sds->busiest_group_capacity = sgs.group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4610	sds->busiest_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4611	sds->busiest_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4612	sds->busiest_group_weight = sgs.group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4613	sds->group_imb = sgs.group_imb;
				4614	}
				4615
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4616	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4617	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4618	}
				4619
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4620	/**
				4621	* check_asym_packing - Check to see if the group is packed into the
				4622	* sched doman.
				4623	*
				4624	* This is primarily intended to used at the sibling level. Some
				4625	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4626	* case of POWER7, it can move to lower SMT modes only when higher
				4627	* threads are idle. When in lower SMT modes, the threads will
				4628	* perform better since they share less core resources. Hence when we
				4629	* have idle threads, we want them to be the higher ones.
				4630	*
				4631	* This packing function is run on idle threads. It checks to see if
				4632	* the busiest CPU in this domain (core in the P7 case) has a higher
				4633	* CPU number than the packing function is being run on. Here we are
				4634	* assuming lower CPU number will be equivalent to lower a SMT thread
				4635	* number.
				4636	*
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4637	* Returns 1 when packing is required and a task should be moved to
				4638	* this CPU. The amount of the imbalance is returned in *imbalance.
				4639	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4640	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4641	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4642	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4643	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4644	{
				4645	int busiest_cpu;
				4646
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4647	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4648	return 0;
				4649
				4650	if (!sds->busiest)
				4651	return 0;
				4652
				4653	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4654	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4655	return 0;
				4656
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4657	env->imbalance = DIV_ROUND_CLOSEST(
				4658	sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
				4659
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4660	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4661	}
				4662
				4663	/**
				4664	* fix_small_imbalance - Calculate the minor imbalance that exists
				4665	* amongst the groups of a sched_domain, during
				4666	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4667	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4668	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4669	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4670	static inline
				4671	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4672	{
				4673	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4674	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4675	unsigned long scaled_busy_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4676
				4677	if (sds->this_nr_running) {
				4678	sds->this_load_per_task /= sds->this_nr_running;
				4679	if (sds->busiest_load_per_task >
				4680	sds->this_load_per_task)
				4681	imbn = 1;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4682	} else {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4683	sds->this_load_per_task =
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4684	cpu_avg_load_per_task(env->dst_cpu);
				4685	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4686
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4687	scaled_busy_load_per_task = sds->busiest_load_per_task
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4688	* SCHED_POWER_SCALE;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4689	scaled_busy_load_per_task /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4690
				4691	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
				4692	(scaled_busy_load_per_task * imbn)) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4693	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4694	return;
				4695	}
				4696
				4697	/*
				4698	* OK, we don't have enough imbalance to justify moving tasks,
				4699	* however we may be able to increase total CPU power used by
				4700	* moving them.
				4701	*/
				4702
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4703	pwr_now += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4704	min(sds->busiest_load_per_task, sds->max_load);
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4705	pwr_now += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4706	min(sds->this_load_per_task, sds->this_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4707	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4708
				4709	/* Amount of load we'd subtract */
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4710	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4711	sds->busiest->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4712	if (sds->max_load > tmp)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4713	pwr_move += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4714	min(sds->busiest_load_per_task, sds->max_load - tmp);
				4715
				4716	/* Amount of load we'd add */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4717	if (sds->max_load * sds->busiest->sgp->power <
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4718	sds->busiest_load_per_task * SCHED_POWER_SCALE)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4719	tmp = (sds->max_load * sds->busiest->sgp->power) /
				4720	sds->this->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4721	else
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4722	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4723	sds->this->sgp->power;
				4724	pwr_move += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4725	min(sds->this_load_per_task, sds->this_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4726	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4727
				4728	/* Move if we gain throughput */
				4729	if (pwr_move > pwr_now)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4730	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4731	}
				4732
				4733	/**
				4734	* calculate_imbalance - Calculate the amount of imbalance present within the
				4735	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4736	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4737	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4738	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4739	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4740	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4741	unsigned long max_pull, load_above_capacity = ~0UL;
				4742
				4743	sds->busiest_load_per_task /= sds->busiest_nr_running;
				4744	if (sds->group_imb) {
				4745	sds->busiest_load_per_task =
				4746	min(sds->busiest_load_per_task, sds->avg_load);
				4747	}
				4748
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4749	/*
				4750	* In the presence of smp nice balancing, certain scenarios can have
				4751	* max load less than avg load(as we skip the groups at or below
				4752	* its cpu_power, while calculating max_load..)
				4753	*/
				4754	if (sds->max_load < sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4755	env->imbalance = 0;
				4756	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4757	}
				4758
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4759	if (!sds->group_imb) {
				4760	/*
				4761	* Don't want to pull so many tasks that a group would go idle.
				4762	*/
				4763	load_above_capacity = (sds->busiest_nr_running -
				4764	sds->busiest_group_capacity);
				4765
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4766	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4767
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4768	load_above_capacity /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4769	}
				4770
				4771	/*
				4772	* We're trying to get all the cpus to the average_load, so we don't
				4773	* want to push ourselves above the average load, nor do we wish to
				4774	* reduce the max loaded cpu below the average load. At the same time,
				4775	* we also don't want to reduce the group load below the group capacity
				4776	* (so that we can implement power-savings policies etc). Thus we look
				4777	* for the minimum possible imbalance.
				4778	* Be careful of negative numbers as they'll appear as very large values
				4779	* with unsigned longs.
				4780	*/
				4781	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4782
				4783	/* How much load to actually move to equalise the imbalance */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4784	env->imbalance = min(max_pull * sds->busiest->sgp->power,
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4785	(sds->avg_load - sds->this_load) * sds->this->sgp->power)
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4786	/ SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4787
				4788	/*
				4789	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4790	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4791	* a think about bumping its value to force at least one task to be
				4792	* moved
				4793	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4794	if (env->imbalance < sds->busiest_load_per_task)
				4795	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4796
				4797	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4798
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4799	/***** find_busiest_group() helpers end here *******************/
				4800
				4801	/**
				4802	* find_busiest_group - Returns the busiest group within the sched_domain
				4803	* if there is an imbalance. If there isn't an imbalance, and
				4804	* the user has opted for power-savings, it returns a group whose
				4805	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				4806	* such a group exists.
				4807	*
				4808	* Also calculates the amount of weighted load which should be moved
				4809	* to restore balance.
				4810	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4811	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4812	* @balance: Pointer to a variable indicating if this_cpu
				4813	* is the appropriate cpu to perform load balancing at this_level.
				4814	*
				4815	* Returns: - the busiest group if imbalance exists.
				4816	* - If no imbalance and user has opted for power-savings balance,
				4817	* return the least loaded group whose CPUs can be
				4818	* put to idle by rebalancing its tasks onto our group.
				4819	*/
				4820	static struct sched_group *
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4821	find_busiest_group(struct lb_env env, int balance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4822	{
				4823	struct sd_lb_stats sds;
				4824
				4825	memset(&sds, 0, sizeof(sds));
				4826
				4827	/*
				4828	* Compute the various statistics relavent for load balancing at
				4829	* this level.
				4830	*/
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4831	update_sd_lb_stats(env, balance, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4832
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4833	/*
				4834	* this_cpu is not the appropriate cpu to perform load balancing at
				4835	* this level.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4836	*/
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4837	if (!(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4838	goto ret;
				4839
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4840	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				4841	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4842	return sds.busiest;
				4843
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4844	/* There is no busy sibling group to pull tasks from */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4845	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
				4846	goto out_balanced;
				4847
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4848	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	4849
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4850	/*
				4851	* If the busiest group is imbalanced the below checks don't
				4852	* work because they assumes all things are equal, which typically
				4853	* isn't true due to cpus_allowed constraints and the like.
				4854	*/
				4855	if (sds.group_imb)
				4856	goto force_balance;
				4857
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4858	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4859	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4860	!sds.busiest_has_capacity)
				4861	goto force_balance;
				4862
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4863	/*
				4864	* If the local group is more busy than the selected busiest group
				4865	* don't try and pull any tasks.
				4866	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4867	if (sds.this_load >= sds.max_load)
				4868	goto out_balanced;
				4869
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4870	/*
				4871	* Don't pull any tasks if this group is already above the domain
				4872	* average load.
				4873	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4874	if (sds.this_load >= sds.avg_load)
				4875	goto out_balanced;
				4876
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4877	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4878	/*
				4879	* This cpu is idle. If the busiest group load doesn't
				4880	* have more tasks than the number of available cpu's and
				4881	* there is no imbalance between this and busiest group
				4882	* wrt to idle cpu's, it is balanced.
				4883	*/
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4884	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4885	sds.busiest_nr_running <= sds.busiest_group_weight)
				4886	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4887	} else {
				4888	/*
				4889	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				4890	* imbalance_pct to be conservative.
				4891	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4892	if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4893	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4894	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4895
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4896	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4897	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4898	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4899	return sds.busiest;
				4900
				4901	out_balanced:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4902	ret:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4903	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4904	return NULL;
				4905	}
				4906
				4907	/*
				4908	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				4909	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4910	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4911	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4912	{
				4913	struct rq busiest = NULL, rq;
				4914	unsigned long max_load = 0;
				4915	int i;
				4916
				4917	for_each_cpu(i, sched_group_cpus(group)) {
				4918	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4919	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				4920	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4921	unsigned long wl;
				4922
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4923	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4924	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4925
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4926	if (!cpumask_test_cpu(i, env->cpus))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4927	continue;
				4928
				4929	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4930	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4931
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4932	/*
				4933	* When comparing with imbalance, use weighted_cpuload()
				4934	* which is not scaled with the cpu power.
				4935	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4936	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4937	continue;
				4938
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4939	/*
				4940	* For the load comparisons with the other cpu's, consider
				4941	* the weighted_cpuload() scaled with the cpu power, so that
				4942	* the load can be moved away from the cpu that is potentially
				4943	* running at a lower capacity.
				4944	*/
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4945	wl = (wl * SCHED_POWER_SCALE) / power;
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4946
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4947	if (wl > max_load) {
				4948	max_load = wl;
				4949	busiest = rq;
				4950	}
				4951	}
				4952
				4953	return busiest;
				4954	}
				4955
				4956	/*
				4957	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				4958	* so long as it is large enough.
				4959	*/
				4960	#define MAX_PINNED_INTERVAL 512
				4961
				4962	/* Working cpumask for load_balance and load_balance_newidle. */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4963	DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4964
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4965	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4966	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4967	struct sched_domain *sd = env->sd;
				4968
				4969	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4970
				4971	/*
				4972	* ASYM_PACKING needs to force migrate tasks from busy but
				4973	* higher numbered CPUs in order to pack all tasks in the
				4974	* lowest numbered CPUs.
				4975	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4976	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4977	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4978	}
				4979
				4980	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				4981	}
				4982
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	4983	static int active_load_balance_cpu_stop(void *data);
				4984
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4985	/*
				4986	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				4987	* tasks if there is an imbalance.
				4988	*/
				4989	static int load_balance(int this_cpu, struct rq *this_rq,
				4990	struct sched_domain *sd, enum cpu_idle_type idle,
				4991	int *balance)
				4992	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4993	int ld_moved, cur_ld_moved, active_balance = 0;
				4994	int lb_iterations, max_lb_iterations;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4995	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4996	struct rq *busiest;
				4997	unsigned long flags;
				4998	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
				4999
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5000	struct lb_env env = {
				5001	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5002	.dst_cpu = this_cpu,
				5003	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5004	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5005	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5006	.loop_break = sched_nr_migrate_break,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5007	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5008	};
				5009
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5010	cpumask_copy(cpus, cpu_active_mask);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5011	max_lb_iterations = cpumask_weight(env.dst_grpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5012
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5013	schedstat_inc(sd, lb_count[idle]);
				5014
				5015	redo:
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5016	group = find_busiest_group(&env, balance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5017
				5018	if (*balance == 0)
				5019	goto out_balanced;
				5020
				5021	if (!group) {
				5022	schedstat_inc(sd, lb_nobusyg[idle]);
				5023	goto out_balanced;
				5024	}
				5025
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5026	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5027	if (!busiest) {
				5028	schedstat_inc(sd, lb_nobusyq[idle]);
				5029	goto out_balanced;
				5030	}
				5031
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5032	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5033
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5034	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5035
				5036	ld_moved = 0;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5037	lb_iterations = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5038	if (busiest->nr_running > 1) {
				5039	/*
				5040	* Attempt to move tasks. If find_busiest_group has found
				5041	* an imbalance but busiest->nr_running <= 1, the group is
				5042	* still unbalanced. ld_moved simply stays zero, so it is
				5043	* correctly treated as an imbalance.
				5044	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5045	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5046	env.src_cpu = busiest->cpu;
				5047	env.src_rq = busiest;
				5048	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5049
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	5050	update_h_load(env.src_cpu);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5051	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5052	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5053	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5054
				5055	/*
				5056	* cur_ld_moved - load moved in current iteration
				5057	* ld_moved - cumulative load moved across iterations
				5058	*/
				5059	cur_ld_moved = move_tasks(&env);
				5060	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5061	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5062	local_irq_restore(flags);
				5063
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5064	if (env.flags & LBF_NEED_BREAK) {
				5065	env.flags &= ~LBF_NEED_BREAK;
				5066	goto more_balance;
				5067	}
				5068
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5069	/*
				5070	* some other cpu did the load balance for us.
				5071	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5072	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5073	resched_cpu(env.dst_cpu);
				5074
				5075	/*
				5076	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5077	* us and move them to an alternate dst_cpu in our sched_group
				5078	* where they can run. The upper limit on how many times we
				5079	* iterate on same src_cpu is dependent on number of cpus in our
				5080	* sched_group.
				5081	*
				5082	* This changes load balance semantics a bit on who can move
				5083	* load to a given_cpu. In addition to the given_cpu itself
				5084	* (or a ilb_cpu acting on its behalf where given_cpu is
				5085	* nohz-idle), we now have balance_cpu in a position to move
				5086	* load to given_cpu. In rare situations, this may cause
				5087	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5088	* _independently_ and at _same_ time to move some load to
				5089	* given_cpu) causing exceess load to be moved to given_cpu.
				5090	* This however should not happen so much in practice and
				5091	* moreover subsequent load balance cycles should correct the
				5092	* excess load moved.
				5093	*/
				5094	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
				5095	lb_iterations++ < max_lb_iterations) {
				5096
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5097	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5098	env.dst_cpu = env.new_dst_cpu;
				5099	env.flags &= ~LBF_SOME_PINNED;
				5100	env.loop = 0;
				5101	env.loop_break = sched_nr_migrate_break;
				5102	/*
				5103	* Go back to "more_balance" rather than "redo" since we
				5104	* need to continue with same src_cpu.
				5105	*/
				5106	goto more_balance;
				5107	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5108
				5109	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5110	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5111	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5112	if (!cpumask_empty(cpus)) {
				5113	env.loop = 0;
				5114	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5115	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5116	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5117	goto out_balanced;
				5118	}
				5119	}
				5120
				5121	if (!ld_moved) {
				5122	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5123	/*
				5124	* Increment the failure counter only on periodic balance.
				5125	* We do not want newidle balance, which can be very
				5126	* frequent, pollute the failure counter causing
				5127	* excessive cache_hot migrations and active balances.
				5128	*/
				5129	if (idle != CPU_NEWLY_IDLE)
				5130	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5131
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5132	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5133	raw_spin_lock_irqsave(&busiest->lock, flags);
				5134
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5135	/* don't kick the active_load_balance_cpu_stop,
				5136	* if the curr task on busiest cpu can't be
				5137	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5138	*/
				5139	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5140	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5141	raw_spin_unlock_irqrestore(&busiest->lock,
				5142	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5143	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5144	goto out_one_pinned;
				5145	}
				5146
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5147	/*
				5148	* ->active_balance synchronizes accesses to
				5149	* ->active_balance_work. Once set, it's cleared
				5150	* only after active load balance is finished.
				5151	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5152	if (!busiest->active_balance) {
				5153	busiest->active_balance = 1;
				5154	busiest->push_cpu = this_cpu;
				5155	active_balance = 1;
				5156	}
				5157	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5158
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5159	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5160	stop_one_cpu_nowait(cpu_of(busiest),
				5161	active_load_balance_cpu_stop, busiest,
				5162	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5163	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5164
				5165	/*
				5166	* We've kicked active balancing, reset the failure
				5167	* counter.
				5168	*/
				5169	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5170	}
				5171	} else
				5172	sd->nr_balance_failed = 0;
				5173
				5174	if (likely(!active_balance)) {
				5175	/* We were unbalanced, so reset the balancing interval */
				5176	sd->balance_interval = sd->min_interval;
				5177	} else {
				5178	/*
				5179	* If we've begun active balancing, start to back off. This
				5180	* case may not be covered by the all_pinned logic if there
				5181	* is only 1 task on the busy runqueue (because we don't call
				5182	* move_tasks).
				5183	*/
				5184	if (sd->balance_interval < sd->max_interval)
				5185	sd->balance_interval *= 2;
				5186	}
				5187
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5188	goto out;
				5189
				5190	out_balanced:
				5191	schedstat_inc(sd, lb_balanced[idle]);
				5192
				5193	sd->nr_balance_failed = 0;
				5194
				5195	out_one_pinned:
				5196	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5197	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5198	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5199	(sd->balance_interval < sd->max_interval))
				5200	sd->balance_interval *= 2;
				5201
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5202	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5203	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5204	return ld_moved;
				5205	}
				5206
				5207	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5208	* idle_balance is called by schedule() if this_cpu is about to become
				5209	* idle. Attempts to pull tasks from other CPUs.
				5210	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5211	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5212	{
				5213	struct sched_domain *sd;
				5214	int pulled_task = 0;
				5215	unsigned long next_balance = jiffies + HZ;
				5216
				5217	this_rq->idle_stamp = this_rq->clock;
				5218
				5219	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5220	return;
				5221
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5222	update_rq_runnable_avg(this_rq, 1);
				5223
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5224	/*
				5225	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5226	*/
				5227	raw_spin_unlock(&this_rq->lock);
				5228
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5229	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5230	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5231	for_each_domain(this_cpu, sd) {
				5232	unsigned long interval;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5233	int balance = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5234
				5235	if (!(sd->flags & SD_LOAD_BALANCE))
				5236	continue;
				5237
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5238	if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5239	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5240	pulled_task = load_balance(this_cpu, this_rq,
				5241	sd, CPU_NEWLY_IDLE, &balance);
				5242	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5243
				5244	interval = msecs_to_jiffies(sd->balance_interval);
				5245	if (time_after(next_balance, sd->last_balance + interval))
				5246	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5247	if (pulled_task) {
				5248	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5249	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5250	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5251	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5252	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5253
				5254	raw_spin_lock(&this_rq->lock);
				5255
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5256	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5257	/*
				5258	* We are going idle. next_balance may be set based on
				5259	* a busy processor. So reset next_balance.
				5260	*/
				5261	this_rq->next_balance = next_balance;
				5262	}
				5263	}
				5264
				5265	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5266	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5267	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5268	* least 1 task to be running on each physical CPU where possible, and
				5269	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5270	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5271	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5272	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5273	struct rq *busiest_rq = data;
				5274	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5275	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5276	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5277	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5278
				5279	raw_spin_lock_irq(&busiest_rq->lock);
				5280
				5281	/* make sure the requested cpu hasn't gone down in the meantime */
				5282	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5283	!busiest_rq->active_balance))
				5284	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5285
				5286	/* Is there any task to move? */
				5287	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5288	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5289
				5290	/*
				5291	* This condition is "impossible", if it occurs
				5292	* we need to fix it. Originally reported by
				5293	* Bjorn Helgaas on a 128-cpu setup.
				5294	*/
				5295	BUG_ON(busiest_rq == target_rq);
				5296
				5297	/* move a task from busiest_rq to target_rq */
				5298	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5299
				5300	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5301	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5302	for_each_domain(target_cpu, sd) {
				5303	if ((sd->flags & SD_LOAD_BALANCE) &&
				5304	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5305	break;
				5306	}
				5307
				5308	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5309	struct lb_env env = {
				5310	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5311	.dst_cpu = target_cpu,
				5312	.dst_rq = target_rq,
				5313	.src_cpu = busiest_rq->cpu,
				5314	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5315	.idle = CPU_IDLE,
				5316	};
				5317
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5318	schedstat_inc(sd, alb_count);
				5319
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5320	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5321	schedstat_inc(sd, alb_pushed);
				5322	else
				5323	schedstat_inc(sd, alb_failed);
				5324	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5325	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5326	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5327	out_unlock:
				5328	busiest_rq->active_balance = 0;
				5329	raw_spin_unlock_irq(&busiest_rq->lock);
				5330	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5331	}
				5332
				5333	#ifdef CONFIG_NO_HZ
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5334	/*
				5335	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5336	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5337	* needed, they will kick the idle load balancer, which then does idle
				5338	* load balancing for all the idle CPUs.
				5339	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5340	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5341	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5342	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5343	unsigned long next_balance; /* in jiffy units */
				5344	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5345
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5346	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5347	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5348	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5349
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	5350	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5351	return ilb;
				5352
				5353	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5354	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5355
				5356	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5357	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5358	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5359	* CPU (if there is one).
				5360	*/
				5361	static void nohz_balancer_kick(int cpu)
				5362	{
				5363	int ilb_cpu;
				5364
				5365	nohz.next_balance++;
				5366
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5367	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5368
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5369	if (ilb_cpu >= nr_cpu_ids)
				5370	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5371
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5372	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5373	return;
				5374	/*
				5375	* Use smp_send_reschedule() instead of resched_cpu().
				5376	* This way we generate a sched IPI on the target cpu which
				5377	* is idle. And the softirq performing nohz idle load balance
				5378	* will be run before returning from the IPI.
				5379	*/
				5380	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5381	return;
				5382	}
				5383
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5384	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5385	{
				5386	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5387	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5388	atomic_dec(&nohz.nr_cpus);
				5389	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5390	}
				5391	}
				5392
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5393	static inline void set_cpu_sd_state_busy(void)
				5394	{
				5395	struct sched_domain *sd;
				5396	int cpu = smp_processor_id();
				5397
				5398	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5399	return;
				5400	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
				5401
				5402	rcu_read_lock();
				5403	for_each_domain(cpu, sd)
				5404	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
				5405	rcu_read_unlock();
				5406	}
				5407
				5408	void set_cpu_sd_state_idle(void)
				5409	{
				5410	struct sched_domain *sd;
				5411	int cpu = smp_processor_id();
				5412
				5413	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5414	return;
				5415	set_bit(NOHZ_IDLE, nohz_flags(cpu));
				5416
				5417	rcu_read_lock();
				5418	for_each_domain(cpu, sd)
				5419	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
				5420	rcu_read_unlock();
				5421	}
				5422
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5423	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5424	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5425	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5426	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5427	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5428	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5429	/*
				5430	* If this cpu is going down, then nothing needs to be done.
				5431	*/
				5432	if (!cpu_active(cpu))
				5433	return;
				5434
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5435	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5436	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5437
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5438	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5439	atomic_inc(&nohz.nr_cpus);
				5440	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5441	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5442
				5443	static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
				5444	unsigned long action, void *hcpu)
				5445	{
				5446	switch (action & ~CPU_TASKS_FROZEN) {
				5447	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5448	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5449	return NOTIFY_OK;
				5450	default:
				5451	return NOTIFY_DONE;
				5452	}
				5453	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5454	#endif
				5455
				5456	static DEFINE_SPINLOCK(balancing);
				5457
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5458	/*
				5459	* Scale the max load_balance interval with the number of CPUs in the system.
				5460	* This trades load-balance latency on larger machines for less cross talk.
				5461	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5462	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5463	{
				5464	max_load_balance_interval = HZ*num_online_cpus()/10;
				5465	}
				5466
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5467	/*
				5468	* It checks each scheduling domain to see if it is due to be balanced,
				5469	* and initiates a balancing operation if so.
				5470	*
				5471	* Balancing parameters are set up in arch_init_sched_domains.
				5472	*/
				5473	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5474	{
				5475	int balance = 1;
				5476	struct rq *rq = cpu_rq(cpu);
				5477	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5478	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5479	/* Earliest time when we have to do rebalance again */
				5480	unsigned long next_balance = jiffies + 60*HZ;
				5481	int update_next_balance = 0;
				5482	int need_serialize;
				5483
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5484	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5485
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5486	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5487	for_each_domain(cpu, sd) {
				5488	if (!(sd->flags & SD_LOAD_BALANCE))
				5489	continue;
				5490
				5491	interval = sd->balance_interval;
				5492	if (idle != CPU_IDLE)
				5493	interval *= sd->busy_factor;
				5494
				5495	/* scale ms to jiffies */
				5496	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5497	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5498
				5499	need_serialize = sd->flags & SD_SERIALIZE;
				5500
				5501	if (need_serialize) {
				5502	if (!spin_trylock(&balancing))
				5503	goto out;
				5504	}
				5505
				5506	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				5507	if (load_balance(cpu, rq, sd, idle, &balance)) {
				5508	/*
				5509	* We've pulled tasks over so either we're no
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5510	* longer idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5511	*/
				5512	idle = CPU_NOT_IDLE;
				5513	}
				5514	sd->last_balance = jiffies;
				5515	}
				5516	if (need_serialize)
				5517	spin_unlock(&balancing);
				5518	out:
				5519	if (time_after(next_balance, sd->last_balance + interval)) {
				5520	next_balance = sd->last_balance + interval;
				5521	update_next_balance = 1;
				5522	}
				5523
				5524	/*
				5525	* Stop the load balance at this level. There is another
				5526	* CPU in our sched group which is doing load balancing more
				5527	* actively.
				5528	*/
				5529	if (!balance)
				5530	break;
				5531	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5532	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5533
				5534	/*
				5535	* next_balance will be updated only when there is a need.
				5536	* When the cpu is attached to null domain for ex, it will not be
				5537	* updated.
				5538	*/
				5539	if (likely(update_next_balance))
				5540	rq->next_balance = next_balance;
				5541	}
				5542
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5543	#ifdef CONFIG_NO_HZ
				5544	/*
				5545	* In CONFIG_NO_HZ case, the idle balance kickee will do the
				5546	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5547	*/
				5548	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5549	{
				5550	struct rq *this_rq = cpu_rq(this_cpu);
				5551	struct rq *rq;
				5552	int balance_cpu;
				5553
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5554	if (idle != CPU_IDLE \|\|
				5555	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5556	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5557
				5558	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5559	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5560	continue;
				5561
				5562	/*
				5563	* If this cpu gets work to do, stop the load balancing
				5564	* work being done for other cpus. Next load
				5565	* balancing owner will pick it up.
				5566	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5567	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5568	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5569
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5570	rq = cpu_rq(balance_cpu);
				5571
				5572	raw_spin_lock_irq(&rq->lock);
				5573	update_rq_clock(rq);
				5574	update_idle_cpu_load(rq);
				5575	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5576
				5577	rebalance_domains(balance_cpu, CPU_IDLE);
				5578
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5579	if (time_after(this_rq->next_balance, rq->next_balance))
				5580	this_rq->next_balance = rq->next_balance;
				5581	}
				5582	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5583	end:
				5584	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5585	}
				5586
				5587	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5588	* Current heuristic for kicking the idle load balancer in the presence
				5589	* of an idle cpu is the system.
				5590	* - This rq has more than one task.
				5591	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5592	* busy cpu's exceeding the group's power.
				5593	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5594	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5595	*/
				5596	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5597	{
				5598	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5599	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5600
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5601	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5602	return 0;
				5603
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5604	/*
				5605	* We may be recently in ticked or tickless idle mode. At the first
				5606	* busy tick after returning from idle, we will update the busy stats.
				5607	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5608	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5609	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5610
				5611	/*
				5612	* None are in tickless mode and hence no need for NOHZ idle load
				5613	* balancing.
				5614	*/
				5615	if (likely(!atomic_read(&nohz.nr_cpus)))
				5616	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5617
				5618	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5619	return 0;
				5620
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5621	if (rq->nr_running >= 2)
				5622	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5623
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5624	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5625	for_each_domain(cpu, sd) {
				5626	struct sched_group *sg = sd->groups;
				5627	struct sched_group_power *sgp = sg->sgp;
				5628	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5629
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5630	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5631	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5632
				5633	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5634	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5635	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5636	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5637
				5638	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5639	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5640	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5641	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5642	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5643
				5644	need_kick_unlock:
				5645	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5646	need_kick:
				5647	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5648	}
				5649	#else
				5650	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5651	#endif
				5652
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5653	/*
				5654	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5655	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5656	*/
				5657	static void run_rebalance_domains(struct softirq_action *h)
				5658	{
				5659	int this_cpu = smp_processor_id();
				5660	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5661	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5662	CPU_IDLE : CPU_NOT_IDLE;
				5663
				5664	rebalance_domains(this_cpu, idle);
				5665
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5666	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5667	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5668	* balancing on behalf of the other idle cpus whose ticks are
				5669	* stopped.
				5670	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5671	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5672	}
				5673
				5674	static inline int on_null_domain(int cpu)
				5675	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5676	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5677	}
				5678
				5679	/*
				5680	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5681	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5682	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5683	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5684	/* Don't need to rebalance while attached to NULL domain */
				5685	if (time_after_eq(jiffies, rq->next_balance) &&
				5686	likely(!on_null_domain(cpu)))
				5687	raise_softirq(SCHED_SOFTIRQ);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5688	#ifdef CONFIG_NO_HZ
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5689	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5690	nohz_balancer_kick(cpu);
				5691	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5692	}
				5693
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5694	static void rq_online_fair(struct rq *rq)
				5695	{
				5696	update_sysctl();
				5697	}
				5698
				5699	static void rq_offline_fair(struct rq *rq)
				5700	{
				5701	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	5702
				5703	/* Ensure any throttled groups are reachable by pick_next_task */
				5704	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5705	}
				5706
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	5707	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	5708
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5709	/*
				5710	* scheduler tick hitting a task of our scheduling class:
				5711	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5712	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5713	{
				5714	struct cfs_rq *cfs_rq;
				5715	struct sched_entity *se = &curr->se;
				5716
				5717	for_each_sched_entity(se) {
				5718	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5719	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5720	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5721
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	5722	if (sched_feat_numa(NUMA))
				5723	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	5724
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5725	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5726	}
				5727
				5728	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5729	* called on fork with the child task as argument from the parent's context
				5730	* - child not yet on the tasklist
				5731	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5732	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5733	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5734	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5735	struct cfs_rq *cfs_rq;
				5736	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	5737	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5738	struct rq *rq = this_rq();
				5739	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5740
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5741	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5742
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	5743	update_rq_clock(rq);
				5744
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5745	cfs_rq = task_cfs_rq(current);
				5746	curr = cfs_rq->curr;
				5747
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5748	if (unlikely(task_cpu(p) != this_cpu)) {
				5749	rcu_read_lock();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5750	__set_task_cpu(p, this_cpu);
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5751	rcu_read_unlock();
				5752	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5753
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	5754	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5755
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	5756	if (curr)
				5757	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	5758	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5759
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5760	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	5761	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	5762	* Upon rescheduling, sched_class::put_prev_task() will place
				5763	* 'current' within the tree based on its new key value.
				5764	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5765	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5766	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5767	}
				5768
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5769	se->vruntime -= cfs_rq->min_vruntime;
				5770
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5771	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5772	}
				5773
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5774	/*
				5775	* Priority of the task has changed. Check to see if we preempt
				5776	* the current task.
				5777	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5778	static void
				5779	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5780	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5781	if (!p->se.on_rq)
				5782	return;
				5783
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5784	/*
				5785	* Reschedule if we are currently running on this runqueue and
				5786	* our priority decreased, or if we are not currently running on
				5787	* this runqueue and our priority is higher than the current's
				5788	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5789	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5790	if (p->prio > oldprio)
				5791	resched_task(rq->curr);
				5792	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5793	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5794	}
				5795
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5796	static void switched_from_fair(struct rq rq, struct task_struct p)
				5797	{
				5798	struct sched_entity *se = &p->se;
				5799	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5800
				5801	/*
				5802	* Ensure the task's vruntime is normalized, so that when its
				5803	* switched back to the fair class the enqueue_entity(.flags=0) will
				5804	* do the right thing.
				5805	*
				5806	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				5807	* have normalized the vruntime, if it was !on_rq, then only when
				5808	* the task is sleeping will it still have non-normalized vruntime.
				5809	*/
				5810	if (!se->on_rq && p->state != TASK_RUNNING) {
				5811	/*
				5812	* Fix up our vruntime so that the current sleep doesn't
				5813	* cause 'unlimited' sleep bonus.
				5814	*/
				5815	place_entity(cfs_rq, se, 0);
				5816	se->vruntime -= cfs_rq->min_vruntime;
				5817	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5818
				5819	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5820	/*
				5821	* Remove our load from contribution when we leave sched_fair
				5822	* and ensure we don't carry in an old decay_count if we
				5823	* switch back.
				5824	*/
				5825	if (p->se.avg.decay_count) {
				5826	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
				5827	__synchronize_entity_decay(&p->se);
				5828	subtract_blocked_load_contrib(cfs_rq,
				5829	p->se.avg.load_avg_contrib);
				5830	}
				5831	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5832	}
				5833
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5834	/*
				5835	* We switched to the sched_fair class.
				5836	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5837	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5838	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5839	if (!p->se.on_rq)
				5840	return;
				5841
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5842	/*
				5843	* We were most likely switched from sched_rt, so
				5844	* kick off the schedule if running, otherwise just see
				5845	* if we can still preempt the current task.
				5846	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5847	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5848	resched_task(rq->curr);
				5849	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5850	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5851	}
				5852
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5853	/* Account for a task changing its policy or group.
				5854	*
				5855	* This routine is mostly called to set cfs_rq->curr field when a task
				5856	* migrates between groups/classes.
				5857	*/
				5858	static void set_curr_task_fair(struct rq *rq)
				5859	{
				5860	struct sched_entity *se = &rq->curr->se;
				5861
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	5862	for_each_sched_entity(se) {
				5863	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5864
				5865	set_next_entity(cfs_rq, se);
				5866	/* ensure bandwidth has been allocated on our new cfs_rq */
				5867	account_cfs_rq_runtime(cfs_rq, 0);
				5868	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5869	}
				5870
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5871	void init_cfs_rq(struct cfs_rq *cfs_rq)
				5872	{
				5873	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5874	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				5875	#ifndef CONFIG_64BIT
				5876	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				5877	#endif
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5878	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5879	atomic64_set(&cfs_rq->decay_counter, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5880	atomic64_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5881	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5882	}
				5883
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5884	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5885	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5886	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5887	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5888	/*
				5889	* If the task was not on the rq at the time of this cgroup movement
				5890	* it must have been asleep, sleeping tasks keep their ->vruntime
				5891	* absolute on their old rq until wakeup (needed for the fair sleeper
				5892	* bonus in place_entity()).
				5893	*
				5894	* If it was on the rq, we've just 'preempted' it, which does convert
				5895	* ->vruntime to a relative base.
				5896	*
				5897	* Make sure both cases convert their relative position when migrating
				5898	* to another cgroup's rq. This does somewhat interfere with the
				5899	* fair sleeper stuff for the first placement, but who cares.
				5900	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5901	/*
				5902	* When !on_rq, vruntime of the task has usually NOT been normalized.
				5903	* But there are some cases where it has already been normalized:
				5904	*
				5905	* - Moving a forked child which is waiting for being woken up by
				5906	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5907	* - Moving a task which has been woken up by try_to_wake_up() and
				5908	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5909	*
				5910	* To prevent boost or penalty in the new cfs_rq caused by delta
				5911	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				5912	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5913	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5914	on_rq = 1;
				5915
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5916	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5917	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				5918	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5919	if (!on_rq) {
				5920	cfs_rq = cfs_rq_of(&p->se);
				5921	p->se.vruntime += cfs_rq->min_vruntime;
				5922	#ifdef CONFIG_SMP
				5923	/*
				5924	* migrate_task_rq_fair() will have removed our previous
				5925	* contribution, but we must synchronize for ongoing future
				5926	* decay.
				5927	*/
				5928	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				5929	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				5930	#endif
				5931	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5932	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5933
				5934	void free_fair_sched_group(struct task_group *tg)
				5935	{
				5936	int i;
				5937
				5938	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5939
				5940	for_each_possible_cpu(i) {
				5941	if (tg->cfs_rq)
				5942	kfree(tg->cfs_rq[i]);
				5943	if (tg->se)
				5944	kfree(tg->se[i]);
				5945	}
				5946
				5947	kfree(tg->cfs_rq);
				5948	kfree(tg->se);
				5949	}
				5950
				5951	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				5952	{
				5953	struct cfs_rq *cfs_rq;
				5954	struct sched_entity *se;
				5955	int i;
				5956
				5957	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				5958	if (!tg->cfs_rq)
				5959	goto err;
				5960	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				5961	if (!tg->se)
				5962	goto err;
				5963
				5964	tg->shares = NICE_0_LOAD;
				5965
				5966	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5967
				5968	for_each_possible_cpu(i) {
				5969	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				5970	GFP_KERNEL, cpu_to_node(i));
				5971	if (!cfs_rq)
				5972	goto err;
				5973
				5974	se = kzalloc_node(sizeof(struct sched_entity),
				5975	GFP_KERNEL, cpu_to_node(i));
				5976	if (!se)
				5977	goto err_free_rq;
				5978
				5979	init_cfs_rq(cfs_rq);
				5980	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				5981	}
				5982
				5983	return 1;
				5984
				5985	err_free_rq:
				5986	kfree(cfs_rq);
				5987	err:
				5988	return 0;
				5989	}
				5990
				5991	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				5992	{
				5993	struct rq *rq = cpu_rq(cpu);
				5994	unsigned long flags;
				5995
				5996	/*
				5997	* Only empty task groups can be destroyed; so we can speculatively
				5998	* check on_list without danger of it being re-added.
				5999	*/
				6000	if (!tg->cfs_rq[cpu]->on_list)
				6001	return;
				6002
				6003	raw_spin_lock_irqsave(&rq->lock, flags);
				6004	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6005	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6006	}
				6007
				6008	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6009	struct sched_entity *se, int cpu,
				6010	struct sched_entity *parent)
				6011	{
				6012	struct rq *rq = cpu_rq(cpu);
				6013
				6014	cfs_rq->tg = tg;
				6015	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6016	init_cfs_rq_runtime(cfs_rq);
				6017
				6018	tg->cfs_rq[cpu] = cfs_rq;
				6019	tg->se[cpu] = se;
				6020
				6021	/* se could be NULL for root_task_group */
				6022	if (!se)
				6023	return;
				6024
				6025	if (!parent)
				6026	se->cfs_rq = &rq->cfs;
				6027	else
				6028	se->cfs_rq = parent->my_q;
				6029
				6030	se->my_q = cfs_rq;
				6031	update_load_set(&se->load, 0);
				6032	se->parent = parent;
				6033	}
				6034
				6035	static DEFINE_MUTEX(shares_mutex);
				6036
				6037	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6038	{
				6039	int i;
				6040	unsigned long flags;
				6041
				6042	/*
				6043	* We can't change the weight of the root cgroup.
				6044	*/
				6045	if (!tg->se[0])
				6046	return -EINVAL;
				6047
				6048	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6049
				6050	mutex_lock(&shares_mutex);
				6051	if (tg->shares == shares)
				6052	goto done;
				6053
				6054	tg->shares = shares;
				6055	for_each_possible_cpu(i) {
				6056	struct rq *rq = cpu_rq(i);
				6057	struct sched_entity *se;
				6058
				6059	se = tg->se[i];
				6060	/* Propagate contribution to hierarchy */
				6061	raw_spin_lock_irqsave(&rq->lock, flags);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6062	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6063	update_cfs_shares(group_cfs_rq(se));
				6064	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6065	}
				6066
				6067	done:
				6068	mutex_unlock(&shares_mutex);
				6069	return 0;
				6070	}
				6071	#else /* CONFIG_FAIR_GROUP_SCHED */
				6072
				6073	void free_fair_sched_group(struct task_group *tg) { }
				6074
				6075	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6076	{
				6077	return 1;
				6078	}
				6079
				6080	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6081
				6082	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6083
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6084
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6085	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6086	{
				6087	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6088	unsigned int rr_interval = 0;
				6089
				6090	/*
				6091	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6092	* idle runqueue:
				6093	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6094	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6095	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6096
				6097	return rr_interval;
				6098	}
				6099
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6100	/*
				6101	* All the scheduling class methods:
				6102	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6103	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6104	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6105	.enqueue_task = enqueue_task_fair,
				6106	.dequeue_task = dequeue_task_fair,
				6107	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6108	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6109
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6110	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6111
				6112	.pick_next_task = pick_next_task_fair,
				6113	.put_prev_task = put_prev_task_fair,
				6114
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6115	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6116	.select_task_rq = select_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6117	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6118	.migrate_task_rq = migrate_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6119	#endif
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6120	.rq_online = rq_online_fair,
				6121	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6122
				6123	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6124	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6125
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6126	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6127	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6128	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6129
				6130	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6131	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6132	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6133
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6134	.get_rr_interval = get_rr_interval_fair,
				6135
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6136	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6137	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6138	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6139	};
				6140
				6141	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6142	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6143	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6144	struct cfs_rq *cfs_rq;
				6145
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6146	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6147	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6148	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6149	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6150	}
				6151	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6152
				6153	__init void init_sched_fair_class(void)
				6154	{
				6155	#ifdef CONFIG_SMP
				6156	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6157
				6158	#ifdef CONFIG_NO_HZ
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6159	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6160	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6161	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6162	#endif
				6163	#endif /* SMP */
				6164
				6165	}