Blame - kernel/sched/fair.c - android_kernel_oneplus_msm8996

blob: 0bb3e0aa110b21ea664a9c0a0f6224185c34fcd5 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Paul Gortmaker	8527632	2013-04-19 15:10:50 -0400	[diff] [blame]	116	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				117	{
				118	lw->weight += inc;
				119	lw->inv_weight = 0;
				120	}
				121
				122	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				123	{
				124	lw->weight -= dec;
				125	lw->inv_weight = 0;
				126	}
				127
				128	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				129	{
				130	lw->weight = w;
				131	lw->inv_weight = 0;
				132	}
				133
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	134	/*
				135	* Increase the granularity value when there are more CPUs,
				136	* because with more CPUs the 'effective latency' as visible
				137	* to users decreases. But the relationship is not linear,
				138	* so pick a second-best guess by going with the log2 of the
				139	* number of CPUs.
				140	*
				141	* This idea comes from the SD scheduler of Con Kolivas:
				142	*/
				143	static int get_update_sysctl_factor(void)
				144	{
				145	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				146	unsigned int factor;
				147
				148	switch (sysctl_sched_tunable_scaling) {
				149	case SCHED_TUNABLESCALING_NONE:
				150	factor = 1;
				151	break;
				152	case SCHED_TUNABLESCALING_LINEAR:
				153	factor = cpus;
				154	break;
				155	case SCHED_TUNABLESCALING_LOG:
				156	default:
				157	factor = 1 + ilog2(cpus);
				158	break;
				159	}
				160
				161	return factor;
				162	}
				163
				164	static void update_sysctl(void)
				165	{
				166	unsigned int factor = get_update_sysctl_factor();
				167
				168	#define SET_SYSCTL(name) \
				169	(sysctl_##name = (factor) * normalized_sysctl_##name)
				170	SET_SYSCTL(sched_min_granularity);
				171	SET_SYSCTL(sched_latency);
				172	SET_SYSCTL(sched_wakeup_granularity);
				173	#undef SET_SYSCTL
				174	}
				175
				176	void sched_init_granularity(void)
				177	{
				178	update_sysctl();
				179	}
				180
				181	#if BITS_PER_LONG == 32
				182	# define WMULT_CONST (~0UL)
				183	#else
				184	# define WMULT_CONST (1UL << 32)
				185	#endif
				186
				187	#define WMULT_SHIFT 32
				188
				189	/*
				190	* Shift right and round:
				191	*/
				192	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				193
				194	/*
				195	* delta *= weight / lw
				196	*/
				197	static unsigned long
				198	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				199	struct load_weight *lw)
				200	{
				201	u64 tmp;
				202
				203	/*
				204	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				205	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				206	* 2^SCHED_LOAD_RESOLUTION.
				207	*/
				208	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				209	tmp = (u64)delta_exec * scale_load_down(weight);
				210	else
				211	tmp = (u64)delta_exec;
				212
				213	if (!lw->inv_weight) {
				214	unsigned long w = scale_load_down(lw->weight);
				215
				216	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				217	lw->inv_weight = 1;
				218	else if (unlikely(!w))
				219	lw->inv_weight = WMULT_CONST;
				220	else
				221	lw->inv_weight = WMULT_CONST / w;
				222	}
				223
				224	/*
				225	* Check whether we'd overflow the 64-bit multiplication:
				226	*/
				227	if (unlikely(tmp > WMULT_CONST))
				228	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				229	WMULT_SHIFT/2);
				230	else
				231	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				232
				233	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				234	}
				235
				236
				237	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	238
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	239	/**************************************************************
				240	* CFS operations on generic schedulable entities:
				241	*/
				242
				243	#ifdef CONFIG_FAIR_GROUP_SCHED
				244
				245	/* cpu runqueue to which this cfs_rq is attached */
				246	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				247	{
				248	return cfs_rq->rq;
				249	}
				250
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	251	/* An entity is a task if it doesn't "own" a runqueue */
				252	#define entity_is_task(se) (!se->my_q)
				253
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	254	static inline struct task_struct task_of(struct sched_entity se)
				255	{
				256	#ifdef CONFIG_SCHED_DEBUG
				257	WARN_ON_ONCE(!entity_is_task(se));
				258	#endif
				259	return container_of(se, struct task_struct, se);
				260	}
				261
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	262	/* Walk up scheduling entities hierarchy */
				263	#define for_each_sched_entity(se) \
				264	for (; se; se = se->parent)
				265
				266	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				267	{
				268	return p->se.cfs_rq;
				269	}
				270
				271	/* runqueue on which this entity is (to be) queued */
				272	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				273	{
				274	return se->cfs_rq;
				275	}
				276
				277	/* runqueue "owned" by this group */
				278	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				279	{
				280	return grp->my_q;
				281	}
				282
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	283	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				284	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	285
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	286	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				287	{
				288	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	289	/*
				290	* Ensure we either appear before our parent (if already
				291	* enqueued) or force our parent to appear after us when it is
				292	* enqueued. The fact that we always enqueue bottom-up
				293	* reduces this to two cases.
				294	*/
				295	if (cfs_rq->tg->parent &&
				296	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				297	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	298	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	299	} else {
				300	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				301	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				302	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	303
				304	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	305	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	306	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	307	}
				308	}
				309
				310	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				311	{
				312	if (cfs_rq->on_list) {
				313	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				314	cfs_rq->on_list = 0;
				315	}
				316	}
				317
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	318	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				319	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				320	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				321
				322	/* Do the two (enqueued) entities belong to the same group ? */
				323	static inline int
				324	is_same_group(struct sched_entity se, struct sched_entity pse)
				325	{
				326	if (se->cfs_rq == pse->cfs_rq)
				327	return 1;
				328
				329	return 0;
				330	}
				331
				332	static inline struct sched_entity parent_entity(struct sched_entity se)
				333	{
				334	return se->parent;
				335	}
				336
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	337	/* return depth at which a sched entity is present in the hierarchy */
				338	static inline int depth_se(struct sched_entity *se)
				339	{
				340	int depth = 0;
				341
				342	for_each_sched_entity(se)
				343	depth++;
				344
				345	return depth;
				346	}
				347
				348	static void
				349	find_matching_se(struct sched_entity se, struct sched_entity pse)
				350	{
				351	int se_depth, pse_depth;
				352
				353	/*
				354	* preemption test can be made between sibling entities who are in the
				355	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				356	* both tasks until we find their ancestors who are siblings of common
				357	* parent.
				358	*/
				359
				360	/* First walk up until both entities are at same depth */
				361	se_depth = depth_se(*se);
				362	pse_depth = depth_se(*pse);
				363
				364	while (se_depth > pse_depth) {
				365	se_depth--;
				366	se = parent_entity(se);
				367	}
				368
				369	while (pse_depth > se_depth) {
				370	pse_depth--;
				371	pse = parent_entity(pse);
				372	}
				373
				374	while (!is_same_group(se, pse)) {
				375	se = parent_entity(se);
				376	pse = parent_entity(pse);
				377	}
				378	}
				379
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	380	#else /* !CONFIG_FAIR_GROUP_SCHED */
				381
				382	static inline struct task_struct task_of(struct sched_entity se)
				383	{
				384	return container_of(se, struct task_struct, se);
				385	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	386
				387	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				388	{
				389	return container_of(cfs_rq, struct rq, cfs);
				390	}
				391
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	392	#define entity_is_task(se) 1
				393
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	394	#define for_each_sched_entity(se) \
				395	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	396
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	397	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	398	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	399	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	400	}
				401
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	402	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				403	{
				404	struct task_struct *p = task_of(se);
				405	struct rq *rq = task_rq(p);
				406
				407	return &rq->cfs;
				408	}
				409
				410	/* runqueue "owned" by this group */
				411	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				412	{
				413	return NULL;
				414	}
				415
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	416	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				417	{
				418	}
				419
				420	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				421	{
				422	}
				423
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	424	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				425	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				426
				427	static inline int
				428	is_same_group(struct sched_entity se, struct sched_entity pse)
				429	{
				430	return 1;
				431	}
				432
				433	static inline struct sched_entity parent_entity(struct sched_entity se)
				434	{
				435	return NULL;
				436	}
				437
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	438	static inline void
				439	find_matching_se(struct sched_entity se, struct sched_entity pse)
				440	{
				441	}
				442
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	443	#endif /* CONFIG_FAIR_GROUP_SCHED */
				444
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	445	static __always_inline
				446	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	447
				448	/**************************************************************
				449	* Scheduling class tree data structure manipulation methods:
				450	*/
				451
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	452	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	453	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	454	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	455	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	456	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	457
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	458	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	459	}
				460
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	461	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	462	{
				463	s64 delta = (s64)(vruntime - min_vruntime);
				464	if (delta < 0)
				465	min_vruntime = vruntime;
				466
				467	return min_vruntime;
				468	}
				469
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	470	static inline int entity_before(struct sched_entity *a,
				471	struct sched_entity *b)
				472	{
				473	return (s64)(a->vruntime - b->vruntime) < 0;
				474	}
				475
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	476	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				477	{
				478	u64 vruntime = cfs_rq->min_vruntime;
				479
				480	if (cfs_rq->curr)
				481	vruntime = cfs_rq->curr->vruntime;
				482
				483	if (cfs_rq->rb_leftmost) {
				484	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				485	struct sched_entity,
				486	run_node);
				487
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	488	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	489	vruntime = se->vruntime;
				490	else
				491	vruntime = min_vruntime(vruntime, se->vruntime);
				492	}
				493
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	494	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	495	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	496	#ifndef CONFIG_64BIT
				497	smp_wmb();
				498	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				499	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	500	}
				501
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	502	/*
				503	* Enqueue an entity into the rb-tree:
				504	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	505	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	506	{
				507	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				508	struct rb_node *parent = NULL;
				509	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	510	int leftmost = 1;
				511
				512	/*
				513	* Find the right place in the rbtree:
				514	*/
				515	while (*link) {
				516	parent = *link;
				517	entry = rb_entry(parent, struct sched_entity, run_node);
				518	/*
				519	* We dont care about collisions. Nodes with
				520	* the same key stay together.
				521	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	522	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	link = &parent->rb_left;
				524	} else {
				525	link = &parent->rb_right;
				526	leftmost = 0;
				527	}
				528	}
				529
				530	/*
				531	* Maintain a cache of leftmost tree entries (it is frequently
				532	* used):
				533	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	534	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	535	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	536
				537	rb_link_node(&se->run_node, parent, link);
				538	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	539	}
				540
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	541	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	543	if (cfs_rq->rb_leftmost == &se->run_node) {
				544	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	545
				546	next_node = rb_next(&se->run_node);
				547	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	548	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	549
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	550	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	551	}
				552
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	553	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	554	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	555	struct rb_node *left = cfs_rq->rb_leftmost;
				556
				557	if (!left)
				558	return NULL;
				559
				560	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	561	}
				562
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	563	static struct sched_entity __pick_next_entity(struct sched_entity se)
				564	{
				565	struct rb_node *next = rb_next(&se->run_node);
				566
				567	if (!next)
				568	return NULL;
				569
				570	return rb_entry(next, struct sched_entity, run_node);
				571	}
				572
				573	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	574	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	575	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	576	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	577
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	578	if (!last)
				579	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	580
				581	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	582	}
				583
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	584	/**************************************************************
				585	* Scheduling class statistics methods:
				586	*/
				587
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	588	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	589	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	590	loff_t *ppos)
				591	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	592	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	593	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	594
				595	if (ret \|\| !write)
				596	return ret;
				597
				598	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				599	sysctl_sched_min_granularity);
				600
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	601	#define WRT_SYSCTL(name) \
				602	(normalized_sysctl_##name = sysctl_##name / (factor))
				603	WRT_SYSCTL(sched_min_granularity);
				604	WRT_SYSCTL(sched_latency);
				605	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	606	#undef WRT_SYSCTL
				607
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	608	return 0;
				609	}
				610	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	611
				612	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	613	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	614	*/
				615	static inline unsigned long
				616	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				617	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	618	if (unlikely(se->load.weight != NICE_0_LOAD))
				619	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	620
				621	return delta;
				622	}
				623
				624	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	625	* The idea is to set a period in which each task runs once.
				626	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	627	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	628	* this period because otherwise the slices get too small.
				629	*
				630	* p = (nr <= nl) ? l : l*nr/nl
				631	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	632	static u64 __sched_period(unsigned long nr_running)
				633	{
				634	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	635	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	636
				637	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	638	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	639	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	640	}
				641
				642	return period;
				643	}
				644
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	645	/*
				646	* We calculate the wall-time slice from the period by taking a part
				647	* proportional to the weight.
				648	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	649	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	650	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	651	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	652	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	653	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	654
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	655	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	656	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	657	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	658
				659	cfs_rq = cfs_rq_of(se);
				660	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	662	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	663	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	664
				665	update_load_add(&lw, se->load.weight);
				666	load = &lw;
				667	}
				668	slice = calc_delta_mine(slice, se->load.weight, load);
				669	}
				670	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	}
				672
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	673	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	674	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	675	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	676	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	677	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	678	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	679	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	680	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	681	}
				682
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	683	#ifdef CONFIG_SMP
				684	static inline void __update_task_entity_contrib(struct sched_entity *se);
				685
				686	/* Give new task start runnable values to heavy its load in infant time */
				687	void init_task_runnable_average(struct task_struct *p)
				688	{
				689	u32 slice;
				690
				691	p->se.avg.decay_count = 0;
				692	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
				693	p->se.avg.runnable_avg_sum = slice;
				694	p->se.avg.runnable_avg_period = slice;
				695	__update_task_entity_contrib(&p->se);
				696	}
				697	#else
				698	void init_task_runnable_average(struct task_struct *p)
				699	{
				700	}
				701	#endif
				702
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	703	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	704	* Update the current task's runtime statistics. Skip current tasks that
				705	* are not in our scheduling class.
				706	*/
				707	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	708	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				709	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	710	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	711	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	712
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	713	schedstat_set(curr->statistics.exec_max,
				714	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	715
				716	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	717	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	718	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	719
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	720	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	721	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	722	}
				723
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	724	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	725	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	726	struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	727	u64 now = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	unsigned long delta_exec;
				729
				730	if (unlikely(!curr))
				731	return;
				732
				733	/*
				734	* Get the amount of time the current task was running
				735	* since the last time we changed load (this cannot
				736	* overflow on 32 bits):
				737	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	738	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	739	if (!delta_exec)
				740	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	741
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	742	__update_curr(cfs_rq, curr, delta_exec);
				743	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	744
				745	if (entity_is_task(curr)) {
				746	struct task_struct *curtask = task_of(curr);
				747
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	748	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	749	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	750	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	751	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	752
				753	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	754	}
				755
				756	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	757	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	758	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	759	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	760	}
				761
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	/*
				763	* Task is being enqueued - update stats:
				764	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	765	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	766	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	767	/*
				768	* Are we enqueueing a waiting task? (for current tasks
				769	* a dequeue/enqueue event is a NOP)
				770	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	771	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	772	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	773	}
				774
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	775	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	776	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	777	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	778	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	779	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	780	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				781	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	782	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	783	#ifdef CONFIG_SCHEDSTATS
				784	if (entity_is_task(se)) {
				785	trace_sched_stat_wait(task_of(se),
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	786	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	787	}
				788	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	789	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	790	}
				791
				792	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	793	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	794	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	795	/*
				796	* Mark the end of the wait period if dequeueing a
				797	* waiting task:
				798	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	799	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	800	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	801	}
				802
				803	/*
				804	* We are picking a new current task - update its stats:
				805	*/
				806	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	807	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	808	{
				809	/*
				810	* We are starting a new run period:
				811	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	812	se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	813	}
				814
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	815	/**************************************************
				816	* Scheduling class queueing methods:
				817	*/
				818
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	819	#ifdef CONFIG_NUMA_BALANCING
				820	/*
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	821	* Approximate time to scan a full NUMA task in ms. The task scan period is
				822	* calculated based on the tasks virtual memory size and
				823	* numa_balancing_scan_size.
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	824	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	825	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				826	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
				827	unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	828
				829	/* Portion of address space to scan in MB */
				830	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	831
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	832	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				833	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				834
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	835	static unsigned int task_nr_scan_windows(struct task_struct *p)
				836	{
				837	unsigned long rss = 0;
				838	unsigned long nr_scan_pages;
				839
				840	/*
				841	* Calculations based on RSS as non-present and empty pages are skipped
				842	* by the PTE scanner and NUMA hinting faults should be trapped based
				843	* on resident pages
				844	*/
				845	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				846	rss = get_mm_rss(p->mm);
				847	if (!rss)
				848	rss = nr_scan_pages;
				849
				850	rss = round_up(rss, nr_scan_pages);
				851	return rss / nr_scan_pages;
				852	}
				853
				854	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				855	#define MAX_SCAN_WINDOW 2560
				856
				857	static unsigned int task_scan_min(struct task_struct *p)
				858	{
				859	unsigned int scan, floor;
				860	unsigned int windows = 1;
				861
				862	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
				863	windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
				864	floor = 1000 / windows;
				865
				866	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				867	return max_t(unsigned int, floor, scan);
				868	}
				869
				870	static unsigned int task_scan_max(struct task_struct *p)
				871	{
				872	unsigned int smin = task_scan_min(p);
				873	unsigned int smax;
				874
				875	/* Watch for min being lower than max due to floor calculations */
				876	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				877	return max(smin, smax);
				878	}
				879
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	880	static void task_numa_placement(struct task_struct *p)
				881	{
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	882	int seq;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	883
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	884	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				885	return;
				886	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	887	if (p->numa_scan_seq == seq)
				888	return;
				889	p->numa_scan_seq = seq;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	890	p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	891
				892	/* FIXME: Scheduling placement policy hints go here */
				893	}
				894
				895	/*
				896	* Got a PROT_NONE fault for a page on @node.
				897	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	898	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	899	{
				900	struct task_struct *p = current;
				901
Dave Kleikamp	10e84b9	2013-07-31 13:53:35 -0700	[diff] [blame]	902	if (!numabalancing_enabled)
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	903	return;
				904
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame^]	905	/* Allocate buffer to track faults on a per-node basis */
				906	if (unlikely(!p->numa_faults)) {
				907	int size = sizeof(p->numa_faults) nr_node_ids;
				908
				909	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				910	if (!p->numa_faults)
				911	return;
				912	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	913
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	914	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	915	* If pages are properly placed (did not migrate) then scan slower.
				916	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	917	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	918	if (!migrated) {
				919	/* Initialise if necessary */
				920	if (!p->numa_scan_period_max)
				921	p->numa_scan_period_max = task_scan_max(p);
				922
				923	p->numa_scan_period = min(p->numa_scan_period_max,
				924	p->numa_scan_period + 10);
				925	}
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	926
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	927	task_numa_placement(p);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame^]	928
				929	p->numa_faults[node] += pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	930	}
				931
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	932	static void reset_ptenuma_scan(struct task_struct *p)
				933	{
				934	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				935	p->mm->numa_scan_offset = 0;
				936	}
				937
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	938	/*
				939	* The expensive part of numa migration is done from task_work context.
				940	* Triggered from task_tick_numa().
				941	*/
				942	void task_numa_work(struct callback_head *work)
				943	{
				944	unsigned long migrate, next_scan, now = jiffies;
				945	struct task_struct *p = current;
				946	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	947	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	948	unsigned long start, end;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	949	unsigned long nr_pte_updates = 0;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	950	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	951
				952	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				953
				954	work->next = work; /* protect against double add */
				955	/*
				956	* Who cares about NUMA placement when they're dying.
				957	*
				958	* NOTE: make sure not to dereference p->mm before this check,
				959	* exit_task_work() happens _after_ exit_mm() so we could be called
				960	* without p->mm even though we still had it when we enqueued this
				961	* work.
				962	*/
				963	if (p->flags & PF_EXITING)
				964	return;
				965
Mel Gorman	7e8d16b	2013-10-07 11:28:54 +0100	[diff] [blame]	966	if (!mm->numa_next_reset \|\| !mm->numa_next_scan) {
				967	mm->numa_next_scan = now +
				968	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
				969	mm->numa_next_reset = now +
				970	msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				971	}
				972
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	973	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	974	* Reset the scan period if enough time has gone by. Objective is that
				975	* scanning will be reduced if pages are properly placed. As tasks
				976	* can enter different phases this needs to be re-examined. Lacking
				977	* proper tracking of reference behaviour, this blunt hammer is used.
				978	*/
				979	migrate = mm->numa_next_reset;
				980	if (time_after(now, migrate)) {
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	981	p->numa_scan_period = task_scan_min(p);
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	982	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				983	xchg(&mm->numa_next_reset, next_scan);
				984	}
				985
				986	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	987	* Enforce maximal scan/migration frequency..
				988	*/
				989	migrate = mm->numa_next_scan;
				990	if (time_before(now, migrate))
				991	return;
				992
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	993	if (p->numa_scan_period == 0) {
				994	p->numa_scan_period_max = task_scan_max(p);
				995	p->numa_scan_period = task_scan_min(p);
				996	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	997
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	998	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	999	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				1000	return;
				1001
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	1002	/*
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	1003	* Delay this task enough that another task of this mm will likely win
				1004	* the next time around.
				1005	*/
				1006	p->node_stamp += 2 * TICK_NSEC;
				1007
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1008	start = mm->numa_scan_offset;
				1009	pages = sysctl_numa_balancing_scan_size;
				1010	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				1011	if (!pages)
				1012	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1013
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1014	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1015	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1016	if (!vma) {
				1017	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1018	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1019	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1020	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1021	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1022	if (!vma_migratable(vma))
				1023	continue;
				1024
				1025	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	1026	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1027	continue;
				1028
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1029	do {
				1030	start = max(start, vma->vm_start);
				1031	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				1032	end = min(end, vma->vm_end);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1033	nr_pte_updates += change_prot_numa(vma, start, end);
				1034
				1035	/*
				1036	* Scan sysctl_numa_balancing_scan_size but ensure that
				1037	* at least one PTE is updated so that unused virtual
				1038	* address space is quickly skipped.
				1039	*/
				1040	if (nr_pte_updates)
				1041	pages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1042
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1043	start = end;
				1044	if (pages <= 0)
				1045	goto out;
				1046	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1047	}
				1048
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1049	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1050	/*
Mel Gorman	f307cd1	2013-10-07 11:28:56 +0100	[diff] [blame]	1051	* If the whole process was scanned without updates then no NUMA
				1052	* hinting faults are being recorded and scan rate should be lower.
				1053	*/
				1054	if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
				1055	p->numa_scan_period = min(p->numa_scan_period_max,
				1056	p->numa_scan_period << 1);
				1057
				1058	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
				1059	mm->numa_next_scan = next_scan;
				1060	}
				1061
				1062	/*
Peter Zijlstra	c69307d	2013-10-07 11:28:41 +0100	[diff] [blame]	1063	* It is possible to reach the end of the VMA list but the last few
				1064	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				1065	* would find the !migratable VMA on the next scan but not reset the
				1066	* scanner to the start so check it now.
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1067	*/
				1068	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1069	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1070	else
				1071	reset_ptenuma_scan(p);
				1072	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1073	}
				1074
				1075	/*
				1076	* Drive the periodic memory faults..
				1077	*/
				1078	void task_tick_numa(struct rq rq, struct task_struct curr)
				1079	{
				1080	struct callback_head *work = &curr->numa_work;
				1081	u64 period, now;
				1082
				1083	/*
				1084	* We don't care about NUMA placement if we don't have memory.
				1085	*/
				1086	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				1087	return;
				1088
				1089	/*
				1090	* Using runtime rather than walltime has the dual advantage that
				1091	* we (mostly) drive the selection from busy threads and that the
				1092	* task needs to have done some actual work before we bother with
				1093	* NUMA placement.
				1094	*/
				1095	now = curr->se.sum_exec_runtime;
				1096	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				1097
				1098	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	1099	if (!curr->node_stamp)
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1100	curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	1101	curr->node_stamp += period;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1102
				1103	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				1104	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				1105	task_work_add(curr, work, true);
				1106	}
				1107	}
				1108	}
				1109	#else
				1110	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1111	{
				1112	}
				1113	#endif /* CONFIG_NUMA_BALANCING */
				1114
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1115	static void
				1116	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1117	{
				1118	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1119	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1120	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1121	#ifdef CONFIG_SMP
				1122	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1123	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1124	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1125	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1126	}
				1127
				1128	static void
				1129	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1130	{
				1131	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1132	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1133	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1134	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1135	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1136	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1137	}
				1138
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1139	#ifdef CONFIG_FAIR_GROUP_SCHED
				1140	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1141	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1142	{
				1143	long tg_weight;
				1144
				1145	/*
				1146	* Use this CPU's actual weight instead of the last load_contribution
				1147	* to gain a more accurate current total weight. See
				1148	* update_cfs_rq_load_contribution().
				1149	*/
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1150	tg_weight = atomic_long_read(&tg->load_avg);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1151	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1152	tg_weight += cfs_rq->load.weight;
				1153
				1154	return tg_weight;
				1155	}
				1156
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1157	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1158	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1159	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1160
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1161	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1162	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1163
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1164	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1165	if (tg_weight)
				1166	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1167
				1168	if (shares < MIN_SHARES)
				1169	shares = MIN_SHARES;
				1170	if (shares > tg->shares)
				1171	shares = tg->shares;
				1172
				1173	return shares;
				1174	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1175	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1176	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1177	{
				1178	return tg->shares;
				1179	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1180	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1181	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1182	unsigned long weight)
				1183	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1184	if (se->on_rq) {
				1185	/* commit outstanding execution time */
				1186	if (cfs_rq->curr == se)
				1187	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1188	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1189	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1190
				1191	update_load_set(&se->load, weight);
				1192
				1193	if (se->on_rq)
				1194	account_entity_enqueue(cfs_rq, se);
				1195	}
				1196
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1197	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1198
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1199	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1200	{
				1201	struct task_group *tg;
				1202	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1203	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1204
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1205	tg = cfs_rq->tg;
				1206	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1207	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1208	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1209	#ifndef CONFIG_SMP
				1210	if (likely(se->load.weight == tg->shares))
				1211	return;
				1212	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1213	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1214
				1215	reweight_entity(cfs_rq_of(se), se, shares);
				1216	}
				1217	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1218	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1219	{
				1220	}
				1221	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1222
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	1223	#ifdef CONFIG_SMP
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1224	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1225	* We choose a half-life close to 1 scheduling period.
				1226	* Note: The tables below are dependent on this value.
				1227	*/
				1228	#define LOAD_AVG_PERIOD 32
				1229	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1230	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1231
				1232	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1233	static const u32 runnable_avg_yN_inv[] = {
				1234	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1235	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1236	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1237	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1238	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1239	0x85aac367, 0x82cd8698,
				1240	};
				1241
				1242	/*
				1243	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1244	* over-estimates when re-combining.
				1245	*/
				1246	static const u32 runnable_avg_yN_sum[] = {
				1247	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1248	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1249	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1250	};
				1251
				1252	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1253	* Approximate:
				1254	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1255	*/
				1256	static __always_inline u64 decay_load(u64 val, u64 n)
				1257	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1258	unsigned int local_n;
				1259
				1260	if (!n)
				1261	return val;
				1262	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1263	return 0;
				1264
				1265	/* after bounds checking we can collapse to 32-bit */
				1266	local_n = n;
				1267
				1268	/*
				1269	* As y^PERIOD = 1/2, we can combine
				1270	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1271	* With a look-up table which covers k^n (n<PERIOD)
				1272	*
				1273	* To achieve constant time decay_load.
				1274	*/
				1275	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1276	val >>= local_n / LOAD_AVG_PERIOD;
				1277	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1278	}
				1279
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1280	val *= runnable_avg_yN_inv[local_n];
				1281	/* We don't use SRR here since we always want to round down. */
				1282	return val >> 32;
				1283	}
				1284
				1285	/*
				1286	* For updates fully spanning n periods, the contribution to runnable
				1287	* average will be: \Sum 1024*y^n
				1288	*
				1289	* We can compute this reasonably efficiently by combining:
				1290	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1291	*/
				1292	static u32 __compute_runnable_contrib(u64 n)
				1293	{
				1294	u32 contrib = 0;
				1295
				1296	if (likely(n <= LOAD_AVG_PERIOD))
				1297	return runnable_avg_yN_sum[n];
				1298	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1299	return LOAD_AVG_MAX;
				1300
				1301	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1302	do {
				1303	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1304	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1305
				1306	n -= LOAD_AVG_PERIOD;
				1307	} while (n > LOAD_AVG_PERIOD);
				1308
				1309	contrib = decay_load(contrib, n);
				1310	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1311	}
				1312
				1313	/*
				1314	* We can represent the historical contribution to runnable average as the
				1315	* coefficients of a geometric series. To do this we sub-divide our runnable
				1316	* history into segments of approximately 1ms (1024us); label the segment that
				1317	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1318	*
				1319	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1320	* p0 p1 p2
				1321	* (now) (~1ms ago) (~2ms ago)
				1322	*
				1323	* Let u_i denote the fraction of p_i that the entity was runnable.
				1324	*
				1325	* We then designate the fractions u_i as our co-efficients, yielding the
				1326	* following representation of historical load:
				1327	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1328	*
				1329	* We choose y based on the with of a reasonably scheduling period, fixing:
				1330	* y^32 = 0.5
				1331	*
				1332	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1333	* approximately half as much as the contribution to load within the last ms
				1334	* (u_0).
				1335	*
				1336	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1337	* sum again by y is sufficient to update:
				1338	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1339	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1340	*/
				1341	static __always_inline int __update_entity_runnable_avg(u64 now,
				1342	struct sched_avg *sa,
				1343	int runnable)
				1344	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1345	u64 delta, periods;
				1346	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1347	int delta_w, decayed = 0;
				1348
				1349	delta = now - sa->last_runnable_update;
				1350	/*
				1351	* This should only happen when time goes backwards, which it
				1352	* unfortunately does during sched clock init when we swap over to TSC.
				1353	*/
				1354	if ((s64)delta < 0) {
				1355	sa->last_runnable_update = now;
				1356	return 0;
				1357	}
				1358
				1359	/*
				1360	* Use 1024ns as the unit of measurement since it's a reasonable
				1361	* approximation of 1us and fast to compute.
				1362	*/
				1363	delta >>= 10;
				1364	if (!delta)
				1365	return 0;
				1366	sa->last_runnable_update = now;
				1367
				1368	/* delta_w is the amount already accumulated against our next period */
				1369	delta_w = sa->runnable_avg_period % 1024;
				1370	if (delta + delta_w >= 1024) {
				1371	/* period roll-over */
				1372	decayed = 1;
				1373
				1374	/*
				1375	* Now that we know we're crossing a period boundary, figure
				1376	* out how much from delta we need to complete the current
				1377	* period and accrue it.
				1378	*/
				1379	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1380	if (runnable)
				1381	sa->runnable_avg_sum += delta_w;
				1382	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1383
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1384	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1385
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1386	/* Figure out how many additional periods this update spans */
				1387	periods = delta / 1024;
				1388	delta %= 1024;
				1389
				1390	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1391	periods + 1);
				1392	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1393	periods + 1);
				1394
				1395	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1396	runnable_contrib = __compute_runnable_contrib(periods);
				1397	if (runnable)
				1398	sa->runnable_avg_sum += runnable_contrib;
				1399	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1400	}
				1401
				1402	/* Remainder of delta accrued against u_0` */
				1403	if (runnable)
				1404	sa->runnable_avg_sum += delta;
				1405	sa->runnable_avg_period += delta;
				1406
				1407	return decayed;
				1408	}
				1409
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1410	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1411	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1412	{
				1413	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1414	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1415
				1416	decays -= se->avg.decay_count;
				1417	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1418	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1419
				1420	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1421	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1422
				1423	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1424	}
				1425
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1426	#ifdef CONFIG_FAIR_GROUP_SCHED
				1427	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1428	int force_update)
				1429	{
				1430	struct task_group *tg = cfs_rq->tg;
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1431	long tg_contrib;
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1432
				1433	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1434	tg_contrib -= cfs_rq->tg_load_contrib;
				1435
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1436	if (force_update \|\| abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1437	atomic_long_add(tg_contrib, &tg->load_avg);
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1438	cfs_rq->tg_load_contrib += tg_contrib;
				1439	}
				1440	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1441
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1442	/*
				1443	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1444	* representation for computing load contributions.
				1445	*/
				1446	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1447	struct cfs_rq *cfs_rq)
				1448	{
				1449	struct task_group *tg = cfs_rq->tg;
				1450	long contrib;
				1451
				1452	/* The fraction of a cpu used by this cfs_rq */
				1453	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1454	sa->runnable_avg_period + 1);
				1455	contrib -= cfs_rq->tg_runnable_contrib;
				1456
				1457	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1458	atomic_add(contrib, &tg->runnable_avg);
				1459	cfs_rq->tg_runnable_contrib += contrib;
				1460	}
				1461	}
				1462
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1463	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1464	{
				1465	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1466	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1467	int runnable_avg;
				1468
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1469	u64 contrib;
				1470
				1471	contrib = cfs_rq->tg_load_contrib * tg->shares;
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1472	se->avg.load_avg_contrib = div_u64(contrib,
				1473	atomic_long_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1474
				1475	/*
				1476	* For group entities we need to compute a correction term in the case
				1477	* that they are consuming <1 cpu so that we would contribute the same
				1478	* load as a task of equal weight.
				1479	*
				1480	* Explicitly co-ordinating this measurement would be expensive, but
				1481	* fortunately the sum of each cpus contribution forms a usable
				1482	* lower-bound on the true value.
				1483	*
				1484	* Consider the aggregate of 2 contributions. Either they are disjoint
				1485	* (and the sum represents true value) or they are disjoint and we are
				1486	* understating by the aggregate of their overlap.
				1487	*
				1488	* Extending this to N cpus, for a given overlap, the maximum amount we
				1489	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1490	* cpus that overlap for this interval and w_i is the interval width.
				1491	*
				1492	* On a small machine; the first term is well-bounded which bounds the
				1493	* total error since w_i is a subset of the period. Whereas on a
				1494	* larger machine, while this first term can be larger, if w_i is the
				1495	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1496	* our upper bound of 1-cpu.
				1497	*/
				1498	runnable_avg = atomic_read(&tg->runnable_avg);
				1499	if (runnable_avg < NICE_0_LOAD) {
				1500	se->avg.load_avg_contrib *= runnable_avg;
				1501	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1502	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1503	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1504	#else
				1505	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1506	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1507	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1508	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1509	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1510	#endif
				1511
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1512	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1513	{
				1514	u32 contrib;
				1515
				1516	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1517	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1518	contrib /= (se->avg.runnable_avg_period + 1);
				1519	se->avg.load_avg_contrib = scale_load(contrib);
				1520	}
				1521
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1522	/* Compute the current contribution to load_avg by se, return any delta */
				1523	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1524	{
				1525	long old_contrib = se->avg.load_avg_contrib;
				1526
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1527	if (entity_is_task(se)) {
				1528	__update_task_entity_contrib(se);
				1529	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1530	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1531	__update_group_entity_contrib(se);
				1532	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1533
				1534	return se->avg.load_avg_contrib - old_contrib;
				1535	}
				1536
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1537	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1538	long load_contrib)
				1539	{
				1540	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1541	cfs_rq->blocked_load_avg -= load_contrib;
				1542	else
				1543	cfs_rq->blocked_load_avg = 0;
				1544	}
				1545
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1546	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1547
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1548	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1549	static inline void update_entity_load_avg(struct sched_entity *se,
				1550	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1551	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1553	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1554	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1555
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1556	/*
				1557	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1558	* case they are the parent of a throttled hierarchy.
				1559	*/
				1560	if (entity_is_task(se))
				1561	now = cfs_rq_clock_task(cfs_rq);
				1562	else
				1563	now = cfs_rq_clock_task(group_cfs_rq(se));
				1564
				1565	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1566	return;
				1567
				1568	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1569
				1570	if (!update_cfs_rq)
				1571	return;
				1572
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1573	if (se->on_rq)
				1574	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1575	else
				1576	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1577	}
				1578
				1579	/*
				1580	* Decay the load contributed by all blocked children and account this so that
				1581	* their contribution may appropriately discounted when they wake up.
				1582	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1583	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1584	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1585	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1586	u64 decays;
				1587
				1588	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1589	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1590	return;
				1591
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	1592	if (atomic_long_read(&cfs_rq->removed_load)) {
				1593	unsigned long removed_load;
				1594	removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1595	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1596	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1597
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1598	if (decays) {
				1599	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1600	decays);
				1601	atomic64_add(decays, &cfs_rq->decay_counter);
				1602	cfs_rq->last_decay = now;
				1603	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1604
				1605	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1606	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1607
				1608	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1609	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1610	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1611	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1612	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1613
				1614	/* Add the load generated by se into cfs_rq's child load-average */
				1615	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1616	struct sched_entity *se,
				1617	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1618	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1619	/*
				1620	* We track migrations using entity decay_count <= 0, on a wake-up
				1621	* migration we use a negative decay count to track the remote decays
				1622	* accumulated while sleeping.
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	1623	*
				1624	* Newly forked tasks are enqueued with se->avg.decay_count == 0, they
				1625	* are seen by enqueue_entity_load_avg() as a migration with an already
				1626	* constructed load_avg_contrib.
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1627	*/
				1628	if (unlikely(se->avg.decay_count <= 0)) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1629	se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1630	if (se->avg.decay_count) {
				1631	/*
				1632	* In a wake-up migration we have to approximate the
				1633	* time sleeping. This is because we can't synchronize
				1634	* clock_task between the two cpus, and it is not
				1635	* guaranteed to be read-safe. Instead, we can
				1636	* approximate this using our carried decays, which are
				1637	* explicitly atomically readable.
				1638	*/
				1639	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1640	<< 20;
				1641	update_entity_load_avg(se, 0);
				1642	/* Indicate that we're now synchronized and on-rq */
				1643	se->avg.decay_count = 0;
				1644	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1645	wakeup = 0;
				1646	} else {
Alex Shi	282cf49	2013-06-20 10:18:48 +0800	[diff] [blame]	1647	/*
				1648	* Task re-woke on same cpu (or else migrate_task_rq_fair()
				1649	* would have made count negative); we must be careful to avoid
				1650	* double-accounting blocked time after synchronizing decays.
				1651	*/
				1652	se->avg.last_runnable_update += __synchronize_entity_decay(se)
				1653	<< 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1654	}
				1655
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1656	/* migrated tasks did not contribute to our blocked load */
				1657	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1658	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1659	update_entity_load_avg(se, 0);
				1660	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1661
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1662	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1663	/* we force update consideration on load-balancer moves */
				1664	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1665	}
				1666
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1667	/*
				1668	* Remove se's load from this cfs_rq child load-average, if the entity is
				1669	* transitioning to a blocked state we track its projected decay using
				1670	* blocked_load_avg.
				1671	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1672	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1673	struct sched_entity *se,
				1674	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1675	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1676	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1677	/* we force update consideration on load-balancer moves */
				1678	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1679
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1680	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1681	if (sleep) {
				1682	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1683	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1684	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1685	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	1686
				1687	/*
				1688	* Update the rq's load with the elapsed running time before entering
				1689	* idle. if the last scheduled task is not a CFS task, idle_enter will
				1690	* be the only way to update the runnable statistic.
				1691	*/
				1692	void idle_enter_fair(struct rq *this_rq)
				1693	{
				1694	update_rq_runnable_avg(this_rq, 1);
				1695	}
				1696
				1697	/*
				1698	* Update the rq's load with the elapsed idle time before a task is
				1699	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
				1700	* be the only way to update the runnable statistic.
				1701	*/
				1702	void idle_exit_fair(struct rq *this_rq)
				1703	{
				1704	update_rq_runnable_avg(this_rq, 0);
				1705	}
				1706
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1707	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1708	static inline void update_entity_load_avg(struct sched_entity *se,
				1709	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1710	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1711	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1712	struct sched_entity *se,
				1713	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1714	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1715	struct sched_entity *se,
				1716	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1717	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1718	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1719	#endif
				1720
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1721	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1722	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1723	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1724	struct task_struct *tsk = NULL;
				1725
				1726	if (entity_is_task(se))
				1727	tsk = task_of(se);
				1728
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1729	if (se->statistics.sleep_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1730	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1731
				1732	if ((s64)delta < 0)
				1733	delta = 0;
				1734
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1735	if (unlikely(delta > se->statistics.sleep_max))
				1736	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1737
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1738	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1739	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1740
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1741	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1742	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1743	trace_sched_stat_sleep(tsk, delta);
				1744	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1745	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1746	if (se->statistics.block_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1747	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1748
				1749	if ((s64)delta < 0)
				1750	delta = 0;
				1751
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1752	if (unlikely(delta > se->statistics.block_max))
				1753	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1754
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1755	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1756	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1757
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1758	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1759	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1760	se->statistics.iowait_sum += delta;
				1761	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1762	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1763	}
				1764
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1765	trace_sched_stat_blocked(tsk, delta);
				1766
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1767	/*
				1768	* Blocking time is in units of nanosecs, so shift by
				1769	* 20 to get a milliseconds-range estimation of the
				1770	* amount of time that the task spent sleeping:
				1771	*/
				1772	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1773	profile_hits(SLEEP_PROFILING,
				1774	(void *)get_wchan(tsk),
				1775	delta >> 20);
				1776	}
				1777	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1778	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1779	}
				1780	#endif
				1781	}
				1782
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1783	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1784	{
				1785	#ifdef CONFIG_SCHED_DEBUG
				1786	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1787
				1788	if (d < 0)
				1789	d = -d;
				1790
				1791	if (d > 3*sysctl_sched_latency)
				1792	schedstat_inc(cfs_rq, nr_spread_over);
				1793	#endif
				1794	}
				1795
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1796	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1797	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1798	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1799	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1800
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1801	/*
				1802	* The 'current' period is already promised to the current tasks,
				1803	* however the extra weight of the new task will slow them down a
				1804	* little, place the new task so that it fits in the slot that
				1805	* stays open at the end.
				1806	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1807	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1808	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1809
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1810	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1811	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1812	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1813
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1814	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1815	* Halve their sleep time's effect, to allow
				1816	* for a gentler effect of sleepers:
				1817	*/
				1818	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1819	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1820
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1821	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1822	}
				1823
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1824	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	1825	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1826	}
				1827
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1828	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1829
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1830	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1831	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1832	{
				1833	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1834	* Update the normalized vruntime before updating min_vruntime
Kamalesh Babulal	0fc576d	2013-06-27 11:24:18 +0530	[diff] [blame]	1835	* through calling update_curr().
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1836	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1837	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1838	se->vruntime += cfs_rq->min_vruntime;
				1839
				1840	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1841	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1842	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1843	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1844	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1845	account_entity_enqueue(cfs_rq, se);
				1846	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1847
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1848	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1849	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1850	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1851	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1852
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1853	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1854	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1855	if (se != cfs_rq->curr)
				1856	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1857	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1858
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1859	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1860	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1861	check_enqueue_throttle(cfs_rq);
				1862	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1863	}
				1864
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1865	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1866	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1867	for_each_sched_entity(se) {
				1868	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1869	if (cfs_rq->last == se)
				1870	cfs_rq->last = NULL;
				1871	else
				1872	break;
				1873	}
				1874	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1875
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1876	static void __clear_buddies_next(struct sched_entity *se)
				1877	{
				1878	for_each_sched_entity(se) {
				1879	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1880	if (cfs_rq->next == se)
				1881	cfs_rq->next = NULL;
				1882	else
				1883	break;
				1884	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1885	}
				1886
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1887	static void __clear_buddies_skip(struct sched_entity *se)
				1888	{
				1889	for_each_sched_entity(se) {
				1890	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1891	if (cfs_rq->skip == se)
				1892	cfs_rq->skip = NULL;
				1893	else
				1894	break;
				1895	}
				1896	}
				1897
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1898	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1899	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1900	if (cfs_rq->last == se)
				1901	__clear_buddies_last(se);
				1902
				1903	if (cfs_rq->next == se)
				1904	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1905
				1906	if (cfs_rq->skip == se)
				1907	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1908	}
				1909
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1910	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1911
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1912	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1913	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1914	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1915	/*
				1916	* Update run-time statistics of the 'current'.
				1917	*/
				1918	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1919	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1920
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1921	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1922	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1923	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1924	if (entity_is_task(se)) {
				1925	struct task_struct *tsk = task_of(se);
				1926
				1927	if (tsk->state & TASK_INTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1928	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1929	if (tsk->state & TASK_UNINTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1930	se->statistics.block_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1931	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1932	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1933	}
				1934
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1935	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1936
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1937	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1938	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1939	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1940	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1941
				1942	/*
				1943	* Normalize the entity after updating the min_vruntime because the
				1944	* update can refer to the ->curr item and we need to reflect this
				1945	* movement in our normalized position.
				1946	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1947	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1948	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1949
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1950	/* return excess runtime on last dequeue */
				1951	return_cfs_rq_runtime(cfs_rq);
				1952
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1953	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1954	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1955	}
				1956
				1957	/*
				1958	* Preempt the current task with a newly woken task if needed:
				1959	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1960	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1961	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1962	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1963	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1964	struct sched_entity *se;
				1965	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1966
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	1967	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1968	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1969	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1970	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1971	/*
				1972	* The current task ran long enough, ensure it doesn't get
				1973	* re-elected due to buddy favours.
				1974	*/
				1975	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1976	return;
				1977	}
				1978
				1979	/*
				1980	* Ensure that a task that missed wakeup preemption by a
				1981	* narrow margin doesn't have to wait for a full slice.
				1982	* This also mitigates buddy induced latencies under load.
				1983	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1984	if (delta_exec < sysctl_sched_min_granularity)
				1985	return;
				1986
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1987	se = __pick_first_entity(cfs_rq);
				1988	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1989
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1990	if (delta < 0)
				1991	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1992
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1993	if (delta > ideal_runtime)
				1994	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1995	}
				1996
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1997	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1998	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1999	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2000	/* 'current' is not kept within the tree. */
				2001	if (se->on_rq) {
				2002	/*
				2003	* Any task has to be enqueued before it get to execute on
				2004	* a CPU. So account for the time it spent waiting on the
				2005	* runqueue.
				2006	*/
				2007	update_stats_wait_end(cfs_rq, se);
				2008	__dequeue_entity(cfs_rq, se);
				2009	}
				2010
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	2011	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	2012	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	2013	#ifdef CONFIG_SCHEDSTATS
				2014	/*
				2015	* Track our maximum slice length, if the CPU's load is at
				2016	* least twice that of our own weight (i.e. dont track it
				2017	* when there are only lesser-weight tasks around):
				2018	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	2019	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2020	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	2021	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				2022	}
				2023	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	2024	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2025	}
				2026
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	2027	static int
				2028	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				2029
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2030	/*
				2031	* Pick the next process, keeping these things in mind, in this order:
				2032	* 1) keep things fair between processes/task groups
				2033	* 2) pick the "next" process, since someone really wants that to run
				2034	* 3) pick the "last" process, for cache locality
				2035	* 4) do not run the "skip" process, if something else is available
				2036	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	2037	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2038	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2039	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2040	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	2041
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2042	/*
				2043	* Avoid running the skip buddy, if running something else can
				2044	* be done without getting too unfair.
				2045	*/
				2046	if (cfs_rq->skip == se) {
				2047	struct sched_entity *second = __pick_next_entity(se);
				2048	if (second && wakeup_preempt_entity(second, left) < 1)
				2049	se = second;
				2050	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2051
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2052	/*
				2053	* Prefer last buddy, try to return the CPU to a preempted task.
				2054	*/
				2055	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				2056	se = cfs_rq->last;
				2057
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2058	/*
				2059	* Someone really wants this to run. If it's not unfair, run it.
				2060	*/
				2061	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				2062	se = cfs_rq->next;
				2063
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2064	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	2065
				2066	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2067	}
				2068
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2069	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				2070
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	2071	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2072	{
				2073	/*
				2074	* If still on the runqueue then deactivate_task()
				2075	* was not called and update_curr() has to be done:
				2076	*/
				2077	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	2078	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2079
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2080	/* throttle cfs_rqs exceeding runtime */
				2081	check_cfs_rq_runtime(cfs_rq);
				2082
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	2083	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2084	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	2085	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2086	/* Put 'current' back into the tree. */
				2087	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2088	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2089	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2090	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	2091	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2092	}
				2093
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2094	static void
				2095	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2096	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2097	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2098	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2099	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2100	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2101
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	2102	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2103	* Ensure that runnable average is periodically updated.
				2104	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2105	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2106	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	bf0bd94	2013-07-26 23:48:42 +0200	[diff] [blame]	2107	update_cfs_shares(cfs_rq);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2108
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2109	#ifdef CONFIG_SCHED_HRTICK
				2110	/*
				2111	* queued ticks are scheduled to match the slice, so don't bother
				2112	* validating it and just reschedule.
				2113	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	2114	if (queued) {
				2115	resched_task(rq_of(cfs_rq)->curr);
				2116	return;
				2117	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2118	/*
				2119	* don't let the period tick interfere with the hrtick preemption
				2120	*/
				2121	if (!sched_feat(DOUBLE_TICK) &&
				2122	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				2123	return;
				2124	#endif
				2125
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	2126	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	2127	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2128	}
				2129
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2130
				2131	/**************************************************
				2132	* CFS bandwidth control machinery
				2133	*/
				2134
				2135	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2136
				2137	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2138	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2139
				2140	static inline bool cfs_bandwidth_used(void)
				2141	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2142	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2143	}
				2144
				2145	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2146	{
				2147	/* only need to count groups transitioning between enabled/!enabled */
				2148	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2149	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2150	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2151	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2152	}
				2153	#else /* HAVE_JUMP_LABEL */
				2154	static bool cfs_bandwidth_used(void)
				2155	{
				2156	return true;
				2157	}
				2158
				2159	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2160	#endif /* HAVE_JUMP_LABEL */
				2161
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2162	/*
				2163	* default period for cfs group bandwidth.
				2164	* default: 0.1s, units: nanoseconds
				2165	*/
				2166	static inline u64 default_cfs_period(void)
				2167	{
				2168	return 100000000ULL;
				2169	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2170
				2171	static inline u64 sched_cfs_bandwidth_slice(void)
				2172	{
				2173	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2174	}
				2175
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2176	/*
				2177	* Replenish runtime according to assigned quota and update expiration time.
				2178	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2179	* additional synchronization around rq->lock.
				2180	*
				2181	* requires cfs_b->lock
				2182	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2183	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2184	{
				2185	u64 now;
				2186
				2187	if (cfs_b->quota == RUNTIME_INF)
				2188	return;
				2189
				2190	now = sched_clock_cpu(smp_processor_id());
				2191	cfs_b->runtime = cfs_b->quota;
				2192	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2193	}
				2194
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2195	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2196	{
				2197	return &tg->cfs_bandwidth;
				2198	}
				2199
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2200	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2201	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2202	{
				2203	if (unlikely(cfs_rq->throttle_count))
				2204	return cfs_rq->throttled_clock_task;
				2205
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2206	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2207	}
				2208
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2209	/* returns 0 on failure to allocate runtime */
				2210	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2211	{
				2212	struct task_group *tg = cfs_rq->tg;
				2213	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2214	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2215
				2216	/* note: this is a positive sum as runtime_remaining <= 0 */
				2217	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2218
				2219	raw_spin_lock(&cfs_b->lock);
				2220	if (cfs_b->quota == RUNTIME_INF)
				2221	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2222	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2223	/*
				2224	* If the bandwidth pool has become inactive, then at least one
				2225	* period must have elapsed since the last consumption.
				2226	* Refresh the global state and ensure bandwidth timer becomes
				2227	* active.
				2228	*/
				2229	if (!cfs_b->timer_active) {
				2230	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2231	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2232	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2233
				2234	if (cfs_b->runtime > 0) {
				2235	amount = min(cfs_b->runtime, min_amount);
				2236	cfs_b->runtime -= amount;
				2237	cfs_b->idle = 0;
				2238	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2239	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2240	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2241	raw_spin_unlock(&cfs_b->lock);
				2242
				2243	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2244	/*
				2245	* we may have advanced our local expiration to account for allowed
				2246	* spread between our sched_clock and the one on which runtime was
				2247	* issued.
				2248	*/
				2249	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2250	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2251
				2252	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2253	}
				2254
				2255	/*
				2256	* Note: This depends on the synchronization provided by sched_clock and the
				2257	* fact that rq->clock snapshots this value.
				2258	*/
				2259	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2260	{
				2261	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2262
				2263	/* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2264	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2265	return;
				2266
				2267	if (cfs_rq->runtime_remaining < 0)
				2268	return;
				2269
				2270	/*
				2271	* If the local deadline has passed we have to consider the
				2272	* possibility that our sched_clock is 'fast' and the global deadline
				2273	* has not truly expired.
				2274	*
				2275	* Fortunately we can check determine whether this the case by checking
				2276	* whether the global deadline has advanced.
				2277	*/
				2278
				2279	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2280	/* extend local deadline, drift is bounded above by 2 ticks */
				2281	cfs_rq->runtime_expires += TICK_NSEC;
				2282	} else {
				2283	/* global deadline is ahead, expiration has passed */
				2284	cfs_rq->runtime_remaining = 0;
				2285	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2286	}
				2287
				2288	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2289	unsigned long delta_exec)
				2290	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2291	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2292	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2293	expire_cfs_rq_runtime(cfs_rq);
				2294
				2295	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2296	return;
				2297
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2298	/*
				2299	* if we're unable to extend our runtime we resched so that the active
				2300	* hierarchy can be throttled
				2301	*/
				2302	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2303	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2304	}
				2305
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2306	static __always_inline
				2307	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2308	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2309	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2310	return;
				2311
				2312	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2313	}
				2314
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2315	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2316	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2317	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2318	}
				2319
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2320	/* check whether cfs_rq, or any parent, is throttled */
				2321	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2322	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2323	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2324	}
				2325
				2326	/*
				2327	* Ensure that neither of the group entities corresponding to src_cpu or
				2328	* dest_cpu are members of a throttled hierarchy when performing group
				2329	* load-balance operations.
				2330	*/
				2331	static inline int throttled_lb_pair(struct task_group *tg,
				2332	int src_cpu, int dest_cpu)
				2333	{
				2334	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2335
				2336	src_cfs_rq = tg->cfs_rq[src_cpu];
				2337	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2338
				2339	return throttled_hierarchy(src_cfs_rq) \|\|
				2340	throttled_hierarchy(dest_cfs_rq);
				2341	}
				2342
				2343	/* updated child weight may affect parent so we have to do this bottom up */
				2344	static int tg_unthrottle_up(struct task_group tg, void data)
				2345	{
				2346	struct rq *rq = data;
				2347	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2348
				2349	cfs_rq->throttle_count--;
				2350	#ifdef CONFIG_SMP
				2351	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2352	/* adjust cfs_rq_clock_task() */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2353	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2354	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2355	}
				2356	#endif
				2357
				2358	return 0;
				2359	}
				2360
				2361	static int tg_throttle_down(struct task_group tg, void data)
				2362	{
				2363	struct rq *rq = data;
				2364	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2365
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2366	/* group is entering throttled state, stop time */
				2367	if (!cfs_rq->throttle_count)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2368	cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2369	cfs_rq->throttle_count++;
				2370
				2371	return 0;
				2372	}
				2373
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2374	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2375	{
				2376	struct rq *rq = rq_of(cfs_rq);
				2377	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2378	struct sched_entity *se;
				2379	long task_delta, dequeue = 1;
				2380
				2381	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2382
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2383	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2384	rcu_read_lock();
				2385	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2386	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2387
				2388	task_delta = cfs_rq->h_nr_running;
				2389	for_each_sched_entity(se) {
				2390	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2391	/* throttled entity or throttle-on-deactivate */
				2392	if (!se->on_rq)
				2393	break;
				2394
				2395	if (dequeue)
				2396	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2397	qcfs_rq->h_nr_running -= task_delta;
				2398
				2399	if (qcfs_rq->load.weight)
				2400	dequeue = 0;
				2401	}
				2402
				2403	if (!se)
				2404	rq->nr_running -= task_delta;
				2405
				2406	cfs_rq->throttled = 1;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2407	cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2408	raw_spin_lock(&cfs_b->lock);
				2409	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2410	raw_spin_unlock(&cfs_b->lock);
				2411	}
				2412
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2413	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2414	{
				2415	struct rq *rq = rq_of(cfs_rq);
				2416	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2417	struct sched_entity *se;
				2418	int enqueue = 1;
				2419	long task_delta;
				2420
Michael Wang	22b958d	2013-06-04 14:23:39 +0800	[diff] [blame]	2421	se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2422
				2423	cfs_rq->throttled = 0;
Frederic Weisbecker	1a55af2	2013-04-12 01:51:01 +0200	[diff] [blame]	2424
				2425	update_rq_clock(rq);
				2426
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2427	raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2428	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2429	list_del_rcu(&cfs_rq->throttled_list);
				2430	raw_spin_unlock(&cfs_b->lock);
				2431
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2432	/* update hierarchical throttle state */
				2433	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2434
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2435	if (!cfs_rq->load.weight)
				2436	return;
				2437
				2438	task_delta = cfs_rq->h_nr_running;
				2439	for_each_sched_entity(se) {
				2440	if (se->on_rq)
				2441	enqueue = 0;
				2442
				2443	cfs_rq = cfs_rq_of(se);
				2444	if (enqueue)
				2445	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2446	cfs_rq->h_nr_running += task_delta;
				2447
				2448	if (cfs_rq_throttled(cfs_rq))
				2449	break;
				2450	}
				2451
				2452	if (!se)
				2453	rq->nr_running += task_delta;
				2454
				2455	/* determine whether we need to wake up potentially idle cpu */
				2456	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2457	resched_task(rq->curr);
				2458	}
				2459
				2460	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2461	u64 remaining, u64 expires)
				2462	{
				2463	struct cfs_rq *cfs_rq;
				2464	u64 runtime = remaining;
				2465
				2466	rcu_read_lock();
				2467	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2468	throttled_list) {
				2469	struct rq *rq = rq_of(cfs_rq);
				2470
				2471	raw_spin_lock(&rq->lock);
				2472	if (!cfs_rq_throttled(cfs_rq))
				2473	goto next;
				2474
				2475	runtime = -cfs_rq->runtime_remaining + 1;
				2476	if (runtime > remaining)
				2477	runtime = remaining;
				2478	remaining -= runtime;
				2479
				2480	cfs_rq->runtime_remaining += runtime;
				2481	cfs_rq->runtime_expires = expires;
				2482
				2483	/* we check whether we're throttled above */
				2484	if (cfs_rq->runtime_remaining > 0)
				2485	unthrottle_cfs_rq(cfs_rq);
				2486
				2487	next:
				2488	raw_spin_unlock(&rq->lock);
				2489
				2490	if (!remaining)
				2491	break;
				2492	}
				2493	rcu_read_unlock();
				2494
				2495	return remaining;
				2496	}
				2497
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2498	/*
				2499	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2500	* cfs_rqs as appropriate. If there has been no activity within the last
				2501	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2502	* used to track this state.
				2503	*/
				2504	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2505	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2506	u64 runtime, runtime_expires;
				2507	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2508
				2509	raw_spin_lock(&cfs_b->lock);
				2510	/* no need to continue the timer with no bandwidth constraint */
				2511	if (cfs_b->quota == RUNTIME_INF)
				2512	goto out_unlock;
				2513
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2514	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2515	/* idle depends on !throttled (for the case of a large deficit) */
				2516	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2517	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2518
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2519	/* if we're going inactive then everything else can be deferred */
				2520	if (idle)
				2521	goto out_unlock;
				2522
				2523	__refill_cfs_bandwidth_runtime(cfs_b);
				2524
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2525	if (!throttled) {
				2526	/* mark as potentially idle for the upcoming period */
				2527	cfs_b->idle = 1;
				2528	goto out_unlock;
				2529	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2530
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2531	/* account preceding periods in which throttling occurred */
				2532	cfs_b->nr_throttled += overrun;
				2533
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2534	/*
				2535	* There are throttled entities so we must first use the new bandwidth
				2536	* to unthrottle them before making it generally available. This
				2537	* ensures that all existing debts will be paid before a new cfs_rq is
				2538	* allowed to run.
				2539	*/
				2540	runtime = cfs_b->runtime;
				2541	runtime_expires = cfs_b->runtime_expires;
				2542	cfs_b->runtime = 0;
				2543
				2544	/*
				2545	* This check is repeated as we are holding onto the new bandwidth
				2546	* while we unthrottle. This can potentially race with an unthrottled
				2547	* group trying to acquire new bandwidth from the global pool.
				2548	*/
				2549	while (throttled && runtime > 0) {
				2550	raw_spin_unlock(&cfs_b->lock);
				2551	/* we can't nest cfs_b->lock while distributing bandwidth */
				2552	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2553	runtime_expires);
				2554	raw_spin_lock(&cfs_b->lock);
				2555
				2556	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2557	}
				2558
				2559	/* return (any) remaining runtime */
				2560	cfs_b->runtime = runtime;
				2561	/*
				2562	* While we are ensured activity in the period following an
				2563	* unthrottle, this also covers the case in which the new bandwidth is
				2564	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2565	* timer to remain active while there are any throttled entities.)
				2566	*/
				2567	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2568	out_unlock:
				2569	if (idle)
				2570	cfs_b->timer_active = 0;
				2571	raw_spin_unlock(&cfs_b->lock);
				2572
				2573	return idle;
				2574	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2575
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2576	/* a cfs_rq won't donate quota below this amount */
				2577	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2578	/* minimum remaining period time to redistribute slack quota */
				2579	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2580	/* how long we wait to gather additional slack before distributing */
				2581	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2582
				2583	/* are we near the end of the current quota period? */
				2584	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2585	{
				2586	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2587	u64 remaining;
				2588
				2589	/* if the call-back is running a quota refresh is already occurring */
				2590	if (hrtimer_callback_running(refresh_timer))
				2591	return 1;
				2592
				2593	/* is a quota refresh about to occur? */
				2594	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2595	if (remaining < min_expire)
				2596	return 1;
				2597
				2598	return 0;
				2599	}
				2600
				2601	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2602	{
				2603	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2604
				2605	/* if there's a quota refresh soon don't bother with slack */
				2606	if (runtime_refresh_within(cfs_b, min_left))
				2607	return;
				2608
				2609	start_bandwidth_timer(&cfs_b->slack_timer,
				2610	ns_to_ktime(cfs_bandwidth_slack_period));
				2611	}
				2612
				2613	/* we know any runtime found here is valid as update_curr() precedes return */
				2614	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2615	{
				2616	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2617	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2618
				2619	if (slack_runtime <= 0)
				2620	return;
				2621
				2622	raw_spin_lock(&cfs_b->lock);
				2623	if (cfs_b->quota != RUNTIME_INF &&
				2624	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2625	cfs_b->runtime += slack_runtime;
				2626
				2627	/* we are under rq->lock, defer unthrottling using a timer */
				2628	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2629	!list_empty(&cfs_b->throttled_cfs_rq))
				2630	start_cfs_slack_bandwidth(cfs_b);
				2631	}
				2632	raw_spin_unlock(&cfs_b->lock);
				2633
				2634	/* even if it's not valid for return we don't want to try again */
				2635	cfs_rq->runtime_remaining -= slack_runtime;
				2636	}
				2637
				2638	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2639	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2640	if (!cfs_bandwidth_used())
				2641	return;
				2642
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2643	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2644	return;
				2645
				2646	__return_cfs_rq_runtime(cfs_rq);
				2647	}
				2648
				2649	/*
				2650	* This is done with a timer (instead of inline with bandwidth return) since
				2651	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2652	*/
				2653	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2654	{
				2655	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2656	u64 expires;
				2657
				2658	/* confirm we're still not at a refresh boundary */
				2659	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2660	return;
				2661
				2662	raw_spin_lock(&cfs_b->lock);
				2663	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2664	runtime = cfs_b->runtime;
				2665	cfs_b->runtime = 0;
				2666	}
				2667	expires = cfs_b->runtime_expires;
				2668	raw_spin_unlock(&cfs_b->lock);
				2669
				2670	if (!runtime)
				2671	return;
				2672
				2673	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2674
				2675	raw_spin_lock(&cfs_b->lock);
				2676	if (expires == cfs_b->runtime_expires)
				2677	cfs_b->runtime = runtime;
				2678	raw_spin_unlock(&cfs_b->lock);
				2679	}
				2680
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2681	/*
				2682	* When a group wakes up we want to make sure that its quota is not already
				2683	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2684	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2685	*/
				2686	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2687	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2688	if (!cfs_bandwidth_used())
				2689	return;
				2690
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2691	/* an active group must be handled by the update_curr()->put() path */
				2692	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2693	return;
				2694
				2695	/* ensure the group is not already throttled */
				2696	if (cfs_rq_throttled(cfs_rq))
				2697	return;
				2698
				2699	/* update runtime allocation */
				2700	account_cfs_rq_runtime(cfs_rq, 0);
				2701	if (cfs_rq->runtime_remaining <= 0)
				2702	throttle_cfs_rq(cfs_rq);
				2703	}
				2704
				2705	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2706	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2707	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2708	if (!cfs_bandwidth_used())
				2709	return;
				2710
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2711	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2712	return;
				2713
				2714	/*
				2715	* it's possible for a throttled entity to be forced into a running
				2716	* state (e.g. set_curr_task), in this case we're finished.
				2717	*/
				2718	if (cfs_rq_throttled(cfs_rq))
				2719	return;
				2720
				2721	throttle_cfs_rq(cfs_rq);
				2722	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2723
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2724	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2725	{
				2726	struct cfs_bandwidth *cfs_b =
				2727	container_of(timer, struct cfs_bandwidth, slack_timer);
				2728	do_sched_cfs_slack_timer(cfs_b);
				2729
				2730	return HRTIMER_NORESTART;
				2731	}
				2732
				2733	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2734	{
				2735	struct cfs_bandwidth *cfs_b =
				2736	container_of(timer, struct cfs_bandwidth, period_timer);
				2737	ktime_t now;
				2738	int overrun;
				2739	int idle = 0;
				2740
				2741	for (;;) {
				2742	now = hrtimer_cb_get_time(timer);
				2743	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2744
				2745	if (!overrun)
				2746	break;
				2747
				2748	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2749	}
				2750
				2751	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2752	}
				2753
				2754	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2755	{
				2756	raw_spin_lock_init(&cfs_b->lock);
				2757	cfs_b->runtime = 0;
				2758	cfs_b->quota = RUNTIME_INF;
				2759	cfs_b->period = ns_to_ktime(default_cfs_period());
				2760
				2761	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2762	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2763	cfs_b->period_timer.function = sched_cfs_period_timer;
				2764	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2765	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2766	}
				2767
				2768	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2769	{
				2770	cfs_rq->runtime_enabled = 0;
				2771	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2772	}
				2773
				2774	/* requires cfs_b->lock, may release to reprogram timer */
				2775	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2776	{
				2777	/*
				2778	* The timer may be active because we're trying to set a new bandwidth
				2779	* period or because we're racing with the tear-down path
				2780	* (timer_active==0 becomes visible before the hrtimer call-back
				2781	* terminates). In either case we ensure that it's re-programmed
				2782	*/
				2783	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2784	raw_spin_unlock(&cfs_b->lock);
				2785	/* ensure cfs_b->lock is available while we wait */
				2786	hrtimer_cancel(&cfs_b->period_timer);
				2787
				2788	raw_spin_lock(&cfs_b->lock);
				2789	/* if someone else restarted the timer then we're done */
				2790	if (cfs_b->timer_active)
				2791	return;
				2792	}
				2793
				2794	cfs_b->timer_active = 1;
				2795	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2796	}
				2797
				2798	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2799	{
				2800	hrtimer_cancel(&cfs_b->period_timer);
				2801	hrtimer_cancel(&cfs_b->slack_timer);
				2802	}
				2803
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	2804	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2805	{
				2806	struct cfs_rq *cfs_rq;
				2807
				2808	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2809	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2810
				2811	if (!cfs_rq->runtime_enabled)
				2812	continue;
				2813
				2814	/*
				2815	* clock_task is not advancing so we just need to make sure
				2816	* there's some valid quota amount
				2817	*/
				2818	cfs_rq->runtime_remaining = cfs_b->quota;
				2819	if (cfs_rq_throttled(cfs_rq))
				2820	unthrottle_cfs_rq(cfs_rq);
				2821	}
				2822	}
				2823
				2824	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2825	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2826	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2827	return rq_clock_task(rq_of(cfs_rq));
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2828	}
				2829
				2830	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2831	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2832	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2833	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2834	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2835
				2836	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2837	{
				2838	return 0;
				2839	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2840
				2841	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2842	{
				2843	return 0;
				2844	}
				2845
				2846	static inline int throttled_lb_pair(struct task_group *tg,
				2847	int src_cpu, int dest_cpu)
				2848	{
				2849	return 0;
				2850	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2851
				2852	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2853
				2854	#ifdef CONFIG_FAIR_GROUP_SCHED
				2855	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2856	#endif
				2857
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2858	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2859	{
				2860	return NULL;
				2861	}
				2862	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2863	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2864
				2865	#endif /* CONFIG_CFS_BANDWIDTH */
				2866
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2867	/**************************************************
				2868	* CFS operations on tasks:
				2869	*/
				2870
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2871	#ifdef CONFIG_SCHED_HRTICK
				2872	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2873	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2874	struct sched_entity *se = &p->se;
				2875	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2876
				2877	WARN_ON(task_rq(p) != rq);
				2878
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2879	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2880	u64 slice = sched_slice(cfs_rq, se);
				2881	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2882	s64 delta = slice - ran;
				2883
				2884	if (delta < 0) {
				2885	if (rq->curr == p)
				2886	resched_task(p);
				2887	return;
				2888	}
				2889
				2890	/*
				2891	* Don't schedule slices shorter than 10000ns, that just
				2892	* doesn't make sense. Rely on vruntime for fairness.
				2893	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2894	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2895	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2896
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2897	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2898	}
				2899	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2900
				2901	/*
				2902	* called from enqueue/dequeue and updates the hrtick when the
				2903	* current task is from our class and nr_running is low enough
				2904	* to matter.
				2905	*/
				2906	static void hrtick_update(struct rq *rq)
				2907	{
				2908	struct task_struct *curr = rq->curr;
				2909
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2910	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2911	return;
				2912
				2913	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2914	hrtick_start_fair(rq, curr);
				2915	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2916	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2917	static inline void
				2918	hrtick_start_fair(struct rq rq, struct task_struct p)
				2919	{
				2920	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2921
				2922	static inline void hrtick_update(struct rq *rq)
				2923	{
				2924	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2925	#endif
				2926
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2927	/*
				2928	* The enqueue_task method is called before nr_running is
				2929	* increased. Here we update the fair scheduling stats and
				2930	* then put the task into the rbtree:
				2931	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2932	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2933	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2934	{
				2935	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2936	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2937
				2938	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2939	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2940	break;
				2941	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2942	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2943
				2944	/*
				2945	* end evaluation on encountering a throttled cfs_rq
				2946	*
				2947	* note: in the case of encountering a throttled cfs_rq we will
				2948	* post the final h_nr_running increment below.
				2949	*/
				2950	if (cfs_rq_throttled(cfs_rq))
				2951	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2952	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2953
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2954	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2955	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2956
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2957	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2958	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2959	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2960
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2961	if (cfs_rq_throttled(cfs_rq))
				2962	break;
				2963
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2964	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2965	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2966	}
				2967
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2968	if (!se) {
				2969	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2970	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2971	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2972	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2973	}
				2974
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2975	static void set_next_buddy(struct sched_entity *se);
				2976
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2977	/*
				2978	* The dequeue_task method is called before nr_running is
				2979	* decreased. We remove the task from the rbtree and
				2980	* update the fair scheduling stats:
				2981	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2982	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2983	{
				2984	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2985	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2986	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2987
				2988	for_each_sched_entity(se) {
				2989	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2990	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2991
				2992	/*
				2993	* end evaluation on encountering a throttled cfs_rq
				2994	*
				2995	* note: in the case of encountering a throttled cfs_rq we will
				2996	* post the final h_nr_running decrement below.
				2997	*/
				2998	if (cfs_rq_throttled(cfs_rq))
				2999	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3000	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3001
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3002	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3003	if (cfs_rq->load.weight) {
				3004	/*
				3005	* Bias pick_next to pick a task from this cfs_rq, as
				3006	* p is sleeping when it is within its sched_slice.
				3007	*/
				3008	if (task_sleep && parent_entity(se))
				3009	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	3010
				3011	/* avoid re-evaluating load for this entity */
				3012	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3013	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3014	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3015	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3016	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3017
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3018	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	3019	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3020	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3021
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3022	if (cfs_rq_throttled(cfs_rq))
				3023	break;
				3024
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3025	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3026	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3027	}
				3028
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3029	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3030	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3031	update_rq_runnable_avg(rq, 1);
				3032	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3033	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3034	}
				3035
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3036	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3037	/* Used instead of source_load when we know the type == 0 */
				3038	static unsigned long weighted_cpuload(const int cpu)
				3039	{
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3040	return cpu_rq(cpu)->cfs.runnable_load_avg;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3041	}
				3042
				3043	/*
				3044	* Return a low guess at the load of a migration-source cpu weighted
				3045	* according to the scheduling class and "nice" value.
				3046	*
				3047	* We want to under-estimate the load of migration sources, to
				3048	* balance conservatively.
				3049	*/
				3050	static unsigned long source_load(int cpu, int type)
				3051	{
				3052	struct rq *rq = cpu_rq(cpu);
				3053	unsigned long total = weighted_cpuload(cpu);
				3054
				3055	if (type == 0 \|\| !sched_feat(LB_BIAS))
				3056	return total;
				3057
				3058	return min(rq->cpu_load[type-1], total);
				3059	}
				3060
				3061	/*
				3062	* Return a high guess at the load of a migration-target cpu weighted
				3063	* according to the scheduling class and "nice" value.
				3064	*/
				3065	static unsigned long target_load(int cpu, int type)
				3066	{
				3067	struct rq *rq = cpu_rq(cpu);
				3068	unsigned long total = weighted_cpuload(cpu);
				3069
				3070	if (type == 0 \|\| !sched_feat(LB_BIAS))
				3071	return total;
				3072
				3073	return max(rq->cpu_load[type-1], total);
				3074	}
				3075
				3076	static unsigned long power_of(int cpu)
				3077	{
				3078	return cpu_rq(cpu)->cpu_power;
				3079	}
				3080
				3081	static unsigned long cpu_avg_load_per_task(int cpu)
				3082	{
				3083	struct rq *rq = cpu_rq(cpu);
				3084	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3085	unsigned long load_avg = rq->cfs.runnable_load_avg;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3086
				3087	if (nr_running)
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3088	return load_avg / nr_running;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3089
				3090	return 0;
				3091	}
				3092
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3093	static void record_wakee(struct task_struct *p)
				3094	{
				3095	/*
				3096	* Rough decay (wiping) for cost saving, don't worry
				3097	* about the boundary, really active task won't care
				3098	* about the loss.
				3099	*/
				3100	if (jiffies > current->wakee_flip_decay_ts + HZ) {
				3101	current->wakee_flips = 0;
				3102	current->wakee_flip_decay_ts = jiffies;
				3103	}
				3104
				3105	if (current->last_wakee != p) {
				3106	current->last_wakee = p;
				3107	current->wakee_flips++;
				3108	}
				3109	}
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3110
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	3111	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3112	{
				3113	struct sched_entity *se = &p->se;
				3114	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3115	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3116
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3117	#ifndef CONFIG_64BIT
				3118	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	3119
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3120	do {
				3121	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				3122	smp_rmb();
				3123	min_vruntime = cfs_rq->min_vruntime;
				3124	} while (min_vruntime != min_vruntime_copy);
				3125	#else
				3126	min_vruntime = cfs_rq->min_vruntime;
				3127	#endif
				3128
				3129	se->vruntime -= min_vruntime;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3130	record_wakee(p);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3131	}
				3132
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3133	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3134	/*
				3135	* effective_load() calculates the load change as seen from the root_task_group
				3136	*
				3137	* Adding load to a group doesn't make a group heavier, but can cause movement
				3138	* of group shares between cpus. Assuming the shares were perfectly aligned one
				3139	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3140	*
				3141	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				3142	* on this @cpu and results in a total addition (subtraction) of @wg to the
				3143	* total group weight.
				3144	*
				3145	* Given a runqueue weight distribution (rw_i) we can compute a shares
				3146	* distribution (s_i) using:
				3147	*
				3148	* s_i = rw_i / \Sum rw_j (1)
				3149	*
				3150	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				3151	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				3152	* shares distribution (s_i):
				3153	*
				3154	* rw_i = { 2, 4, 1, 0 }
				3155	* s_i = { 2/7, 4/7, 1/7, 0 }
				3156	*
				3157	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				3158	* task used to run on and the CPU the waker is running on), we need to
				3159	* compute the effect of waking a task on either CPU and, in case of a sync
				3160	* wakeup, compute the effect of the current task going to sleep.
				3161	*
				3162	* So for a change of @wl to the local @cpu with an overall group weight change
				3163	* of @wl we can compute the new shares distribution (s'_i) using:
				3164	*
				3165	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3166	*
				3167	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3168	* differences in waking a task to CPU 0. The additional task changes the
				3169	* weight and shares distributions like:
				3170	*
				3171	* rw'_i = { 3, 4, 1, 0 }
				3172	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3173	*
				3174	* We can then compute the difference in effective weight by using:
				3175	*
				3176	* dw_i = S * (s'_i - s_i) (3)
				3177	*
				3178	* Where 'S' is the group weight as seen by its parent.
				3179	*
				3180	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3181	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3182	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3183	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3184	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3185	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3186	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3187
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3188	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3189	return wl;
				3190
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3191	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3192	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3193
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3194	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3195
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3196	/*
				3197	* W = @wg + \Sum rw_j
				3198	*/
				3199	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3200
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3201	/*
				3202	* w = rw_i + @wl
				3203	*/
				3204	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3205
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3206	/*
				3207	* wl = S * s'_i; see (2)
				3208	*/
				3209	if (W > 0 && w < W)
				3210	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3211	else
				3212	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3213
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3214	/*
				3215	* Per the above, wl is the new se->load.weight value; since
				3216	* those are clipped to [MIN_SHARES, ...) do so now. See
				3217	* calc_cfs_shares().
				3218	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3219	if (wl < MIN_SHARES)
				3220	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3221
				3222	/*
				3223	* wl = dw_i = S * (s'_i - s_i); see (3)
				3224	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3225	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3226
				3227	/*
				3228	* Recursively apply this logic to all parent groups to compute
				3229	* the final effective load change on the root group. Since
				3230	* only the @tg group gets extra weight, all parent groups can
				3231	* only redistribute existing shares. @wl is the shift in shares
				3232	* resulting from this level per the above.
				3233	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3234	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3235	}
				3236
				3237	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3238	}
				3239	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3240
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3241	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3242	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3243	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3244	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3245	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3246
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3247	#endif
				3248
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3249	static int wake_wide(struct task_struct *p)
				3250	{
Peter Zijlstra	7d9ffa8	2013-07-04 12:56:46 +0800	[diff] [blame]	3251	int factor = this_cpu_read(sd_llc_size);
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3252
				3253	/*
				3254	* Yeah, it's the switching-frequency, could means many wakee or
				3255	* rapidly switch, use factor here will just help to automatically
				3256	* adjust the loose-degree, so bigger node will lead to more pull.
				3257	*/
				3258	if (p->wakee_flips > factor) {
				3259	/*
				3260	* wakee is somewhat hot, it needs certain amount of cpu
				3261	* resource, so if waker is far more hot, prefer to leave
				3262	* it alone.
				3263	*/
				3264	if (current->wakee_flips > (factor * p->wakee_flips))
				3265	return 1;
				3266	}
				3267
				3268	return 0;
				3269	}
				3270
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3271	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3272	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3273	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3274	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3275	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3276	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3277	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3278	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3279
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3280	/*
				3281	* If we wake multiple tasks be careful to not bounce
				3282	* ourselves around too much.
				3283	*/
				3284	if (wake_wide(p))
				3285	return 0;
				3286
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3287	idx = sd->wake_idx;
				3288	this_cpu = smp_processor_id();
				3289	prev_cpu = task_cpu(p);
				3290	load = source_load(prev_cpu, idx);
				3291	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3292
				3293	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3294	* If sync wakeup then subtract the (maximum possible)
				3295	* effect of the currently running task from the load
				3296	* of the current CPU:
				3297	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3298	if (sync) {
				3299	tg = task_group(current);
				3300	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3301
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3302	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3303	load += effective_load(tg, prev_cpu, 0, -weight);
				3304	}
				3305
				3306	tg = task_group(p);
				3307	weight = p->se.load.weight;
				3308
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3309	/*
				3310	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3311	* due to the sync cause above having dropped this_load to 0, we'll
				3312	* always have an imbalance, but there's really nothing you can do
				3313	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3314	*
				3315	* Otherwise check if either cpus are near enough in load to allow this
				3316	* task to be woken on this_cpu.
				3317	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3318	if (this_load > 0) {
				3319	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3320
				3321	this_eff_load = 100;
				3322	this_eff_load *= power_of(prev_cpu);
				3323	this_eff_load *= this_load +
				3324	effective_load(tg, this_cpu, weight, weight);
				3325
				3326	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3327	prev_eff_load *= power_of(this_cpu);
				3328	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3329
				3330	balanced = this_eff_load <= prev_eff_load;
				3331	} else
				3332	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3333
				3334	/*
				3335	* If the currently running task will sleep within
				3336	* a reasonable amount of time then attract this newly
				3337	* woken task:
				3338	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3339	if (sync && balanced)
				3340	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3341
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3342	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3343	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3344
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3345	if (balanced \|\|
				3346	(this_load <= load &&
				3347	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3348	/*
				3349	* This domain has SD_WAKE_AFFINE and
				3350	* p is cache cold in this domain, and
				3351	* there is no bad imbalance.
				3352	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3353	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3354	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3355
				3356	return 1;
				3357	}
				3358	return 0;
				3359	}
				3360
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3361	/*
				3362	* find_idlest_group finds and returns the least busy CPU group within the
				3363	* domain.
				3364	*/
				3365	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3366	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3367	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3368	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3369	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3370	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3371	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3372
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3373	do {
				3374	unsigned long load, avg_load;
				3375	int local_group;
				3376	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3377
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3378	/* Skip over this group if it has no CPUs allowed */
				3379	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3380	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3381	continue;
				3382
				3383	local_group = cpumask_test_cpu(this_cpu,
				3384	sched_group_cpus(group));
				3385
				3386	/* Tally up the load of all CPUs in the group */
				3387	avg_load = 0;
				3388
				3389	for_each_cpu(i, sched_group_cpus(group)) {
				3390	/* Bias balancing toward cpus of our domain */
				3391	if (local_group)
				3392	load = source_load(i, load_idx);
				3393	else
				3394	load = target_load(i, load_idx);
				3395
				3396	avg_load += load;
				3397	}
				3398
				3399	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3400	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3401
				3402	if (local_group) {
				3403	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3404	} else if (avg_load < min_load) {
				3405	min_load = avg_load;
				3406	idlest = group;
				3407	}
				3408	} while (group = group->next, group != sd->groups);
				3409
				3410	if (!idlest \|\| 100this_load < imbalancemin_load)
				3411	return NULL;
				3412	return idlest;
				3413	}
				3414
				3415	/*
				3416	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3417	*/
				3418	static int
				3419	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3420	{
				3421	unsigned long load, min_load = ULONG_MAX;
				3422	int idlest = -1;
				3423	int i;
				3424
				3425	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3426	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3427	load = weighted_cpuload(i);
				3428
				3429	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3430	min_load = load;
				3431	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3432	}
				3433	}
				3434
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3435	return idlest;
				3436	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3437
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3438	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3439	* Try and locate an idle CPU in the sched_domain.
				3440	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3441	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3442	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3443	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3444	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3445	int i = task_cpu(p);
				3446
				3447	if (idle_cpu(target))
				3448	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3449
				3450	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3451	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3452	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3453	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				3454	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3455
				3456	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3457	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3458	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3459	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3460	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3461	sg = sd->groups;
				3462	do {
				3463	if (!cpumask_intersects(sched_group_cpus(sg),
				3464	tsk_cpus_allowed(p)))
				3465	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3466
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3467	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3468	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3469	goto next;
				3470	}
				3471
				3472	target = cpumask_first_and(sched_group_cpus(sg),
				3473	tsk_cpus_allowed(p));
				3474	goto done;
				3475	next:
				3476	sg = sg->next;
				3477	} while (sg != sd->groups);
				3478	}
				3479	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3480	return target;
				3481	}
				3482
				3483	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3484	* sched_balance_self: balance the current task (running on cpu) in domains
				3485	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3486	* SD_BALANCE_EXEC.
				3487	*
				3488	* Balance, ie. select the least loaded group.
				3489	*
				3490	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3491	*
				3492	* preempt must be disabled.
				3493	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3494	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3495	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3496	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3497	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3498	int cpu = smp_processor_id();
				3499	int prev_cpu = task_cpu(p);
				3500	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3501	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3502	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3503
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3504	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3505	return prev_cpu;
				3506
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3507	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3508	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3509	want_affine = 1;
				3510	new_cpu = prev_cpu;
				3511	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3512
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3513	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3514	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	3515	if (!(tmp->flags & SD_LOAD_BALANCE))
				3516	continue;
				3517
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3518	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3519	* If both cpu and prev_cpu are part of this domain,
				3520	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3521	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3522	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3523	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3524	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3525	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3526	}
				3527
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3528	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3529	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3530	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3531
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3532	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3533	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3534	prev_cpu = cpu;
				3535
				3536	new_cpu = select_idle_sibling(p, prev_cpu);
				3537	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3538	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3539
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3540	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3541	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3542	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3543	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3544
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3545	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3546	sd = sd->child;
				3547	continue;
				3548	}
				3549
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3550	if (sd_flag & SD_BALANCE_WAKE)
				3551	load_idx = sd->wake_idx;
				3552
				3553	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3554	if (!group) {
				3555	sd = sd->child;
				3556	continue;
				3557	}
				3558
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3559	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3560	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3561	/* Now try balancing at a lower domain level of cpu */
				3562	sd = sd->child;
				3563	continue;
				3564	}
				3565
				3566	/* Now try balancing at a lower domain level of new_cpu */
				3567	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3568	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3569	sd = NULL;
				3570	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3571	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3572	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3573	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3574	sd = tmp;
				3575	}
				3576	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3577	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3578	unlock:
				3579	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3580
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3581	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3582	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3583
				3584	/*
				3585	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3586	* cfs_rq_of(p) references at time of call are still valid and identify the
				3587	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3588	* other assumptions, including the state of rq->lock, should be made.
				3589	*/
				3590	static void
				3591	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3592	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3593	struct sched_entity *se = &p->se;
				3594	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3595
				3596	/*
				3597	* Load tracking: accumulate removed load so that it can be processed
				3598	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3599	* to blocked load iff they have a positive decay-count. It can never
				3600	* be negative here since on-rq tasks have decay-count == 0.
				3601	*/
				3602	if (se->avg.decay_count) {
				3603	se->avg.decay_count = -__synchronize_entity_decay(se);
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	3604	atomic_long_add(se->avg.load_avg_contrib,
				3605	&cfs_rq->removed_load);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3606	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3607	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3608	#endif /* CONFIG_SMP */
				3609
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3610	static unsigned long
				3611	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3612	{
				3613	unsigned long gran = sysctl_sched_wakeup_granularity;
				3614
				3615	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3616	* Since its curr running now, convert the gran from real-time
				3617	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3618	*
				3619	* By using 'se' instead of 'curr' we penalize light tasks, so
				3620	* they get preempted easier. That is, if 'se' < 'curr' then
				3621	* the resulting gran will be larger, therefore penalizing the
				3622	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3623	* be smaller, again penalizing the lighter task.
				3624	*
				3625	* This is especially important for buddies when the leftmost
				3626	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3627	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3628	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3629	}
				3630
				3631	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3632	* Should 'se' preempt 'curr'.
				3633	*
				3634	* \|s1
				3635	* \|s2
				3636	* \|s3
				3637	* g
				3638	* \|<--->\|c
				3639	*
				3640	* w(c, s1) = -1
				3641	* w(c, s2) = 0
				3642	* w(c, s3) = 1
				3643	*
				3644	*/
				3645	static int
				3646	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3647	{
				3648	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3649
				3650	if (vdiff <= 0)
				3651	return -1;
				3652
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3653	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3654	if (vdiff > gran)
				3655	return 1;
				3656
				3657	return 0;
				3658	}
				3659
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3660	static void set_last_buddy(struct sched_entity *se)
				3661	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3662	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3663	return;
				3664
				3665	for_each_sched_entity(se)
				3666	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3667	}
				3668
				3669	static void set_next_buddy(struct sched_entity *se)
				3670	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3671	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3672	return;
				3673
				3674	for_each_sched_entity(se)
				3675	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3676	}
				3677
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3678	static void set_skip_buddy(struct sched_entity *se)
				3679	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3680	for_each_sched_entity(se)
				3681	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3682	}
				3683
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3684	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3685	* Preempt the current task with a newly woken task if needed:
				3686	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3687	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3688	{
				3689	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3690	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3691	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3692	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3693	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3694
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3695	if (unlikely(se == pse))
				3696	return;
				3697
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3698	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3699	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3700	* unconditionally check_prempt_curr() after an enqueue (which may have
				3701	* lead to a throttle). This both saves work and prevents false
				3702	* next-buddy nomination below.
				3703	*/
				3704	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3705	return;
				3706
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3707	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3708	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3709	next_buddy_marked = 1;
				3710	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3711
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3712	/*
				3713	* We can come here with TIF_NEED_RESCHED already set from new task
				3714	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3715	*
				3716	* Note: this also catches the edge-case of curr being in a throttled
				3717	* group (e.g. via set_curr_task), since update_curr() (in the
				3718	* enqueue of curr) will have resulted in resched being set. This
				3719	* prevents us from potentially nominating it as a false LAST_BUDDY
				3720	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3721	*/
				3722	if (test_tsk_need_resched(curr))
				3723	return;
				3724
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3725	/* Idle tasks are by definition preempted by non-idle tasks. */
				3726	if (unlikely(curr->policy == SCHED_IDLE) &&
				3727	likely(p->policy != SCHED_IDLE))
				3728	goto preempt;
				3729
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3730	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3731	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3732	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3733	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	3734	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3735	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3736
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3737	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3738	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3739	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3740	if (wakeup_preempt_entity(se, pse) == 1) {
				3741	/*
				3742	* Bias pick_next to pick the sched entity that is
				3743	* triggering this preemption.
				3744	*/
				3745	if (!next_buddy_marked)
				3746	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3747	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3748	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3749
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3750	return;
				3751
				3752	preempt:
				3753	resched_task(curr);
				3754	/*
				3755	* Only set the backward buddy when the current task is still
				3756	* on the rq. This can happen when a wakeup gets interleaved
				3757	* with schedule on the ->pre_schedule() or idle_balance()
				3758	* point, either of which can * drop the rq lock.
				3759	*
				3760	* Also, during early boot the idle thread is in the fair class,
				3761	* for obvious reasons its a bad idea to schedule back to it.
				3762	*/
				3763	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3764	return;
				3765
				3766	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3767	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3768	}
				3769
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3770	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3771	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3772	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3773	struct cfs_rq *cfs_rq = &rq->cfs;
				3774	struct sched_entity *se;
				3775
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3776	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3777	return NULL;
				3778
				3779	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3780	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3781	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3782	cfs_rq = group_cfs_rq(se);
				3783	} while (cfs_rq);
				3784
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3785	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3786	if (hrtick_enabled(rq))
				3787	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3788
				3789	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3790	}
				3791
				3792	/*
				3793	* Account for a descheduled task:
				3794	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3795	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3796	{
				3797	struct sched_entity *se = &prev->se;
				3798	struct cfs_rq *cfs_rq;
				3799
				3800	for_each_sched_entity(se) {
				3801	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3802	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3803	}
				3804	}
				3805
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3806	/*
				3807	* sched_yield() is very simple
				3808	*
				3809	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3810	*/
				3811	static void yield_task_fair(struct rq *rq)
				3812	{
				3813	struct task_struct *curr = rq->curr;
				3814	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3815	struct sched_entity *se = &curr->se;
				3816
				3817	/*
				3818	* Are we the only task in the tree?
				3819	*/
				3820	if (unlikely(rq->nr_running == 1))
				3821	return;
				3822
				3823	clear_buddies(cfs_rq, se);
				3824
				3825	if (curr->policy != SCHED_BATCH) {
				3826	update_rq_clock(rq);
				3827	/*
				3828	* Update run-time statistics of the 'current'.
				3829	*/
				3830	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3831	/*
				3832	* Tell update_rq_clock() that we've just updated,
				3833	* so we don't do microscopic update in schedule()
				3834	* and double the fastpath cost.
				3835	*/
				3836	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3837	}
				3838
				3839	set_skip_buddy(se);
				3840	}
				3841
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3842	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3843	{
				3844	struct sched_entity *se = &p->se;
				3845
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3846	/* throttled hierarchies are not runnable */
				3847	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3848	return false;
				3849
				3850	/* Tell the scheduler that we'd really like pse to run next. */
				3851	set_next_buddy(se);
				3852
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3853	yield_task_fair(rq);
				3854
				3855	return true;
				3856	}
				3857
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3858	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3859	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3860	* Fair scheduling class load-balancing methods.
				3861	*
				3862	* BASICS
				3863	*
				3864	* The purpose of load-balancing is to achieve the same basic fairness the
				3865	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3866	* time to each task. This is expressed in the following equation:
				3867	*
				3868	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3869	*
				3870	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3871	* W_i,0 is defined as:
				3872	*
				3873	* W_i,0 = \Sum_j w_i,j (2)
				3874	*
				3875	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3876	* is derived from the nice value as per prio_to_weight[].
				3877	*
				3878	* The weight average is an exponential decay average of the instantaneous
				3879	* weight:
				3880	*
				3881	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3882	*
				3883	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3884	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3885	* can also include other factors [XXX].
				3886	*
				3887	* To achieve this balance we define a measure of imbalance which follows
				3888	* directly from (1):
				3889	*
				3890	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3891	*
				3892	* We them move tasks around to minimize the imbalance. In the continuous
				3893	* function space it is obvious this converges, in the discrete case we get
				3894	* a few fun cases generally called infeasible weight scenarios.
				3895	*
				3896	* [XXX expand on:
				3897	* - infeasible weights;
				3898	* - local vs global optima in the discrete case. ]
				3899	*
				3900	*
				3901	* SCHED DOMAINS
				3902	*
				3903	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3904	* for all i,j solution, we create a tree of cpus that follows the hardware
				3905	* topology where each level pairs two lower groups (or better). This results
				3906	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3907	* tree to only the first of the previous level and we decrease the frequency
				3908	* of load-balance at each level inv. proportional to the number of cpus in
				3909	* the groups.
				3910	*
				3911	* This yields:
				3912	*
				3913	* log_2 n 1 n
				3914	* \Sum { --- * --- * 2^i } = O(n) (5)
				3915	* i = 0 2^i 2^i
				3916	* `- size of each group
				3917	* \| \| `- number of cpus doing load-balance
				3918	* \| `- freq
				3919	* `- sum over all levels
				3920	*
				3921	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3922	* this makes (5) the runtime complexity of the balancer.
				3923	*
				3924	* An important property here is that each CPU is still (indirectly) connected
				3925	* to every other cpu in at most O(log n) steps:
				3926	*
				3927	* The adjacency matrix of the resulting graph is given by:
				3928	*
				3929	* log_2 n
				3930	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3931	* k = 0
				3932	*
				3933	* And you'll find that:
				3934	*
				3935	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3936	*
				3937	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3938	* The task movement gives a factor of O(m), giving a convergence complexity
				3939	* of:
				3940	*
				3941	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3942	*
				3943	*
				3944	* WORK CONSERVING
				3945	*
				3946	* In order to avoid CPUs going idle while there's still work to do, new idle
				3947	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3948	* tree itself instead of relying on other CPUs to bring it work.
				3949	*
				3950	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3951	* time.
				3952	*
				3953	* [XXX more?]
				3954	*
				3955	*
				3956	* CGROUPS
				3957	*
				3958	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3959	*
				3960	* s_k,i
				3961	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3962	* S_k
				3963	*
				3964	* Where
				3965	*
				3966	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3967	*
				3968	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3969	*
				3970	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3971	* property.
				3972	*
				3973	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3974	* rewrite all of this once again.]
				3975	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3976
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3977	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3978
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3979	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3980	#define LBF_NEED_BREAK 0x02
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	3981	#define LBF_DST_PINNED 0x04
				3982	#define LBF_SOME_PINNED 0x08
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3983
				3984	struct lb_env {
				3985	struct sched_domain *sd;
				3986
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3987	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3988	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3989
				3990	int dst_cpu;
				3991	struct rq *dst_rq;
				3992
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3993	struct cpumask *dst_grpmask;
				3994	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3995	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3996	long imbalance;
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	3997	/* The set of CPUs under consideration for load-balancing */
				3998	struct cpumask *cpus;
				3999
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4000	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4001
				4002	unsigned int loop;
				4003	unsigned int loop_break;
				4004	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4005	};
				4006
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4007	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4008	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4009	* Both runqueues must be locked.
				4010	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4011	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4012	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4013	deactivate_task(env->src_rq, p, 0);
				4014	set_task_cpu(p, env->dst_cpu);
				4015	activate_task(env->dst_rq, p, 0);
				4016	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4017	}
				4018
				4019	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4020	* Is this task likely cache-hot:
				4021	*/
				4022	static int
				4023	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				4024	{
				4025	s64 delta;
				4026
				4027	if (p->sched_class != &fair_sched_class)
				4028	return 0;
				4029
				4030	if (unlikely(p->policy == SCHED_IDLE))
				4031	return 0;
				4032
				4033	/*
				4034	* Buddy candidates are cache hot:
				4035	*/
				4036	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				4037	(&p->se == cfs_rq_of(&p->se)->next \|\|
				4038	&p->se == cfs_rq_of(&p->se)->last))
				4039	return 1;
				4040
				4041	if (sysctl_sched_migration_cost == -1)
				4042	return 1;
				4043	if (sysctl_sched_migration_cost == 0)
				4044	return 0;
				4045
				4046	delta = now - p->se.exec_start;
				4047
				4048	return delta < (s64)sysctl_sched_migration_cost;
				4049	}
				4050
				4051	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4052	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				4053	*/
				4054	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4055	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4056	{
				4057	int tsk_cache_hot = 0;
				4058	/*
				4059	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4060	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4061	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4062	* 3) running (obviously), or
				4063	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4064	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4065	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				4066	return 0;
				4067
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4068	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4069	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4070
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4071	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4072
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4073	env->flags \|= LBF_SOME_PINNED;
				4074
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4075	/*
				4076	* Remember if this task can be migrated to any other cpu in
				4077	* our sched_group. We may want to revisit it if we couldn't
				4078	* meet load balance goals by pulling other tasks on src_cpu.
				4079	*
				4080	* Also avoid computing new_dst_cpu if we have already computed
				4081	* one in current iteration.
				4082	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4083	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4084	return 0;
				4085
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4086	/* Prevent to re-select dst_cpu via env's cpus */
				4087	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				4088	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4089	env->flags \|= LBF_DST_PINNED;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4090	env->new_dst_cpu = cpu;
				4091	break;
				4092	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4093	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4094
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4095	return 0;
				4096	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4097
				4098	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4099	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4100
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4101	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4102	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4103	return 0;
				4104	}
				4105
				4106	/*
				4107	* Aggressive migration if:
				4108	* 1) task is cache cold, or
				4109	* 2) too many balance attempts have failed.
				4110	*/
				4111
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4112	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4113	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4114	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4115
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4116	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4117	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4118	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4119	}
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4120
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4121	return 1;
				4122	}
				4123
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4124	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				4125	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4126	}
				4127
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4128	/*
				4129	* move_one_task tries to move exactly one task from busiest to this_rq, as
				4130	* part of active balancing operations within "domain".
				4131	* Returns 1 if successful and 0 otherwise.
				4132	*
				4133	* Called with both runqueues locked.
				4134	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4135	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4136	{
				4137	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4138
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4139	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4140	if (!can_migrate_task(p, env))
				4141	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4142
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4143	move_task(p, env);
				4144	/*
				4145	* Right now, this is only the second place move_task()
				4146	* is called, so we can safely collect move_task()
				4147	* stats here rather than inside move_task().
				4148	*/
				4149	schedstat_inc(env->sd, lb_gained[env->idle]);
				4150	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4151	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4152	return 0;
				4153	}
				4154
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4155	static unsigned long task_h_load(struct task_struct *p);
				4156
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4157	static const unsigned int sched_nr_migrate_break = 32;
				4158
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4159	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4160	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4161	* this_rq, as part of a balancing operation within domain "sd".
				4162	* Returns 1 if successful and 0 otherwise.
				4163	*
				4164	* Called with both runqueues locked.
				4165	*/
				4166	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4167	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4168	struct list_head *tasks = &env->src_rq->cfs_tasks;
				4169	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4170	unsigned long load;
				4171	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4172
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4173	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4174	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4175
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4176	while (!list_empty(tasks)) {
				4177	p = list_first_entry(tasks, struct task_struct, se.group_node);
				4178
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4179	env->loop++;
				4180	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4181	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4182	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4183
				4184	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4185	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4186	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4187	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4188	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4189	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4190
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4191	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4192	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4193
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4194	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4195
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4196	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4197	goto next;
				4198
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4199	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4200	goto next;
				4201
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4202	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4203	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4204	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4205
				4206	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4207	/*
				4208	* NEWIDLE balancing is a source of latency, so preemptible
				4209	* kernels will stop after the first task is pulled to minimize
				4210	* the critical section.
				4211	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4212	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4213	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4214	#endif
				4215
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4216	/*
				4217	* We only want to steal up to the prescribed amount of
				4218	* weighted load.
				4219	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4220	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4221	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4222
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4223	continue;
				4224	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4225	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4226	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4227
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4228	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4229	* Right now, this is one of only two places move_task() is called,
				4230	* so we can safely collect move_task() stats here rather than
				4231	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4232	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4233	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4234
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4235	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4236	}
				4237
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4238	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4239	/*
				4240	* update tg->load_weight by folding this cpu's load_avg
				4241	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4242	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4243	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4244	struct sched_entity *se = tg->se[cpu];
				4245	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4246
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4247	/* throttled entities do not contribute to load */
				4248	if (throttled_hierarchy(cfs_rq))
				4249	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4250
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4251	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4252
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4253	if (se) {
				4254	update_entity_load_avg(se, 1);
				4255	/*
				4256	* We pivot on our runnable average having decayed to zero for
				4257	* list removal. This generally implies that all our children
				4258	* have also been removed (modulo rounding error or bandwidth
				4259	* control); however, such cases are rare and we can fix these
				4260	* at enqueue.
				4261	*
				4262	* TODO: fix up out-of-order children on enqueue.
				4263	*/
				4264	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4265	list_del_leaf_cfs_rq(cfs_rq);
				4266	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4267	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4268	update_rq_runnable_avg(rq, rq->nr_running);
				4269	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4270	}
				4271
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4272	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4273	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4274	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4275	struct cfs_rq *cfs_rq;
				4276	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4277
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4278	raw_spin_lock_irqsave(&rq->lock, flags);
				4279	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4280	/*
				4281	* Iterates the task_group tree in a bottom up fashion, see
				4282	* list_add_leaf_cfs_rq() for details.
				4283	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4284	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4285	/*
				4286	* Note: We may want to consider periodically releasing
				4287	* rq->lock about these updates so that creating many task
				4288	* groups does not result in continually extending hold time.
				4289	*/
				4290	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4291	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4292
				4293	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4294	}
				4295
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4296	/*
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4297	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4298	* This needs to be done in a top-down fashion because the load of a child
				4299	* group is a fraction of its parents load.
				4300	*/
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4301	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4302	{
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4303	struct rq *rq = rq_of(cfs_rq);
				4304	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4305	unsigned long now = jiffies;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4306	unsigned long load;
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4307
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4308	if (cfs_rq->last_h_load_update == now)
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4309	return;
				4310
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4311	cfs_rq->h_load_next = NULL;
				4312	for_each_sched_entity(se) {
				4313	cfs_rq = cfs_rq_of(se);
				4314	cfs_rq->h_load_next = se;
				4315	if (cfs_rq->last_h_load_update == now)
				4316	break;
				4317	}
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4318
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4319	if (!se) {
Vladimir Davydov	7e3115e	2013-09-14 19:39:46 +0400	[diff] [blame]	4320	cfs_rq->h_load = cfs_rq->runnable_load_avg;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4321	cfs_rq->last_h_load_update = now;
				4322	}
				4323
				4324	while ((se = cfs_rq->h_load_next) != NULL) {
				4325	load = cfs_rq->h_load;
				4326	load = div64_ul(load * se->avg.load_avg_contrib,
				4327	cfs_rq->runnable_load_avg + 1);
				4328	cfs_rq = group_cfs_rq(se);
				4329	cfs_rq->h_load = load;
				4330	cfs_rq->last_h_load_update = now;
				4331	}
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4332	}
				4333
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4334	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4335	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4336	struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4337
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4338	update_cfs_rq_h_load(cfs_rq);
Alex Shi	a003a25	2013-06-20 10:18:51 +0800	[diff] [blame]	4339	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
				4340	cfs_rq->runnable_load_avg + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4341	}
				4342	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4343	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4344	{
				4345	}
				4346
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4347	static unsigned long task_h_load(struct task_struct *p)
				4348	{
Alex Shi	a003a25	2013-06-20 10:18:51 +0800	[diff] [blame]	4349	return p->se.avg.load_avg_contrib;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4350	}
				4351	#endif
				4352
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4353	/******** Helpers for find_busiest_group **********************/
				4354	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4355	* sg_lb_stats - stats of a sched_group required for load_balancing
				4356	*/
				4357	struct sg_lb_stats {
				4358	unsigned long avg_load; /Avg load across the CPUs of the group /
				4359	unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4360	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4361	unsigned long load_per_task;
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4362	unsigned long group_power;
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4363	unsigned int sum_nr_running; /* Nr tasks running in the group */
				4364	unsigned int group_capacity;
				4365	unsigned int idle_cpus;
				4366	unsigned int group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4367	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4368	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4369	};
				4370
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4371	/*
				4372	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4373	* during load balancing.
				4374	*/
				4375	struct sd_lb_stats {
				4376	struct sched_group busiest; / Busiest group in this sd */
				4377	struct sched_group local; / Local group in this sd */
				4378	unsigned long total_load; /* Total load of all groups in sd */
				4379	unsigned long total_pwr; /* Total power of all groups in sd */
				4380	unsigned long avg_load; /* Average load across all groups in sd */
				4381
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4382	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4383	struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4384	};
				4385
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4386	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				4387	{
				4388	/*
				4389	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				4390	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				4391	* We must however clear busiest_stat::avg_load because
				4392	* update_sd_pick_busiest() reads this before assignment.
				4393	*/
				4394	*sds = (struct sd_lb_stats){
				4395	.busiest = NULL,
				4396	.local = NULL,
				4397	.total_load = 0UL,
				4398	.total_pwr = 0UL,
				4399	.busiest_stat = {
				4400	.avg_load = 0UL,
				4401	},
				4402	};
				4403	}
				4404
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4405	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4406	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4407	* @sd: The sched_domain whose load_idx is to be obtained.
				4408	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4409	*
				4410	* Return: The load index.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4411	*/
				4412	static inline int get_sd_load_idx(struct sched_domain *sd,
				4413	enum cpu_idle_type idle)
				4414	{
				4415	int load_idx;
				4416
				4417	switch (idle) {
				4418	case CPU_NOT_IDLE:
				4419	load_idx = sd->busy_idx;
				4420	break;
				4421
				4422	case CPU_NEWLY_IDLE:
				4423	load_idx = sd->newidle_idx;
				4424	break;
				4425	default:
				4426	load_idx = sd->idle_idx;
				4427	break;
				4428	}
				4429
				4430	return load_idx;
				4431	}
				4432
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4433	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4434	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4435	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4436	}
				4437
				4438	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4439	{
				4440	return default_scale_freq_power(sd, cpu);
				4441	}
				4442
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4443	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4444	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4445	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4446	unsigned long smt_gain = sd->smt_gain;
				4447
				4448	smt_gain /= weight;
				4449
				4450	return smt_gain;
				4451	}
				4452
				4453	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4454	{
				4455	return default_scale_smt_power(sd, cpu);
				4456	}
				4457
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4458	static unsigned long scale_rt_power(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4459	{
				4460	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4461	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4462
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4463	/*
				4464	* Since we're reading these variables without serialization make sure
				4465	* we read them once before doing sanity checks on them.
				4466	*/
				4467	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4468	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4469
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4470	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4471
				4472	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4473	/* Ensures that power won't end up being negative */
				4474	available = 0;
				4475	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4476	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4477	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4478
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4479	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4480	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4481
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4482	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4483
				4484	return div_u64(available, total);
				4485	}
				4486
				4487	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4488	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4489	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4490	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4491	struct sched_group *sdg = sd->groups;
				4492
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4493	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4494	if (sched_feat(ARCH_POWER))
				4495	power *= arch_scale_smt_power(sd, cpu);
				4496	else
				4497	power *= default_scale_smt_power(sd, cpu);
				4498
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4499	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4500	}
				4501
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4502	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4503
				4504	if (sched_feat(ARCH_POWER))
				4505	power *= arch_scale_freq_power(sd, cpu);
				4506	else
				4507	power *= default_scale_freq_power(sd, cpu);
				4508
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4509	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4510
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4511	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4512	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4513
				4514	if (!power)
				4515	power = 1;
				4516
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4517	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4518	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4519	}
				4520
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4521	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4522	{
				4523	struct sched_domain *child = sd->child;
				4524	struct sched_group group, sdg = sd->groups;
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4525	unsigned long power, power_orig;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4526	unsigned long interval;
				4527
				4528	interval = msecs_to_jiffies(sd->balance_interval);
				4529	interval = clamp(interval, 1UL, max_load_balance_interval);
				4530	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4531
				4532	if (!child) {
				4533	update_cpu_power(sd, cpu);
				4534	return;
				4535	}
				4536
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4537	power_orig = power = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4538
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4539	if (child->flags & SD_OVERLAP) {
				4540	/*
				4541	* SD_OVERLAP domains cannot assume that child groups
				4542	* span the current group.
				4543	*/
				4544
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4545	for_each_cpu(cpu, sched_group_cpus(sdg)) {
				4546	struct sched_group *sg = cpu_rq(cpu)->sd->groups;
				4547
				4548	power_orig += sg->sgp->power_orig;
				4549	power += sg->sgp->power;
				4550	}
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4551	} else {
				4552	/*
				4553	* !SD_OVERLAP domains can assume that child groups
				4554	* span the current group.
				4555	*/
				4556
				4557	group = child->groups;
				4558	do {
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4559	power_orig += group->sgp->power_orig;
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4560	power += group->sgp->power;
				4561	group = group->next;
				4562	} while (group != child->groups);
				4563	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4564
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4565	sdg->sgp->power_orig = power_orig;
				4566	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4567	}
				4568
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4569	/*
				4570	* Try and fix up capacity for tiny siblings, this is needed when
				4571	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4572	* which on its own isn't powerful enough.
				4573	*
				4574	* See update_sd_pick_busiest() and check_asym_packing().
				4575	*/
				4576	static inline int
				4577	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4578	{
				4579	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4580	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4581	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4582	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4583	return 0;
				4584
				4585	/*
				4586	* If ~90% of the cpu_power is still there, we're good.
				4587	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4588	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4589	return 1;
				4590
				4591	return 0;
				4592	}
				4593
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4594	/*
				4595	* Group imbalance indicates (and tries to solve) the problem where balancing
				4596	* groups is inadequate due to tsk_cpus_allowed() constraints.
				4597	*
				4598	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				4599	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				4600	* Something like:
				4601	*
				4602	* { 0 1 2 3 } { 4 5 6 7 }
				4603	* * * * *
				4604	*
				4605	* If we were to balance group-wise we'd place two tasks in the first group and
				4606	* two tasks in the second group. Clearly this is undesired as it will overload
				4607	* cpu 3 and leave one of the cpus in the second group unused.
				4608	*
				4609	* The current solution to this issue is detecting the skew in the first group
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4610	* by noticing the lower domain failed to reach balance and had difficulty
				4611	* moving tasks due to affinity constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4612	*
				4613	* When this is so detected; this group becomes a candidate for busiest; see
				4614	* update_sd_pick_busiest(). And calculcate_imbalance() and
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4615	* find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4616	* to create an effective group imbalance.
				4617	*
				4618	* This is a somewhat tricky proposition since the next run might not find the
				4619	* group imbalance and decide the groups need to be balanced again. A most
				4620	* subtle and fragile situation.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4621	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4622
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4623	static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4624	{
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4625	return group->sgp->imbalance;
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4626	}
				4627
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4628	/*
				4629	* Compute the group capacity.
				4630	*
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4631	* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
				4632	* first dividing out the smt factor and computing the actual number of cores
				4633	* and limit power unit capacity with that.
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4634	*/
				4635	static inline int sg_capacity(struct lb_env env, struct sched_group group)
				4636	{
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4637	unsigned int capacity, smt, cpus;
				4638	unsigned int power, power_orig;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4639
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4640	power = group->sgp->power;
				4641	power_orig = group->sgp->power_orig;
				4642	cpus = group->group_weight;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4643
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4644	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
				4645	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
				4646	capacity = cpus / smt; /* cores */
				4647
				4648	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4649	if (!capacity)
				4650	capacity = fix_small_capacity(env->sd, group);
				4651
				4652	return capacity;
				4653	}
				4654
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4655	/**
				4656	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				4657	* @env: The load balancing environment.
				4658	* @group: sched_group whose statistics are to be updated.
				4659	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				4660	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4661	* @sgs: variable to hold the statistics for this group.
				4662	*/
				4663	static inline void update_sg_lb_stats(struct lb_env *env,
				4664	struct sched_group *group, int load_idx,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	4665	int local_group, struct sg_lb_stats *sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4666	{
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4667	unsigned long nr_running;
				4668	unsigned long load;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4669	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4670
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4671	memset(sgs, 0, sizeof(*sgs));
				4672
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	4673	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4674	struct rq *rq = cpu_rq(i);
				4675
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4676	nr_running = rq->nr_running;
				4677
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4678	/* Bias balancing toward cpus of our domain */
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4679	if (local_group)
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4680	load = target_load(i, load_idx);
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4681	else
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4682	load = source_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4683
				4684	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4685	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4686	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4687	if (idle_cpu(i))
				4688	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4689	}
				4690
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4691	/* Adjust by relative CPU power of the group */
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4692	sgs->group_power = group->sgp->power;
				4693	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4694
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4695	if (sgs->sum_nr_running)
Peter Zijlstra	38d0f77	2013-08-15 19:47:56 +0200	[diff] [blame]	4696	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4697
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4698	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4699
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4700	sgs->group_imb = sg_imbalanced(group);
				4701	sgs->group_capacity = sg_capacity(env, group);
				4702
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4703	if (sgs->group_capacity > sgs->sum_nr_running)
				4704	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4705	}
				4706
				4707	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4708	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4709	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4710	* @sds: sched_domain statistics
				4711	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4712	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4713	*
				4714	* Determine if @sg is a busier group than the previously selected
				4715	* busiest group.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4716	*
				4717	* Return: %true if @sg is a busier group than the previously selected
				4718	* busiest group. %false otherwise.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4719	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4720	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4721	struct sd_lb_stats *sds,
				4722	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4723	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4724	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4725	if (sgs->avg_load <= sds->busiest_stat.avg_load)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4726	return false;
				4727
				4728	if (sgs->sum_nr_running > sgs->group_capacity)
				4729	return true;
				4730
				4731	if (sgs->group_imb)
				4732	return true;
				4733
				4734	/*
				4735	* ASYM_PACKING needs to move all the work to the lowest
				4736	* numbered CPUs in the group, therefore mark all groups
				4737	* higher than ourself as busy.
				4738	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4739	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4740	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4741	if (!sds->busiest)
				4742	return true;
				4743
				4744	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4745	return true;
				4746	}
				4747
				4748	return false;
				4749	}
				4750
				4751	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4752	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4753	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4754	* @balance: Should we balance.
				4755	* @sds: variable to hold the statistics for this sched_domain.
				4756	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4757	static inline void update_sd_lb_stats(struct lb_env *env,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	4758	struct sd_lb_stats *sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4759	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4760	struct sched_domain *child = env->sd->child;
				4761	struct sched_group *sg = env->sd->groups;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4762	struct sg_lb_stats tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4763	int load_idx, prefer_sibling = 0;
				4764
				4765	if (child && child->flags & SD_PREFER_SIBLING)
				4766	prefer_sibling = 1;
				4767
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4768	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4769
				4770	do {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4771	struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4772	int local_group;
				4773
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4774	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4775	if (local_group) {
				4776	sds->local = sg;
				4777	sgs = &sds->local_stat;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4778
				4779	if (env->idle != CPU_NEWLY_IDLE \|\|
				4780	time_after_eq(jiffies, sg->sgp->next_update))
				4781	update_group_power(env->sd, env->dst_cpu);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4782	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4783
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4784	update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4785
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4786	if (local_group)
				4787	goto next_group;
				4788
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4789	/*
				4790	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4791	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4792	* and move all the excess tasks away. We lower the capacity
				4793	* of a group only if the local group has the capacity to fit
				4794	* these excess tasks, i.e. nr_running < group_capacity. The
				4795	* extra check prevents the case where you always pull from the
				4796	* heaviest group when it is already under-utilized (possible
				4797	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4798	*/
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4799	if (prefer_sibling && sds->local &&
				4800	sds->local_stat.group_has_capacity)
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4801	sgs->group_capacity = min(sgs->group_capacity, 1U);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4802
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4803	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4804	sds->busiest = sg;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4805	sds->busiest_stat = *sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4806	}
				4807
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4808	next_group:
				4809	/* Now, start updating sd_lb_stats */
				4810	sds->total_load += sgs->group_load;
				4811	sds->total_pwr += sgs->group_power;
				4812
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4813	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4814	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4815	}
				4816
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4817	/**
				4818	* check_asym_packing - Check to see if the group is packed into the
				4819	* sched doman.
				4820	*
				4821	* This is primarily intended to used at the sibling level. Some
				4822	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4823	* case of POWER7, it can move to lower SMT modes only when higher
				4824	* threads are idle. When in lower SMT modes, the threads will
				4825	* perform better since they share less core resources. Hence when we
				4826	* have idle threads, we want them to be the higher ones.
				4827	*
				4828	* This packing function is run on idle threads. It checks to see if
				4829	* the busiest CPU in this domain (core in the P7 case) has a higher
				4830	* CPU number than the packing function is being run on. Here we are
				4831	* assuming lower CPU number will be equivalent to lower a SMT thread
				4832	* number.
				4833	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4834	* Return: 1 when packing is required and a task should be moved to
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4835	* this CPU. The amount of the imbalance is returned in *imbalance.
				4836	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4837	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4838	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4839	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4840	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4841	{
				4842	int busiest_cpu;
				4843
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4844	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4845	return 0;
				4846
				4847	if (!sds->busiest)
				4848	return 0;
				4849
				4850	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4851	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4852	return 0;
				4853
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4854	env->imbalance = DIV_ROUND_CLOSEST(
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4855	sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
				4856	SCHED_POWER_SCALE);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4857
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4858	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4859	}
				4860
				4861	/**
				4862	* fix_small_imbalance - Calculate the minor imbalance that exists
				4863	* amongst the groups of a sched_domain, during
				4864	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4865	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4866	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4867	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4868	static inline
				4869	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4870	{
				4871	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4872	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4873	unsigned long scaled_busy_load_per_task;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4874	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4875
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4876	local = &sds->local_stat;
				4877	busiest = &sds->busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4878
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4879	if (!local->sum_nr_running)
				4880	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				4881	else if (busiest->load_per_task > local->load_per_task)
				4882	imbn = 1;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4883
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4884	scaled_busy_load_per_task =
				4885	(busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4886	busiest->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4887
Vladimir Davydov	3029ede	2013-09-15 17:49:14 +0400	[diff] [blame]	4888	if (busiest->avg_load + scaled_busy_load_per_task >=
				4889	local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4890	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4891	return;
				4892	}
				4893
				4894	/*
				4895	* OK, we don't have enough imbalance to justify moving tasks,
				4896	* however we may be able to increase total CPU power used by
				4897	* moving them.
				4898	*/
				4899
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4900	pwr_now += busiest->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4901	min(busiest->load_per_task, busiest->avg_load);
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4902	pwr_now += local->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4903	min(local->load_per_task, local->avg_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4904	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4905
				4906	/* Amount of load we'd subtract */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4907	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4908	busiest->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4909	if (busiest->avg_load > tmp) {
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4910	pwr_move += busiest->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4911	min(busiest->load_per_task,
				4912	busiest->avg_load - tmp);
				4913	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4914
				4915	/* Amount of load we'd add */
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4916	if (busiest->avg_load * busiest->group_power <
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4917	busiest->load_per_task * SCHED_POWER_SCALE) {
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4918	tmp = (busiest->avg_load * busiest->group_power) /
				4919	local->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4920	} else {
				4921	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4922	local->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4923	}
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4924	pwr_move += local->group_power *
				4925	min(local->load_per_task, local->avg_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4926	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4927
				4928	/* Move if we gain throughput */
				4929	if (pwr_move > pwr_now)
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4930	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4931	}
				4932
				4933	/**
				4934	* calculate_imbalance - Calculate the amount of imbalance present within the
				4935	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4936	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4937	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4938	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4939	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4940	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4941	unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4942	struct sg_lb_stats local, busiest;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4943
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4944	local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4945	busiest = &sds->busiest_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4946
				4947	if (busiest->group_imb) {
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4948	/*
				4949	* In the group_imb case we cannot rely on group-wide averages
				4950	* to ensure cpu-load equilibrium, look at wider averages. XXX
				4951	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4952	busiest->load_per_task =
				4953	min(busiest->load_per_task, sds->avg_load);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4954	}
				4955
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4956	/*
				4957	* In the presence of smp nice balancing, certain scenarios can have
				4958	* max load less than avg load(as we skip the groups at or below
				4959	* its cpu_power, while calculating max_load..)
				4960	*/
Vladimir Davydov	b188555	2013-09-15 17:49:13 +0400	[diff] [blame]	4961	if (busiest->avg_load <= sds->avg_load \|\|
				4962	local->avg_load >= sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4963	env->imbalance = 0;
				4964	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4965	}
				4966
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4967	if (!busiest->group_imb) {
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4968	/*
				4969	* Don't want to pull so many tasks that a group would go idle.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4970	* Except of course for the group_imb case, since then we might
				4971	* have to drop below capacity to reach cpu-load equilibrium.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4972	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4973	load_above_capacity =
				4974	(busiest->sum_nr_running - busiest->group_capacity);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4975
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4976	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4977	load_above_capacity /= busiest->group_power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4978	}
				4979
				4980	/*
				4981	* We're trying to get all the cpus to the average_load, so we don't
				4982	* want to push ourselves above the average load, nor do we wish to
				4983	* reduce the max loaded cpu below the average load. At the same time,
				4984	* we also don't want to reduce the group load below the group capacity
				4985	* (so that we can implement power-savings policies etc). Thus we look
				4986	* for the minimum possible imbalance.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4987	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4988	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4989
				4990	/* How much load to actually move to equalise the imbalance */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4991	env->imbalance = min(
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4992	max_pull * busiest->group_power,
				4993	(sds->avg_load - local->avg_load) * local->group_power
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4994	) / SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4995
				4996	/*
				4997	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4998	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4999	* a think about bumping its value to force at least one task to be
				5000	* moved
				5001	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5002	if (env->imbalance < busiest->load_per_task)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5003	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5004	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5005
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5006	/***** find_busiest_group() helpers end here *******************/
				5007
				5008	/**
				5009	* find_busiest_group - Returns the busiest group within the sched_domain
				5010	* if there is an imbalance. If there isn't an imbalance, and
				5011	* the user has opted for power-savings, it returns a group whose
				5012	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				5013	* such a group exists.
				5014	*
				5015	* Also calculates the amount of weighted load which should be moved
				5016	* to restore balance.
				5017	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5018	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5019	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	5020	* Return: - The busiest group if imbalance exists.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5021	* - If no imbalance and user has opted for power-savings balance,
				5022	* return the least loaded group whose CPUs can be
				5023	* put to idle by rebalancing its tasks onto our group.
				5024	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5025	static struct sched_group find_busiest_group(struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5026	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5027	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5028	struct sd_lb_stats sds;
				5029
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	5030	init_sd_lb_stats(&sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5031
				5032	/*
				5033	* Compute the various statistics relavent for load balancing at
				5034	* this level.
				5035	*/
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5036	update_sd_lb_stats(env, &sds);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5037	local = &sds.local_stat;
				5038	busiest = &sds.busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5039
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5040	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				5041	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5042	return sds.busiest;
				5043
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5044	/* There is no busy sibling group to pull tasks from */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5045	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5046	goto out_balanced;
				5047
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5048	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	5049
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5050	/*
				5051	* If the busiest group is imbalanced the below checks don't
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5052	* work because they assume all things are equal, which typically
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5053	* isn't true due to cpus_allowed constraints and the like.
				5054	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5055	if (busiest->group_imb)
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5056	goto force_balance;
				5057
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5058	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5059	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
				5060	!busiest->group_has_capacity)
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5061	goto force_balance;
				5062
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5063	/*
				5064	* If the local group is more busy than the selected busiest group
				5065	* don't try and pull any tasks.
				5066	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5067	if (local->avg_load >= busiest->avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5068	goto out_balanced;
				5069
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5070	/*
				5071	* Don't pull any tasks if this group is already above the domain
				5072	* average load.
				5073	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5074	if (local->avg_load >= sds.avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5075	goto out_balanced;
				5076
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5077	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5078	/*
				5079	* This cpu is idle. If the busiest group load doesn't
				5080	* have more tasks than the number of available cpu's and
				5081	* there is no imbalance between this and busiest group
				5082	* wrt to idle cpu's, it is balanced.
				5083	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5084	if ((local->idle_cpus < busiest->idle_cpus) &&
				5085	busiest->sum_nr_running <= busiest->group_weight)
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5086	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5087	} else {
				5088	/*
				5089	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				5090	* imbalance_pct to be conservative.
				5091	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5092	if (100 * busiest->avg_load <=
				5093	env->sd->imbalance_pct * local->avg_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5094	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5095	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5096
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5097	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5098	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5099	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5100	return sds.busiest;
				5101
				5102	out_balanced:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5103	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5104	return NULL;
				5105	}
				5106
				5107	/*
				5108	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				5109	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5110	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5111	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5112	{
				5113	struct rq busiest = NULL, rq;
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5114	unsigned long busiest_load = 0, busiest_power = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5115	int i;
				5116
Peter Zijlstra	6906a40	2013-08-19 15:20:21 +0200	[diff] [blame]	5117	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5118	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5119	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				5120	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5121	unsigned long wl;
				5122
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5123	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5124	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5125
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5126	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5127	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5128
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5129	/*
				5130	* When comparing with imbalance, use weighted_cpuload()
				5131	* which is not scaled with the cpu power.
				5132	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5133	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5134	continue;
				5135
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5136	/*
				5137	* For the load comparisons with the other cpu's, consider
				5138	* the weighted_cpuload() scaled with the cpu power, so that
				5139	* the load can be moved away from the cpu that is potentially
				5140	* running at a lower capacity.
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5141	*
				5142	* Thus we're looking for max(wl_i / power_i), crosswise
				5143	* multiplication to rid ourselves of the division works out
				5144	* to: wl_i * power_j > wl_j * power_i; where j is our
				5145	* previous maximum.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5146	*/
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5147	if (wl * busiest_power > busiest_load * power) {
				5148	busiest_load = wl;
				5149	busiest_power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5150	busiest = rq;
				5151	}
				5152	}
				5153
				5154	return busiest;
				5155	}
				5156
				5157	/*
				5158	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				5159	* so long as it is large enough.
				5160	*/
				5161	#define MAX_PINNED_INTERVAL 512
				5162
				5163	/* Working cpumask for load_balance and load_balance_newidle. */
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5164	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5165
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5166	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	5167	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5168	struct sched_domain *sd = env->sd;
				5169
				5170	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5171
				5172	/*
				5173	* ASYM_PACKING needs to force migrate tasks from busy but
				5174	* higher numbered CPUs in order to pack all tasks in the
				5175	* lowest numbered CPUs.
				5176	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5177	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5178	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	5179	}
				5180
				5181	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				5182	}
				5183
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5184	static int active_load_balance_cpu_stop(void *data);
				5185
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5186	static int should_we_balance(struct lb_env *env)
				5187	{
				5188	struct sched_group *sg = env->sd->groups;
				5189	struct cpumask sg_cpus, sg_mask;
				5190	int cpu, balance_cpu = -1;
				5191
				5192	/*
				5193	* In the newly idle case, we will allow all the cpu's
				5194	* to do the newly idle load balance.
				5195	*/
				5196	if (env->idle == CPU_NEWLY_IDLE)
				5197	return 1;
				5198
				5199	sg_cpus = sched_group_cpus(sg);
				5200	sg_mask = sched_group_mask(sg);
				5201	/* Try to find first idle cpu */
				5202	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				5203	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				5204	continue;
				5205
				5206	balance_cpu = cpu;
				5207	break;
				5208	}
				5209
				5210	if (balance_cpu == -1)
				5211	balance_cpu = group_balance_cpu(sg);
				5212
				5213	/*
				5214	* First idle cpu or the first cpu(busiest) in this sched group
				5215	* is eligible for doing load balancing at this and above domains.
				5216	*/
Joonsoo Kim	b0cff9d	2013-09-10 15:54:49 +0900	[diff] [blame]	5217	return balance_cpu == env->dst_cpu;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5218	}
				5219
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5220	/*
				5221	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				5222	* tasks if there is an imbalance.
				5223	*/
				5224	static int load_balance(int this_cpu, struct rq *this_rq,
				5225	struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5226	int *continue_balancing)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5227	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5228	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5229	struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5230	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5231	struct rq *busiest;
				5232	unsigned long flags;
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5233	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5234
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5235	struct lb_env env = {
				5236	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5237	.dst_cpu = this_cpu,
				5238	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5239	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5240	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5241	.loop_break = sched_nr_migrate_break,
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5242	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5243	};
				5244
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5245	/*
				5246	* For NEWLY_IDLE load_balancing, we don't need to consider
				5247	* other cpus in our group
				5248	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5249	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5250	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5251
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5252	cpumask_copy(cpus, cpu_active_mask);
				5253
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5254	schedstat_inc(sd, lb_count[idle]);
				5255
				5256	redo:
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5257	if (!should_we_balance(&env)) {
				5258	*continue_balancing = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5259	goto out_balanced;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5260	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5261
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5262	group = find_busiest_group(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5263	if (!group) {
				5264	schedstat_inc(sd, lb_nobusyg[idle]);
				5265	goto out_balanced;
				5266	}
				5267
Michael Wang	b940313	2012-07-12 16:10:13 +0800	[diff] [blame]	5268	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5269	if (!busiest) {
				5270	schedstat_inc(sd, lb_nobusyq[idle]);
				5271	goto out_balanced;
				5272	}
				5273
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5274	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5275
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5276	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5277
				5278	ld_moved = 0;
				5279	if (busiest->nr_running > 1) {
				5280	/*
				5281	* Attempt to move tasks. If find_busiest_group has found
				5282	* an imbalance but busiest->nr_running <= 1, the group is
				5283	* still unbalanced. ld_moved simply stays zero, so it is
				5284	* correctly treated as an imbalance.
				5285	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5286	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5287	env.src_cpu = busiest->cpu;
				5288	env.src_rq = busiest;
				5289	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5290
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5291	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5292	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5293	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5294
				5295	/*
				5296	* cur_ld_moved - load moved in current iteration
				5297	* ld_moved - cumulative load moved across iterations
				5298	*/
				5299	cur_ld_moved = move_tasks(&env);
				5300	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5301	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5302	local_irq_restore(flags);
				5303
				5304	/*
				5305	* some other cpu did the load balance for us.
				5306	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5307	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5308	resched_cpu(env.dst_cpu);
				5309
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	5310	if (env.flags & LBF_NEED_BREAK) {
				5311	env.flags &= ~LBF_NEED_BREAK;
				5312	goto more_balance;
				5313	}
				5314
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5315	/*
				5316	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5317	* us and move them to an alternate dst_cpu in our sched_group
				5318	* where they can run. The upper limit on how many times we
				5319	* iterate on same src_cpu is dependent on number of cpus in our
				5320	* sched_group.
				5321	*
				5322	* This changes load balance semantics a bit on who can move
				5323	* load to a given_cpu. In addition to the given_cpu itself
				5324	* (or a ilb_cpu acting on its behalf where given_cpu is
				5325	* nohz-idle), we now have balance_cpu in a position to move
				5326	* load to given_cpu. In rare situations, this may cause
				5327	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5328	* _independently_ and at _same_ time to move some load to
				5329	* given_cpu) causing exceess load to be moved to given_cpu.
				5330	* This however should not happen so much in practice and
				5331	* moreover subsequent load balance cycles should correct the
				5332	* excess load moved.
				5333	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5334	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5335
Vladimir Davydov	7aff2e3	2013-09-15 21:30:13 +0400	[diff] [blame]	5336	/* Prevent to re-select dst_cpu via env's cpus */
				5337	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				5338
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5339	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5340	env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5341	env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5342	env.loop = 0;
				5343	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5344
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5345	/*
				5346	* Go back to "more_balance" rather than "redo" since we
				5347	* need to continue with same src_cpu.
				5348	*/
				5349	goto more_balance;
				5350	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5351
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5352	/*
				5353	* We failed to reach balance because of affinity.
				5354	*/
				5355	if (sd_parent) {
				5356	int *group_imbalance = &sd_parent->groups->sgp->imbalance;
				5357
				5358	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
				5359	*group_imbalance = 1;
				5360	} else if (*group_imbalance)
				5361	*group_imbalance = 0;
				5362	}
				5363
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5364	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5365	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5366	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5367	if (!cpumask_empty(cpus)) {
				5368	env.loop = 0;
				5369	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5370	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5371	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5372	goto out_balanced;
				5373	}
				5374	}
				5375
				5376	if (!ld_moved) {
				5377	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5378	/*
				5379	* Increment the failure counter only on periodic balance.
				5380	* We do not want newidle balance, which can be very
				5381	* frequent, pollute the failure counter causing
				5382	* excessive cache_hot migrations and active balances.
				5383	*/
				5384	if (idle != CPU_NEWLY_IDLE)
				5385	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5386
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5387	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5388	raw_spin_lock_irqsave(&busiest->lock, flags);
				5389
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5390	/* don't kick the active_load_balance_cpu_stop,
				5391	* if the curr task on busiest cpu can't be
				5392	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5393	*/
				5394	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5395	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5396	raw_spin_unlock_irqrestore(&busiest->lock,
				5397	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5398	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5399	goto out_one_pinned;
				5400	}
				5401
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5402	/*
				5403	* ->active_balance synchronizes accesses to
				5404	* ->active_balance_work. Once set, it's cleared
				5405	* only after active load balance is finished.
				5406	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5407	if (!busiest->active_balance) {
				5408	busiest->active_balance = 1;
				5409	busiest->push_cpu = this_cpu;
				5410	active_balance = 1;
				5411	}
				5412	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5413
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5414	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5415	stop_one_cpu_nowait(cpu_of(busiest),
				5416	active_load_balance_cpu_stop, busiest,
				5417	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5418	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5419
				5420	/*
				5421	* We've kicked active balancing, reset the failure
				5422	* counter.
				5423	*/
				5424	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5425	}
				5426	} else
				5427	sd->nr_balance_failed = 0;
				5428
				5429	if (likely(!active_balance)) {
				5430	/* We were unbalanced, so reset the balancing interval */
				5431	sd->balance_interval = sd->min_interval;
				5432	} else {
				5433	/*
				5434	* If we've begun active balancing, start to back off. This
				5435	* case may not be covered by the all_pinned logic if there
				5436	* is only 1 task on the busy runqueue (because we don't call
				5437	* move_tasks).
				5438	*/
				5439	if (sd->balance_interval < sd->max_interval)
				5440	sd->balance_interval *= 2;
				5441	}
				5442
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5443	goto out;
				5444
				5445	out_balanced:
				5446	schedstat_inc(sd, lb_balanced[idle]);
				5447
				5448	sd->nr_balance_failed = 0;
				5449
				5450	out_one_pinned:
				5451	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5452	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5453	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5454	(sd->balance_interval < sd->max_interval))
				5455	sd->balance_interval *= 2;
				5456
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5457	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5458	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5459	return ld_moved;
				5460	}
				5461
				5462	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5463	* idle_balance is called by schedule() if this_cpu is about to become
				5464	* idle. Attempts to pull tasks from other CPUs.
				5465	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5466	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5467	{
				5468	struct sched_domain *sd;
				5469	int pulled_task = 0;
				5470	unsigned long next_balance = jiffies + HZ;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5471	u64 curr_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5472
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	5473	this_rq->idle_stamp = rq_clock(this_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5474
				5475	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5476	return;
				5477
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5478	/*
				5479	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5480	*/
				5481	raw_spin_unlock(&this_rq->lock);
				5482
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5483	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5484	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5485	for_each_domain(this_cpu, sd) {
				5486	unsigned long interval;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5487	int continue_balancing = 1;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5488	u64 t0, domain_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5489
				5490	if (!(sd->flags & SD_LOAD_BALANCE))
				5491	continue;
				5492
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5493	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
				5494	break;
				5495
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5496	if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5497	t0 = sched_clock_cpu(this_cpu);
				5498
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5499	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5500	pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5501	sd, CPU_NEWLY_IDLE,
				5502	&continue_balancing);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5503
				5504	domain_cost = sched_clock_cpu(this_cpu) - t0;
				5505	if (domain_cost > sd->max_newidle_lb_cost)
				5506	sd->max_newidle_lb_cost = domain_cost;
				5507
				5508	curr_cost += domain_cost;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5509	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5510
				5511	interval = msecs_to_jiffies(sd->balance_interval);
				5512	if (time_after(next_balance, sd->last_balance + interval))
				5513	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5514	if (pulled_task) {
				5515	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5516	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5517	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5518	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5519	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5520
				5521	raw_spin_lock(&this_rq->lock);
				5522
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5523	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5524	/*
				5525	* We are going idle. next_balance may be set based on
				5526	* a busy processor. So reset next_balance.
				5527	*/
				5528	this_rq->next_balance = next_balance;
				5529	}
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5530
				5531	if (curr_cost > this_rq->max_idle_balance_cost)
				5532	this_rq->max_idle_balance_cost = curr_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5533	}
				5534
				5535	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5536	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5537	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5538	* least 1 task to be running on each physical CPU where possible, and
				5539	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5540	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5541	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5542	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5543	struct rq *busiest_rq = data;
				5544	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5545	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5546	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5547	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5548
				5549	raw_spin_lock_irq(&busiest_rq->lock);
				5550
				5551	/* make sure the requested cpu hasn't gone down in the meantime */
				5552	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5553	!busiest_rq->active_balance))
				5554	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5555
				5556	/* Is there any task to move? */
				5557	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5558	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5559
				5560	/*
				5561	* This condition is "impossible", if it occurs
				5562	* we need to fix it. Originally reported by
				5563	* Bjorn Helgaas on a 128-cpu setup.
				5564	*/
				5565	BUG_ON(busiest_rq == target_rq);
				5566
				5567	/* move a task from busiest_rq to target_rq */
				5568	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5569
				5570	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5571	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5572	for_each_domain(target_cpu, sd) {
				5573	if ((sd->flags & SD_LOAD_BALANCE) &&
				5574	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5575	break;
				5576	}
				5577
				5578	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5579	struct lb_env env = {
				5580	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5581	.dst_cpu = target_cpu,
				5582	.dst_rq = target_rq,
				5583	.src_cpu = busiest_rq->cpu,
				5584	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5585	.idle = CPU_IDLE,
				5586	};
				5587
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5588	schedstat_inc(sd, alb_count);
				5589
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5590	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5591	schedstat_inc(sd, alb_pushed);
				5592	else
				5593	schedstat_inc(sd, alb_failed);
				5594	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5595	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5596	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5597	out_unlock:
				5598	busiest_rq->active_balance = 0;
				5599	raw_spin_unlock_irq(&busiest_rq->lock);
				5600	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5601	}
				5602
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5603	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5604	/*
				5605	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5606	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5607	* needed, they will kick the idle load balancer, which then does idle
				5608	* load balancing for all the idle CPUs.
				5609	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5610	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5611	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5612	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5613	unsigned long next_balance; /* in jiffy units */
				5614	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5615
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5616	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5617	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5618	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5619
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	5620	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5621	return ilb;
				5622
				5623	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5624	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5625
				5626	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5627	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5628	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5629	* CPU (if there is one).
				5630	*/
				5631	static void nohz_balancer_kick(int cpu)
				5632	{
				5633	int ilb_cpu;
				5634
				5635	nohz.next_balance++;
				5636
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5637	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5638
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5639	if (ilb_cpu >= nr_cpu_ids)
				5640	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5641
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5642	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5643	return;
				5644	/*
				5645	* Use smp_send_reschedule() instead of resched_cpu().
				5646	* This way we generate a sched IPI on the target cpu which
				5647	* is idle. And the softirq performing nohz idle load balance
				5648	* will be run before returning from the IPI.
				5649	*/
				5650	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5651	return;
				5652	}
				5653
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5654	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5655	{
				5656	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5657	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5658	atomic_dec(&nohz.nr_cpus);
				5659	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5660	}
				5661	}
				5662
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5663	static inline void set_cpu_sd_state_busy(void)
				5664	{
				5665	struct sched_domain *sd;
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5666
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5667	rcu_read_lock();
Nathan Zimmer	424c93f	2013-05-09 11:24:03 -0500	[diff] [blame]	5668	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5669
				5670	if (!sd \|\| !sd->nohz_idle)
				5671	goto unlock;
				5672	sd->nohz_idle = 0;
				5673
				5674	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5675	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5676	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5677	rcu_read_unlock();
				5678	}
				5679
				5680	void set_cpu_sd_state_idle(void)
				5681	{
				5682	struct sched_domain *sd;
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5683
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5684	rcu_read_lock();
Nathan Zimmer	424c93f	2013-05-09 11:24:03 -0500	[diff] [blame]	5685	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5686
				5687	if (!sd \|\| sd->nohz_idle)
				5688	goto unlock;
				5689	sd->nohz_idle = 1;
				5690
				5691	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5692	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5693	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5694	rcu_read_unlock();
				5695	}
				5696
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5697	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5698	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5699	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5700	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5701	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5702	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5703	/*
				5704	* If this cpu is going down, then nothing needs to be done.
				5705	*/
				5706	if (!cpu_active(cpu))
				5707	return;
				5708
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5709	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5710	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5711
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5712	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5713	atomic_inc(&nohz.nr_cpus);
				5714	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5715	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5716
Paul Gortmaker	0db0628	2013-06-19 14:53:51 -0400	[diff] [blame]	5717	static int sched_ilb_notifier(struct notifier_block *nfb,
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5718	unsigned long action, void *hcpu)
				5719	{
				5720	switch (action & ~CPU_TASKS_FROZEN) {
				5721	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5722	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5723	return NOTIFY_OK;
				5724	default:
				5725	return NOTIFY_DONE;
				5726	}
				5727	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5728	#endif
				5729
				5730	static DEFINE_SPINLOCK(balancing);
				5731
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5732	/*
				5733	* Scale the max load_balance interval with the number of CPUs in the system.
				5734	* This trades load-balance latency on larger machines for less cross talk.
				5735	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5736	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5737	{
				5738	max_load_balance_interval = HZ*num_online_cpus()/10;
				5739	}
				5740
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5741	/*
				5742	* It checks each scheduling domain to see if it is due to be balanced,
				5743	* and initiates a balancing operation if so.
				5744	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	5745	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5746	*/
				5747	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5748	{
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5749	int continue_balancing = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5750	struct rq *rq = cpu_rq(cpu);
				5751	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5752	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5753	/* Earliest time when we have to do rebalance again */
				5754	unsigned long next_balance = jiffies + 60*HZ;
				5755	int update_next_balance = 0;
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5756	int need_serialize, need_decay = 0;
				5757	u64 max_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5758
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5759	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5760
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5761	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5762	for_each_domain(cpu, sd) {
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5763	/*
				5764	* Decay the newidle max times here because this is a regular
				5765	* visit to all the domains. Decay ~1% per second.
				5766	*/
				5767	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				5768	sd->max_newidle_lb_cost =
				5769	(sd->max_newidle_lb_cost * 253) / 256;
				5770	sd->next_decay_max_lb_cost = jiffies + HZ;
				5771	need_decay = 1;
				5772	}
				5773	max_cost += sd->max_newidle_lb_cost;
				5774
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5775	if (!(sd->flags & SD_LOAD_BALANCE))
				5776	continue;
				5777
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5778	/*
				5779	* Stop the load balance at this level. There is another
				5780	* CPU in our sched group which is doing load balancing more
				5781	* actively.
				5782	*/
				5783	if (!continue_balancing) {
				5784	if (need_decay)
				5785	continue;
				5786	break;
				5787	}
				5788
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5789	interval = sd->balance_interval;
				5790	if (idle != CPU_IDLE)
				5791	interval *= sd->busy_factor;
				5792
				5793	/* scale ms to jiffies */
				5794	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5795	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5796
				5797	need_serialize = sd->flags & SD_SERIALIZE;
				5798
				5799	if (need_serialize) {
				5800	if (!spin_trylock(&balancing))
				5801	goto out;
				5802	}
				5803
				5804	if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5805	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5806	/*
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5807	* The LBF_DST_PINNED logic could have changed
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5808	* env->dst_cpu, so we can't know our idle
				5809	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5810	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5811	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5812	}
				5813	sd->last_balance = jiffies;
				5814	}
				5815	if (need_serialize)
				5816	spin_unlock(&balancing);
				5817	out:
				5818	if (time_after(next_balance, sd->last_balance + interval)) {
				5819	next_balance = sd->last_balance + interval;
				5820	update_next_balance = 1;
				5821	}
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5822	}
				5823	if (need_decay) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5824	/*
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5825	* Ensure the rq-wide value also decays but keep it at a
				5826	* reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5827	*/
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5828	rq->max_idle_balance_cost =
				5829	max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5830	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5831	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5832
				5833	/*
				5834	* next_balance will be updated only when there is a need.
				5835	* When the cpu is attached to null domain for ex, it will not be
				5836	* updated.
				5837	*/
				5838	if (likely(update_next_balance))
				5839	rq->next_balance = next_balance;
				5840	}
				5841
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5842	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5843	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5844	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5845	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5846	*/
				5847	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5848	{
				5849	struct rq *this_rq = cpu_rq(this_cpu);
				5850	struct rq *rq;
				5851	int balance_cpu;
				5852
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5853	if (idle != CPU_IDLE \|\|
				5854	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5855	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5856
				5857	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5858	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5859	continue;
				5860
				5861	/*
				5862	* If this cpu gets work to do, stop the load balancing
				5863	* work being done for other cpus. Next load
				5864	* balancing owner will pick it up.
				5865	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5866	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5867	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5868
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5869	rq = cpu_rq(balance_cpu);
				5870
				5871	raw_spin_lock_irq(&rq->lock);
				5872	update_rq_clock(rq);
				5873	update_idle_cpu_load(rq);
				5874	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5875
				5876	rebalance_domains(balance_cpu, CPU_IDLE);
				5877
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5878	if (time_after(this_rq->next_balance, rq->next_balance))
				5879	this_rq->next_balance = rq->next_balance;
				5880	}
				5881	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5882	end:
				5883	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5884	}
				5885
				5886	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5887	* Current heuristic for kicking the idle load balancer in the presence
				5888	* of an idle cpu is the system.
				5889	* - This rq has more than one task.
				5890	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5891	* busy cpu's exceeding the group's power.
				5892	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5893	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5894	*/
				5895	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5896	{
				5897	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5898	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5899
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5900	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5901	return 0;
				5902
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5903	/*
				5904	* We may be recently in ticked or tickless idle mode. At the first
				5905	* busy tick after returning from idle, we will update the busy stats.
				5906	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5907	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5908	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5909
				5910	/*
				5911	* None are in tickless mode and hence no need for NOHZ idle load
				5912	* balancing.
				5913	*/
				5914	if (likely(!atomic_read(&nohz.nr_cpus)))
				5915	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5916
				5917	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5918	return 0;
				5919
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5920	if (rq->nr_running >= 2)
				5921	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5922
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5923	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5924	for_each_domain(cpu, sd) {
				5925	struct sched_group *sg = sd->groups;
				5926	struct sched_group_power *sgp = sg->sgp;
				5927	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5928
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5929	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5930	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5931
				5932	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5933	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5934	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5935	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5936
				5937	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5938	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5939	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5940	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5941	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5942
				5943	need_kick_unlock:
				5944	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5945	need_kick:
				5946	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5947	}
				5948	#else
				5949	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5950	#endif
				5951
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5952	/*
				5953	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5954	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5955	*/
				5956	static void run_rebalance_domains(struct softirq_action *h)
				5957	{
				5958	int this_cpu = smp_processor_id();
				5959	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5960	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5961	CPU_IDLE : CPU_NOT_IDLE;
				5962
				5963	rebalance_domains(this_cpu, idle);
				5964
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5965	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5966	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5967	* balancing on behalf of the other idle cpus whose ticks are
				5968	* stopped.
				5969	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5970	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5971	}
				5972
				5973	static inline int on_null_domain(int cpu)
				5974	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5975	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5976	}
				5977
				5978	/*
				5979	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5980	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5981	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5982	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5983	/* Don't need to rebalance while attached to NULL domain */
				5984	if (time_after_eq(jiffies, rq->next_balance) &&
				5985	likely(!on_null_domain(cpu)))
				5986	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5987	#ifdef CONFIG_NO_HZ_COMMON
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5988	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5989	nohz_balancer_kick(cpu);
				5990	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5991	}
				5992
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5993	static void rq_online_fair(struct rq *rq)
				5994	{
				5995	update_sysctl();
				5996	}
				5997
				5998	static void rq_offline_fair(struct rq *rq)
				5999	{
				6000	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	6001
				6002	/* Ensure any throttled groups are reachable by pick_next_task */
				6003	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6004	}
				6005
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	6006	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	6007
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6008	/*
				6009	* scheduler tick hitting a task of our scheduling class:
				6010	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6011	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6012	{
				6013	struct cfs_rq *cfs_rq;
				6014	struct sched_entity *se = &curr->se;
				6015
				6016	for_each_sched_entity(se) {
				6017	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6018	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6019	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	6020
Dave Kleikamp	10e84b9	2013-07-31 13:53:35 -0700	[diff] [blame]	6021	if (numabalancing_enabled)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	6022	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	6023
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	6024	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6025	}
				6026
				6027	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6028	* called on fork with the child task as argument from the parent's context
				6029	* - child not yet on the tasklist
				6030	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6031	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6032	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6033	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	6034	struct cfs_rq *cfs_rq;
				6035	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	6036	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6037	struct rq *rq = this_rq();
				6038	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6039
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	6040	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6041
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	6042	update_rq_clock(rq);
				6043
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	6044	cfs_rq = task_cfs_rq(current);
				6045	curr = cfs_rq->curr;
				6046
Daisuke Nishimura	6c9a27f	2013-09-10 18:16:36 +0900	[diff] [blame]	6047	/*
				6048	* Not only the cpu but also the task_group of the parent might have
				6049	* been changed after parent->se.parent,cfs_rq were copied to
				6050	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
				6051	* of child point to valid ones.
				6052	*/
				6053	rcu_read_lock();
				6054	__set_task_cpu(p, this_cpu);
				6055	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6056
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	6057	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6058
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	6059	if (curr)
				6060	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	6061	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6062
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6063	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	6064	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	6065	* Upon rescheduling, sched_class::put_prev_task() will place
				6066	* 'current' within the tree based on its new key value.
				6067	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6068	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6069	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6070	}
				6071
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6072	se->vruntime -= cfs_rq->min_vruntime;
				6073
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	6074	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6075	}
				6076
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6077	/*
				6078	* Priority of the task has changed. Check to see if we preempt
				6079	* the current task.
				6080	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6081	static void
				6082	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6083	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6084	if (!p->se.on_rq)
				6085	return;
				6086
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6087	/*
				6088	* Reschedule if we are currently running on this runqueue and
				6089	* our priority decreased, or if we are not currently running on
				6090	* this runqueue and our priority is higher than the current's
				6091	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6092	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6093	if (p->prio > oldprio)
				6094	resched_task(rq->curr);
				6095	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	6096	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6097	}
				6098
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6099	static void switched_from_fair(struct rq rq, struct task_struct p)
				6100	{
				6101	struct sched_entity *se = &p->se;
				6102	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6103
				6104	/*
				6105	* Ensure the task's vruntime is normalized, so that when its
				6106	* switched back to the fair class the enqueue_entity(.flags=0) will
				6107	* do the right thing.
				6108	*
				6109	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				6110	* have normalized the vruntime, if it was !on_rq, then only when
				6111	* the task is sleeping will it still have non-normalized vruntime.
				6112	*/
				6113	if (!se->on_rq && p->state != TASK_RUNNING) {
				6114	/*
				6115	* Fix up our vruntime so that the current sleep doesn't
				6116	* cause 'unlimited' sleep bonus.
				6117	*/
				6118	place_entity(cfs_rq, se, 0);
				6119	se->vruntime -= cfs_rq->min_vruntime;
				6120	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6121
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6122	#ifdef CONFIG_SMP
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6123	/*
				6124	* Remove our load from contribution when we leave sched_fair
				6125	* and ensure we don't carry in an old decay_count if we
				6126	* switch back.
				6127	*/
Kirill Tkhai	87e3c8a	2013-07-21 04:32:07 +0400	[diff] [blame]	6128	if (se->avg.decay_count) {
				6129	__synchronize_entity_decay(se);
				6130	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6131	}
				6132	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6133	}
				6134
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6135	/*
				6136	* We switched to the sched_fair class.
				6137	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6138	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6139	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6140	if (!p->se.on_rq)
				6141	return;
				6142
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6143	/*
				6144	* We were most likely switched from sched_rt, so
				6145	* kick off the schedule if running, otherwise just see
				6146	* if we can still preempt the current task.
				6147	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6148	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6149	resched_task(rq->curr);
				6150	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	6151	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6152	}
				6153
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6154	/* Account for a task changing its policy or group.
				6155	*
				6156	* This routine is mostly called to set cfs_rq->curr field when a task
				6157	* migrates between groups/classes.
				6158	*/
				6159	static void set_curr_task_fair(struct rq *rq)
				6160	{
				6161	struct sched_entity *se = &rq->curr->se;
				6162
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	6163	for_each_sched_entity(se) {
				6164	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6165
				6166	set_next_entity(cfs_rq, se);
				6167	/* ensure bandwidth has been allocated on our new cfs_rq */
				6168	account_cfs_rq_runtime(cfs_rq, 0);
				6169	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6170	}
				6171
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6172	void init_cfs_rq(struct cfs_rq *cfs_rq)
				6173	{
				6174	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6175	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				6176	#ifndef CONFIG_64BIT
				6177	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				6178	#endif
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6179	#ifdef CONFIG_SMP
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6180	atomic64_set(&cfs_rq->decay_counter, 1);
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	6181	atomic_long_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6182	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6183	}
				6184
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6185	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6186	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6187	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6188	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6189	/*
				6190	* If the task was not on the rq at the time of this cgroup movement
				6191	* it must have been asleep, sleeping tasks keep their ->vruntime
				6192	* absolute on their old rq until wakeup (needed for the fair sleeper
				6193	* bonus in place_entity()).
				6194	*
				6195	* If it was on the rq, we've just 'preempted' it, which does convert
				6196	* ->vruntime to a relative base.
				6197	*
				6198	* Make sure both cases convert their relative position when migrating
				6199	* to another cgroup's rq. This does somewhat interfere with the
				6200	* fair sleeper stuff for the first placement, but who cares.
				6201	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6202	/*
				6203	* When !on_rq, vruntime of the task has usually NOT been normalized.
				6204	* But there are some cases where it has already been normalized:
				6205	*
				6206	* - Moving a forked child which is waiting for being woken up by
				6207	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	6208	* - Moving a task which has been woken up by try_to_wake_up() and
				6209	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6210	*
				6211	* To prevent boost or penalty in the new cfs_rq caused by delta
				6212	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				6213	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	6214	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6215	on_rq = 1;
				6216
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6217	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6218	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				6219	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6220	if (!on_rq) {
				6221	cfs_rq = cfs_rq_of(&p->se);
				6222	p->se.vruntime += cfs_rq->min_vruntime;
				6223	#ifdef CONFIG_SMP
				6224	/*
				6225	* migrate_task_rq_fair() will have removed our previous
				6226	* contribution, but we must synchronize for ongoing future
				6227	* decay.
				6228	*/
				6229	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				6230	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				6231	#endif
				6232	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6233	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6234
				6235	void free_fair_sched_group(struct task_group *tg)
				6236	{
				6237	int i;
				6238
				6239	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				6240
				6241	for_each_possible_cpu(i) {
				6242	if (tg->cfs_rq)
				6243	kfree(tg->cfs_rq[i]);
				6244	if (tg->se)
				6245	kfree(tg->se[i]);
				6246	}
				6247
				6248	kfree(tg->cfs_rq);
				6249	kfree(tg->se);
				6250	}
				6251
				6252	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6253	{
				6254	struct cfs_rq *cfs_rq;
				6255	struct sched_entity *se;
				6256	int i;
				6257
				6258	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				6259	if (!tg->cfs_rq)
				6260	goto err;
				6261	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				6262	if (!tg->se)
				6263	goto err;
				6264
				6265	tg->shares = NICE_0_LOAD;
				6266
				6267	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				6268
				6269	for_each_possible_cpu(i) {
				6270	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				6271	GFP_KERNEL, cpu_to_node(i));
				6272	if (!cfs_rq)
				6273	goto err;
				6274
				6275	se = kzalloc_node(sizeof(struct sched_entity),
				6276	GFP_KERNEL, cpu_to_node(i));
				6277	if (!se)
				6278	goto err_free_rq;
				6279
				6280	init_cfs_rq(cfs_rq);
				6281	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				6282	}
				6283
				6284	return 1;
				6285
				6286	err_free_rq:
				6287	kfree(cfs_rq);
				6288	err:
				6289	return 0;
				6290	}
				6291
				6292	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				6293	{
				6294	struct rq *rq = cpu_rq(cpu);
				6295	unsigned long flags;
				6296
				6297	/*
				6298	* Only empty task groups can be destroyed; so we can speculatively
				6299	* check on_list without danger of it being re-added.
				6300	*/
				6301	if (!tg->cfs_rq[cpu]->on_list)
				6302	return;
				6303
				6304	raw_spin_lock_irqsave(&rq->lock, flags);
				6305	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6306	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6307	}
				6308
				6309	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6310	struct sched_entity *se, int cpu,
				6311	struct sched_entity *parent)
				6312	{
				6313	struct rq *rq = cpu_rq(cpu);
				6314
				6315	cfs_rq->tg = tg;
				6316	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6317	init_cfs_rq_runtime(cfs_rq);
				6318
				6319	tg->cfs_rq[cpu] = cfs_rq;
				6320	tg->se[cpu] = se;
				6321
				6322	/* se could be NULL for root_task_group */
				6323	if (!se)
				6324	return;
				6325
				6326	if (!parent)
				6327	se->cfs_rq = &rq->cfs;
				6328	else
				6329	se->cfs_rq = parent->my_q;
				6330
				6331	se->my_q = cfs_rq;
				6332	update_load_set(&se->load, 0);
				6333	se->parent = parent;
				6334	}
				6335
				6336	static DEFINE_MUTEX(shares_mutex);
				6337
				6338	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6339	{
				6340	int i;
				6341	unsigned long flags;
				6342
				6343	/*
				6344	* We can't change the weight of the root cgroup.
				6345	*/
				6346	if (!tg->se[0])
				6347	return -EINVAL;
				6348
				6349	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6350
				6351	mutex_lock(&shares_mutex);
				6352	if (tg->shares == shares)
				6353	goto done;
				6354
				6355	tg->shares = shares;
				6356	for_each_possible_cpu(i) {
				6357	struct rq *rq = cpu_rq(i);
				6358	struct sched_entity *se;
				6359
				6360	se = tg->se[i];
				6361	/* Propagate contribution to hierarchy */
				6362	raw_spin_lock_irqsave(&rq->lock, flags);
Frederic Weisbecker	71b1da4	2013-04-12 01:50:59 +0200	[diff] [blame]	6363
				6364	/* Possible calls to update_curr() need rq clock */
				6365	update_rq_clock(rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6366	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6367	update_cfs_shares(group_cfs_rq(se));
				6368	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6369	}
				6370
				6371	done:
				6372	mutex_unlock(&shares_mutex);
				6373	return 0;
				6374	}
				6375	#else /* CONFIG_FAIR_GROUP_SCHED */
				6376
				6377	void free_fair_sched_group(struct task_group *tg) { }
				6378
				6379	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6380	{
				6381	return 1;
				6382	}
				6383
				6384	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6385
				6386	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6387
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6388
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6389	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6390	{
				6391	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6392	unsigned int rr_interval = 0;
				6393
				6394	/*
				6395	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6396	* idle runqueue:
				6397	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6398	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6399	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6400
				6401	return rr_interval;
				6402	}
				6403
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6404	/*
				6405	* All the scheduling class methods:
				6406	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6407	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6408	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6409	.enqueue_task = enqueue_task_fair,
				6410	.dequeue_task = dequeue_task_fair,
				6411	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6412	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6413
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6414	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6415
				6416	.pick_next_task = pick_next_task_fair,
				6417	.put_prev_task = put_prev_task_fair,
				6418
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6419	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6420	.select_task_rq = select_task_rq_fair,
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6421	.migrate_task_rq = migrate_task_rq_fair,
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6422
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6423	.rq_online = rq_online_fair,
				6424	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6425
				6426	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6427	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6428
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6429	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6430	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6431	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6432
				6433	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6434	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6435	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6436
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6437	.get_rr_interval = get_rr_interval_fair,
				6438
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6439	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6440	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6441	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6442	};
				6443
				6444	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6445	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6446	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6447	struct cfs_rq *cfs_rq;
				6448
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6449	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6450	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6451	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6452	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6453	}
				6454	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6455
				6456	__init void init_sched_fair_class(void)
				6457	{
				6458	#ifdef CONFIG_SMP
				6459	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6460
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6461	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6462	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6463	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6464	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6465	#endif
				6466	#endif /* SMP */
				6467
				6468	}