blob: 0e3caf742ae3b30f1b43e0e37c7cd940a01051d0 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020025#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080030#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031#include <linux/completion.h>
32#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070033#include <linux/debug_locks.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080037#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080038#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070052#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080053#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070054#include <linux/delayacct.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070055#include <linux/reciprocal_div.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020056#include <linux/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070057
Eric Dumazet5517d862007-05-08 00:32:57 -070058#include <asm/tlb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070059
60/*
Alexey Dobriyanb035b6d2007-02-10 01:45:10 -080061 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
70/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070071 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
74 */
75#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
78
79/*
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
83 */
84#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
87
88/*
89 * Some helpers for converting nanosecond timing to jiffy resolution
90 */
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93
Ingo Molnar6aa645e2007-07-09 18:51:58 +020094#define NICE_0_LOAD SCHED_LOAD_SCALE
95#define NICE_0_SHIFT SCHED_LOAD_SHIFT
96
Linus Torvalds1da177e2005-04-16 15:20:36 -070097/*
98 * These are the 'tuning knobs' of the scheduler:
99 *
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
103 */
104#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105#define DEF_TIMESLICE (100 * HZ / 1000)
Peter Williams2dd73a42006-06-27 02:54:34 -0700106
Eric Dumazet5517d862007-05-08 00:32:57 -0700107#ifdef CONFIG_SMP
108/*
109 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
110 * Since cpu_power is a 'constant', we can use a reciprocal divide.
111 */
112static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
113{
114 return reciprocal_divide(load, sg->reciprocal_cpu_power);
115}
116
117/*
118 * Each time a sched group cpu_power is changed,
119 * we must compute its reciprocal value
120 */
121static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
122{
123 sg->__cpu_power += val;
124 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
125}
126#endif
127
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200128#define SCALE_PRIO(x, prio) \
129 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
Borislav Petkov91fcdd42006-10-19 23:28:29 -0700130
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200131/*
132 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
133 * to time slice values: [800ms ... 100ms ... 5ms]
134 */
135static unsigned int static_prio_timeslice(int static_prio)
Peter Williams2dd73a42006-06-27 02:54:34 -0700136{
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200137 if (static_prio == NICE_TO_PRIO(19))
138 return 1;
139
140 if (static_prio < NICE_TO_PRIO(0))
141 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
142 else
143 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
Peter Williams2dd73a42006-06-27 02:54:34 -0700144}
145
Ingo Molnare05606d2007-07-09 18:51:59 +0200146static inline int rt_policy(int policy)
147{
148 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
149 return 1;
150 return 0;
151}
152
153static inline int task_has_rt_policy(struct task_struct *p)
154{
155 return rt_policy(p->policy);
156}
157
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200159 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200161struct rt_prio_array {
162 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
163 struct list_head queue[MAX_RT_PRIO];
164};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200166struct load_stat {
167 struct load_weight load;
168 u64 load_update_start, load_update_last;
169 unsigned long delta_fair, delta_exec, delta_stat;
170};
171
172/* CFS-related fields in a runqueue */
173struct cfs_rq {
174 struct load_weight load;
175 unsigned long nr_running;
176
177 s64 fair_clock;
178 u64 exec_clock;
179 s64 wait_runtime;
180 u64 sleeper_bonus;
181 unsigned long wait_runtime_overruns, wait_runtime_underruns;
182
183 struct rb_root tasks_timeline;
184 struct rb_node *rb_leftmost;
185 struct rb_node *rb_load_balance_curr;
186#ifdef CONFIG_FAIR_GROUP_SCHED
187 /* 'curr' points to currently running entity on this cfs_rq.
188 * It is set to NULL otherwise (i.e when none are currently running).
189 */
190 struct sched_entity *curr;
191 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
192
193 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
194 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
195 * (like users, containers etc.)
196 *
197 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
198 * list is used during load balance.
199 */
200 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
201#endif
202};
203
204/* Real-Time classes' related field in a runqueue: */
205struct rt_rq {
206 struct rt_prio_array active;
207 int rt_load_balance_idx;
208 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
209};
210
211/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 * This is the main, per-CPU runqueue data structure.
213 *
214 * Locking rule: those places that want to lock multiple runqueues
215 * (such as the load balancing or the thread migration code), lock
216 * acquire operations must be ordered by ascending &runqueue.
217 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700218struct rq {
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200219 spinlock_t lock; /* runqueue lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220
221 /*
222 * nr_running and cpu_load should be in the same cacheline because
223 * remote CPUs use both these fields when doing load calculation.
224 */
225 unsigned long nr_running;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200226 #define CPU_LOAD_IDX_MAX 5
227 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Siddha, Suresh Bbdecea32007-05-08 00:32:48 -0700228 unsigned char idle_at_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700229#ifdef CONFIG_NO_HZ
230 unsigned char in_nohz_recently;
231#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200232 struct load_stat ls; /* capture load from *all* tasks on this cpu */
233 unsigned long nr_load_updates;
234 u64 nr_switches;
235
236 struct cfs_rq cfs;
237#ifdef CONFIG_FAIR_GROUP_SCHED
238 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200240 struct rt_rq rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242 /*
243 * This is part of a global counter where only the total sum
244 * over all CPUs matters. A task can increase this counter on
245 * one CPU and if it got migrated afterwards it may decrease
246 * it on another CPU. Always updated under the runqueue lock:
247 */
248 unsigned long nr_uninterruptible;
249
Ingo Molnar36c8b582006-07-03 00:25:41 -0700250 struct task_struct *curr, *idle;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800251 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200253
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200254 u64 clock, prev_clock_raw;
255 s64 clock_max_delta;
256
257 unsigned int clock_warps, clock_overflows;
258 unsigned int clock_unstable_events;
259
260 struct sched_class *load_balance_class;
261
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262 atomic_t nr_iowait;
263
264#ifdef CONFIG_SMP
265 struct sched_domain *sd;
266
267 /* For active balancing */
268 int active_balance;
269 int push_cpu;
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700270 int cpu; /* cpu of this runqueue */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271
Ingo Molnar36c8b582006-07-03 00:25:41 -0700272 struct task_struct *migration_thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 struct list_head migration_queue;
274#endif
275
276#ifdef CONFIG_SCHEDSTATS
277 /* latency stats */
278 struct sched_info rq_sched_info;
279
280 /* sys_sched_yield() stats */
281 unsigned long yld_exp_empty;
282 unsigned long yld_act_empty;
283 unsigned long yld_both_empty;
284 unsigned long yld_cnt;
285
286 /* schedule() stats */
287 unsigned long sched_switch;
288 unsigned long sched_cnt;
289 unsigned long sched_goidle;
290
291 /* try_to_wake_up() stats */
292 unsigned long ttwu_cnt;
293 unsigned long ttwu_local;
294#endif
Ingo Molnarfcb99372006-07-03 00:25:10 -0700295 struct lock_class_key rq_lock_key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296};
297
Siddha, Suresh Bc3396622007-05-08 00:33:09 -0700298static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
Gautham R Shenoy5be93612007-05-09 02:34:04 -0700299static DEFINE_MUTEX(sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Ingo Molnardd41f592007-07-09 18:51:59 +0200301static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
302{
303 rq->curr->sched_class->check_preempt_curr(rq, p);
304}
305
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700306static inline int cpu_of(struct rq *rq)
307{
308#ifdef CONFIG_SMP
309 return rq->cpu;
310#else
311 return 0;
312#endif
313}
314
Nick Piggin674311d2005-06-25 14:57:27 -0700315/*
Ingo Molnar20d315d2007-07-09 18:51:58 +0200316 * Per-runqueue clock, as finegrained as the platform can give us:
317 */
318static unsigned long long __rq_clock(struct rq *rq)
319{
320 u64 prev_raw = rq->prev_clock_raw;
321 u64 now = sched_clock();
322 s64 delta = now - prev_raw;
323 u64 clock = rq->clock;
324
325 /*
326 * Protect against sched_clock() occasionally going backwards:
327 */
328 if (unlikely(delta < 0)) {
329 clock++;
330 rq->clock_warps++;
331 } else {
332 /*
333 * Catch too large forward jumps too:
334 */
335 if (unlikely(delta > 2*TICK_NSEC)) {
336 clock++;
337 rq->clock_overflows++;
338 } else {
339 if (unlikely(delta > rq->clock_max_delta))
340 rq->clock_max_delta = delta;
341 clock += delta;
342 }
343 }
344
345 rq->prev_clock_raw = now;
346 rq->clock = clock;
347
348 return clock;
349}
350
351static inline unsigned long long rq_clock(struct rq *rq)
352{
353 int this_cpu = smp_processor_id();
354
355 if (this_cpu == cpu_of(rq))
356 return __rq_clock(rq);
357
358 return rq->clock;
359}
360
361/*
Nick Piggin674311d2005-06-25 14:57:27 -0700362 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700363 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700364 *
365 * The domain tree of any CPU may only be accessed from within
366 * preempt-disabled sections.
367 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700368#define for_each_domain(cpu, __sd) \
369 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370
371#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
372#define this_rq() (&__get_cpu_var(runqueues))
373#define task_rq(p) cpu_rq(task_cpu(p))
374#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
375
Ingo Molnar138a8ae2007-07-09 18:51:58 +0200376#ifdef CONFIG_FAIR_GROUP_SCHED
377/* Change a task's ->cfs_rq if it moves across CPUs */
378static inline void set_task_cfs_rq(struct task_struct *p)
379{
380 p->se.cfs_rq = &task_rq(p)->cfs;
381}
382#else
383static inline void set_task_cfs_rq(struct task_struct *p)
384{
385}
386#endif
387
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700389# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700391#ifndef finish_arch_switch
392# define finish_arch_switch(prev) do { } while (0)
393#endif
394
395#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -0700396static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700397{
398 return rq->curr == p;
399}
400
Ingo Molnar70b97a72006-07-03 00:25:42 -0700401static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700402{
403}
404
Ingo Molnar70b97a72006-07-03 00:25:42 -0700405static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700406{
Ingo Molnarda04c032005-09-13 11:17:59 +0200407#ifdef CONFIG_DEBUG_SPINLOCK
408 /* this is a valid case when another task releases the spinlock */
409 rq->lock.owner = current;
410#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700411 /*
412 * If we are tracking spinlock dependencies then we have to
413 * fix up the runqueue lock - which gets 'carried over' from
414 * prev into current:
415 */
416 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
417
Nick Piggin4866cde2005-06-25 14:57:23 -0700418 spin_unlock_irq(&rq->lock);
419}
420
421#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700422static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700423{
424#ifdef CONFIG_SMP
425 return p->oncpu;
426#else
427 return rq->curr == p;
428#endif
429}
430
Ingo Molnar70b97a72006-07-03 00:25:42 -0700431static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700432{
433#ifdef CONFIG_SMP
434 /*
435 * We can optimise this out completely for !SMP, because the
436 * SMP rebalancing from interrupt is the only thing that cares
437 * here.
438 */
439 next->oncpu = 1;
440#endif
441#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
442 spin_unlock_irq(&rq->lock);
443#else
444 spin_unlock(&rq->lock);
445#endif
446}
447
Ingo Molnar70b97a72006-07-03 00:25:42 -0700448static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700449{
450#ifdef CONFIG_SMP
451 /*
452 * After ->oncpu is cleared, the task can be moved to a different CPU.
453 * We must ensure this doesn't happen until the switch is completely
454 * finished.
455 */
456 smp_wmb();
457 prev->oncpu = 0;
458#endif
459#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
460 local_irq_enable();
461#endif
462}
463#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464
465/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700466 * __task_rq_lock - lock the runqueue a given task resides on.
467 * Must be called interrupts disabled.
468 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700469static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700470 __acquires(rq->lock)
471{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700472 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -0700473
474repeat_lock_task:
475 rq = task_rq(p);
476 spin_lock(&rq->lock);
477 if (unlikely(rq != task_rq(p))) {
478 spin_unlock(&rq->lock);
479 goto repeat_lock_task;
480 }
481 return rq;
482}
483
484/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 * task_rq_lock - lock the runqueue a given task resides on and disable
486 * interrupts. Note the ordering: we can safely lookup the task_rq without
487 * explicitly disabling preemption.
488 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700489static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 __acquires(rq->lock)
491{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700492 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493
494repeat_lock_task:
495 local_irq_save(*flags);
496 rq = task_rq(p);
497 spin_lock(&rq->lock);
498 if (unlikely(rq != task_rq(p))) {
499 spin_unlock_irqrestore(&rq->lock, *flags);
500 goto repeat_lock_task;
501 }
502 return rq;
503}
504
Ingo Molnar70b97a72006-07-03 00:25:42 -0700505static inline void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700506 __releases(rq->lock)
507{
508 spin_unlock(&rq->lock);
509}
510
Ingo Molnar70b97a72006-07-03 00:25:42 -0700511static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 __releases(rq->lock)
513{
514 spin_unlock_irqrestore(&rq->lock, *flags);
515}
516
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -0800518 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700520static inline struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521 __acquires(rq->lock)
522{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700523 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524
525 local_irq_disable();
526 rq = this_rq();
527 spin_lock(&rq->lock);
528
529 return rq;
530}
531
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200532/*
Ingo Molnar1b9f19c2007-07-09 18:51:59 +0200533 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
534 */
535void sched_clock_unstable_event(void)
536{
537 unsigned long flags;
538 struct rq *rq;
539
540 rq = task_rq_lock(current, &flags);
541 rq->prev_clock_raw = sched_clock();
542 rq->clock_unstable_events++;
543 task_rq_unlock(rq, &flags);
544}
545
546/*
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200547 * resched_task - mark a task 'to be rescheduled now'.
548 *
549 * On UP this means the setting of the need_resched flag, on SMP it
550 * might also involve a cross-CPU call to trigger the scheduler on
551 * the target CPU.
552 */
553#ifdef CONFIG_SMP
554
555#ifndef tsk_is_polling
556#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
557#endif
558
559static void resched_task(struct task_struct *p)
560{
561 int cpu;
562
563 assert_spin_locked(&task_rq(p)->lock);
564
565 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
566 return;
567
568 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
569
570 cpu = task_cpu(p);
571 if (cpu == smp_processor_id())
572 return;
573
574 /* NEED_RESCHED must be visible before we test polling */
575 smp_mb();
576 if (!tsk_is_polling(p))
577 smp_send_reschedule(cpu);
578}
579
580static void resched_cpu(int cpu)
581{
582 struct rq *rq = cpu_rq(cpu);
583 unsigned long flags;
584
585 if (!spin_trylock_irqsave(&rq->lock, flags))
586 return;
587 resched_task(cpu_curr(cpu));
588 spin_unlock_irqrestore(&rq->lock, flags);
589}
590#else
591static inline void resched_task(struct task_struct *p)
592{
593 assert_spin_locked(&task_rq(p)->lock);
594 set_tsk_need_resched(p);
595}
596#endif
597
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200598static u64 div64_likely32(u64 divident, unsigned long divisor)
599{
600#if BITS_PER_LONG == 32
601 if (likely(divident <= 0xffffffffULL))
602 return (u32)divident / divisor;
603 do_div(divident, divisor);
604
605 return divident;
606#else
607 return divident / divisor;
608#endif
609}
610
611#if BITS_PER_LONG == 32
612# define WMULT_CONST (~0UL)
613#else
614# define WMULT_CONST (1UL << 32)
615#endif
616
617#define WMULT_SHIFT 32
618
619static inline unsigned long
620calc_delta_mine(unsigned long delta_exec, unsigned long weight,
621 struct load_weight *lw)
622{
623 u64 tmp;
624
625 if (unlikely(!lw->inv_weight))
626 lw->inv_weight = WMULT_CONST / lw->weight;
627
628 tmp = (u64)delta_exec * weight;
629 /*
630 * Check whether we'd overflow the 64-bit multiplication:
631 */
632 if (unlikely(tmp > WMULT_CONST)) {
633 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
634 >> (WMULT_SHIFT/2);
635 } else {
636 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
637 }
638
639 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
640}
641
642static inline unsigned long
643calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
644{
645 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
646}
647
648static void update_load_add(struct load_weight *lw, unsigned long inc)
649{
650 lw->weight += inc;
651 lw->inv_weight = 0;
652}
653
654static void update_load_sub(struct load_weight *lw, unsigned long dec)
655{
656 lw->weight -= dec;
657 lw->inv_weight = 0;
658}
659
660static void __update_curr_load(struct rq *rq, struct load_stat *ls)
661{
662 if (rq->curr != rq->idle && ls->load.weight) {
663 ls->delta_exec += ls->delta_stat;
664 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
665 ls->delta_stat = 0;
666 }
667}
668
669/*
670 * Update delta_exec, delta_fair fields for rq.
671 *
672 * delta_fair clock advances at a rate inversely proportional to
673 * total load (rq->ls.load.weight) on the runqueue, while
674 * delta_exec advances at the same rate as wall-clock (provided
675 * cpu is not idle).
676 *
677 * delta_exec / delta_fair is a measure of the (smoothened) load on this
678 * runqueue over any given interval. This (smoothened) load is used
679 * during load balance.
680 *
681 * This function is called /before/ updating rq->ls.load
682 * and when switching tasks.
683 */
684static void update_curr_load(struct rq *rq, u64 now)
685{
686 struct load_stat *ls = &rq->ls;
687 u64 start;
688
689 start = ls->load_update_start;
690 ls->load_update_start = now;
691 ls->delta_stat += now - start;
692 /*
693 * Stagger updates to ls->delta_fair. Very frequent updates
694 * can be expensive.
695 */
696 if (ls->delta_stat >= sysctl_sched_stat_granularity)
697 __update_curr_load(rq, ls);
698}
699
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700/*
Peter Williams2dd73a42006-06-27 02:54:34 -0700701 * To aid in avoiding the subversion of "niceness" due to uneven distribution
702 * of tasks with abnormal "nice" values across CPUs the contribution that
703 * each task makes to its run queue's load is weighted according to its
704 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
705 * scaled version of the new time slice allocation that they receive on time
706 * slice expiry etc.
707 */
708
709/*
710 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
711 * If static_prio_timeslice() is ever changed to break this assumption then
712 * this code will need modification
713 */
714#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
Ingo Molnardd41f592007-07-09 18:51:59 +0200715#define load_weight(lp) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700716 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
717#define PRIO_TO_LOAD_WEIGHT(prio) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200718 load_weight(static_prio_timeslice(prio))
Peter Williams2dd73a42006-06-27 02:54:34 -0700719#define RTPRIO_TO_LOAD_WEIGHT(rp) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200720 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
721
722#define WEIGHT_IDLEPRIO 2
723#define WMULT_IDLEPRIO (1 << 31)
724
725/*
726 * Nice levels are multiplicative, with a gentle 10% change for every
727 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
728 * nice 1, it will get ~10% less CPU time than another CPU-bound task
729 * that remained on nice 0.
730 *
731 * The "10% effect" is relative and cumulative: from _any_ nice level,
732 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
733 * it's +10% CPU usage.
734 */
735static const int prio_to_weight[40] = {
736/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
737/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
738/* 0 */ NICE_0_LOAD /* 1024 */,
739/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
740/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
741};
742
743static const u32 prio_to_wmult[40] = {
744 48356, 60446, 75558, 94446, 118058, 147573,
745 184467, 230589, 288233, 360285, 450347,
746 562979, 703746, 879575, 1099582, 1374389,
747 717986, 2147483, 2684354, 3355443, 4194304,
748 244160, 6557201, 8196502, 10250518, 12782640,
749 16025997, 19976592, 24970740, 31350126, 39045157,
750 49367440, 61356675, 76695844, 95443717, 119304647,
751 148102320, 186737708, 238609294, 286331153,
752};
Peter Williams2dd73a42006-06-27 02:54:34 -0700753
Ingo Molnar36c8b582006-07-03 00:25:41 -0700754static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200755inc_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700756{
Ingo Molnardd41f592007-07-09 18:51:59 +0200757 update_curr_load(rq, now);
758 update_load_add(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700759}
760
Ingo Molnar36c8b582006-07-03 00:25:41 -0700761static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200762dec_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700763{
Ingo Molnardd41f592007-07-09 18:51:59 +0200764 update_curr_load(rq, now);
765 update_load_sub(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700766}
767
Ingo Molnardd41f592007-07-09 18:51:59 +0200768static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700769{
770 rq->nr_running++;
Ingo Molnardd41f592007-07-09 18:51:59 +0200771 inc_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700772}
773
Ingo Molnardd41f592007-07-09 18:51:59 +0200774static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700775{
776 rq->nr_running--;
Ingo Molnardd41f592007-07-09 18:51:59 +0200777 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700778}
779
Ingo Molnardd41f592007-07-09 18:51:59 +0200780static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
781
782/*
783 * runqueue iterator, to support SMP load-balancing between different
784 * scheduling classes, without having to expose their internal data
785 * structures to the load-balancing proper:
786 */
787struct rq_iterator {
788 void *arg;
789 struct task_struct *(*start)(void *);
790 struct task_struct *(*next)(void *);
791};
792
793static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
794 unsigned long max_nr_move, unsigned long max_load_move,
795 struct sched_domain *sd, enum cpu_idle_type idle,
796 int *all_pinned, unsigned long *load_moved,
797 int this_best_prio, int best_prio, int best_prio_seen,
798 struct rq_iterator *iterator);
799
800#include "sched_stats.h"
801#include "sched_rt.c"
802#include "sched_fair.c"
803#include "sched_idletask.c"
804#ifdef CONFIG_SCHED_DEBUG
805# include "sched_debug.c"
806#endif
807
808#define sched_class_highest (&rt_sched_class)
809
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200810static void set_load_weight(struct task_struct *p)
811{
Ingo Molnardd41f592007-07-09 18:51:59 +0200812 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
813 p->se.wait_runtime = 0;
814
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200815 if (task_has_rt_policy(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +0200816 p->se.load.weight = prio_to_weight[0] * 2;
817 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
818 return;
819 }
820
821 /*
822 * SCHED_IDLE tasks get minimal weight:
823 */
824 if (p->policy == SCHED_IDLE) {
825 p->se.load.weight = WEIGHT_IDLEPRIO;
826 p->se.load.inv_weight = WMULT_IDLEPRIO;
827 return;
828 }
829
830 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
831 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200832}
833
Ingo Molnardd41f592007-07-09 18:51:59 +0200834static void
835enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200836{
837 sched_info_queued(p);
Ingo Molnardd41f592007-07-09 18:51:59 +0200838 p->sched_class->enqueue_task(rq, p, wakeup, now);
839 p->se.on_rq = 1;
840}
841
842static void
843dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
844{
845 p->sched_class->dequeue_task(rq, p, sleep, now);
846 p->se.on_rq = 0;
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200847}
848
849/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200850 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200851 */
Ingo Molnar14531182007-07-09 18:51:59 +0200852static inline int __normal_prio(struct task_struct *p)
853{
Ingo Molnardd41f592007-07-09 18:51:59 +0200854 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200855}
856
857/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700858 * Calculate the expected normal priority: i.e. priority
859 * without taking RT-inheritance into account. Might be
860 * boosted by interactivity modifiers. Changes upon fork,
861 * setprio syscalls, and whenever the interactivity
862 * estimator recalculates.
863 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700864static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700865{
866 int prio;
867
Ingo Molnare05606d2007-07-09 18:51:59 +0200868 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700869 prio = MAX_RT_PRIO-1 - p->rt_priority;
870 else
871 prio = __normal_prio(p);
872 return prio;
873}
874
875/*
876 * Calculate the current priority, i.e. the priority
877 * taken into account by the scheduler. This value might
878 * be boosted by RT tasks, or might be boosted by
879 * interactivity modifiers. Will be RT if the task got
880 * RT-boosted. If not then it returns p->normal_prio.
881 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700882static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700883{
884 p->normal_prio = normal_prio(p);
885 /*
886 * If we are RT tasks or we were boosted to RT priority,
887 * keep the priority unchanged. Otherwise, update priority
888 * to the normal priority:
889 */
890 if (!rt_prio(p->prio))
891 return p->normal_prio;
892 return p->prio;
893}
894
895/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200896 * activate_task - move a task to the runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200898static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899{
Ingo Molnardd41f592007-07-09 18:51:59 +0200900 u64 now = rq_clock(rq);
Con Kolivasd425b272006-03-31 02:31:29 -0800901
Ingo Molnardd41f592007-07-09 18:51:59 +0200902 if (p->state == TASK_UNINTERRUPTIBLE)
903 rq->nr_uninterruptible--;
904
905 enqueue_task(rq, p, wakeup, now);
906 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907}
908
909/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200910 * activate_idle_task - move idle task to the _front_ of runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200912static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913{
Ingo Molnardd41f592007-07-09 18:51:59 +0200914 u64 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915
Ingo Molnardd41f592007-07-09 18:51:59 +0200916 if (p->state == TASK_UNINTERRUPTIBLE)
917 rq->nr_uninterruptible--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918
Ingo Molnardd41f592007-07-09 18:51:59 +0200919 enqueue_task(rq, p, 0, now);
920 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921}
922
923/*
924 * deactivate_task - remove a task from the runqueue.
925 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200926static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927{
Ingo Molnardd41f592007-07-09 18:51:59 +0200928 u64 now = rq_clock(rq);
929
930 if (p->state == TASK_UNINTERRUPTIBLE)
931 rq->nr_uninterruptible++;
932
933 dequeue_task(rq, p, sleep, now);
934 dec_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935}
936
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937/**
938 * task_curr - is this task currently executing on a CPU?
939 * @p: the task in question.
940 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700941inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942{
943 return cpu_curr(task_cpu(p)) == p;
944}
945
Peter Williams2dd73a42006-06-27 02:54:34 -0700946/* Used instead of source_load when we know the type == 0 */
947unsigned long weighted_cpuload(const int cpu)
948{
Ingo Molnardd41f592007-07-09 18:51:59 +0200949 return cpu_rq(cpu)->ls.load.weight;
950}
951
952static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
953{
954#ifdef CONFIG_SMP
955 task_thread_info(p)->cpu = cpu;
956 set_task_cfs_rq(p);
957#endif
Peter Williams2dd73a42006-06-27 02:54:34 -0700958}
959
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960#ifdef CONFIG_SMP
Ingo Molnarc65cc872007-07-09 18:51:58 +0200961
Ingo Molnardd41f592007-07-09 18:51:59 +0200962void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +0200963{
Ingo Molnardd41f592007-07-09 18:51:59 +0200964 int old_cpu = task_cpu(p);
965 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
966 u64 clock_offset, fair_clock_offset;
967
968 clock_offset = old_rq->clock - new_rq->clock;
969 fair_clock_offset = old_rq->cfs.fair_clock -
970 new_rq->cfs.fair_clock;
971 if (p->se.wait_start)
972 p->se.wait_start -= clock_offset;
973 if (p->se.wait_start_fair)
974 p->se.wait_start_fair -= fair_clock_offset;
975 if (p->se.sleep_start)
976 p->se.sleep_start -= clock_offset;
977 if (p->se.block_start)
978 p->se.block_start -= clock_offset;
979 if (p->se.sleep_start_fair)
980 p->se.sleep_start_fair -= fair_clock_offset;
981
982 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +0200983}
984
Ingo Molnar70b97a72006-07-03 00:25:42 -0700985struct migration_req {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987
Ingo Molnar36c8b582006-07-03 00:25:41 -0700988 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 int dest_cpu;
990
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 struct completion done;
Ingo Molnar70b97a72006-07-03 00:25:42 -0700992};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993
994/*
995 * The task's runqueue lock must be held.
996 * Returns true if you have to wait for migration thread.
997 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700998static int
Ingo Molnar70b97a72006-07-03 00:25:42 -0700999migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001001 struct rq *rq = task_rq(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002
1003 /*
1004 * If the task is not on a runqueue (and not running), then
1005 * it is sufficient to simply update the task's cpu field.
1006 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001007 if (!p->se.on_rq && !task_running(rq, p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 set_task_cpu(p, dest_cpu);
1009 return 0;
1010 }
1011
1012 init_completion(&req->done);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013 req->task = p;
1014 req->dest_cpu = dest_cpu;
1015 list_add(&req->list, &rq->migration_queue);
Ingo Molnar48f24c42006-07-03 00:25:40 -07001016
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017 return 1;
1018}
1019
1020/*
1021 * wait_task_inactive - wait for a thread to unschedule.
1022 *
1023 * The caller must ensure that the task *will* unschedule sometime soon,
1024 * else this function might spin for a *long* time. This function can't
1025 * be called with interrupts off, or it may introduce deadlock with
1026 * smp_call_function() if an IPI is sent by the same process we are
1027 * waiting to become inactive.
1028 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001029void wait_task_inactive(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030{
1031 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001032 int running, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001033 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034
1035repeat:
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001036 /*
1037 * We do the initial early heuristics without holding
1038 * any task-queue locks at all. We'll only try to get
1039 * the runqueue lock when things look like they will
1040 * work out!
1041 */
1042 rq = task_rq(p);
1043
1044 /*
1045 * If the task is actively running on another CPU
1046 * still, just relax and busy-wait without holding
1047 * any locks.
1048 *
1049 * NOTE! Since we don't hold any locks, it's not
1050 * even sure that "rq" stays as the right runqueue!
1051 * But we don't care, since "task_running()" will
1052 * return false if the runqueue has changed and p
1053 * is actually now running somewhere else!
1054 */
1055 while (task_running(rq, p))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001056 cpu_relax();
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001057
1058 /*
1059 * Ok, time to look more closely! We need the rq
1060 * lock now, to be *sure*. If we're wrong, we'll
1061 * just go back and repeat.
1062 */
1063 rq = task_rq_lock(p, &flags);
1064 running = task_running(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02001065 on_rq = p->se.on_rq;
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001066 task_rq_unlock(rq, &flags);
1067
1068 /*
1069 * Was it really running after all now that we
1070 * checked with the proper locks actually held?
1071 *
1072 * Oops. Go back and try again..
1073 */
1074 if (unlikely(running)) {
1075 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076 goto repeat;
1077 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001078
1079 /*
1080 * It's not enough that it's not actively running,
1081 * it must be off the runqueue _entirely_, and not
1082 * preempted!
1083 *
1084 * So if it wa still runnable (but just not actively
1085 * running right now), it's preempted, and we should
1086 * yield - it could be a while.
1087 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001088 if (unlikely(on_rq)) {
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001089 yield();
1090 goto repeat;
1091 }
1092
1093 /*
1094 * Ahh, all good. It wasn't running, and it wasn't
1095 * runnable, which means that it will never become
1096 * running in the future either. We're all done!
1097 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001098}
1099
1100/***
1101 * kick_process - kick a running thread to enter/exit the kernel
1102 * @p: the to-be-kicked thread
1103 *
1104 * Cause a process which is running on another CPU to enter
1105 * kernel-mode, without any delay. (to get signals handled.)
1106 *
1107 * NOTE: this function doesnt have to take the runqueue lock,
1108 * because all it wants to ensure is that the remote task enters
1109 * the kernel. If the IPI races and the task has been migrated
1110 * to another CPU then no harm is done and the purpose has been
1111 * achieved as well.
1112 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001113void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114{
1115 int cpu;
1116
1117 preempt_disable();
1118 cpu = task_cpu(p);
1119 if ((cpu != smp_processor_id()) && task_curr(p))
1120 smp_send_reschedule(cpu);
1121 preempt_enable();
1122}
1123
1124/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001125 * Return a low guess at the load of a migration-source cpu weighted
1126 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127 *
1128 * We want to under-estimate the load of migration sources, to
1129 * balance conservatively.
1130 */
Con Kolivasb9104722005-11-08 21:38:55 -08001131static inline unsigned long source_load(int cpu, int type)
1132{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001133 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001134 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001135
Peter Williams2dd73a42006-06-27 02:54:34 -07001136 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001137 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001138
Ingo Molnardd41f592007-07-09 18:51:59 +02001139 return min(rq->cpu_load[type-1], total);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001140}
1141
1142/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001143 * Return a high guess at the load of a migration-target cpu weighted
1144 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 */
Con Kolivasb9104722005-11-08 21:38:55 -08001146static inline unsigned long target_load(int cpu, int type)
1147{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001148 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001149 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001150
Peter Williams2dd73a42006-06-27 02:54:34 -07001151 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001152 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001153
Ingo Molnardd41f592007-07-09 18:51:59 +02001154 return max(rq->cpu_load[type-1], total);
Peter Williams2dd73a42006-06-27 02:54:34 -07001155}
1156
1157/*
1158 * Return the average load per task on the cpu's run queue
1159 */
1160static inline unsigned long cpu_avg_load_per_task(int cpu)
1161{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001162 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001163 unsigned long total = weighted_cpuload(cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001164 unsigned long n = rq->nr_running;
1165
Ingo Molnardd41f592007-07-09 18:51:59 +02001166 return n ? total / n : SCHED_LOAD_SCALE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167}
1168
Nick Piggin147cbb42005-06-25 14:57:19 -07001169/*
1170 * find_idlest_group finds and returns the least busy CPU group within the
1171 * domain.
1172 */
1173static struct sched_group *
1174find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1175{
1176 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1177 unsigned long min_load = ULONG_MAX, this_load = 0;
1178 int load_idx = sd->forkexec_idx;
1179 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1180
1181 do {
1182 unsigned long load, avg_load;
1183 int local_group;
1184 int i;
1185
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001186 /* Skip over this group if it has no CPUs allowed */
1187 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1188 goto nextgroup;
1189
Nick Piggin147cbb42005-06-25 14:57:19 -07001190 local_group = cpu_isset(this_cpu, group->cpumask);
Nick Piggin147cbb42005-06-25 14:57:19 -07001191
1192 /* Tally up the load of all CPUs in the group */
1193 avg_load = 0;
1194
1195 for_each_cpu_mask(i, group->cpumask) {
1196 /* Bias balancing toward cpus of our domain */
1197 if (local_group)
1198 load = source_load(i, load_idx);
1199 else
1200 load = target_load(i, load_idx);
1201
1202 avg_load += load;
1203 }
1204
1205 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07001206 avg_load = sg_div_cpu_power(group,
1207 avg_load * SCHED_LOAD_SCALE);
Nick Piggin147cbb42005-06-25 14:57:19 -07001208
1209 if (local_group) {
1210 this_load = avg_load;
1211 this = group;
1212 } else if (avg_load < min_load) {
1213 min_load = avg_load;
1214 idlest = group;
1215 }
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001216nextgroup:
Nick Piggin147cbb42005-06-25 14:57:19 -07001217 group = group->next;
1218 } while (group != sd->groups);
1219
1220 if (!idlest || 100*this_load < imbalance*min_load)
1221 return NULL;
1222 return idlest;
1223}
1224
1225/*
Satoru Takeuchi0feaece2006-10-03 01:14:10 -07001226 * find_idlest_cpu - find the idlest cpu among the cpus in group.
Nick Piggin147cbb42005-06-25 14:57:19 -07001227 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07001228static int
1229find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Nick Piggin147cbb42005-06-25 14:57:19 -07001230{
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001231 cpumask_t tmp;
Nick Piggin147cbb42005-06-25 14:57:19 -07001232 unsigned long load, min_load = ULONG_MAX;
1233 int idlest = -1;
1234 int i;
1235
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001236 /* Traverse only the allowed CPUs */
1237 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1238
1239 for_each_cpu_mask(i, tmp) {
Peter Williams2dd73a42006-06-27 02:54:34 -07001240 load = weighted_cpuload(i);
Nick Piggin147cbb42005-06-25 14:57:19 -07001241
1242 if (load < min_load || (load == min_load && i == this_cpu)) {
1243 min_load = load;
1244 idlest = i;
1245 }
1246 }
1247
1248 return idlest;
1249}
1250
Nick Piggin476d1392005-06-25 14:57:29 -07001251/*
1252 * sched_balance_self: balance the current task (running on cpu) in domains
1253 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1254 * SD_BALANCE_EXEC.
1255 *
1256 * Balance, ie. select the least loaded group.
1257 *
1258 * Returns the target CPU number, or the same CPU if no balancing is needed.
1259 *
1260 * preempt must be disabled.
1261 */
1262static int sched_balance_self(int cpu, int flag)
1263{
1264 struct task_struct *t = current;
1265 struct sched_domain *tmp, *sd = NULL;
Nick Piggin147cbb42005-06-25 14:57:19 -07001266
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001267 for_each_domain(cpu, tmp) {
Ingo Molnar9761eea2007-07-09 18:52:00 +02001268 /*
1269 * If power savings logic is enabled for a domain, stop there.
1270 */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07001271 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1272 break;
Nick Piggin476d1392005-06-25 14:57:29 -07001273 if (tmp->flags & flag)
1274 sd = tmp;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001275 }
Nick Piggin476d1392005-06-25 14:57:29 -07001276
1277 while (sd) {
1278 cpumask_t span;
1279 struct sched_group *group;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001280 int new_cpu, weight;
1281
1282 if (!(sd->flags & flag)) {
1283 sd = sd->child;
1284 continue;
1285 }
Nick Piggin476d1392005-06-25 14:57:29 -07001286
1287 span = sd->span;
1288 group = find_idlest_group(sd, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001289 if (!group) {
1290 sd = sd->child;
1291 continue;
1292 }
Nick Piggin476d1392005-06-25 14:57:29 -07001293
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001294 new_cpu = find_idlest_cpu(group, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001295 if (new_cpu == -1 || new_cpu == cpu) {
1296 /* Now try balancing at a lower domain level of cpu */
1297 sd = sd->child;
1298 continue;
1299 }
Nick Piggin476d1392005-06-25 14:57:29 -07001300
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001301 /* Now try balancing at a lower domain level of new_cpu */
Nick Piggin476d1392005-06-25 14:57:29 -07001302 cpu = new_cpu;
Nick Piggin476d1392005-06-25 14:57:29 -07001303 sd = NULL;
1304 weight = cpus_weight(span);
1305 for_each_domain(cpu, tmp) {
1306 if (weight <= cpus_weight(tmp->span))
1307 break;
1308 if (tmp->flags & flag)
1309 sd = tmp;
1310 }
1311 /* while loop will break here if sd == NULL */
1312 }
1313
1314 return cpu;
1315}
1316
1317#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318
1319/*
1320 * wake_idle() will wake a task on an idle cpu if task->cpu is
1321 * not idle and an idle cpu is available. The span of cpus to
1322 * search starts with cpus closest then further out as needed,
1323 * so we always favor a closer, idle cpu.
1324 *
1325 * Returns the CPU we should wake onto.
1326 */
1327#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
Ingo Molnar36c8b582006-07-03 00:25:41 -07001328static int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001329{
1330 cpumask_t tmp;
1331 struct sched_domain *sd;
1332 int i;
1333
Siddha, Suresh B49531982007-05-08 00:33:01 -07001334 /*
1335 * If it is idle, then it is the best cpu to run this task.
1336 *
1337 * This cpu is also the best, if it has more than one task already.
1338 * Siblings must be also busy(in most cases) as they didn't already
1339 * pickup the extra load from this cpu and hence we need not check
1340 * sibling runqueue info. This will avoid the checks and cache miss
1341 * penalities associated with that.
1342 */
1343 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 return cpu;
1345
1346 for_each_domain(cpu, sd) {
1347 if (sd->flags & SD_WAKE_IDLE) {
Nick Piggine0f364f2005-06-25 14:57:06 -07001348 cpus_and(tmp, sd->span, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 for_each_cpu_mask(i, tmp) {
1350 if (idle_cpu(i))
1351 return i;
1352 }
Ingo Molnar9761eea2007-07-09 18:52:00 +02001353 } else {
Nick Piggine0f364f2005-06-25 14:57:06 -07001354 break;
Ingo Molnar9761eea2007-07-09 18:52:00 +02001355 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 }
1357 return cpu;
1358}
1359#else
Ingo Molnar36c8b582006-07-03 00:25:41 -07001360static inline int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361{
1362 return cpu;
1363}
1364#endif
1365
1366/***
1367 * try_to_wake_up - wake up a thread
1368 * @p: the to-be-woken-up thread
1369 * @state: the mask of task states that can be woken
1370 * @sync: do a synchronous wakeup?
1371 *
1372 * Put it on the run-queue if it's not already there. The "current"
1373 * thread is always on the run-queue (except when the actual
1374 * re-schedule is in progress), and as such you're allowed to do
1375 * the simpler "current->state = TASK_RUNNING" to mark yourself
1376 * runnable without the overhead of this.
1377 *
1378 * returns failure only if the task is already active.
1379 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001380static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381{
1382 int cpu, this_cpu, success = 0;
1383 unsigned long flags;
1384 long old_state;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001385 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386#ifdef CONFIG_SMP
Nick Piggin78979862005-06-25 14:57:13 -07001387 struct sched_domain *sd, *this_sd = NULL;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001388 unsigned long load, this_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 int new_cpu;
1390#endif
1391
1392 rq = task_rq_lock(p, &flags);
1393 old_state = p->state;
1394 if (!(old_state & state))
1395 goto out;
1396
Ingo Molnardd41f592007-07-09 18:51:59 +02001397 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001398 goto out_running;
1399
1400 cpu = task_cpu(p);
1401 this_cpu = smp_processor_id();
1402
1403#ifdef CONFIG_SMP
1404 if (unlikely(task_running(rq, p)))
1405 goto out_activate;
1406
Nick Piggin78979862005-06-25 14:57:13 -07001407 new_cpu = cpu;
1408
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409 schedstat_inc(rq, ttwu_cnt);
1410 if (cpu == this_cpu) {
1411 schedstat_inc(rq, ttwu_local);
Nick Piggin78979862005-06-25 14:57:13 -07001412 goto out_set_cpu;
1413 }
1414
1415 for_each_domain(this_cpu, sd) {
1416 if (cpu_isset(cpu, sd->span)) {
1417 schedstat_inc(sd, ttwu_wake_remote);
1418 this_sd = sd;
1419 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420 }
1421 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422
Nick Piggin78979862005-06-25 14:57:13 -07001423 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 goto out_set_cpu;
1425
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 /*
Nick Piggin78979862005-06-25 14:57:13 -07001427 * Check for affine wakeup and passive balancing possibilities.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428 */
Nick Piggin78979862005-06-25 14:57:13 -07001429 if (this_sd) {
1430 int idx = this_sd->wake_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 unsigned int imbalance;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432
Nick Piggina3f21bc2005-06-25 14:57:15 -07001433 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1434
Nick Piggin78979862005-06-25 14:57:13 -07001435 load = source_load(cpu, idx);
1436 this_load = target_load(this_cpu, idx);
1437
Nick Piggin78979862005-06-25 14:57:13 -07001438 new_cpu = this_cpu; /* Wake to this CPU if we can */
1439
Nick Piggina3f21bc2005-06-25 14:57:15 -07001440 if (this_sd->flags & SD_WAKE_AFFINE) {
1441 unsigned long tl = this_load;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08001442 unsigned long tl_per_task;
1443
1444 tl_per_task = cpu_avg_load_per_task(this_cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001445
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446 /*
Nick Piggina3f21bc2005-06-25 14:57:15 -07001447 * If sync wakeup then subtract the (maximum possible)
1448 * effect of the currently running task from the load
1449 * of the current CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 */
Nick Piggina3f21bc2005-06-25 14:57:15 -07001451 if (sync)
Ingo Molnardd41f592007-07-09 18:51:59 +02001452 tl -= current->se.load.weight;
Nick Piggina3f21bc2005-06-25 14:57:15 -07001453
1454 if ((tl <= load &&
Peter Williams2dd73a42006-06-27 02:54:34 -07001455 tl + target_load(cpu, idx) <= tl_per_task) ||
Ingo Molnardd41f592007-07-09 18:51:59 +02001456 100*(tl + p->se.load.weight) <= imbalance*load) {
Nick Piggina3f21bc2005-06-25 14:57:15 -07001457 /*
1458 * This domain has SD_WAKE_AFFINE and
1459 * p is cache cold in this domain, and
1460 * there is no bad imbalance.
1461 */
1462 schedstat_inc(this_sd, ttwu_move_affine);
1463 goto out_set_cpu;
1464 }
1465 }
1466
1467 /*
1468 * Start passive balancing when half the imbalance_pct
1469 * limit is reached.
1470 */
1471 if (this_sd->flags & SD_WAKE_BALANCE) {
1472 if (imbalance*this_load <= 100*load) {
1473 schedstat_inc(this_sd, ttwu_move_balance);
1474 goto out_set_cpu;
1475 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476 }
1477 }
1478
1479 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1480out_set_cpu:
1481 new_cpu = wake_idle(new_cpu, p);
1482 if (new_cpu != cpu) {
1483 set_task_cpu(p, new_cpu);
1484 task_rq_unlock(rq, &flags);
1485 /* might preempt at this point */
1486 rq = task_rq_lock(p, &flags);
1487 old_state = p->state;
1488 if (!(old_state & state))
1489 goto out;
Ingo Molnardd41f592007-07-09 18:51:59 +02001490 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 goto out_running;
1492
1493 this_cpu = smp_processor_id();
1494 cpu = task_cpu(p);
1495 }
1496
1497out_activate:
1498#endif /* CONFIG_SMP */
Ingo Molnardd41f592007-07-09 18:51:59 +02001499 activate_task(rq, p, 1);
Ingo Molnard79fc0f2005-09-10 00:26:12 -07001500 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 * Sync wakeups (i.e. those types of wakeups where the waker
1502 * has indicated that it will leave the CPU in short order)
1503 * don't trigger a preemption, if the woken up task will run on
1504 * this cpu. (in this case the 'I will reschedule' promise of
1505 * the waker guarantees that the freshly woken up task is going
1506 * to be considered on this CPU.)
1507 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001508 if (!sync || cpu != this_cpu)
1509 check_preempt_curr(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510 success = 1;
1511
1512out_running:
1513 p->state = TASK_RUNNING;
1514out:
1515 task_rq_unlock(rq, &flags);
1516
1517 return success;
1518}
1519
Ingo Molnar36c8b582006-07-03 00:25:41 -07001520int fastcall wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521{
1522 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1523 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1524}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525EXPORT_SYMBOL(wake_up_process);
1526
Ingo Molnar36c8b582006-07-03 00:25:41 -07001527int fastcall wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528{
1529 return try_to_wake_up(p, state, 0);
1530}
1531
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532/*
1533 * Perform scheduler related setup for a newly forked process p.
1534 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02001535 *
1536 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001538static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539{
Ingo Molnardd41f592007-07-09 18:51:59 +02001540 p->se.wait_start_fair = 0;
1541 p->se.wait_start = 0;
1542 p->se.exec_start = 0;
1543 p->se.sum_exec_runtime = 0;
1544 p->se.delta_exec = 0;
1545 p->se.delta_fair_run = 0;
1546 p->se.delta_fair_sleep = 0;
1547 p->se.wait_runtime = 0;
1548 p->se.sum_wait_runtime = 0;
1549 p->se.sum_sleep_runtime = 0;
1550 p->se.sleep_start = 0;
1551 p->se.sleep_start_fair = 0;
1552 p->se.block_start = 0;
1553 p->se.sleep_max = 0;
1554 p->se.block_max = 0;
1555 p->se.exec_max = 0;
1556 p->se.wait_max = 0;
1557 p->se.wait_runtime_overruns = 0;
1558 p->se.wait_runtime_underruns = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001559
Ingo Molnardd41f592007-07-09 18:51:59 +02001560 INIT_LIST_HEAD(&p->run_list);
1561 p->se.on_rq = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001562
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563 /*
1564 * We mark the process as running here, but have not actually
1565 * inserted it onto the runqueue yet. This guarantees that
1566 * nobody will actually run it, and a signal or other external
1567 * event cannot wake it up and insert it on the runqueue either.
1568 */
1569 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02001570}
1571
1572/*
1573 * fork()/clone()-time setup:
1574 */
1575void sched_fork(struct task_struct *p, int clone_flags)
1576{
1577 int cpu = get_cpu();
1578
1579 __sched_fork(p);
1580
1581#ifdef CONFIG_SMP
1582 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1583#endif
1584 __set_task_cpu(p, cpu);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001585
1586 /*
1587 * Make sure we do not leak PI boosting priority to the child:
1588 */
1589 p->prio = current->normal_prio;
1590
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001591#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02001592 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001593 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594#endif
Chen, Kenneth Wd6077cb2006-02-14 13:53:10 -08001595#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
Nick Piggin4866cde2005-06-25 14:57:23 -07001596 p->oncpu = 0;
1597#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598#ifdef CONFIG_PREEMPT
Nick Piggin4866cde2005-06-25 14:57:23 -07001599 /* Want to start with kernel preemption disabled. */
Al Viroa1261f52005-11-13 16:06:55 -08001600 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001601#endif
Nick Piggin476d1392005-06-25 14:57:29 -07001602 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603}
1604
1605/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001606 * After fork, child runs first. (default) If set to 0 then
1607 * parent will (try to) run first.
1608 */
1609unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1610
1611/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612 * wake_up_new_task - wake up a newly created task for the first time.
1613 *
1614 * This function will do some initial scheduler statistics housekeeping
1615 * that must be done for every newly created context, then puts the task
1616 * on the runqueue and wakes it.
1617 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001618void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619{
1620 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001621 struct rq *rq;
1622 int this_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001623
1624 rq = task_rq_lock(p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625 BUG_ON(p->state != TASK_RUNNING);
Ingo Molnardd41f592007-07-09 18:51:59 +02001626 this_cpu = smp_processor_id(); /* parent's CPU */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627
1628 p->prio = effective_prio(p);
1629
Ingo Molnardd41f592007-07-09 18:51:59 +02001630 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1631 task_cpu(p) != this_cpu || !current->se.on_rq) {
1632 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001634 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02001635 * Let the scheduling class do new task startup
1636 * management (if any):
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001638 p->sched_class->task_new(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001640 check_preempt_curr(rq, p);
1641 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642}
1643
Linus Torvalds1da177e2005-04-16 15:20:36 -07001644/**
Nick Piggin4866cde2005-06-25 14:57:23 -07001645 * prepare_task_switch - prepare to switch tasks
1646 * @rq: the runqueue preparing to switch
1647 * @next: the task we are going to switch to.
1648 *
1649 * This is called with the rq lock held and interrupts off. It must
1650 * be paired with a subsequent finish_task_switch after the context
1651 * switch.
1652 *
1653 * prepare_task_switch sets up locking and calls architecture specific
1654 * hooks.
1655 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001656static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001657{
1658 prepare_lock_switch(rq, next);
1659 prepare_arch_switch(next);
1660}
1661
1662/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001663 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04001664 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665 * @prev: the thread we just switched away from.
1666 *
Nick Piggin4866cde2005-06-25 14:57:23 -07001667 * finish_task_switch must be called after the context switch, paired
1668 * with a prepare_task_switch call before the context switch.
1669 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1670 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 *
1672 * Note that we may have delayed dropping an mm in context_switch(). If
1673 * so, we finish that here outside of the runqueue lock. (Doing it
1674 * with the lock held can cause deadlocks; see schedule() for
1675 * details.)
1676 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001677static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678 __releases(rq->lock)
1679{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001681 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682
1683 rq->prev_mm = NULL;
1684
1685 /*
1686 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001687 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001688 * schedule one last time. The schedule call will never return, and
1689 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001690 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691 * still held, otherwise prev could be scheduled on another cpu, die
1692 * there before we look at prev->state, and then the reference would
1693 * be dropped twice.
1694 * Manfred Spraul <manfred@colorfullife.com>
1695 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001696 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07001697 finish_arch_switch(prev);
1698 finish_lock_switch(rq, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 if (mm)
1700 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001701 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08001702 /*
1703 * Remove function-return probe instances associated with this
1704 * task and put them back on the free list.
Ingo Molnar9761eea2007-07-09 18:52:00 +02001705 */
bibo maoc6fd91f2006-03-26 01:38:20 -08001706 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08001708 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709}
1710
1711/**
1712 * schedule_tail - first thing a freshly forked thread must call.
1713 * @prev: the thread we just switched away from.
1714 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001715asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 __releases(rq->lock)
1717{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001718 struct rq *rq = this_rq();
1719
Nick Piggin4866cde2005-06-25 14:57:23 -07001720 finish_task_switch(rq, prev);
1721#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1722 /* In this case, finish_task_switch does not reenable preemption */
1723 preempt_enable();
1724#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725 if (current->set_child_tid)
1726 put_user(current->pid, current->set_child_tid);
1727}
1728
1729/*
1730 * context_switch - switch to the new MM and the new
1731 * thread's register state.
1732 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001733static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07001734context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07001735 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736{
Ingo Molnardd41f592007-07-09 18:51:59 +02001737 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738
Ingo Molnardd41f592007-07-09 18:51:59 +02001739 prepare_task_switch(rq, next);
1740 mm = next->mm;
1741 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01001742 /*
1743 * For paravirt, this is coupled with an exit in switch_to to
1744 * combine the page table reload and the switch backend into
1745 * one hypercall.
1746 */
1747 arch_enter_lazy_cpu_mode();
1748
Ingo Molnardd41f592007-07-09 18:51:59 +02001749 if (unlikely(!mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001750 next->active_mm = oldmm;
1751 atomic_inc(&oldmm->mm_count);
1752 enter_lazy_tlb(oldmm, next);
1753 } else
1754 switch_mm(oldmm, mm, next);
1755
Ingo Molnardd41f592007-07-09 18:51:59 +02001756 if (unlikely(!prev->mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001757 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758 rq->prev_mm = oldmm;
1759 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001760 /*
1761 * Since the runqueue lock will be released by the next
1762 * task (which is an invalid locking op but in the case
1763 * of the scheduler it's an obvious special-case), so we
1764 * do an early lockdep release here:
1765 */
1766#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001767 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001768#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769
1770 /* Here we just switch the register state and the stack. */
1771 switch_to(prev, next, prev);
1772
Ingo Molnardd41f592007-07-09 18:51:59 +02001773 barrier();
1774 /*
1775 * this_rq must be evaluated again because prev may have moved
1776 * CPUs since it called schedule(), thus the 'rq' on its stack
1777 * frame will be invalid.
1778 */
1779 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780}
1781
1782/*
1783 * nr_running, nr_uninterruptible and nr_context_switches:
1784 *
1785 * externally visible scheduler statistics: current number of runnable
1786 * threads, current number of uninterruptible-sleeping threads, total
1787 * number of context switches performed since bootup.
1788 */
1789unsigned long nr_running(void)
1790{
1791 unsigned long i, sum = 0;
1792
1793 for_each_online_cpu(i)
1794 sum += cpu_rq(i)->nr_running;
1795
1796 return sum;
1797}
1798
1799unsigned long nr_uninterruptible(void)
1800{
1801 unsigned long i, sum = 0;
1802
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001803 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804 sum += cpu_rq(i)->nr_uninterruptible;
1805
1806 /*
1807 * Since we read the counters lockless, it might be slightly
1808 * inaccurate. Do not allow it to go below zero though:
1809 */
1810 if (unlikely((long)sum < 0))
1811 sum = 0;
1812
1813 return sum;
1814}
1815
1816unsigned long long nr_context_switches(void)
1817{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07001818 int i;
1819 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001821 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 sum += cpu_rq(i)->nr_switches;
1823
1824 return sum;
1825}
1826
1827unsigned long nr_iowait(void)
1828{
1829 unsigned long i, sum = 0;
1830
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001831 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1833
1834 return sum;
1835}
1836
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08001837unsigned long nr_active(void)
1838{
1839 unsigned long i, running = 0, uninterruptible = 0;
1840
1841 for_each_online_cpu(i) {
1842 running += cpu_rq(i)->nr_running;
1843 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1844 }
1845
1846 if (unlikely((long)uninterruptible < 0))
1847 uninterruptible = 0;
1848
1849 return running + uninterruptible;
1850}
1851
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001853 * Update rq->cpu_load[] statistics. This function is usually called every
1854 * scheduler tick (TICK_NSEC).
Ingo Molnar48f24c42006-07-03 00:25:40 -07001855 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001856static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07001857{
Ingo Molnardd41f592007-07-09 18:51:59 +02001858 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1859 unsigned long total_load = this_rq->ls.load.weight;
1860 unsigned long this_load = total_load;
1861 struct load_stat *ls = &this_rq->ls;
1862 u64 now = __rq_clock(this_rq);
1863 int i, scale;
1864
1865 this_rq->nr_load_updates++;
1866 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1867 goto do_avg;
1868
1869 /* Update delta_fair/delta_exec fields first */
1870 update_curr_load(this_rq, now);
1871
1872 fair_delta64 = ls->delta_fair + 1;
1873 ls->delta_fair = 0;
1874
1875 exec_delta64 = ls->delta_exec + 1;
1876 ls->delta_exec = 0;
1877
1878 sample_interval64 = now - ls->load_update_last;
1879 ls->load_update_last = now;
1880
1881 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1882 sample_interval64 = TICK_NSEC;
1883
1884 if (exec_delta64 > sample_interval64)
1885 exec_delta64 = sample_interval64;
1886
1887 idle_delta64 = sample_interval64 - exec_delta64;
1888
1889 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1890 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1891
1892 this_load = (unsigned long)tmp64;
1893
1894do_avg:
1895
1896 /* Update our load: */
1897 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1898 unsigned long old_load, new_load;
1899
1900 /* scale is effectively 1 << i now, and >> i divides by scale */
1901
1902 old_load = this_rq->cpu_load[i];
1903 new_load = this_load;
1904
1905 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1906 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07001907}
1908
Ingo Molnardd41f592007-07-09 18:51:59 +02001909#ifdef CONFIG_SMP
1910
Ingo Molnar48f24c42006-07-03 00:25:40 -07001911/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001912 * double_rq_lock - safely lock two runqueues
1913 *
1914 * Note this does not disable interrupts like task_rq_lock,
1915 * you need to do so manually before calling.
1916 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001917static void double_rq_lock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 __acquires(rq1->lock)
1919 __acquires(rq2->lock)
1920{
Kirill Korotaev054b9102006-12-10 02:20:11 -08001921 BUG_ON(!irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07001922 if (rq1 == rq2) {
1923 spin_lock(&rq1->lock);
1924 __acquire(rq2->lock); /* Fake it out ;) */
1925 } else {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001926 if (rq1 < rq2) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 spin_lock(&rq1->lock);
1928 spin_lock(&rq2->lock);
1929 } else {
1930 spin_lock(&rq2->lock);
1931 spin_lock(&rq1->lock);
1932 }
1933 }
1934}
1935
1936/*
1937 * double_rq_unlock - safely unlock two runqueues
1938 *
1939 * Note this does not restore interrupts like task_rq_unlock,
1940 * you need to do so manually after calling.
1941 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001942static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943 __releases(rq1->lock)
1944 __releases(rq2->lock)
1945{
1946 spin_unlock(&rq1->lock);
1947 if (rq1 != rq2)
1948 spin_unlock(&rq2->lock);
1949 else
1950 __release(rq2->lock);
1951}
1952
1953/*
1954 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1955 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001956static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957 __releases(this_rq->lock)
1958 __acquires(busiest->lock)
1959 __acquires(this_rq->lock)
1960{
Kirill Korotaev054b9102006-12-10 02:20:11 -08001961 if (unlikely(!irqs_disabled())) {
1962 /* printk() doesn't work good under rq->lock */
1963 spin_unlock(&this_rq->lock);
1964 BUG_ON(1);
1965 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966 if (unlikely(!spin_trylock(&busiest->lock))) {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001967 if (busiest < this_rq) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968 spin_unlock(&this_rq->lock);
1969 spin_lock(&busiest->lock);
1970 spin_lock(&this_rq->lock);
1971 } else
1972 spin_lock(&busiest->lock);
1973 }
1974}
1975
1976/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 * If dest_cpu is allowed for this process, migrate the task to it.
1978 * This is accomplished by forcing the cpu_allowed mask to only
1979 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1980 * the cpu_allowed mask is restored.
1981 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001982static void sched_migrate_task(struct task_struct *p, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001984 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001985 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001986 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987
1988 rq = task_rq_lock(p, &flags);
1989 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1990 || unlikely(cpu_is_offline(dest_cpu)))
1991 goto out;
1992
1993 /* force the process onto the specified CPU */
1994 if (migrate_task(p, dest_cpu, &req)) {
1995 /* Need to wait for migration thread (might exit: take ref). */
1996 struct task_struct *mt = rq->migration_thread;
Ingo Molnar36c8b582006-07-03 00:25:41 -07001997
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998 get_task_struct(mt);
1999 task_rq_unlock(rq, &flags);
2000 wake_up_process(mt);
2001 put_task_struct(mt);
2002 wait_for_completion(&req.done);
Ingo Molnar36c8b582006-07-03 00:25:41 -07002003
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 return;
2005 }
2006out:
2007 task_rq_unlock(rq, &flags);
2008}
2009
2010/*
Nick Piggin476d1392005-06-25 14:57:29 -07002011 * sched_exec - execve() is a valuable balancing opportunity, because at
2012 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 */
2014void sched_exec(void)
2015{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016 int new_cpu, this_cpu = get_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002017 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 put_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002019 if (new_cpu != this_cpu)
2020 sched_migrate_task(current, new_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021}
2022
2023/*
2024 * pull_task - move a task from a remote runqueue to the local runqueue.
2025 * Both runqueues must be locked.
2026 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002027static void pull_task(struct rq *src_rq, struct task_struct *p,
2028 struct rq *this_rq, int this_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029{
Ingo Molnardd41f592007-07-09 18:51:59 +02002030 deactivate_task(src_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 set_task_cpu(p, this_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002032 activate_task(this_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002033 /*
2034 * Note that idle threads have a prio of MAX_PRIO, for this test
2035 * to be always true for them.
2036 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002037 check_preempt_curr(this_rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002038}
2039
2040/*
2041 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2042 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08002043static
Ingo Molnar70b97a72006-07-03 00:25:42 -07002044int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002045 struct sched_domain *sd, enum cpu_idle_type idle,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07002046 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047{
2048 /*
2049 * We do not migrate tasks that are:
2050 * 1) running (obviously), or
2051 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2052 * 3) are cache-hot on their current CPU.
2053 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 if (!cpu_isset(this_cpu, p->cpus_allowed))
2055 return 0;
Nick Piggin81026792005-06-25 14:57:07 -07002056 *all_pinned = 0;
2057
2058 if (task_running(rq, p))
2059 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002060
2061 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02002062 * Aggressive migration if too many balance attempts have failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002064 if (sd->nr_balance_failed > sd->cache_nice_tries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002065 return 1;
2066
Linus Torvalds1da177e2005-04-16 15:20:36 -07002067 return 1;
2068}
2069
Ingo Molnardd41f592007-07-09 18:51:59 +02002070static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2071 unsigned long max_nr_move, unsigned long max_load_move,
2072 struct sched_domain *sd, enum cpu_idle_type idle,
2073 int *all_pinned, unsigned long *load_moved,
2074 int this_best_prio, int best_prio, int best_prio_seen,
2075 struct rq_iterator *iterator)
2076{
2077 int pulled = 0, pinned = 0, skip_for_load;
2078 struct task_struct *p;
2079 long rem_load_move = max_load_move;
2080
2081 if (max_nr_move == 0 || max_load_move == 0)
2082 goto out;
2083
2084 pinned = 1;
2085
2086 /*
2087 * Start the load-balancing iterator:
2088 */
2089 p = iterator->start(iterator->arg);
2090next:
2091 if (!p)
2092 goto out;
2093 /*
2094 * To help distribute high priority tasks accross CPUs we don't
2095 * skip a task if it will be the highest priority task (i.e. smallest
2096 * prio value) on its new queue regardless of its load weight
2097 */
2098 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2099 SCHED_LOAD_SCALE_FUZZ;
2100 if (skip_for_load && p->prio < this_best_prio)
2101 skip_for_load = !best_prio_seen && p->prio == best_prio;
2102 if (skip_for_load ||
2103 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2104
2105 best_prio_seen |= p->prio == best_prio;
2106 p = iterator->next(iterator->arg);
2107 goto next;
2108 }
2109
2110 pull_task(busiest, p, this_rq, this_cpu);
2111 pulled++;
2112 rem_load_move -= p->se.load.weight;
2113
2114 /*
2115 * We only want to steal up to the prescribed number of tasks
2116 * and the prescribed amount of weighted load.
2117 */
2118 if (pulled < max_nr_move && rem_load_move > 0) {
2119 if (p->prio < this_best_prio)
2120 this_best_prio = p->prio;
2121 p = iterator->next(iterator->arg);
2122 goto next;
2123 }
2124out:
2125 /*
2126 * Right now, this is the only place pull_task() is called,
2127 * so we can safely collect pull_task() stats here rather than
2128 * inside pull_task().
2129 */
2130 schedstat_add(sd, lb_gained[idle], pulled);
2131
2132 if (all_pinned)
2133 *all_pinned = pinned;
2134 *load_moved = max_load_move - rem_load_move;
2135 return pulled;
2136}
Ingo Molnar48f24c42006-07-03 00:25:40 -07002137
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138/*
Peter Williams2dd73a42006-06-27 02:54:34 -07002139 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2140 * load from busiest to this_rq, as part of a balancing operation within
2141 * "domain". Returns the number of tasks moved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142 *
2143 * Called with both runqueues locked.
2144 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002145static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002146 unsigned long max_nr_move, unsigned long max_load_move,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002147 struct sched_domain *sd, enum cpu_idle_type idle,
Peter Williams2dd73a42006-06-27 02:54:34 -07002148 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149{
Ingo Molnardd41f592007-07-09 18:51:59 +02002150 struct sched_class *class = sched_class_highest;
2151 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2152 long rem_load_move = max_load_move;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153
Ingo Molnardd41f592007-07-09 18:51:59 +02002154 do {
2155 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2156 max_nr_move, (unsigned long)rem_load_move,
2157 sd, idle, all_pinned, &load_moved);
2158 total_nr_moved += nr_moved;
2159 max_nr_move -= nr_moved;
2160 rem_load_move -= load_moved;
2161 class = class->next;
2162 } while (class && max_nr_move && rem_load_move > 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002163
Ingo Molnardd41f592007-07-09 18:51:59 +02002164 return total_nr_moved;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165}
2166
2167/*
2168 * find_busiest_group finds and returns the busiest CPU group within the
Ingo Molnar48f24c42006-07-03 00:25:40 -07002169 * domain. It calculates and returns the amount of weighted load which
2170 * should be moved to restore balance via the imbalance parameter.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 */
2172static struct sched_group *
2173find_busiest_group(struct sched_domain *sd, int this_cpu,
Ingo Molnardd41f592007-07-09 18:51:59 +02002174 unsigned long *imbalance, enum cpu_idle_type idle,
2175 int *sd_idle, cpumask_t *cpus, int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176{
2177 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2178 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002179 unsigned long max_pull;
Peter Williams2dd73a42006-06-27 02:54:34 -07002180 unsigned long busiest_load_per_task, busiest_nr_running;
2181 unsigned long this_load_per_task, this_nr_running;
Nick Piggin78979862005-06-25 14:57:13 -07002182 int load_idx;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002183#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2184 int power_savings_balance = 1;
2185 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2186 unsigned long min_nr_running = ULONG_MAX;
2187 struct sched_group *group_min = NULL, *group_leader = NULL;
2188#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002189
2190 max_load = this_load = total_load = total_pwr = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002191 busiest_load_per_task = busiest_nr_running = 0;
2192 this_load_per_task = this_nr_running = 0;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002193 if (idle == CPU_NOT_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002194 load_idx = sd->busy_idx;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002195 else if (idle == CPU_NEWLY_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002196 load_idx = sd->newidle_idx;
2197 else
2198 load_idx = sd->idle_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199
2200 do {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002201 unsigned long load, group_capacity;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202 int local_group;
2203 int i;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002204 unsigned int balance_cpu = -1, first_idle_cpu = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002205 unsigned long sum_nr_running, sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206
2207 local_group = cpu_isset(this_cpu, group->cpumask);
2208
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002209 if (local_group)
2210 balance_cpu = first_cpu(group->cpumask);
2211
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 /* Tally up the load of all CPUs in the group */
Peter Williams2dd73a42006-06-27 02:54:34 -07002213 sum_weighted_load = sum_nr_running = avg_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214
2215 for_each_cpu_mask(i, group->cpumask) {
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002216 struct rq *rq;
2217
2218 if (!cpu_isset(i, *cpus))
2219 continue;
2220
2221 rq = cpu_rq(i);
Peter Williams2dd73a42006-06-27 02:54:34 -07002222
Nick Piggin5969fe02005-09-10 00:26:19 -07002223 if (*sd_idle && !idle_cpu(i))
2224 *sd_idle = 0;
2225
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 /* Bias balancing toward cpus of our domain */
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002227 if (local_group) {
2228 if (idle_cpu(i) && !first_idle_cpu) {
2229 first_idle_cpu = 1;
2230 balance_cpu = i;
2231 }
2232
Nick Piggina2000572006-02-10 01:51:02 -08002233 load = target_load(i, load_idx);
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002234 } else
Nick Piggina2000572006-02-10 01:51:02 -08002235 load = source_load(i, load_idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236
2237 avg_load += load;
Peter Williams2dd73a42006-06-27 02:54:34 -07002238 sum_nr_running += rq->nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002239 sum_weighted_load += weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240 }
2241
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002242 /*
2243 * First idle cpu or the first cpu(busiest) in this sched group
2244 * is eligible for doing load balancing at this and above
2245 * domains.
2246 */
2247 if (local_group && balance_cpu != this_cpu && balance) {
2248 *balance = 0;
2249 goto ret;
2250 }
2251
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252 total_load += avg_load;
Eric Dumazet5517d862007-05-08 00:32:57 -07002253 total_pwr += group->__cpu_power;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254
2255 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07002256 avg_load = sg_div_cpu_power(group,
2257 avg_load * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258
Eric Dumazet5517d862007-05-08 00:32:57 -07002259 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002260
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 if (local_group) {
2262 this_load = avg_load;
2263 this = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002264 this_nr_running = sum_nr_running;
2265 this_load_per_task = sum_weighted_load;
2266 } else if (avg_load > max_load &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002267 sum_nr_running > group_capacity) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 max_load = avg_load;
2269 busiest = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002270 busiest_nr_running = sum_nr_running;
2271 busiest_load_per_task = sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002273
2274#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2275 /*
2276 * Busy processors will not participate in power savings
2277 * balance.
2278 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002279 if (idle == CPU_NOT_IDLE ||
2280 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2281 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002282
2283 /*
2284 * If the local group is idle or completely loaded
2285 * no need to do power savings balance at this domain
2286 */
2287 if (local_group && (this_nr_running >= group_capacity ||
2288 !this_nr_running))
2289 power_savings_balance = 0;
2290
Ingo Molnardd41f592007-07-09 18:51:59 +02002291 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002292 * If a group is already running at full capacity or idle,
2293 * don't include that group in power savings calculations
Ingo Molnardd41f592007-07-09 18:51:59 +02002294 */
2295 if (!power_savings_balance || sum_nr_running >= group_capacity
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002296 || !sum_nr_running)
Ingo Molnardd41f592007-07-09 18:51:59 +02002297 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002298
Ingo Molnardd41f592007-07-09 18:51:59 +02002299 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002300 * Calculate the group which has the least non-idle load.
Ingo Molnardd41f592007-07-09 18:51:59 +02002301 * This is the group from where we need to pick up the load
2302 * for saving power
2303 */
2304 if ((sum_nr_running < min_nr_running) ||
2305 (sum_nr_running == min_nr_running &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002306 first_cpu(group->cpumask) <
2307 first_cpu(group_min->cpumask))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002308 group_min = group;
2309 min_nr_running = sum_nr_running;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002310 min_load_per_task = sum_weighted_load /
2311 sum_nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002312 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002313
Ingo Molnardd41f592007-07-09 18:51:59 +02002314 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002315 * Calculate the group which is almost near its
Ingo Molnardd41f592007-07-09 18:51:59 +02002316 * capacity but still has some space to pick up some load
2317 * from other group and save more power
2318 */
2319 if (sum_nr_running <= group_capacity - 1) {
2320 if (sum_nr_running > leader_nr_running ||
2321 (sum_nr_running == leader_nr_running &&
2322 first_cpu(group->cpumask) >
2323 first_cpu(group_leader->cpumask))) {
2324 group_leader = group;
2325 leader_nr_running = sum_nr_running;
2326 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002327 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002328group_next:
2329#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002330 group = group->next;
2331 } while (group != sd->groups);
2332
Peter Williams2dd73a42006-06-27 02:54:34 -07002333 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 goto out_balanced;
2335
2336 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2337
2338 if (this_load >= avg_load ||
2339 100*max_load <= sd->imbalance_pct*this_load)
2340 goto out_balanced;
2341
Peter Williams2dd73a42006-06-27 02:54:34 -07002342 busiest_load_per_task /= busiest_nr_running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343 /*
2344 * We're trying to get all the cpus to the average_load, so we don't
2345 * want to push ourselves above the average load, nor do we wish to
2346 * reduce the max loaded cpu below the average load, as either of these
2347 * actions would just result in more rebalancing later, and ping-pong
2348 * tasks around. Thus we look for the minimum possible imbalance.
2349 * Negative imbalances (*we* are more loaded than anyone else) will
2350 * be counted as no imbalance for these purposes -- we can't fix that
2351 * by pulling tasks to us. Be careful of negative numbers as they'll
2352 * appear as very large values with unsigned longs.
2353 */
Peter Williams2dd73a42006-06-27 02:54:34 -07002354 if (max_load <= busiest_load_per_task)
2355 goto out_balanced;
2356
2357 /*
2358 * In the presence of smp nice balancing, certain scenarios can have
2359 * max load less than avg load(as we skip the groups at or below
2360 * its cpu_power, while calculating max_load..)
2361 */
2362 if (max_load < avg_load) {
2363 *imbalance = 0;
2364 goto small_imbalance;
2365 }
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002366
2367 /* Don't want to pull so many tasks that a group would go idle */
Peter Williams2dd73a42006-06-27 02:54:34 -07002368 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002369
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370 /* How much load to actually move to equalise the imbalance */
Eric Dumazet5517d862007-05-08 00:32:57 -07002371 *imbalance = min(max_pull * busiest->__cpu_power,
2372 (avg_load - this_load) * this->__cpu_power)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373 / SCHED_LOAD_SCALE;
2374
Peter Williams2dd73a42006-06-27 02:54:34 -07002375 /*
2376 * if *imbalance is less than the average load per runnable task
2377 * there is no gaurantee that any tasks will be moved so we'll have
2378 * a think about bumping its value to force at least one task to be
2379 * moved
2380 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002381 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07002382 unsigned long tmp, pwr_now, pwr_move;
Peter Williams2dd73a42006-06-27 02:54:34 -07002383 unsigned int imbn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384
Peter Williams2dd73a42006-06-27 02:54:34 -07002385small_imbalance:
2386 pwr_move = pwr_now = 0;
2387 imbn = 2;
2388 if (this_nr_running) {
2389 this_load_per_task /= this_nr_running;
2390 if (busiest_load_per_task > this_load_per_task)
2391 imbn = 1;
2392 } else
2393 this_load_per_task = SCHED_LOAD_SCALE;
2394
Ingo Molnardd41f592007-07-09 18:51:59 +02002395 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2396 busiest_load_per_task * imbn) {
Peter Williams2dd73a42006-06-27 02:54:34 -07002397 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 return busiest;
2399 }
2400
2401 /*
2402 * OK, we don't have enough imbalance to justify moving tasks,
2403 * however we may be able to increase total CPU power used by
2404 * moving them.
2405 */
2406
Eric Dumazet5517d862007-05-08 00:32:57 -07002407 pwr_now += busiest->__cpu_power *
2408 min(busiest_load_per_task, max_load);
2409 pwr_now += this->__cpu_power *
2410 min(this_load_per_task, this_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002411 pwr_now /= SCHED_LOAD_SCALE;
2412
2413 /* Amount of load we'd subtract */
Eric Dumazet5517d862007-05-08 00:32:57 -07002414 tmp = sg_div_cpu_power(busiest,
2415 busiest_load_per_task * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416 if (max_load > tmp)
Eric Dumazet5517d862007-05-08 00:32:57 -07002417 pwr_move += busiest->__cpu_power *
Peter Williams2dd73a42006-06-27 02:54:34 -07002418 min(busiest_load_per_task, max_load - tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419
2420 /* Amount of load we'd add */
Eric Dumazet5517d862007-05-08 00:32:57 -07002421 if (max_load * busiest->__cpu_power <
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08002422 busiest_load_per_task * SCHED_LOAD_SCALE)
Eric Dumazet5517d862007-05-08 00:32:57 -07002423 tmp = sg_div_cpu_power(this,
2424 max_load * busiest->__cpu_power);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 else
Eric Dumazet5517d862007-05-08 00:32:57 -07002426 tmp = sg_div_cpu_power(this,
2427 busiest_load_per_task * SCHED_LOAD_SCALE);
2428 pwr_move += this->__cpu_power *
2429 min(this_load_per_task, this_load + tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430 pwr_move /= SCHED_LOAD_SCALE;
2431
2432 /* Move if we gain throughput */
2433 if (pwr_move <= pwr_now)
2434 goto out_balanced;
2435
Peter Williams2dd73a42006-06-27 02:54:34 -07002436 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 }
2438
Linus Torvalds1da177e2005-04-16 15:20:36 -07002439 return busiest;
2440
2441out_balanced:
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002442#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002443 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002444 goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002446 if (this == group_leader && group_leader != group_min) {
2447 *imbalance = min_load_per_task;
2448 return group_min;
2449 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002450#endif
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002451ret:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452 *imbalance = 0;
2453 return NULL;
2454}
2455
2456/*
2457 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2458 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002459static struct rq *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002460find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002461 unsigned long imbalance, cpumask_t *cpus)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002463 struct rq *busiest = NULL, *rq;
Peter Williams2dd73a42006-06-27 02:54:34 -07002464 unsigned long max_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002465 int i;
2466
2467 for_each_cpu_mask(i, group->cpumask) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002468 unsigned long wl;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002469
2470 if (!cpu_isset(i, *cpus))
2471 continue;
2472
Ingo Molnar48f24c42006-07-03 00:25:40 -07002473 rq = cpu_rq(i);
Ingo Molnardd41f592007-07-09 18:51:59 +02002474 wl = weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475
Ingo Molnardd41f592007-07-09 18:51:59 +02002476 if (rq->nr_running == 1 && wl > imbalance)
Peter Williams2dd73a42006-06-27 02:54:34 -07002477 continue;
2478
Ingo Molnardd41f592007-07-09 18:51:59 +02002479 if (wl > max_load) {
2480 max_load = wl;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002481 busiest = rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 }
2483 }
2484
2485 return busiest;
2486}
2487
2488/*
Nick Piggin77391d72005-06-25 14:57:30 -07002489 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2490 * so long as it is large enough.
2491 */
2492#define MAX_PINNED_INTERVAL 512
2493
Ingo Molnar48f24c42006-07-03 00:25:40 -07002494static inline unsigned long minus_1_or_zero(unsigned long n)
2495{
2496 return n > 0 ? n - 1 : 0;
2497}
2498
Nick Piggin77391d72005-06-25 14:57:30 -07002499/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2501 * tasks if there is an imbalance.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002503static int load_balance(int this_cpu, struct rq *this_rq,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002504 struct sched_domain *sd, enum cpu_idle_type idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002505 int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506{
Ingo Molnar48f24c42006-07-03 00:25:40 -07002507 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 struct sched_group *group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509 unsigned long imbalance;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002510 struct rq *busiest;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002511 cpumask_t cpus = CPU_MASK_ALL;
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002512 unsigned long flags;
Nick Piggin5969fe02005-09-10 00:26:19 -07002513
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002514 /*
2515 * When power savings policy is enabled for the parent domain, idle
2516 * sibling can pick up load irrespective of busy siblings. In this case,
Ingo Molnardd41f592007-07-09 18:51:59 +02002517 * let the state of idle sibling percolate up as CPU_IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002518 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002519 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002520 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002521 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002522 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524 schedstat_inc(sd, lb_cnt[idle]);
2525
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002526redo:
2527 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002528 &cpus, balance);
2529
Chen, Kenneth W06066712006-12-10 02:20:35 -08002530 if (*balance == 0)
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002531 goto out_balanced;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002532
Linus Torvalds1da177e2005-04-16 15:20:36 -07002533 if (!group) {
2534 schedstat_inc(sd, lb_nobusyg[idle]);
2535 goto out_balanced;
2536 }
2537
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002538 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539 if (!busiest) {
2540 schedstat_inc(sd, lb_nobusyq[idle]);
2541 goto out_balanced;
2542 }
2543
Nick Piggindb935db2005-06-25 14:57:11 -07002544 BUG_ON(busiest == this_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002545
2546 schedstat_add(sd, lb_imbalance[idle], imbalance);
2547
2548 nr_moved = 0;
2549 if (busiest->nr_running > 1) {
2550 /*
2551 * Attempt to move tasks. If find_busiest_group has found
2552 * an imbalance but busiest->nr_running <= 1, the group is
2553 * still unbalanced. nr_moved simply stays zero, so it is
2554 * correctly treated as an imbalance.
2555 */
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002556 local_irq_save(flags);
Nick Piggine17224b2005-09-10 00:26:18 -07002557 double_rq_lock(this_rq, busiest);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002559 minus_1_or_zero(busiest->nr_running),
2560 imbalance, sd, idle, &all_pinned);
Nick Piggine17224b2005-09-10 00:26:18 -07002561 double_rq_unlock(this_rq, busiest);
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002562 local_irq_restore(flags);
Nick Piggin81026792005-06-25 14:57:07 -07002563
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002564 /*
2565 * some other cpu did the load balance for us.
2566 */
2567 if (nr_moved && this_cpu != smp_processor_id())
2568 resched_cpu(this_cpu);
2569
Nick Piggin81026792005-06-25 14:57:07 -07002570 /* All tasks on this runqueue were pinned by CPU affinity */
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002571 if (unlikely(all_pinned)) {
2572 cpu_clear(cpu_of(busiest), cpus);
2573 if (!cpus_empty(cpus))
2574 goto redo;
Nick Piggin81026792005-06-25 14:57:07 -07002575 goto out_balanced;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002576 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002577 }
Nick Piggin81026792005-06-25 14:57:07 -07002578
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579 if (!nr_moved) {
2580 schedstat_inc(sd, lb_failed[idle]);
2581 sd->nr_balance_failed++;
2582
2583 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002584
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002585 spin_lock_irqsave(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002586
2587 /* don't kick the migration_thread, if the curr
2588 * task on busiest cpu can't be moved to this_cpu
2589 */
2590 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002591 spin_unlock_irqrestore(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002592 all_pinned = 1;
2593 goto out_one_pinned;
2594 }
2595
Linus Torvalds1da177e2005-04-16 15:20:36 -07002596 if (!busiest->active_balance) {
2597 busiest->active_balance = 1;
2598 busiest->push_cpu = this_cpu;
Nick Piggin81026792005-06-25 14:57:07 -07002599 active_balance = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 }
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002601 spin_unlock_irqrestore(&busiest->lock, flags);
Nick Piggin81026792005-06-25 14:57:07 -07002602 if (active_balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 wake_up_process(busiest->migration_thread);
2604
2605 /*
2606 * We've kicked active balancing, reset the failure
2607 * counter.
2608 */
Nick Piggin39507452005-06-25 14:57:09 -07002609 sd->nr_balance_failed = sd->cache_nice_tries+1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610 }
Nick Piggin81026792005-06-25 14:57:07 -07002611 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 sd->nr_balance_failed = 0;
2613
Nick Piggin81026792005-06-25 14:57:07 -07002614 if (likely(!active_balance)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615 /* We were unbalanced, so reset the balancing interval */
2616 sd->balance_interval = sd->min_interval;
Nick Piggin81026792005-06-25 14:57:07 -07002617 } else {
2618 /*
2619 * If we've begun active balancing, start to back off. This
2620 * case may not be covered by the all_pinned logic if there
2621 * is only 1 task on the busy runqueue (because we don't call
2622 * move_tasks).
2623 */
2624 if (sd->balance_interval < sd->max_interval)
2625 sd->balance_interval *= 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 }
2627
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002628 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002629 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002630 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631 return nr_moved;
2632
2633out_balanced:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002634 schedstat_inc(sd, lb_balanced[idle]);
2635
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002636 sd->nr_balance_failed = 0;
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002637
2638out_one_pinned:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639 /* tune up the balancing interval */
Nick Piggin77391d72005-06-25 14:57:30 -07002640 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2641 (sd->balance_interval < sd->max_interval))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 sd->balance_interval *= 2;
2643
Ingo Molnar48f24c42006-07-03 00:25:40 -07002644 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002645 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002646 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 return 0;
2648}
2649
2650/*
2651 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2652 * tasks if there is an imbalance.
2653 *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002654 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655 * this_rq is locked.
2656 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07002657static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07002658load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659{
2660 struct sched_group *group;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002661 struct rq *busiest = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002662 unsigned long imbalance;
2663 int nr_moved = 0;
Nick Piggin5969fe02005-09-10 00:26:19 -07002664 int sd_idle = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002665 cpumask_t cpus = CPU_MASK_ALL;
Nick Piggin5969fe02005-09-10 00:26:19 -07002666
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002667 /*
2668 * When power savings policy is enabled for the parent domain, idle
2669 * sibling can pick up load irrespective of busy siblings. In this case,
2670 * let the state of idle sibling percolate up as IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002671 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002672 */
2673 if (sd->flags & SD_SHARE_CPUPOWER &&
2674 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002675 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002677 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002678redo:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002679 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002680 &sd_idle, &cpus, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681 if (!group) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002682 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002683 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684 }
2685
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002686 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002687 &cpus);
Nick Piggindb935db2005-06-25 14:57:11 -07002688 if (!busiest) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002689 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002690 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002691 }
2692
Nick Piggindb935db2005-06-25 14:57:11 -07002693 BUG_ON(busiest == this_rq);
2694
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002695 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002696
2697 nr_moved = 0;
2698 if (busiest->nr_running > 1) {
2699 /* Attempt to move tasks */
2700 double_lock_balance(this_rq, busiest);
2701 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002702 minus_1_or_zero(busiest->nr_running),
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002703 imbalance, sd, CPU_NEWLY_IDLE, NULL);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002704 spin_unlock(&busiest->lock);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002705
2706 if (!nr_moved) {
2707 cpu_clear(cpu_of(busiest), cpus);
2708 if (!cpus_empty(cpus))
2709 goto redo;
2710 }
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002711 }
2712
Nick Piggin5969fe02005-09-10 00:26:19 -07002713 if (!nr_moved) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002714 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002715 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2716 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002717 return -1;
2718 } else
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002719 sd->nr_balance_failed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721 return nr_moved;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002722
2723out_balanced:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002724 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
Ingo Molnar48f24c42006-07-03 00:25:40 -07002725 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002726 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002727 return -1;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002728 sd->nr_balance_failed = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002729
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002730 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002731}
2732
2733/*
2734 * idle_balance is called by schedule() if this_cpu is about to become
2735 * idle. Attempts to pull tasks from other CPUs.
2736 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002737static void idle_balance(int this_cpu, struct rq *this_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738{
2739 struct sched_domain *sd;
Ingo Molnardd41f592007-07-09 18:51:59 +02002740 int pulled_task = -1;
2741 unsigned long next_balance = jiffies + HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002742
2743 for_each_domain(this_cpu, sd) {
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002744 unsigned long interval;
2745
2746 if (!(sd->flags & SD_LOAD_BALANCE))
2747 continue;
2748
2749 if (sd->flags & SD_BALANCE_NEWIDLE)
Ingo Molnar48f24c42006-07-03 00:25:40 -07002750 /* If we've pulled tasks over stop searching: */
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002751 pulled_task = load_balance_newidle(this_cpu,
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002752 this_rq, sd);
2753
2754 interval = msecs_to_jiffies(sd->balance_interval);
2755 if (time_after(next_balance, sd->last_balance + interval))
2756 next_balance = sd->last_balance + interval;
2757 if (pulled_task)
2758 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002759 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002760 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002761 /*
2762 * We are going idle. next_balance may be set based on
2763 * a busy processor. So reset next_balance.
2764 */
2765 this_rq->next_balance = next_balance;
Ingo Molnardd41f592007-07-09 18:51:59 +02002766 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767}
2768
2769/*
2770 * active_load_balance is run by migration threads. It pushes running tasks
2771 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2772 * running on each physical CPU where possible, and avoids physical /
2773 * logical imbalances.
2774 *
2775 * Called with busiest_rq locked.
2776 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002777static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778{
Nick Piggin39507452005-06-25 14:57:09 -07002779 int target_cpu = busiest_rq->push_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002780 struct sched_domain *sd;
2781 struct rq *target_rq;
Nick Piggin39507452005-06-25 14:57:09 -07002782
Ingo Molnar48f24c42006-07-03 00:25:40 -07002783 /* Is there any task to move? */
Nick Piggin39507452005-06-25 14:57:09 -07002784 if (busiest_rq->nr_running <= 1)
Nick Piggin39507452005-06-25 14:57:09 -07002785 return;
2786
2787 target_rq = cpu_rq(target_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788
2789 /*
Nick Piggin39507452005-06-25 14:57:09 -07002790 * This condition is "impossible", if it occurs
2791 * we need to fix it. Originally reported by
2792 * Bjorn Helgaas on a 128-cpu setup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793 */
Nick Piggin39507452005-06-25 14:57:09 -07002794 BUG_ON(busiest_rq == target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795
Nick Piggin39507452005-06-25 14:57:09 -07002796 /* move a task from busiest_rq to target_rq */
2797 double_lock_balance(busiest_rq, target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798
Nick Piggin39507452005-06-25 14:57:09 -07002799 /* Search for an sd spanning us and the target CPU. */
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002800 for_each_domain(target_cpu, sd) {
Nick Piggin39507452005-06-25 14:57:09 -07002801 if ((sd->flags & SD_LOAD_BALANCE) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07002802 cpu_isset(busiest_cpu, sd->span))
Nick Piggin39507452005-06-25 14:57:09 -07002803 break;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002804 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002805
Ingo Molnar48f24c42006-07-03 00:25:40 -07002806 if (likely(sd)) {
2807 schedstat_inc(sd, alb_cnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808
Ingo Molnar48f24c42006-07-03 00:25:40 -07002809 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002810 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002811 NULL))
2812 schedstat_inc(sd, alb_pushed);
2813 else
2814 schedstat_inc(sd, alb_failed);
2815 }
Nick Piggin39507452005-06-25 14:57:09 -07002816 spin_unlock(&target_rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817}
2818
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002819#ifdef CONFIG_NO_HZ
2820static struct {
2821 atomic_t load_balancer;
2822 cpumask_t cpu_mask;
2823} nohz ____cacheline_aligned = {
2824 .load_balancer = ATOMIC_INIT(-1),
2825 .cpu_mask = CPU_MASK_NONE,
2826};
2827
Christoph Lameter7835b982006-12-10 02:20:22 -08002828/*
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002829 * This routine will try to nominate the ilb (idle load balancing)
2830 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2831 * load balancing on behalf of all those cpus. If all the cpus in the system
2832 * go into this tickless mode, then there will be no ilb owner (as there is
2833 * no need for one) and all the cpus will sleep till the next wakeup event
2834 * arrives...
Christoph Lameter7835b982006-12-10 02:20:22 -08002835 *
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002836 * For the ilb owner, tick is not stopped. And this tick will be used
2837 * for idle load balancing. ilb owner will still be part of
2838 * nohz.cpu_mask..
2839 *
2840 * While stopping the tick, this cpu will become the ilb owner if there
2841 * is no other owner. And will be the owner till that cpu becomes busy
2842 * or if all cpus in the system stop their ticks at which point
2843 * there is no need for ilb owner.
2844 *
2845 * When the ilb owner becomes busy, it nominates another owner, during the
2846 * next busy scheduler_tick()
2847 */
2848int select_nohz_load_balancer(int stop_tick)
2849{
2850 int cpu = smp_processor_id();
2851
2852 if (stop_tick) {
2853 cpu_set(cpu, nohz.cpu_mask);
2854 cpu_rq(cpu)->in_nohz_recently = 1;
2855
2856 /*
2857 * If we are going offline and still the leader, give up!
2858 */
2859 if (cpu_is_offline(cpu) &&
2860 atomic_read(&nohz.load_balancer) == cpu) {
2861 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2862 BUG();
2863 return 0;
2864 }
2865
2866 /* time for ilb owner also to sleep */
2867 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2868 if (atomic_read(&nohz.load_balancer) == cpu)
2869 atomic_set(&nohz.load_balancer, -1);
2870 return 0;
2871 }
2872
2873 if (atomic_read(&nohz.load_balancer) == -1) {
2874 /* make me the ilb owner */
2875 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2876 return 1;
2877 } else if (atomic_read(&nohz.load_balancer) == cpu)
2878 return 1;
2879 } else {
2880 if (!cpu_isset(cpu, nohz.cpu_mask))
2881 return 0;
2882
2883 cpu_clear(cpu, nohz.cpu_mask);
2884
2885 if (atomic_read(&nohz.load_balancer) == cpu)
2886 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2887 BUG();
2888 }
2889 return 0;
2890}
2891#endif
2892
2893static DEFINE_SPINLOCK(balancing);
2894
2895/*
Christoph Lameter7835b982006-12-10 02:20:22 -08002896 * It checks each scheduling domain to see if it is due to be balanced,
2897 * and initiates a balancing operation if so.
2898 *
2899 * Balancing parameters are set up in arch_init_sched_domains.
2900 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002901static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
Christoph Lameter7835b982006-12-10 02:20:22 -08002902{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002903 int balance = 1;
2904 struct rq *rq = cpu_rq(cpu);
Christoph Lameter7835b982006-12-10 02:20:22 -08002905 unsigned long interval;
2906 struct sched_domain *sd;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002907 /* Earliest time when we have to do rebalance again */
Christoph Lameterc9819f42006-12-10 02:20:25 -08002908 unsigned long next_balance = jiffies + 60*HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002909
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002910 for_each_domain(cpu, sd) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002911 if (!(sd->flags & SD_LOAD_BALANCE))
2912 continue;
2913
2914 interval = sd->balance_interval;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002915 if (idle != CPU_IDLE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002916 interval *= sd->busy_factor;
2917
2918 /* scale ms to jiffies */
2919 interval = msecs_to_jiffies(interval);
2920 if (unlikely(!interval))
2921 interval = 1;
Ingo Molnardd41f592007-07-09 18:51:59 +02002922 if (interval > HZ*NR_CPUS/10)
2923 interval = HZ*NR_CPUS/10;
2924
Linus Torvalds1da177e2005-04-16 15:20:36 -07002925
Christoph Lameter08c183f2006-12-10 02:20:29 -08002926 if (sd->flags & SD_SERIALIZE) {
2927 if (!spin_trylock(&balancing))
2928 goto out;
2929 }
2930
Christoph Lameterc9819f42006-12-10 02:20:25 -08002931 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002932 if (load_balance(cpu, rq, sd, idle, &balance)) {
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002933 /*
2934 * We've pulled tasks over so either we're no
Nick Piggin5969fe02005-09-10 00:26:19 -07002935 * longer idle, or one of our SMT siblings is
2936 * not idle.
2937 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002938 idle = CPU_NOT_IDLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939 }
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002940 sd->last_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002941 }
Christoph Lameter08c183f2006-12-10 02:20:29 -08002942 if (sd->flags & SD_SERIALIZE)
2943 spin_unlock(&balancing);
2944out:
Christoph Lameterc9819f42006-12-10 02:20:25 -08002945 if (time_after(next_balance, sd->last_balance + interval))
2946 next_balance = sd->last_balance + interval;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002947
2948 /*
2949 * Stop the load balance at this level. There is another
2950 * CPU in our sched group which is doing load balancing more
2951 * actively.
2952 */
2953 if (!balance)
2954 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002955 }
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002956 rq->next_balance = next_balance;
2957}
2958
2959/*
2960 * run_rebalance_domains is triggered when needed from the scheduler tick.
2961 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2962 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2963 */
2964static void run_rebalance_domains(struct softirq_action *h)
2965{
Ingo Molnardd41f592007-07-09 18:51:59 +02002966 int this_cpu = smp_processor_id();
2967 struct rq *this_rq = cpu_rq(this_cpu);
2968 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2969 CPU_IDLE : CPU_NOT_IDLE;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002970
Ingo Molnardd41f592007-07-09 18:51:59 +02002971 rebalance_domains(this_cpu, idle);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002972
2973#ifdef CONFIG_NO_HZ
2974 /*
2975 * If this cpu is the owner for idle load balancing, then do the
2976 * balancing on behalf of the other idle cpus whose ticks are
2977 * stopped.
2978 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002979 if (this_rq->idle_at_tick &&
2980 atomic_read(&nohz.load_balancer) == this_cpu) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002981 cpumask_t cpus = nohz.cpu_mask;
2982 struct rq *rq;
2983 int balance_cpu;
2984
Ingo Molnardd41f592007-07-09 18:51:59 +02002985 cpu_clear(this_cpu, cpus);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002986 for_each_cpu_mask(balance_cpu, cpus) {
2987 /*
2988 * If this cpu gets work to do, stop the load balancing
2989 * work being done for other cpus. Next load
2990 * balancing owner will pick it up.
2991 */
2992 if (need_resched())
2993 break;
2994
Ingo Molnardd41f592007-07-09 18:51:59 +02002995 rebalance_domains(balance_cpu, SCHED_IDLE);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002996
2997 rq = cpu_rq(balance_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002998 if (time_after(this_rq->next_balance, rq->next_balance))
2999 this_rq->next_balance = rq->next_balance;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003000 }
3001 }
3002#endif
3003}
3004
3005/*
3006 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3007 *
3008 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3009 * idle load balancing owner or decide to stop the periodic load balancing,
3010 * if the whole system is idle.
3011 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003012static inline void trigger_load_balance(struct rq *rq, int cpu)
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003013{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003014#ifdef CONFIG_NO_HZ
3015 /*
3016 * If we were in the nohz mode recently and busy at the current
3017 * scheduler tick, then check if we need to nominate new idle
3018 * load balancer.
3019 */
3020 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3021 rq->in_nohz_recently = 0;
3022
3023 if (atomic_read(&nohz.load_balancer) == cpu) {
3024 cpu_clear(cpu, nohz.cpu_mask);
3025 atomic_set(&nohz.load_balancer, -1);
3026 }
3027
3028 if (atomic_read(&nohz.load_balancer) == -1) {
3029 /*
3030 * simple selection for now: Nominate the
3031 * first cpu in the nohz list to be the next
3032 * ilb owner.
3033 *
3034 * TBD: Traverse the sched domains and nominate
3035 * the nearest cpu in the nohz.cpu_mask.
3036 */
3037 int ilb = first_cpu(nohz.cpu_mask);
3038
3039 if (ilb != NR_CPUS)
3040 resched_cpu(ilb);
3041 }
3042 }
3043
3044 /*
3045 * If this cpu is idle and doing idle load balancing for all the
3046 * cpus with ticks stopped, is it time for that to stop?
3047 */
3048 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3049 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3050 resched_cpu(cpu);
3051 return;
3052 }
3053
3054 /*
3055 * If this cpu is idle and the idle load balancing is done by
3056 * someone else, then no need raise the SCHED_SOFTIRQ
3057 */
3058 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3059 cpu_isset(cpu, nohz.cpu_mask))
3060 return;
3061#endif
3062 if (time_after_eq(jiffies, rq->next_balance))
3063 raise_softirq(SCHED_SOFTIRQ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064}
Ingo Molnardd41f592007-07-09 18:51:59 +02003065
3066#else /* CONFIG_SMP */
3067
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068/*
3069 * on UP we do not need to balance between CPUs:
3070 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07003071static inline void idle_balance(int cpu, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003072{
3073}
Ingo Molnardd41f592007-07-09 18:51:59 +02003074
3075/* Avoid "used but not defined" warning on UP */
3076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3077 unsigned long max_nr_move, unsigned long max_load_move,
3078 struct sched_domain *sd, enum cpu_idle_type idle,
3079 int *all_pinned, unsigned long *load_moved,
3080 int this_best_prio, int best_prio, int best_prio_seen,
3081 struct rq_iterator *iterator)
3082{
3083 *load_moved = 0;
3084
3085 return 0;
3086}
3087
Linus Torvalds1da177e2005-04-16 15:20:36 -07003088#endif
3089
Linus Torvalds1da177e2005-04-16 15:20:36 -07003090DEFINE_PER_CPU(struct kernel_stat, kstat);
3091
3092EXPORT_PER_CPU_SYMBOL(kstat);
3093
3094/*
Ingo Molnar41b86e92007-07-09 18:51:58 +02003095 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3096 * that have not yet been banked in case the task is currently running.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003097 */
Ingo Molnar41b86e92007-07-09 18:51:58 +02003098unsigned long long task_sched_runtime(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003100 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003101 u64 ns, delta_exec;
3102 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003103
Ingo Molnar41b86e92007-07-09 18:51:58 +02003104 rq = task_rq_lock(p, &flags);
3105 ns = p->se.sum_exec_runtime;
3106 if (rq->curr == p) {
3107 delta_exec = rq_clock(rq) - p->se.exec_start;
3108 if ((s64)delta_exec > 0)
3109 ns += delta_exec;
3110 }
3111 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003112
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 return ns;
3114}
3115
3116/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117 * Account user cpu time to a process.
3118 * @p: the process that the cpu time gets accounted to
3119 * @hardirq_offset: the offset to subtract from hardirq_count()
3120 * @cputime: the cpu time spent in user space since the last update
3121 */
3122void account_user_time(struct task_struct *p, cputime_t cputime)
3123{
3124 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3125 cputime64_t tmp;
3126
3127 p->utime = cputime_add(p->utime, cputime);
3128
3129 /* Add user time to cpustat. */
3130 tmp = cputime_to_cputime64(cputime);
3131 if (TASK_NICE(p) > 0)
3132 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3133 else
3134 cpustat->user = cputime64_add(cpustat->user, tmp);
3135}
3136
3137/*
3138 * Account system cpu time to a process.
3139 * @p: the process that the cpu time gets accounted to
3140 * @hardirq_offset: the offset to subtract from hardirq_count()
3141 * @cputime: the cpu time spent in kernel space since the last update
3142 */
3143void account_system_time(struct task_struct *p, int hardirq_offset,
3144 cputime_t cputime)
3145{
3146 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003147 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148 cputime64_t tmp;
3149
3150 p->stime = cputime_add(p->stime, cputime);
3151
3152 /* Add system time to cpustat. */
3153 tmp = cputime_to_cputime64(cputime);
3154 if (hardirq_count() - hardirq_offset)
3155 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3156 else if (softirq_count())
3157 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3158 else if (p != rq->idle)
3159 cpustat->system = cputime64_add(cpustat->system, tmp);
3160 else if (atomic_read(&rq->nr_iowait) > 0)
3161 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3162 else
3163 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3164 /* Account for system time used */
3165 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166}
3167
3168/*
3169 * Account for involuntary wait time.
3170 * @p: the process from which the cpu time has been stolen
3171 * @steal: the cpu time spent in involuntary wait
3172 */
3173void account_steal_time(struct task_struct *p, cputime_t steal)
3174{
3175 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3176 cputime64_t tmp = cputime_to_cputime64(steal);
Ingo Molnar70b97a72006-07-03 00:25:42 -07003177 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178
3179 if (p == rq->idle) {
3180 p->stime = cputime_add(p->stime, steal);
3181 if (atomic_read(&rq->nr_iowait) > 0)
3182 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3183 else
3184 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3185 } else
3186 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3187}
3188
Christoph Lameter7835b982006-12-10 02:20:22 -08003189/*
3190 * This function gets called by the timer code, with HZ frequency.
3191 * We call it with interrupts disabled.
3192 *
3193 * It also gets called by the fork code, when changing the parent's
3194 * timeslices.
3195 */
3196void scheduler_tick(void)
3197{
Christoph Lameter7835b982006-12-10 02:20:22 -08003198 int cpu = smp_processor_id();
3199 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003200 struct task_struct *curr = rq->curr;
Christoph Lameter7835b982006-12-10 02:20:22 -08003201
Ingo Molnardd41f592007-07-09 18:51:59 +02003202 spin_lock(&rq->lock);
3203 if (curr != rq->idle) /* FIXME: needed? */
3204 curr->sched_class->task_tick(rq, curr);
3205 update_cpu_load(rq);
3206 spin_unlock(&rq->lock);
3207
Christoph Lametere418e1c2006-12-10 02:20:23 -08003208#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003209 rq->idle_at_tick = idle_cpu(cpu);
3210 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003211#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003212}
3213
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3215
3216void fastcall add_preempt_count(int val)
3217{
3218 /*
3219 * Underflow?
3220 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003221 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3222 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003223 preempt_count() += val;
3224 /*
3225 * Spinlock count overflowing soon?
3226 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003227 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3228 PREEMPT_MASK - 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229}
3230EXPORT_SYMBOL(add_preempt_count);
3231
3232void fastcall sub_preempt_count(int val)
3233{
3234 /*
3235 * Underflow?
3236 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003237 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3238 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003239 /*
3240 * Is the spinlock portion underflowing?
3241 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003242 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3243 !(preempt_count() & PREEMPT_MASK)))
3244 return;
3245
Linus Torvalds1da177e2005-04-16 15:20:36 -07003246 preempt_count() -= val;
3247}
3248EXPORT_SYMBOL(sub_preempt_count);
3249
3250#endif
3251
3252/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003253 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003255static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003256{
Ingo Molnardd41f592007-07-09 18:51:59 +02003257 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3258 prev->comm, preempt_count(), prev->pid);
3259 debug_show_held_locks(prev);
3260 if (irqs_disabled())
3261 print_irqtrace_events(prev);
3262 dump_stack();
3263}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264
Ingo Molnardd41f592007-07-09 18:51:59 +02003265/*
3266 * Various schedule()-time debugging checks and statistics:
3267 */
3268static inline void schedule_debug(struct task_struct *prev)
3269{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270 /*
3271 * Test if we are atomic. Since do_exit() needs to call into
3272 * schedule() atomically, we ignore that path for now.
3273 * Otherwise, whine if we are scheduling when we should not be.
3274 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003275 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3276 __schedule_bug(prev);
3277
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3279
Ingo Molnardd41f592007-07-09 18:51:59 +02003280 schedstat_inc(this_rq(), sched_cnt);
3281}
3282
3283/*
3284 * Pick up the highest-prio task:
3285 */
3286static inline struct task_struct *
3287pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3288{
3289 struct sched_class *class;
3290 struct task_struct *p;
3291
3292 /*
3293 * Optimization: we know that if all tasks are in
3294 * the fair class we can call that function directly:
3295 */
3296 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3297 p = fair_sched_class.pick_next_task(rq, now);
3298 if (likely(p))
3299 return p;
3300 }
3301
3302 class = sched_class_highest;
3303 for ( ; ; ) {
3304 p = class->pick_next_task(rq, now);
3305 if (p)
3306 return p;
3307 /*
3308 * Will never be NULL as the idle class always
3309 * returns a non-NULL p:
3310 */
3311 class = class->next;
3312 }
3313}
3314
3315/*
3316 * schedule() is the main scheduler function.
3317 */
3318asmlinkage void __sched schedule(void)
3319{
3320 struct task_struct *prev, *next;
3321 long *switch_count;
3322 struct rq *rq;
3323 u64 now;
3324 int cpu;
3325
Linus Torvalds1da177e2005-04-16 15:20:36 -07003326need_resched:
3327 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02003328 cpu = smp_processor_id();
3329 rq = cpu_rq(cpu);
3330 rcu_qsctr_inc(cpu);
3331 prev = rq->curr;
3332 switch_count = &prev->nivcsw;
3333
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334 release_kernel_lock(prev);
3335need_resched_nonpreemptible:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003336
Ingo Molnardd41f592007-07-09 18:51:59 +02003337 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003338
3339 spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 clear_tsk_need_resched(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003341
Ingo Molnardd41f592007-07-09 18:51:59 +02003342 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3343 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3344 unlikely(signal_pending(prev)))) {
3345 prev->state = TASK_RUNNING;
3346 } else {
3347 deactivate_task(rq, prev, 1);
3348 }
3349 switch_count = &prev->nvcsw;
3350 }
3351
3352 if (unlikely(!rq->nr_running))
3353 idle_balance(cpu, rq);
3354
3355 now = __rq_clock(rq);
3356 prev->sched_class->put_prev_task(rq, prev, now);
3357 next = pick_next_task(rq, prev, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003358
3359 sched_info_switch(prev, next);
Ingo Molnardd41f592007-07-09 18:51:59 +02003360
Linus Torvalds1da177e2005-04-16 15:20:36 -07003361 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003362 rq->nr_switches++;
3363 rq->curr = next;
3364 ++*switch_count;
3365
Ingo Molnardd41f592007-07-09 18:51:59 +02003366 context_switch(rq, prev, next); /* unlocks the rq */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003367 } else
3368 spin_unlock_irq(&rq->lock);
3369
Ingo Molnardd41f592007-07-09 18:51:59 +02003370 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3371 cpu = smp_processor_id();
3372 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373 goto need_resched_nonpreemptible;
Ingo Molnardd41f592007-07-09 18:51:59 +02003374 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003375 preempt_enable_no_resched();
3376 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3377 goto need_resched;
3378}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003379EXPORT_SYMBOL(schedule);
3380
3381#ifdef CONFIG_PREEMPT
3382/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003383 * this is the entry point to schedule() from in-kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003384 * off of preempt_enable. Kernel preemptions off return from interrupt
3385 * occur there and call schedule directly.
3386 */
3387asmlinkage void __sched preempt_schedule(void)
3388{
3389 struct thread_info *ti = current_thread_info();
3390#ifdef CONFIG_PREEMPT_BKL
3391 struct task_struct *task = current;
3392 int saved_lock_depth;
3393#endif
3394 /*
3395 * If there is a non-zero preempt_count or interrupts are disabled,
3396 * we do not want to preempt the current task. Just return..
3397 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07003398 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003399 return;
3400
3401need_resched:
3402 add_preempt_count(PREEMPT_ACTIVE);
3403 /*
3404 * We keep the big kernel semaphore locked, but we
3405 * clear ->lock_depth so that schedule() doesnt
3406 * auto-release the semaphore:
3407 */
3408#ifdef CONFIG_PREEMPT_BKL
3409 saved_lock_depth = task->lock_depth;
3410 task->lock_depth = -1;
3411#endif
3412 schedule();
3413#ifdef CONFIG_PREEMPT_BKL
3414 task->lock_depth = saved_lock_depth;
3415#endif
3416 sub_preempt_count(PREEMPT_ACTIVE);
3417
3418 /* we could miss a preemption opportunity between schedule and now */
3419 barrier();
3420 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3421 goto need_resched;
3422}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003423EXPORT_SYMBOL(preempt_schedule);
3424
3425/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003426 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003427 * off of irq context.
3428 * Note, that this is called and return with irqs disabled. This will
3429 * protect us against recursive calling from irq.
3430 */
3431asmlinkage void __sched preempt_schedule_irq(void)
3432{
3433 struct thread_info *ti = current_thread_info();
3434#ifdef CONFIG_PREEMPT_BKL
3435 struct task_struct *task = current;
3436 int saved_lock_depth;
3437#endif
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003438 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003439 BUG_ON(ti->preempt_count || !irqs_disabled());
3440
3441need_resched:
3442 add_preempt_count(PREEMPT_ACTIVE);
3443 /*
3444 * We keep the big kernel semaphore locked, but we
3445 * clear ->lock_depth so that schedule() doesnt
3446 * auto-release the semaphore:
3447 */
3448#ifdef CONFIG_PREEMPT_BKL
3449 saved_lock_depth = task->lock_depth;
3450 task->lock_depth = -1;
3451#endif
3452 local_irq_enable();
3453 schedule();
3454 local_irq_disable();
3455#ifdef CONFIG_PREEMPT_BKL
3456 task->lock_depth = saved_lock_depth;
3457#endif
3458 sub_preempt_count(PREEMPT_ACTIVE);
3459
3460 /* we could miss a preemption opportunity between schedule and now */
3461 barrier();
3462 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3463 goto need_resched;
3464}
3465
3466#endif /* CONFIG_PREEMPT */
3467
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003468int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3469 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003470{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003471 return try_to_wake_up(curr->private, mode, sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003472}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003473EXPORT_SYMBOL(default_wake_function);
3474
3475/*
3476 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3477 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3478 * number) then we wake all the non-exclusive tasks and one exclusive task.
3479 *
3480 * There are circumstances in which we can try to wake a task which has already
3481 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3482 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3483 */
3484static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3485 int nr_exclusive, int sync, void *key)
3486{
3487 struct list_head *tmp, *next;
3488
3489 list_for_each_safe(tmp, next, &q->task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07003490 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3491 unsigned flags = curr->flags;
3492
Linus Torvalds1da177e2005-04-16 15:20:36 -07003493 if (curr->func(curr, mode, sync, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07003494 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003495 break;
3496 }
3497}
3498
3499/**
3500 * __wake_up - wake up threads blocked on a waitqueue.
3501 * @q: the waitqueue
3502 * @mode: which threads
3503 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07003504 * @key: is directly passed to the wakeup function
Linus Torvalds1da177e2005-04-16 15:20:36 -07003505 */
3506void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003507 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003508{
3509 unsigned long flags;
3510
3511 spin_lock_irqsave(&q->lock, flags);
3512 __wake_up_common(q, mode, nr_exclusive, 0, key);
3513 spin_unlock_irqrestore(&q->lock, flags);
3514}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003515EXPORT_SYMBOL(__wake_up);
3516
3517/*
3518 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3519 */
3520void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3521{
3522 __wake_up_common(q, mode, 1, 0, NULL);
3523}
3524
3525/**
Martin Waitz67be2dd2005-05-01 08:59:26 -07003526 * __wake_up_sync - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003527 * @q: the waitqueue
3528 * @mode: which threads
3529 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3530 *
3531 * The sync wakeup differs that the waker knows that it will schedule
3532 * away soon, so while the target thread will be woken up, it will not
3533 * be migrated to another CPU - ie. the two threads are 'synchronized'
3534 * with each other. This can prevent needless bouncing between CPUs.
3535 *
3536 * On UP it can prevent extra preemption.
3537 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003538void fastcall
3539__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003540{
3541 unsigned long flags;
3542 int sync = 1;
3543
3544 if (unlikely(!q))
3545 return;
3546
3547 if (unlikely(!nr_exclusive))
3548 sync = 0;
3549
3550 spin_lock_irqsave(&q->lock, flags);
3551 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3552 spin_unlock_irqrestore(&q->lock, flags);
3553}
3554EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3555
3556void fastcall complete(struct completion *x)
3557{
3558 unsigned long flags;
3559
3560 spin_lock_irqsave(&x->wait.lock, flags);
3561 x->done++;
3562 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3563 1, 0, NULL);
3564 spin_unlock_irqrestore(&x->wait.lock, flags);
3565}
3566EXPORT_SYMBOL(complete);
3567
3568void fastcall complete_all(struct completion *x)
3569{
3570 unsigned long flags;
3571
3572 spin_lock_irqsave(&x->wait.lock, flags);
3573 x->done += UINT_MAX/2;
3574 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3575 0, 0, NULL);
3576 spin_unlock_irqrestore(&x->wait.lock, flags);
3577}
3578EXPORT_SYMBOL(complete_all);
3579
3580void fastcall __sched wait_for_completion(struct completion *x)
3581{
3582 might_sleep();
Ingo Molnar48f24c42006-07-03 00:25:40 -07003583
Linus Torvalds1da177e2005-04-16 15:20:36 -07003584 spin_lock_irq(&x->wait.lock);
3585 if (!x->done) {
3586 DECLARE_WAITQUEUE(wait, current);
3587
3588 wait.flags |= WQ_FLAG_EXCLUSIVE;
3589 __add_wait_queue_tail(&x->wait, &wait);
3590 do {
3591 __set_current_state(TASK_UNINTERRUPTIBLE);
3592 spin_unlock_irq(&x->wait.lock);
3593 schedule();
3594 spin_lock_irq(&x->wait.lock);
3595 } while (!x->done);
3596 __remove_wait_queue(&x->wait, &wait);
3597 }
3598 x->done--;
3599 spin_unlock_irq(&x->wait.lock);
3600}
3601EXPORT_SYMBOL(wait_for_completion);
3602
3603unsigned long fastcall __sched
3604wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3605{
3606 might_sleep();
3607
3608 spin_lock_irq(&x->wait.lock);
3609 if (!x->done) {
3610 DECLARE_WAITQUEUE(wait, current);
3611
3612 wait.flags |= WQ_FLAG_EXCLUSIVE;
3613 __add_wait_queue_tail(&x->wait, &wait);
3614 do {
3615 __set_current_state(TASK_UNINTERRUPTIBLE);
3616 spin_unlock_irq(&x->wait.lock);
3617 timeout = schedule_timeout(timeout);
3618 spin_lock_irq(&x->wait.lock);
3619 if (!timeout) {
3620 __remove_wait_queue(&x->wait, &wait);
3621 goto out;
3622 }
3623 } while (!x->done);
3624 __remove_wait_queue(&x->wait, &wait);
3625 }
3626 x->done--;
3627out:
3628 spin_unlock_irq(&x->wait.lock);
3629 return timeout;
3630}
3631EXPORT_SYMBOL(wait_for_completion_timeout);
3632
3633int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3634{
3635 int ret = 0;
3636
3637 might_sleep();
3638
3639 spin_lock_irq(&x->wait.lock);
3640 if (!x->done) {
3641 DECLARE_WAITQUEUE(wait, current);
3642
3643 wait.flags |= WQ_FLAG_EXCLUSIVE;
3644 __add_wait_queue_tail(&x->wait, &wait);
3645 do {
3646 if (signal_pending(current)) {
3647 ret = -ERESTARTSYS;
3648 __remove_wait_queue(&x->wait, &wait);
3649 goto out;
3650 }
3651 __set_current_state(TASK_INTERRUPTIBLE);
3652 spin_unlock_irq(&x->wait.lock);
3653 schedule();
3654 spin_lock_irq(&x->wait.lock);
3655 } while (!x->done);
3656 __remove_wait_queue(&x->wait, &wait);
3657 }
3658 x->done--;
3659out:
3660 spin_unlock_irq(&x->wait.lock);
3661
3662 return ret;
3663}
3664EXPORT_SYMBOL(wait_for_completion_interruptible);
3665
3666unsigned long fastcall __sched
3667wait_for_completion_interruptible_timeout(struct completion *x,
3668 unsigned long timeout)
3669{
3670 might_sleep();
3671
3672 spin_lock_irq(&x->wait.lock);
3673 if (!x->done) {
3674 DECLARE_WAITQUEUE(wait, current);
3675
3676 wait.flags |= WQ_FLAG_EXCLUSIVE;
3677 __add_wait_queue_tail(&x->wait, &wait);
3678 do {
3679 if (signal_pending(current)) {
3680 timeout = -ERESTARTSYS;
3681 __remove_wait_queue(&x->wait, &wait);
3682 goto out;
3683 }
3684 __set_current_state(TASK_INTERRUPTIBLE);
3685 spin_unlock_irq(&x->wait.lock);
3686 timeout = schedule_timeout(timeout);
3687 spin_lock_irq(&x->wait.lock);
3688 if (!timeout) {
3689 __remove_wait_queue(&x->wait, &wait);
3690 goto out;
3691 }
3692 } while (!x->done);
3693 __remove_wait_queue(&x->wait, &wait);
3694 }
3695 x->done--;
3696out:
3697 spin_unlock_irq(&x->wait.lock);
3698 return timeout;
3699}
3700EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3701
Ingo Molnar0fec1712007-07-09 18:52:01 +02003702static inline void
3703sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003704{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003705 spin_lock_irqsave(&q->lock, *flags);
3706 __add_wait_queue(q, wait);
3707 spin_unlock(&q->lock);
3708}
3709
3710static inline void
3711sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3712{
3713 spin_lock_irq(&q->lock);
3714 __remove_wait_queue(q, wait);
3715 spin_unlock_irqrestore(&q->lock, *flags);
3716}
3717
3718void __sched interruptible_sleep_on(wait_queue_head_t *q)
3719{
3720 unsigned long flags;
3721 wait_queue_t wait;
3722
3723 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003724
3725 current->state = TASK_INTERRUPTIBLE;
3726
Ingo Molnar0fec1712007-07-09 18:52:01 +02003727 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003728 schedule();
Ingo Molnar0fec1712007-07-09 18:52:01 +02003729 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003730}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003731EXPORT_SYMBOL(interruptible_sleep_on);
3732
Ingo Molnar0fec1712007-07-09 18:52:01 +02003733long __sched
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003734interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003735{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003736 unsigned long flags;
3737 wait_queue_t wait;
3738
3739 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003740
3741 current->state = TASK_INTERRUPTIBLE;
3742
Ingo Molnar0fec1712007-07-09 18:52:01 +02003743 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003744 timeout = schedule_timeout(timeout);
Ingo Molnar0fec1712007-07-09 18:52:01 +02003745 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003746
3747 return timeout;
3748}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003749EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3750
Ingo Molnar0fec1712007-07-09 18:52:01 +02003751void __sched sleep_on(wait_queue_head_t *q)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003752{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003753 unsigned long flags;
3754 wait_queue_t wait;
3755
3756 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003757
3758 current->state = TASK_UNINTERRUPTIBLE;
3759
Ingo Molnar0fec1712007-07-09 18:52:01 +02003760 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003761 schedule();
Ingo Molnar0fec1712007-07-09 18:52:01 +02003762 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003763}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003764EXPORT_SYMBOL(sleep_on);
3765
Ingo Molnar0fec1712007-07-09 18:52:01 +02003766long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003767{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003768 unsigned long flags;
3769 wait_queue_t wait;
3770
3771 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003772
3773 current->state = TASK_UNINTERRUPTIBLE;
3774
Ingo Molnar0fec1712007-07-09 18:52:01 +02003775 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003776 timeout = schedule_timeout(timeout);
Ingo Molnar0fec1712007-07-09 18:52:01 +02003777 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003778
3779 return timeout;
3780}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003781EXPORT_SYMBOL(sleep_on_timeout);
3782
Ingo Molnarb29739f2006-06-27 02:54:51 -07003783#ifdef CONFIG_RT_MUTEXES
3784
3785/*
3786 * rt_mutex_setprio - set the current priority of a task
3787 * @p: task
3788 * @prio: prio value (kernel-internal form)
3789 *
3790 * This function changes the 'effective' priority of a task. It does
3791 * not touch ->normal_prio like __setscheduler().
3792 *
3793 * Used by the rt_mutex code to implement priority inheritance logic.
3794 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003795void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003796{
3797 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003798 int oldprio, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003799 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003800 u64 now;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003801
3802 BUG_ON(prio < 0 || prio > MAX_PRIO);
3803
3804 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003805 now = rq_clock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003806
Andrew Mortond5f9f942007-05-08 20:27:06 -07003807 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02003808 on_rq = p->se.on_rq;
3809 if (on_rq)
3810 dequeue_task(rq, p, 0, now);
3811
3812 if (rt_prio(prio))
3813 p->sched_class = &rt_sched_class;
3814 else
3815 p->sched_class = &fair_sched_class;
3816
Ingo Molnarb29739f2006-06-27 02:54:51 -07003817 p->prio = prio;
3818
Ingo Molnardd41f592007-07-09 18:51:59 +02003819 if (on_rq) {
3820 enqueue_task(rq, p, 0, now);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003821 /*
3822 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07003823 * our priority decreased, or if we are not currently running on
3824 * this runqueue and our priority is higher than the current's
Ingo Molnarb29739f2006-06-27 02:54:51 -07003825 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003826 if (task_running(rq, p)) {
3827 if (p->prio > oldprio)
3828 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02003829 } else {
3830 check_preempt_curr(rq, p);
3831 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07003832 }
3833 task_rq_unlock(rq, &flags);
3834}
3835
3836#endif
3837
Ingo Molnar36c8b582006-07-03 00:25:41 -07003838void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003839{
Ingo Molnardd41f592007-07-09 18:51:59 +02003840 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003841 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003842 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003843 u64 now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003844
3845 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3846 return;
3847 /*
3848 * We have to be careful, if called from sys_setpriority(),
3849 * the task might be in the middle of scheduling on another CPU.
3850 */
3851 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003852 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003853 /*
3854 * The RT priorities are set via sched_setscheduler(), but we still
3855 * allow the 'normal' nice value to be set - but as expected
3856 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02003857 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003858 */
Ingo Molnare05606d2007-07-09 18:51:59 +02003859 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003860 p->static_prio = NICE_TO_PRIO(nice);
3861 goto out_unlock;
3862 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003863 on_rq = p->se.on_rq;
3864 if (on_rq) {
3865 dequeue_task(rq, p, 0, now);
3866 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -07003867 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003868
Linus Torvalds1da177e2005-04-16 15:20:36 -07003869 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07003870 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003871 old_prio = p->prio;
3872 p->prio = effective_prio(p);
3873 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003874
Ingo Molnardd41f592007-07-09 18:51:59 +02003875 if (on_rq) {
3876 enqueue_task(rq, p, 0, now);
3877 inc_load(rq, p, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003878 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07003879 * If the task increased its priority or is running and
3880 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003881 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003882 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003883 resched_task(rq->curr);
3884 }
3885out_unlock:
3886 task_rq_unlock(rq, &flags);
3887}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003888EXPORT_SYMBOL(set_user_nice);
3889
Matt Mackalle43379f2005-05-01 08:59:00 -07003890/*
3891 * can_nice - check if a task can reduce its nice value
3892 * @p: task
3893 * @nice: nice value
3894 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003895int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07003896{
Matt Mackall024f4742005-08-18 11:24:19 -07003897 /* convert nice value [19,-20] to rlimit style value [1,40] */
3898 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003899
Matt Mackalle43379f2005-05-01 08:59:00 -07003900 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3901 capable(CAP_SYS_NICE));
3902}
3903
Linus Torvalds1da177e2005-04-16 15:20:36 -07003904#ifdef __ARCH_WANT_SYS_NICE
3905
3906/*
3907 * sys_nice - change the priority of the current process.
3908 * @increment: priority increment
3909 *
3910 * sys_setpriority is a more generic, but much slower function that
3911 * does similar things.
3912 */
3913asmlinkage long sys_nice(int increment)
3914{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003915 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003916
3917 /*
3918 * Setpriority might change our priority at the same moment.
3919 * We don't have to worry. Conceptually one call occurs first
3920 * and we have a single winner.
3921 */
Matt Mackalle43379f2005-05-01 08:59:00 -07003922 if (increment < -40)
3923 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003924 if (increment > 40)
3925 increment = 40;
3926
3927 nice = PRIO_TO_NICE(current->static_prio) + increment;
3928 if (nice < -20)
3929 nice = -20;
3930 if (nice > 19)
3931 nice = 19;
3932
Matt Mackalle43379f2005-05-01 08:59:00 -07003933 if (increment < 0 && !can_nice(current, nice))
3934 return -EPERM;
3935
Linus Torvalds1da177e2005-04-16 15:20:36 -07003936 retval = security_task_setnice(current, nice);
3937 if (retval)
3938 return retval;
3939
3940 set_user_nice(current, nice);
3941 return 0;
3942}
3943
3944#endif
3945
3946/**
3947 * task_prio - return the priority value of a given task.
3948 * @p: the task in question.
3949 *
3950 * This is the priority value as seen by users in /proc.
3951 * RT tasks are offset by -200. Normal tasks are centered
3952 * around 0, value goes from -16 to +15.
3953 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003954int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003955{
3956 return p->prio - MAX_RT_PRIO;
3957}
3958
3959/**
3960 * task_nice - return the nice value of a given task.
3961 * @p: the task in question.
3962 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003963int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003964{
3965 return TASK_NICE(p);
3966}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003967EXPORT_SYMBOL_GPL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003968
3969/**
3970 * idle_cpu - is a given cpu idle currently?
3971 * @cpu: the processor in question.
3972 */
3973int idle_cpu(int cpu)
3974{
3975 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3976}
3977
Linus Torvalds1da177e2005-04-16 15:20:36 -07003978/**
3979 * idle_task - return the idle task for a given cpu.
3980 * @cpu: the processor in question.
3981 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003982struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003983{
3984 return cpu_rq(cpu)->idle;
3985}
3986
3987/**
3988 * find_process_by_pid - find a process with a matching PID value.
3989 * @pid: the pid in question.
3990 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003991static inline struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003992{
3993 return pid ? find_task_by_pid(pid) : current;
3994}
3995
3996/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02003997static void
3998__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003999{
Ingo Molnardd41f592007-07-09 18:51:59 +02004000 BUG_ON(p->se.on_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004001
Linus Torvalds1da177e2005-04-16 15:20:36 -07004002 p->policy = policy;
Ingo Molnardd41f592007-07-09 18:51:59 +02004003 switch (p->policy) {
4004 case SCHED_NORMAL:
4005 case SCHED_BATCH:
4006 case SCHED_IDLE:
4007 p->sched_class = &fair_sched_class;
4008 break;
4009 case SCHED_FIFO:
4010 case SCHED_RR:
4011 p->sched_class = &rt_sched_class;
4012 break;
4013 }
4014
Linus Torvalds1da177e2005-04-16 15:20:36 -07004015 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004016 p->normal_prio = normal_prio(p);
4017 /* we are holding p->pi_lock already */
4018 p->prio = rt_mutex_getprio(p);
Peter Williams2dd73a42006-06-27 02:54:34 -07004019 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004020}
4021
4022/**
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004023 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004024 * @p: the task in question.
4025 * @policy: new policy.
4026 * @param: structure containing the new RT priority.
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004027 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004028 * NOTE that the task may be already dead.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004029 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004030int sched_setscheduler(struct task_struct *p, int policy,
4031 struct sched_param *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004032{
Ingo Molnardd41f592007-07-09 18:51:59 +02004033 int retval, oldprio, oldpolicy = -1, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004034 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004035 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004036
Steven Rostedt66e53932006-06-27 02:54:44 -07004037 /* may grab non-irq protected spin_locks */
4038 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004039recheck:
4040 /* double check policy once rq lock held */
4041 if (policy < 0)
4042 policy = oldpolicy = p->policy;
4043 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
Ingo Molnardd41f592007-07-09 18:51:59 +02004044 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4045 policy != SCHED_IDLE)
Ingo Molnarb0a94992006-01-14 13:20:41 -08004046 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004047 /*
4048 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004049 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4050 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004051 */
4052 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004053 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04004054 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004055 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02004056 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004057 return -EINVAL;
4058
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004059 /*
4060 * Allow unprivileged RT tasks to decrease priority:
4061 */
4062 if (!capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02004063 if (rt_policy(policy)) {
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004064 unsigned long rlim_rtprio;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004065
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004066 if (!lock_task_sighand(p, &flags))
4067 return -ESRCH;
4068 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4069 unlock_task_sighand(p, &flags);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004070
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004071 /* can't set/change the rt policy */
4072 if (policy != p->policy && !rlim_rtprio)
4073 return -EPERM;
4074
4075 /* can't increase priority */
4076 if (param->sched_priority > p->rt_priority &&
4077 param->sched_priority > rlim_rtprio)
4078 return -EPERM;
4079 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004080 /*
4081 * Like positive nice levels, dont allow tasks to
4082 * move out of SCHED_IDLE either:
4083 */
4084 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4085 return -EPERM;
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004086
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004087 /* can't change other user's priorities */
4088 if ((current->euid != p->euid) &&
4089 (current->euid != p->uid))
4090 return -EPERM;
4091 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004092
4093 retval = security_task_setscheduler(p, policy, param);
4094 if (retval)
4095 return retval;
4096 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004097 * make sure no PI-waiters arrive (or leave) while we are
4098 * changing the priority of the task:
4099 */
4100 spin_lock_irqsave(&p->pi_lock, flags);
4101 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07004102 * To be able to change p->policy safely, the apropriate
4103 * runqueue lock must be held.
4104 */
Ingo Molnarb29739f2006-06-27 02:54:51 -07004105 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004106 /* recheck policy now with rq lock held */
4107 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4108 policy = oldpolicy = -1;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004109 __task_rq_unlock(rq);
4110 spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004111 goto recheck;
4112 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004113 on_rq = p->se.on_rq;
4114 if (on_rq)
4115 deactivate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004116 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02004117 __setscheduler(rq, p, policy, param->sched_priority);
4118 if (on_rq) {
4119 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004120 /*
4121 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07004122 * our priority decreased, or if we are not currently running on
4123 * this runqueue and our priority is higher than the current's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004124 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004125 if (task_running(rq, p)) {
4126 if (p->prio > oldprio)
4127 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02004128 } else {
4129 check_preempt_curr(rq, p);
4130 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004131 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07004132 __task_rq_unlock(rq);
4133 spin_unlock_irqrestore(&p->pi_lock, flags);
4134
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004135 rt_mutex_adjust_pi(p);
4136
Linus Torvalds1da177e2005-04-16 15:20:36 -07004137 return 0;
4138}
4139EXPORT_SYMBOL_GPL(sched_setscheduler);
4140
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004141static int
4142do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004143{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004144 struct sched_param lparam;
4145 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004146 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004147
4148 if (!param || pid < 0)
4149 return -EINVAL;
4150 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4151 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004152
4153 rcu_read_lock();
4154 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004155 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004156 if (p != NULL)
4157 retval = sched_setscheduler(p, policy, &lparam);
4158 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004159
Linus Torvalds1da177e2005-04-16 15:20:36 -07004160 return retval;
4161}
4162
4163/**
4164 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4165 * @pid: the pid in question.
4166 * @policy: new policy.
4167 * @param: structure containing the new RT priority.
4168 */
4169asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4170 struct sched_param __user *param)
4171{
Jason Baronc21761f2006-01-18 17:43:03 -08004172 /* negative values for policy are not valid */
4173 if (policy < 0)
4174 return -EINVAL;
4175
Linus Torvalds1da177e2005-04-16 15:20:36 -07004176 return do_sched_setscheduler(pid, policy, param);
4177}
4178
4179/**
4180 * sys_sched_setparam - set/change the RT priority of a thread
4181 * @pid: the pid in question.
4182 * @param: structure containing the new RT priority.
4183 */
4184asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4185{
4186 return do_sched_setscheduler(pid, -1, param);
4187}
4188
4189/**
4190 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4191 * @pid: the pid in question.
4192 */
4193asmlinkage long sys_sched_getscheduler(pid_t pid)
4194{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004195 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004196 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004197
4198 if (pid < 0)
4199 goto out_nounlock;
4200
4201 retval = -ESRCH;
4202 read_lock(&tasklist_lock);
4203 p = find_process_by_pid(pid);
4204 if (p) {
4205 retval = security_task_getscheduler(p);
4206 if (!retval)
4207 retval = p->policy;
4208 }
4209 read_unlock(&tasklist_lock);
4210
4211out_nounlock:
4212 return retval;
4213}
4214
4215/**
4216 * sys_sched_getscheduler - get the RT priority of a thread
4217 * @pid: the pid in question.
4218 * @param: structure containing the RT priority.
4219 */
4220asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4221{
4222 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004223 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004224 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004225
4226 if (!param || pid < 0)
4227 goto out_nounlock;
4228
4229 read_lock(&tasklist_lock);
4230 p = find_process_by_pid(pid);
4231 retval = -ESRCH;
4232 if (!p)
4233 goto out_unlock;
4234
4235 retval = security_task_getscheduler(p);
4236 if (retval)
4237 goto out_unlock;
4238
4239 lp.sched_priority = p->rt_priority;
4240 read_unlock(&tasklist_lock);
4241
4242 /*
4243 * This one might sleep, we cannot do it with a spinlock held ...
4244 */
4245 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4246
4247out_nounlock:
4248 return retval;
4249
4250out_unlock:
4251 read_unlock(&tasklist_lock);
4252 return retval;
4253}
4254
4255long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4256{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004257 cpumask_t cpus_allowed;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004258 struct task_struct *p;
4259 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004260
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004261 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004262 read_lock(&tasklist_lock);
4263
4264 p = find_process_by_pid(pid);
4265 if (!p) {
4266 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004267 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004268 return -ESRCH;
4269 }
4270
4271 /*
4272 * It is not safe to call set_cpus_allowed with the
4273 * tasklist_lock held. We will bump the task_struct's
4274 * usage count and then drop tasklist_lock.
4275 */
4276 get_task_struct(p);
4277 read_unlock(&tasklist_lock);
4278
4279 retval = -EPERM;
4280 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4281 !capable(CAP_SYS_NICE))
4282 goto out_unlock;
4283
David Quigleye7834f82006-06-23 02:03:59 -07004284 retval = security_task_setscheduler(p, 0, NULL);
4285 if (retval)
4286 goto out_unlock;
4287
Linus Torvalds1da177e2005-04-16 15:20:36 -07004288 cpus_allowed = cpuset_cpus_allowed(p);
4289 cpus_and(new_mask, new_mask, cpus_allowed);
4290 retval = set_cpus_allowed(p, new_mask);
4291
4292out_unlock:
4293 put_task_struct(p);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004294 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004295 return retval;
4296}
4297
4298static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4299 cpumask_t *new_mask)
4300{
4301 if (len < sizeof(cpumask_t)) {
4302 memset(new_mask, 0, sizeof(cpumask_t));
4303 } else if (len > sizeof(cpumask_t)) {
4304 len = sizeof(cpumask_t);
4305 }
4306 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4307}
4308
4309/**
4310 * sys_sched_setaffinity - set the cpu affinity of a process
4311 * @pid: pid of the process
4312 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4313 * @user_mask_ptr: user-space pointer to the new cpu mask
4314 */
4315asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4316 unsigned long __user *user_mask_ptr)
4317{
4318 cpumask_t new_mask;
4319 int retval;
4320
4321 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4322 if (retval)
4323 return retval;
4324
4325 return sched_setaffinity(pid, new_mask);
4326}
4327
4328/*
4329 * Represents all cpu's present in the system
4330 * In systems capable of hotplug, this map could dynamically grow
4331 * as new cpu's are detected in the system via any platform specific
4332 * method, such as ACPI for e.g.
4333 */
4334
Andi Kleen4cef0c62006-01-11 22:44:57 +01004335cpumask_t cpu_present_map __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004336EXPORT_SYMBOL(cpu_present_map);
4337
4338#ifndef CONFIG_SMP
Andi Kleen4cef0c62006-01-11 22:44:57 +01004339cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004340EXPORT_SYMBOL(cpu_online_map);
4341
Andi Kleen4cef0c62006-01-11 22:44:57 +01004342cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004343EXPORT_SYMBOL(cpu_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004344#endif
4345
4346long sched_getaffinity(pid_t pid, cpumask_t *mask)
4347{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004348 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004349 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004350
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004351 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004352 read_lock(&tasklist_lock);
4353
4354 retval = -ESRCH;
4355 p = find_process_by_pid(pid);
4356 if (!p)
4357 goto out_unlock;
4358
David Quigleye7834f82006-06-23 02:03:59 -07004359 retval = security_task_getscheduler(p);
4360 if (retval)
4361 goto out_unlock;
4362
Jack Steiner2f7016d2006-02-01 03:05:18 -08004363 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004364
4365out_unlock:
4366 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004367 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004368 if (retval)
4369 return retval;
4370
4371 return 0;
4372}
4373
4374/**
4375 * sys_sched_getaffinity - get the cpu affinity of a process
4376 * @pid: pid of the process
4377 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4378 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4379 */
4380asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4381 unsigned long __user *user_mask_ptr)
4382{
4383 int ret;
4384 cpumask_t mask;
4385
4386 if (len < sizeof(cpumask_t))
4387 return -EINVAL;
4388
4389 ret = sched_getaffinity(pid, &mask);
4390 if (ret < 0)
4391 return ret;
4392
4393 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4394 return -EFAULT;
4395
4396 return sizeof(cpumask_t);
4397}
4398
4399/**
4400 * sys_sched_yield - yield the current processor to other threads.
4401 *
Ingo Molnardd41f592007-07-09 18:51:59 +02004402 * This function yields the current CPU to other tasks. If there are no
4403 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004404 */
4405asmlinkage long sys_sched_yield(void)
4406{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004407 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004408
4409 schedstat_inc(rq, yld_cnt);
Ingo Molnardd41f592007-07-09 18:51:59 +02004410 if (unlikely(rq->nr_running == 1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004411 schedstat_inc(rq, yld_act_empty);
Ingo Molnardd41f592007-07-09 18:51:59 +02004412 else
4413 current->sched_class->yield_task(rq, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004414
4415 /*
4416 * Since we are going to call schedule() anyway, there's
4417 * no need to preempt or enable interrupts:
4418 */
4419 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004420 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004421 _raw_spin_unlock(&rq->lock);
4422 preempt_enable_no_resched();
4423
4424 schedule();
4425
4426 return 0;
4427}
4428
Andrew Mortone7b38402006-06-30 01:56:00 -07004429static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004430{
Ingo Molnar8e0a43d2006-06-23 02:05:23 -07004431#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4432 __might_sleep(__FILE__, __LINE__);
4433#endif
Ingo Molnar5bbcfd92005-07-07 17:57:04 -07004434 /*
4435 * The BKS might be reacquired before we have dropped
4436 * PREEMPT_ACTIVE, which could trigger a second
4437 * cond_resched() call.
4438 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004439 do {
4440 add_preempt_count(PREEMPT_ACTIVE);
4441 schedule();
4442 sub_preempt_count(PREEMPT_ACTIVE);
4443 } while (need_resched());
4444}
4445
4446int __sched cond_resched(void)
4447{
Ingo Molnar94142322006-12-29 16:48:13 -08004448 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4449 system_state == SYSTEM_RUNNING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004450 __cond_resched();
4451 return 1;
4452 }
4453 return 0;
4454}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004455EXPORT_SYMBOL(cond_resched);
4456
4457/*
4458 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4459 * call schedule, and on return reacquire the lock.
4460 *
4461 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4462 * operations here to prevent schedule() from being called twice (once via
4463 * spin_unlock(), once by hand).
4464 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004465int cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004466{
Jan Kara6df3cec2005-06-13 15:52:32 -07004467 int ret = 0;
4468
Linus Torvalds1da177e2005-04-16 15:20:36 -07004469 if (need_lockbreak(lock)) {
4470 spin_unlock(lock);
4471 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07004472 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004473 spin_lock(lock);
4474 }
Ingo Molnar94142322006-12-29 16:48:13 -08004475 if (need_resched() && system_state == SYSTEM_RUNNING) {
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004476 spin_release(&lock->dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004477 _raw_spin_unlock(lock);
4478 preempt_enable_no_resched();
4479 __cond_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07004480 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004481 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004482 }
Jan Kara6df3cec2005-06-13 15:52:32 -07004483 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004484}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004485EXPORT_SYMBOL(cond_resched_lock);
4486
4487int __sched cond_resched_softirq(void)
4488{
4489 BUG_ON(!in_softirq());
4490
Ingo Molnar94142322006-12-29 16:48:13 -08004491 if (need_resched() && system_state == SYSTEM_RUNNING) {
Thomas Gleixner98d82562007-05-23 13:58:18 -07004492 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004493 __cond_resched();
4494 local_bh_disable();
4495 return 1;
4496 }
4497 return 0;
4498}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004499EXPORT_SYMBOL(cond_resched_softirq);
4500
Linus Torvalds1da177e2005-04-16 15:20:36 -07004501/**
4502 * yield - yield the current processor to other threads.
4503 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004504 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07004505 * thread runnable and calls sys_sched_yield().
4506 */
4507void __sched yield(void)
4508{
4509 set_current_state(TASK_RUNNING);
4510 sys_sched_yield();
4511}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004512EXPORT_SYMBOL(yield);
4513
4514/*
4515 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4516 * that process accounting knows that this is a task in IO wait state.
4517 *
4518 * But don't do that if it is a deliberate, throttling IO wait (this task
4519 * has set its backing_dev_info: the queue against which it should throttle)
4520 */
4521void __sched io_schedule(void)
4522{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004523 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004524
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004525 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004526 atomic_inc(&rq->nr_iowait);
4527 schedule();
4528 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004529 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004530}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004531EXPORT_SYMBOL(io_schedule);
4532
4533long __sched io_schedule_timeout(long timeout)
4534{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004535 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004536 long ret;
4537
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004538 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004539 atomic_inc(&rq->nr_iowait);
4540 ret = schedule_timeout(timeout);
4541 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004542 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004543 return ret;
4544}
4545
4546/**
4547 * sys_sched_get_priority_max - return maximum RT priority.
4548 * @policy: scheduling class.
4549 *
4550 * this syscall returns the maximum rt_priority that can be used
4551 * by a given scheduling class.
4552 */
4553asmlinkage long sys_sched_get_priority_max(int policy)
4554{
4555 int ret = -EINVAL;
4556
4557 switch (policy) {
4558 case SCHED_FIFO:
4559 case SCHED_RR:
4560 ret = MAX_USER_RT_PRIO-1;
4561 break;
4562 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004563 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004564 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004565 ret = 0;
4566 break;
4567 }
4568 return ret;
4569}
4570
4571/**
4572 * sys_sched_get_priority_min - return minimum RT priority.
4573 * @policy: scheduling class.
4574 *
4575 * this syscall returns the minimum rt_priority that can be used
4576 * by a given scheduling class.
4577 */
4578asmlinkage long sys_sched_get_priority_min(int policy)
4579{
4580 int ret = -EINVAL;
4581
4582 switch (policy) {
4583 case SCHED_FIFO:
4584 case SCHED_RR:
4585 ret = 1;
4586 break;
4587 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004588 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004589 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004590 ret = 0;
4591 }
4592 return ret;
4593}
4594
4595/**
4596 * sys_sched_rr_get_interval - return the default timeslice of a process.
4597 * @pid: pid of the process.
4598 * @interval: userspace pointer to the timeslice value.
4599 *
4600 * this syscall writes the default timeslice value of a given process
4601 * into the user-space timespec buffer. A value of '0' means infinity.
4602 */
4603asmlinkage
4604long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4605{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004606 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004607 int retval = -EINVAL;
4608 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004609
4610 if (pid < 0)
4611 goto out_nounlock;
4612
4613 retval = -ESRCH;
4614 read_lock(&tasklist_lock);
4615 p = find_process_by_pid(pid);
4616 if (!p)
4617 goto out_unlock;
4618
4619 retval = security_task_getscheduler(p);
4620 if (retval)
4621 goto out_unlock;
4622
Peter Williamsb78709c2006-06-26 16:58:00 +10004623 jiffies_to_timespec(p->policy == SCHED_FIFO ?
Ingo Molnardd41f592007-07-09 18:51:59 +02004624 0 : static_prio_timeslice(p->static_prio), &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004625 read_unlock(&tasklist_lock);
4626 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4627out_nounlock:
4628 return retval;
4629out_unlock:
4630 read_unlock(&tasklist_lock);
4631 return retval;
4632}
4633
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004634static const char stat_nam[] = "RSDTtZX";
Ingo Molnar36c8b582006-07-03 00:25:41 -07004635
4636static void show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004637{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004638 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004639 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004640
Linus Torvalds1da177e2005-04-16 15:20:36 -07004641 state = p->state ? __ffs(p->state) + 1 : 0;
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004642 printk("%-13.13s %c", p->comm,
4643 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07004644#if (BITS_PER_LONG == 32)
4645 if (state == TASK_RUNNING)
4646 printk(" running ");
4647 else
4648 printk(" %08lX ", thread_saved_pc(p));
4649#else
4650 if (state == TASK_RUNNING)
4651 printk(" running task ");
4652 else
4653 printk(" %016lx ", thread_saved_pc(p));
4654#endif
4655#ifdef CONFIG_DEBUG_STACK_USAGE
4656 {
Al Viro10ebffd2005-11-13 16:06:56 -08004657 unsigned long *n = end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004658 while (!*n)
4659 n++;
Al Viro10ebffd2005-11-13 16:06:56 -08004660 free = (unsigned long)n - (unsigned long)end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004661 }
4662#endif
Ingo Molnar35f6f752007-04-06 21:18:06 +02004663 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004664 if (!p->mm)
4665 printk(" (L-TLB)\n");
4666 else
4667 printk(" (NOTLB)\n");
4668
4669 if (state != TASK_RUNNING)
4670 show_stack(p, NULL);
4671}
4672
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004673void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004674{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004675 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004676
4677#if (BITS_PER_LONG == 32)
4678 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004679 " free sibling\n");
4680 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004681#else
4682 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004683 " free sibling\n");
4684 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004685#endif
4686 read_lock(&tasklist_lock);
4687 do_each_thread(g, p) {
4688 /*
4689 * reset the NMI-timeout, listing all files on a slow
4690 * console might take alot of time:
4691 */
4692 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07004693 if (!state_filter || (p->state & state_filter))
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004694 show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004695 } while_each_thread(g, p);
4696
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07004697 touch_all_softlockup_watchdogs();
4698
Ingo Molnardd41f592007-07-09 18:51:59 +02004699#ifdef CONFIG_SCHED_DEBUG
4700 sysrq_sched_debug_show();
4701#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004702 read_unlock(&tasklist_lock);
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004703 /*
4704 * Only show locks if all tasks are dumped:
4705 */
4706 if (state_filter == -1)
4707 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004708}
4709
Ingo Molnar1df21052007-07-09 18:51:58 +02004710void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4711{
Ingo Molnardd41f592007-07-09 18:51:59 +02004712 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02004713}
4714
Ingo Molnarf340c0d2005-06-28 16:40:42 +02004715/**
4716 * init_idle - set up an idle thread for a given CPU
4717 * @idle: task in question
4718 * @cpu: cpu the idle task belongs to
4719 *
4720 * NOTE: this function does not set the idle thread's NEED_RESCHED
4721 * flag, to make booting more robust.
4722 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07004723void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004724{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004725 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004726 unsigned long flags;
4727
Ingo Molnardd41f592007-07-09 18:51:59 +02004728 __sched_fork(idle);
4729 idle->se.exec_start = sched_clock();
4730
Ingo Molnarb29739f2006-06-27 02:54:51 -07004731 idle->prio = idle->normal_prio = MAX_PRIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732 idle->cpus_allowed = cpumask_of_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004733 __set_task_cpu(idle, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004734
4735 spin_lock_irqsave(&rq->lock, flags);
4736 rq->curr = rq->idle = idle;
Nick Piggin4866cde2005-06-25 14:57:23 -07004737#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4738 idle->oncpu = 1;
4739#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004740 spin_unlock_irqrestore(&rq->lock, flags);
4741
4742 /* Set the preempt count _outside_ the spinlocks! */
4743#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
Al Viroa1261f52005-11-13 16:06:55 -08004744 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004745#else
Al Viroa1261f52005-11-13 16:06:55 -08004746 task_thread_info(idle)->preempt_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004747#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02004748 /*
4749 * The idle tasks have their own, simple scheduling class:
4750 */
4751 idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004752}
4753
4754/*
4755 * In a system that switches off the HZ timer nohz_cpu_mask
4756 * indicates which cpus entered this state. This is used
4757 * in the rcu update to wait only for active cpus. For system
4758 * which do not switch off the HZ timer nohz_cpu_mask should
4759 * always be CPU_MASK_NONE.
4760 */
4761cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4762
Ingo Molnardd41f592007-07-09 18:51:59 +02004763/*
4764 * Increase the granularity value when there are more CPUs,
4765 * because with more CPUs the 'effective latency' as visible
4766 * to users decreases. But the relationship is not linear,
4767 * so pick a second-best guess by going with the log2 of the
4768 * number of CPUs.
4769 *
4770 * This idea comes from the SD scheduler of Con Kolivas:
4771 */
4772static inline void sched_init_granularity(void)
4773{
4774 unsigned int factor = 1 + ilog2(num_online_cpus());
4775 const unsigned long gran_limit = 10000000;
4776
4777 sysctl_sched_granularity *= factor;
4778 if (sysctl_sched_granularity > gran_limit)
4779 sysctl_sched_granularity = gran_limit;
4780
4781 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4782 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4783}
4784
Linus Torvalds1da177e2005-04-16 15:20:36 -07004785#ifdef CONFIG_SMP
4786/*
4787 * This is how migration works:
4788 *
Ingo Molnar70b97a72006-07-03 00:25:42 -07004789 * 1) we queue a struct migration_req structure in the source CPU's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004790 * runqueue and wake up that CPU's migration thread.
4791 * 2) we down() the locked semaphore => thread blocks.
4792 * 3) migration thread wakes up (implicitly it forces the migrated
4793 * thread off the CPU)
4794 * 4) it gets the migration request and checks whether the migrated
4795 * task is still in the wrong runqueue.
4796 * 5) if it's in the wrong runqueue then the migration thread removes
4797 * it and puts it into the right queue.
4798 * 6) migration thread up()s the semaphore.
4799 * 7) we wake up and the migration is done.
4800 */
4801
4802/*
4803 * Change a given task's CPU affinity. Migrate the thread to a
4804 * proper CPU and schedule it away if the CPU it's executing on
4805 * is removed from the allowed bitmask.
4806 *
4807 * NOTE: the caller must have a valid reference to the task, the
4808 * task must not exit() & deallocate itself prematurely. The
4809 * call is not atomic; no spinlocks may be held.
4810 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004811int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004812{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004813 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004814 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004815 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07004816 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004817
4818 rq = task_rq_lock(p, &flags);
4819 if (!cpus_intersects(new_mask, cpu_online_map)) {
4820 ret = -EINVAL;
4821 goto out;
4822 }
4823
4824 p->cpus_allowed = new_mask;
4825 /* Can the task run on the task's current CPU? If so, we're done */
4826 if (cpu_isset(task_cpu(p), new_mask))
4827 goto out;
4828
4829 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4830 /* Need help from migration thread: drop lock and wait. */
4831 task_rq_unlock(rq, &flags);
4832 wake_up_process(rq->migration_thread);
4833 wait_for_completion(&req.done);
4834 tlb_migrate_finish(p->mm);
4835 return 0;
4836 }
4837out:
4838 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004839
Linus Torvalds1da177e2005-04-16 15:20:36 -07004840 return ret;
4841}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004842EXPORT_SYMBOL_GPL(set_cpus_allowed);
4843
4844/*
4845 * Move (not current) task off this cpu, onto dest cpu. We're doing
4846 * this because either it can't run here any more (set_cpus_allowed()
4847 * away from this CPU, or CPU going down), or because we're
4848 * attempting to rebalance this task on exec (sched_exec).
4849 *
4850 * So we race with normal scheduler movements, but that's OK, as long
4851 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07004852 *
4853 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004854 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07004855static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004856{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004857 struct rq *rq_dest, *rq_src;
Ingo Molnardd41f592007-07-09 18:51:59 +02004858 int ret = 0, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004859
4860 if (unlikely(cpu_is_offline(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004861 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004862
4863 rq_src = cpu_rq(src_cpu);
4864 rq_dest = cpu_rq(dest_cpu);
4865
4866 double_rq_lock(rq_src, rq_dest);
4867 /* Already moved. */
4868 if (task_cpu(p) != src_cpu)
4869 goto out;
4870 /* Affinity changed (again). */
4871 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4872 goto out;
4873
Ingo Molnardd41f592007-07-09 18:51:59 +02004874 on_rq = p->se.on_rq;
4875 if (on_rq)
4876 deactivate_task(rq_src, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004877 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004878 if (on_rq) {
4879 activate_task(rq_dest, p, 0);
4880 check_preempt_curr(rq_dest, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004881 }
Kirill Korotaevefc30812006-06-27 02:54:32 -07004882 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004883out:
4884 double_rq_unlock(rq_src, rq_dest);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004885 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004886}
4887
4888/*
4889 * migration_thread - this is a highprio system thread that performs
4890 * thread migration by bumping thread off CPU then 'pushing' onto
4891 * another runqueue.
4892 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004893static int migration_thread(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004894{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004895 int cpu = (long)data;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004896 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004897
4898 rq = cpu_rq(cpu);
4899 BUG_ON(rq->migration_thread != current);
4900
4901 set_current_state(TASK_INTERRUPTIBLE);
4902 while (!kthread_should_stop()) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07004903 struct migration_req *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004904 struct list_head *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004905
Christoph Lameter3e1d1d22005-06-24 23:13:50 -07004906 try_to_freeze();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004907
4908 spin_lock_irq(&rq->lock);
4909
4910 if (cpu_is_offline(cpu)) {
4911 spin_unlock_irq(&rq->lock);
4912 goto wait_to_die;
4913 }
4914
4915 if (rq->active_balance) {
4916 active_load_balance(rq, cpu);
4917 rq->active_balance = 0;
4918 }
4919
4920 head = &rq->migration_queue;
4921
4922 if (list_empty(head)) {
4923 spin_unlock_irq(&rq->lock);
4924 schedule();
4925 set_current_state(TASK_INTERRUPTIBLE);
4926 continue;
4927 }
Ingo Molnar70b97a72006-07-03 00:25:42 -07004928 req = list_entry(head->next, struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004929 list_del_init(head->next);
4930
Nick Piggin674311d2005-06-25 14:57:27 -07004931 spin_unlock(&rq->lock);
4932 __migrate_task(req->task, cpu, req->dest_cpu);
4933 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934
4935 complete(&req->done);
4936 }
4937 __set_current_state(TASK_RUNNING);
4938 return 0;
4939
4940wait_to_die:
4941 /* Wait for kthread_stop */
4942 set_current_state(TASK_INTERRUPTIBLE);
4943 while (!kthread_should_stop()) {
4944 schedule();
4945 set_current_state(TASK_INTERRUPTIBLE);
4946 }
4947 __set_current_state(TASK_RUNNING);
4948 return 0;
4949}
4950
4951#ifdef CONFIG_HOTPLUG_CPU
Kirill Korotaev054b9102006-12-10 02:20:11 -08004952/*
4953 * Figure out where task on dead CPU should go, use force if neccessary.
4954 * NOTE: interrupts should be disabled by the caller
4955 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07004956static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004957{
Kirill Korotaevefc30812006-06-27 02:54:32 -07004958 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004959 cpumask_t mask;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004960 struct rq *rq;
4961 int dest_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962
Kirill Korotaevefc30812006-06-27 02:54:32 -07004963restart:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004964 /* On same node? */
4965 mask = node_to_cpumask(cpu_to_node(dead_cpu));
Ingo Molnar48f24c42006-07-03 00:25:40 -07004966 cpus_and(mask, mask, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004967 dest_cpu = any_online_cpu(mask);
4968
4969 /* On any allowed CPU? */
4970 if (dest_cpu == NR_CPUS)
Ingo Molnar48f24c42006-07-03 00:25:40 -07004971 dest_cpu = any_online_cpu(p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004972
4973 /* No more Mr. Nice Guy. */
4974 if (dest_cpu == NR_CPUS) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07004975 rq = task_rq_lock(p, &flags);
4976 cpus_setall(p->cpus_allowed);
4977 dest_cpu = any_online_cpu(p->cpus_allowed);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004978 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979
4980 /*
4981 * Don't tell them about moving exiting tasks or
4982 * kernel threads (both mm NULL), since they never
4983 * leave kernel.
4984 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07004985 if (p->mm && printk_ratelimit())
Linus Torvalds1da177e2005-04-16 15:20:36 -07004986 printk(KERN_INFO "process %d (%s) no "
4987 "longer affine to cpu%d\n",
Ingo Molnar48f24c42006-07-03 00:25:40 -07004988 p->pid, p->comm, dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004989 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07004990 if (!__migrate_task(p, dead_cpu, dest_cpu))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004991 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004992}
4993
4994/*
4995 * While a dead CPU has no uninterruptible tasks queued at this point,
4996 * it might still have a nonzero ->nr_uninterruptible counter, because
4997 * for performance reasons the counter is not stricly tracking tasks to
4998 * their home CPUs. So we just add the counter to another CPU's counter,
4999 * to keep the global sum constant after CPU-down:
5000 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07005001static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005002{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005003 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005004 unsigned long flags;
5005
5006 local_irq_save(flags);
5007 double_rq_lock(rq_src, rq_dest);
5008 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5009 rq_src->nr_uninterruptible = 0;
5010 double_rq_unlock(rq_src, rq_dest);
5011 local_irq_restore(flags);
5012}
5013
5014/* Run through task list and migrate tasks from the dead cpu. */
5015static void migrate_live_tasks(int src_cpu)
5016{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005017 struct task_struct *p, *t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005018
5019 write_lock_irq(&tasklist_lock);
5020
Ingo Molnar48f24c42006-07-03 00:25:40 -07005021 do_each_thread(t, p) {
5022 if (p == current)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005023 continue;
5024
Ingo Molnar48f24c42006-07-03 00:25:40 -07005025 if (task_cpu(p) == src_cpu)
5026 move_task_off_dead_cpu(src_cpu, p);
5027 } while_each_thread(t, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005028
5029 write_unlock_irq(&tasklist_lock);
5030}
5031
Ingo Molnardd41f592007-07-09 18:51:59 +02005032/*
5033 * Schedules idle task to be the next runnable task on current CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005034 * It does so by boosting its priority to highest possible and adding it to
Ingo Molnar48f24c42006-07-03 00:25:40 -07005035 * the _front_ of the runqueue. Used by CPU offline code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005036 */
5037void sched_idle_next(void)
5038{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005039 int this_cpu = smp_processor_id();
Ingo Molnar70b97a72006-07-03 00:25:42 -07005040 struct rq *rq = cpu_rq(this_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005041 struct task_struct *p = rq->idle;
5042 unsigned long flags;
5043
5044 /* cpu has to be offline */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005045 BUG_ON(cpu_online(this_cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046
Ingo Molnar48f24c42006-07-03 00:25:40 -07005047 /*
5048 * Strictly not necessary since rest of the CPUs are stopped by now
5049 * and interrupts disabled on the current cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005050 */
5051 spin_lock_irqsave(&rq->lock, flags);
5052
Ingo Molnardd41f592007-07-09 18:51:59 +02005053 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005054
5055 /* Add idle task to the _front_ of its priority queue: */
Ingo Molnardd41f592007-07-09 18:51:59 +02005056 activate_idle_task(p, rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005057
5058 spin_unlock_irqrestore(&rq->lock, flags);
5059}
5060
Ingo Molnar48f24c42006-07-03 00:25:40 -07005061/*
5062 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005063 * offline.
5064 */
5065void idle_task_exit(void)
5066{
5067 struct mm_struct *mm = current->active_mm;
5068
5069 BUG_ON(cpu_online(smp_processor_id()));
5070
5071 if (mm != &init_mm)
5072 switch_mm(mm, &init_mm, current);
5073 mmdrop(mm);
5074}
5075
Kirill Korotaev054b9102006-12-10 02:20:11 -08005076/* called under rq->lock with disabled interrupts */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005077static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005078{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005079 struct rq *rq = cpu_rq(dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005080
5081 /* Must be exiting, otherwise would be on tasklist. */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005082 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005083
5084 /* Cannot have done final schedule yet: would have vanished. */
Oleg Nesterovc394cc92006-09-29 02:01:11 -07005085 BUG_ON(p->state == TASK_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005086
Ingo Molnar48f24c42006-07-03 00:25:40 -07005087 get_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005088
5089 /*
5090 * Drop lock around migration; if someone else moves it,
5091 * that's OK. No task can be added to this CPU, so iteration is
5092 * fine.
Kirill Korotaev054b9102006-12-10 02:20:11 -08005093 * NOTE: interrupts should be left disabled --dev@
Linus Torvalds1da177e2005-04-16 15:20:36 -07005094 */
Kirill Korotaev054b9102006-12-10 02:20:11 -08005095 spin_unlock(&rq->lock);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005096 move_task_off_dead_cpu(dead_cpu, p);
Kirill Korotaev054b9102006-12-10 02:20:11 -08005097 spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005098
Ingo Molnar48f24c42006-07-03 00:25:40 -07005099 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005100}
5101
5102/* release_task() removes task from tasklist, so we won't find dead tasks. */
5103static void migrate_dead_tasks(unsigned int dead_cpu)
5104{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005105 struct rq *rq = cpu_rq(dead_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02005106 struct task_struct *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005107
Ingo Molnardd41f592007-07-09 18:51:59 +02005108 for ( ; ; ) {
5109 if (!rq->nr_running)
5110 break;
5111 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5112 if (!next)
5113 break;
5114 migrate_dead(dead_cpu, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005115 }
5116}
5117#endif /* CONFIG_HOTPLUG_CPU */
5118
5119/*
5120 * migration_call - callback that gets triggered when a CPU is added.
5121 * Here we can start up the necessary migration thread for the new CPU.
5122 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005123static int __cpuinit
5124migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005125{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005126 struct task_struct *p;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005127 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005128 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005129 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005130
5131 switch (action) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005132 case CPU_LOCK_ACQUIRE:
5133 mutex_lock(&sched_hotcpu_mutex);
5134 break;
5135
Linus Torvalds1da177e2005-04-16 15:20:36 -07005136 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005137 case CPU_UP_PREPARE_FROZEN:
Ingo Molnardd41f592007-07-09 18:51:59 +02005138 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005139 if (IS_ERR(p))
5140 return NOTIFY_BAD;
5141 p->flags |= PF_NOFREEZE;
5142 kthread_bind(p, cpu);
5143 /* Must be high prio: stop_machine expects to yield to it. */
5144 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005145 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005146 task_rq_unlock(rq, &flags);
5147 cpu_rq(cpu)->migration_thread = p;
5148 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005149
Linus Torvalds1da177e2005-04-16 15:20:36 -07005150 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005151 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005152 /* Strictly unneccessary, as first user will wake it. */
5153 wake_up_process(cpu_rq(cpu)->migration_thread);
5154 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005155
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156#ifdef CONFIG_HOTPLUG_CPU
5157 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005158 case CPU_UP_CANCELED_FROZEN:
Heiko Carstensfc75cdf2006-06-25 05:49:10 -07005159 if (!cpu_rq(cpu)->migration_thread)
5160 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005161 /* Unbind it from offline cpu so it can run. Fall thru. */
Heiko Carstensa4c4af72005-11-07 00:58:38 -08005162 kthread_bind(cpu_rq(cpu)->migration_thread,
5163 any_online_cpu(cpu_online_map));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005164 kthread_stop(cpu_rq(cpu)->migration_thread);
5165 cpu_rq(cpu)->migration_thread = NULL;
5166 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005167
Linus Torvalds1da177e2005-04-16 15:20:36 -07005168 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005169 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005170 migrate_live_tasks(cpu);
5171 rq = cpu_rq(cpu);
5172 kthread_stop(rq->migration_thread);
5173 rq->migration_thread = NULL;
5174 /* Idle task back to normal (off runqueue, low prio) */
5175 rq = task_rq_lock(rq->idle, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005176 deactivate_task(rq, rq->idle, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005177 rq->idle->static_prio = MAX_PRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02005178 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5179 rq->idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005180 migrate_dead_tasks(cpu);
5181 task_rq_unlock(rq, &flags);
5182 migrate_nr_uninterruptible(rq);
5183 BUG_ON(rq->nr_running != 0);
5184
5185 /* No need to migrate the tasks: it was best-effort if
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005186 * they didn't take sched_hotcpu_mutex. Just wake up
Linus Torvalds1da177e2005-04-16 15:20:36 -07005187 * the requestors. */
5188 spin_lock_irq(&rq->lock);
5189 while (!list_empty(&rq->migration_queue)) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07005190 struct migration_req *req;
5191
Linus Torvalds1da177e2005-04-16 15:20:36 -07005192 req = list_entry(rq->migration_queue.next,
Ingo Molnar70b97a72006-07-03 00:25:42 -07005193 struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005194 list_del_init(&req->list);
5195 complete(&req->done);
5196 }
5197 spin_unlock_irq(&rq->lock);
5198 break;
5199#endif
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005200 case CPU_LOCK_RELEASE:
5201 mutex_unlock(&sched_hotcpu_mutex);
5202 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005203 }
5204 return NOTIFY_OK;
5205}
5206
5207/* Register at highest priority so that task migration (migrate_all_tasks)
5208 * happens before everything else.
5209 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07005210static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005211 .notifier_call = migration_call,
5212 .priority = 10
5213};
5214
5215int __init migration_init(void)
5216{
5217 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07005218 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005219
5220 /* Start one for the boot CPU: */
Akinobu Mita07dccf32006-09-29 02:00:22 -07005221 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5222 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005223 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5224 register_cpu_notifier(&migration_notifier);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005225
Linus Torvalds1da177e2005-04-16 15:20:36 -07005226 return 0;
5227}
5228#endif
5229
5230#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07005231
5232/* Number of possible processor ids */
5233int nr_cpu_ids __read_mostly = NR_CPUS;
5234EXPORT_SYMBOL(nr_cpu_ids);
5235
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005236#undef SCHED_DOMAIN_DEBUG
Linus Torvalds1da177e2005-04-16 15:20:36 -07005237#ifdef SCHED_DOMAIN_DEBUG
5238static void sched_domain_debug(struct sched_domain *sd, int cpu)
5239{
5240 int level = 0;
5241
Nick Piggin41c7ce92005-06-25 14:57:24 -07005242 if (!sd) {
5243 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5244 return;
5245 }
5246
Linus Torvalds1da177e2005-04-16 15:20:36 -07005247 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5248
5249 do {
5250 int i;
5251 char str[NR_CPUS];
5252 struct sched_group *group = sd->groups;
5253 cpumask_t groupmask;
5254
5255 cpumask_scnprintf(str, NR_CPUS, sd->span);
5256 cpus_clear(groupmask);
5257
5258 printk(KERN_DEBUG);
5259 for (i = 0; i < level + 1; i++)
5260 printk(" ");
5261 printk("domain %d: ", level);
5262
5263 if (!(sd->flags & SD_LOAD_BALANCE)) {
5264 printk("does not load-balance\n");
5265 if (sd->parent)
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005266 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5267 " has parent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005268 break;
5269 }
5270
5271 printk("span %s\n", str);
5272
5273 if (!cpu_isset(cpu, sd->span))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005274 printk(KERN_ERR "ERROR: domain->span does not contain "
5275 "CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005276 if (!cpu_isset(cpu, group->cpumask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005277 printk(KERN_ERR "ERROR: domain->groups does not contain"
5278 " CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005279
5280 printk(KERN_DEBUG);
5281 for (i = 0; i < level + 2; i++)
5282 printk(" ");
5283 printk("groups:");
5284 do {
5285 if (!group) {
5286 printk("\n");
5287 printk(KERN_ERR "ERROR: group is NULL\n");
5288 break;
5289 }
5290
Eric Dumazet5517d862007-05-08 00:32:57 -07005291 if (!group->__cpu_power) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005292 printk("\n");
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005293 printk(KERN_ERR "ERROR: domain->cpu_power not "
5294 "set\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005295 }
5296
5297 if (!cpus_weight(group->cpumask)) {
5298 printk("\n");
5299 printk(KERN_ERR "ERROR: empty group\n");
5300 }
5301
5302 if (cpus_intersects(groupmask, group->cpumask)) {
5303 printk("\n");
5304 printk(KERN_ERR "ERROR: repeated CPUs\n");
5305 }
5306
5307 cpus_or(groupmask, groupmask, group->cpumask);
5308
5309 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5310 printk(" %s", str);
5311
5312 group = group->next;
5313 } while (group != sd->groups);
5314 printk("\n");
5315
5316 if (!cpus_equal(sd->span, groupmask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005317 printk(KERN_ERR "ERROR: groups don't span "
5318 "domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005319
5320 level++;
5321 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005322 if (!sd)
5323 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005324
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005325 if (!cpus_subset(groupmask, sd->span))
5326 printk(KERN_ERR "ERROR: parent span is not a superset "
5327 "of domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005328
5329 } while (sd);
5330}
5331#else
Ingo Molnar48f24c42006-07-03 00:25:40 -07005332# define sched_domain_debug(sd, cpu) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005333#endif
5334
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005335static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005336{
5337 if (cpus_weight(sd->span) == 1)
5338 return 1;
5339
5340 /* Following flags need at least 2 groups */
5341 if (sd->flags & (SD_LOAD_BALANCE |
5342 SD_BALANCE_NEWIDLE |
5343 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005344 SD_BALANCE_EXEC |
5345 SD_SHARE_CPUPOWER |
5346 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005347 if (sd->groups != sd->groups->next)
5348 return 0;
5349 }
5350
5351 /* Following flags don't use groups */
5352 if (sd->flags & (SD_WAKE_IDLE |
5353 SD_WAKE_AFFINE |
5354 SD_WAKE_BALANCE))
5355 return 0;
5356
5357 return 1;
5358}
5359
Ingo Molnar48f24c42006-07-03 00:25:40 -07005360static int
5361sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005362{
5363 unsigned long cflags = sd->flags, pflags = parent->flags;
5364
5365 if (sd_degenerate(parent))
5366 return 1;
5367
5368 if (!cpus_equal(sd->span, parent->span))
5369 return 0;
5370
5371 /* Does parent contain flags not in child? */
5372 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5373 if (cflags & SD_WAKE_AFFINE)
5374 pflags &= ~SD_WAKE_BALANCE;
5375 /* Flags needing groups don't count if only 1 group in parent */
5376 if (parent->groups == parent->groups->next) {
5377 pflags &= ~(SD_LOAD_BALANCE |
5378 SD_BALANCE_NEWIDLE |
5379 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005380 SD_BALANCE_EXEC |
5381 SD_SHARE_CPUPOWER |
5382 SD_SHARE_PKG_RESOURCES);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005383 }
5384 if (~cflags & pflags)
5385 return 0;
5386
5387 return 1;
5388}
5389
Linus Torvalds1da177e2005-04-16 15:20:36 -07005390/*
5391 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5392 * hold the hotplug lock.
5393 */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005394static void cpu_attach_domain(struct sched_domain *sd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005395{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005396 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005397 struct sched_domain *tmp;
5398
5399 /* Remove the sched domains which do not contribute to scheduling. */
5400 for (tmp = sd; tmp; tmp = tmp->parent) {
5401 struct sched_domain *parent = tmp->parent;
5402 if (!parent)
5403 break;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005404 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005405 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005406 if (parent->parent)
5407 parent->parent->child = tmp;
5408 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07005409 }
5410
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005411 if (sd && sd_degenerate(sd)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005412 sd = sd->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005413 if (sd)
5414 sd->child = NULL;
5415 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005416
5417 sched_domain_debug(sd, cpu);
5418
Nick Piggin674311d2005-06-25 14:57:27 -07005419 rcu_assign_pointer(rq->sd, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005420}
5421
5422/* cpus with isolated domains */
Tim Chen67af63a2006-12-22 01:07:50 -08005423static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005424
5425/* Setup the mask of cpus configured for isolated domains */
5426static int __init isolated_cpu_setup(char *str)
5427{
5428 int ints[NR_CPUS], i;
5429
5430 str = get_options(str, ARRAY_SIZE(ints), ints);
5431 cpus_clear(cpu_isolated_map);
5432 for (i = 1; i <= ints[0]; i++)
5433 if (ints[i] < NR_CPUS)
5434 cpu_set(ints[i], cpu_isolated_map);
5435 return 1;
5436}
5437
5438__setup ("isolcpus=", isolated_cpu_setup);
5439
5440/*
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005441 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5442 * to a function which identifies what group(along with sched group) a CPU
5443 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5444 * (due to the fact that we keep track of groups covered with a cpumask_t).
Linus Torvalds1da177e2005-04-16 15:20:36 -07005445 *
5446 * init_sched_build_groups will build a circular linked list of the groups
5447 * covered by the given span, and will set each group's ->cpumask correctly,
5448 * and ->cpu_power to 0.
5449 */
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005450static void
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005451init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5452 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5453 struct sched_group **sg))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005454{
5455 struct sched_group *first = NULL, *last = NULL;
5456 cpumask_t covered = CPU_MASK_NONE;
5457 int i;
5458
5459 for_each_cpu_mask(i, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005460 struct sched_group *sg;
5461 int group = group_fn(i, cpu_map, &sg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005462 int j;
5463
5464 if (cpu_isset(i, covered))
5465 continue;
5466
5467 sg->cpumask = CPU_MASK_NONE;
Eric Dumazet5517d862007-05-08 00:32:57 -07005468 sg->__cpu_power = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005469
5470 for_each_cpu_mask(j, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005471 if (group_fn(j, cpu_map, NULL) != group)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005472 continue;
5473
5474 cpu_set(j, covered);
5475 cpu_set(j, sg->cpumask);
5476 }
5477 if (!first)
5478 first = sg;
5479 if (last)
5480 last->next = sg;
5481 last = sg;
5482 }
5483 last->next = first;
5484}
5485
John Hawkes9c1cfda2005-09-06 15:18:14 -07005486#define SD_NODES_PER_DOMAIN 16
Linus Torvalds1da177e2005-04-16 15:20:36 -07005487
John Hawkes9c1cfda2005-09-06 15:18:14 -07005488#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08005489
John Hawkes9c1cfda2005-09-06 15:18:14 -07005490/**
5491 * find_next_best_node - find the next node to include in a sched_domain
5492 * @node: node whose sched_domain we're building
5493 * @used_nodes: nodes already in the sched_domain
5494 *
5495 * Find the next node to include in a given scheduling domain. Simply
5496 * finds the closest node not already in the @used_nodes map.
5497 *
5498 * Should use nodemask_t.
5499 */
5500static int find_next_best_node(int node, unsigned long *used_nodes)
5501{
5502 int i, n, val, min_val, best_node = 0;
5503
5504 min_val = INT_MAX;
5505
5506 for (i = 0; i < MAX_NUMNODES; i++) {
5507 /* Start at @node */
5508 n = (node + i) % MAX_NUMNODES;
5509
5510 if (!nr_cpus_node(n))
5511 continue;
5512
5513 /* Skip already used nodes */
5514 if (test_bit(n, used_nodes))
5515 continue;
5516
5517 /* Simple min distance search */
5518 val = node_distance(node, n);
5519
5520 if (val < min_val) {
5521 min_val = val;
5522 best_node = n;
5523 }
5524 }
5525
5526 set_bit(best_node, used_nodes);
5527 return best_node;
5528}
5529
5530/**
5531 * sched_domain_node_span - get a cpumask for a node's sched_domain
5532 * @node: node whose cpumask we're constructing
5533 * @size: number of nodes to include in this span
5534 *
5535 * Given a node, construct a good cpumask for its sched_domain to span. It
5536 * should be one that prevents unnecessary balancing, but also spreads tasks
5537 * out optimally.
5538 */
5539static cpumask_t sched_domain_node_span(int node)
5540{
John Hawkes9c1cfda2005-09-06 15:18:14 -07005541 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005542 cpumask_t span, nodemask;
5543 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005544
5545 cpus_clear(span);
5546 bitmap_zero(used_nodes, MAX_NUMNODES);
5547
5548 nodemask = node_to_cpumask(node);
5549 cpus_or(span, span, nodemask);
5550 set_bit(node, used_nodes);
5551
5552 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5553 int next_node = find_next_best_node(node, used_nodes);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005554
John Hawkes9c1cfda2005-09-06 15:18:14 -07005555 nodemask = node_to_cpumask(next_node);
5556 cpus_or(span, span, nodemask);
5557 }
5558
5559 return span;
5560}
5561#endif
5562
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005563int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005564
John Hawkes9c1cfda2005-09-06 15:18:14 -07005565/*
Ingo Molnar48f24c42006-07-03 00:25:40 -07005566 * SMT sched-domains:
John Hawkes9c1cfda2005-09-06 15:18:14 -07005567 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005568#ifdef CONFIG_SCHED_SMT
5569static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005570static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005571
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005572static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5573 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005574{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005575 if (sg)
5576 *sg = &per_cpu(sched_group_cpus, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005577 return cpu;
5578}
5579#endif
5580
Ingo Molnar48f24c42006-07-03 00:25:40 -07005581/*
5582 * multi-core sched-domains:
5583 */
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005584#ifdef CONFIG_SCHED_MC
5585static DEFINE_PER_CPU(struct sched_domain, core_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005586static DEFINE_PER_CPU(struct sched_group, sched_group_core);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005587#endif
5588
5589#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005590static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5591 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005592{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005593 int group;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005594 cpumask_t mask = cpu_sibling_map[cpu];
5595 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005596 group = first_cpu(mask);
5597 if (sg)
5598 *sg = &per_cpu(sched_group_core, group);
5599 return group;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005600}
5601#elif defined(CONFIG_SCHED_MC)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005602static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5603 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005604{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005605 if (sg)
5606 *sg = &per_cpu(sched_group_core, cpu);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005607 return cpu;
5608}
5609#endif
5610
Linus Torvalds1da177e2005-04-16 15:20:36 -07005611static DEFINE_PER_CPU(struct sched_domain, phys_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005612static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005613
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005614static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5615 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005616{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005617 int group;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005618#ifdef CONFIG_SCHED_MC
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005619 cpumask_t mask = cpu_coregroup_map(cpu);
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005620 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005621 group = first_cpu(mask);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005622#elif defined(CONFIG_SCHED_SMT)
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005623 cpumask_t mask = cpu_sibling_map[cpu];
5624 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005625 group = first_cpu(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005626#else
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005627 group = cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005628#endif
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005629 if (sg)
5630 *sg = &per_cpu(sched_group_phys, group);
5631 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005632}
5633
5634#ifdef CONFIG_NUMA
John Hawkes9c1cfda2005-09-06 15:18:14 -07005635/*
5636 * The init_sched_build_groups can't handle what we want to do with node
5637 * groups, so roll our own. Now each node has its own list of groups which
5638 * gets dynamically allocated.
5639 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005640static DEFINE_PER_CPU(struct sched_domain, node_domains);
John Hawkesd1b55132005-09-06 15:18:14 -07005641static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
John Hawkes9c1cfda2005-09-06 15:18:14 -07005642
5643static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005644static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005645
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005646static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5647 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005648{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005649 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5650 int group;
5651
5652 cpus_and(nodemask, nodemask, *cpu_map);
5653 group = first_cpu(nodemask);
5654
5655 if (sg)
5656 *sg = &per_cpu(sched_group_allnodes, group);
5657 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005658}
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005659
Siddha, Suresh B08069032006-03-27 01:15:23 -08005660static void init_numa_sched_groups_power(struct sched_group *group_head)
5661{
5662 struct sched_group *sg = group_head;
5663 int j;
5664
5665 if (!sg)
5666 return;
5667next_sg:
5668 for_each_cpu_mask(j, sg->cpumask) {
5669 struct sched_domain *sd;
5670
5671 sd = &per_cpu(phys_domains, j);
5672 if (j != first_cpu(sd->groups->cpumask)) {
5673 /*
5674 * Only add "power" once for each
5675 * physical package.
5676 */
5677 continue;
5678 }
5679
Eric Dumazet5517d862007-05-08 00:32:57 -07005680 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
Siddha, Suresh B08069032006-03-27 01:15:23 -08005681 }
5682 sg = sg->next;
5683 if (sg != group_head)
5684 goto next_sg;
5685}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005686#endif
5687
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005688#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005689/* Free memory allocated for various sched_group structures */
5690static void free_sched_groups(const cpumask_t *cpu_map)
5691{
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005692 int cpu, i;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005693
5694 for_each_cpu_mask(cpu, *cpu_map) {
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005695 struct sched_group **sched_group_nodes
5696 = sched_group_nodes_bycpu[cpu];
5697
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005698 if (!sched_group_nodes)
5699 continue;
5700
5701 for (i = 0; i < MAX_NUMNODES; i++) {
5702 cpumask_t nodemask = node_to_cpumask(i);
5703 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5704
5705 cpus_and(nodemask, nodemask, *cpu_map);
5706 if (cpus_empty(nodemask))
5707 continue;
5708
5709 if (sg == NULL)
5710 continue;
5711 sg = sg->next;
5712next_sg:
5713 oldsg = sg;
5714 sg = sg->next;
5715 kfree(oldsg);
5716 if (oldsg != sched_group_nodes[i])
5717 goto next_sg;
5718 }
5719 kfree(sched_group_nodes);
5720 sched_group_nodes_bycpu[cpu] = NULL;
5721 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005722}
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005723#else
5724static void free_sched_groups(const cpumask_t *cpu_map)
5725{
5726}
5727#endif
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005728
Linus Torvalds1da177e2005-04-16 15:20:36 -07005729/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005730 * Initialize sched groups cpu_power.
5731 *
5732 * cpu_power indicates the capacity of sched group, which is used while
5733 * distributing the load between different sched groups in a sched domain.
5734 * Typically cpu_power for all the groups in a sched domain will be same unless
5735 * there are asymmetries in the topology. If there are asymmetries, group
5736 * having more cpu_power will pickup more load compared to the group having
5737 * less cpu_power.
5738 *
5739 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5740 * the maximum number of tasks a group can handle in the presence of other idle
5741 * or lightly loaded groups in the same sched domain.
5742 */
5743static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5744{
5745 struct sched_domain *child;
5746 struct sched_group *group;
5747
5748 WARN_ON(!sd || !sd->groups);
5749
5750 if (cpu != first_cpu(sd->groups->cpumask))
5751 return;
5752
5753 child = sd->child;
5754
Eric Dumazet5517d862007-05-08 00:32:57 -07005755 sd->groups->__cpu_power = 0;
5756
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005757 /*
5758 * For perf policy, if the groups in child domain share resources
5759 * (for example cores sharing some portions of the cache hierarchy
5760 * or SMT), then set this domain groups cpu_power such that each group
5761 * can handle only one task, when there are other idle groups in the
5762 * same sched domain.
5763 */
5764 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5765 (child->flags &
5766 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
Eric Dumazet5517d862007-05-08 00:32:57 -07005767 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005768 return;
5769 }
5770
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005771 /*
5772 * add cpu_power of each child group to this groups cpu_power
5773 */
5774 group = child->groups;
5775 do {
Eric Dumazet5517d862007-05-08 00:32:57 -07005776 sg_inc_cpu_power(sd->groups, group->__cpu_power);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005777 group = group->next;
5778 } while (group != child->groups);
5779}
5780
5781/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005782 * Build sched domains for a given set of cpus and attach the sched domains
5783 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07005784 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005785static int build_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005786{
5787 int i;
John Hawkesd1b55132005-09-06 15:18:14 -07005788#ifdef CONFIG_NUMA
5789 struct sched_group **sched_group_nodes = NULL;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005790 int sd_allnodes = 0;
John Hawkesd1b55132005-09-06 15:18:14 -07005791
5792 /*
5793 * Allocate the per-node list of sched groups
5794 */
Ingo Molnardd41f592007-07-09 18:51:59 +02005795 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
Srivatsa Vaddagirid3a5aa92006-06-27 02:54:39 -07005796 GFP_KERNEL);
John Hawkesd1b55132005-09-06 15:18:14 -07005797 if (!sched_group_nodes) {
5798 printk(KERN_WARNING "Can not alloc sched group node list\n");
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005799 return -ENOMEM;
John Hawkesd1b55132005-09-06 15:18:14 -07005800 }
5801 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5802#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005803
5804 /*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005805 * Set up domains for cpus specified by the cpu_map.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005806 */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005807 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005808 struct sched_domain *sd = NULL, *p;
5809 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5810
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005811 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005812
5813#ifdef CONFIG_NUMA
Ingo Molnardd41f592007-07-09 18:51:59 +02005814 if (cpus_weight(*cpu_map) >
5815 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07005816 sd = &per_cpu(allnodes_domains, i);
5817 *sd = SD_ALLNODES_INIT;
5818 sd->span = *cpu_map;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005819 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005820 p = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005821 sd_allnodes = 1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005822 } else
5823 p = NULL;
5824
Linus Torvalds1da177e2005-04-16 15:20:36 -07005825 sd = &per_cpu(node_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005826 *sd = SD_NODE_INIT;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005827 sd->span = sched_domain_node_span(cpu_to_node(i));
5828 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005829 if (p)
5830 p->child = sd;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005831 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005832#endif
5833
5834 p = sd;
5835 sd = &per_cpu(phys_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005836 *sd = SD_CPU_INIT;
5837 sd->span = nodemask;
5838 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005839 if (p)
5840 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005841 cpu_to_phys_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005842
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005843#ifdef CONFIG_SCHED_MC
5844 p = sd;
5845 sd = &per_cpu(core_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005846 *sd = SD_MC_INIT;
5847 sd->span = cpu_coregroup_map(i);
5848 cpus_and(sd->span, sd->span, *cpu_map);
5849 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005850 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005851 cpu_to_core_group(i, cpu_map, &sd->groups);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005852#endif
5853
Linus Torvalds1da177e2005-04-16 15:20:36 -07005854#ifdef CONFIG_SCHED_SMT
5855 p = sd;
5856 sd = &per_cpu(cpu_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005857 *sd = SD_SIBLING_INIT;
5858 sd->span = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005859 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005860 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005861 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005862 cpu_to_cpu_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005863#endif
5864 }
5865
5866#ifdef CONFIG_SCHED_SMT
5867 /* Set up CPU (sibling) groups */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005868 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005869 cpumask_t this_sibling_map = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005870 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005871 if (i != first_cpu(this_sibling_map))
5872 continue;
5873
Ingo Molnardd41f592007-07-09 18:51:59 +02005874 init_sched_build_groups(this_sibling_map, cpu_map,
5875 &cpu_to_cpu_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005876 }
5877#endif
5878
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005879#ifdef CONFIG_SCHED_MC
5880 /* Set up multi-core groups */
5881 for_each_cpu_mask(i, *cpu_map) {
5882 cpumask_t this_core_map = cpu_coregroup_map(i);
5883 cpus_and(this_core_map, this_core_map, *cpu_map);
5884 if (i != first_cpu(this_core_map))
5885 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02005886 init_sched_build_groups(this_core_map, cpu_map,
5887 &cpu_to_core_group);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005888 }
5889#endif
5890
Linus Torvalds1da177e2005-04-16 15:20:36 -07005891 /* Set up physical groups */
5892 for (i = 0; i < MAX_NUMNODES; i++) {
5893 cpumask_t nodemask = node_to_cpumask(i);
5894
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005895 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005896 if (cpus_empty(nodemask))
5897 continue;
5898
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005899 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005900 }
5901
5902#ifdef CONFIG_NUMA
5903 /* Set up node groups */
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005904 if (sd_allnodes)
Ingo Molnardd41f592007-07-09 18:51:59 +02005905 init_sched_build_groups(*cpu_map, cpu_map,
5906 &cpu_to_allnodes_group);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005907
5908 for (i = 0; i < MAX_NUMNODES; i++) {
5909 /* Set up node groups */
5910 struct sched_group *sg, *prev;
5911 cpumask_t nodemask = node_to_cpumask(i);
5912 cpumask_t domainspan;
5913 cpumask_t covered = CPU_MASK_NONE;
5914 int j;
5915
5916 cpus_and(nodemask, nodemask, *cpu_map);
John Hawkesd1b55132005-09-06 15:18:14 -07005917 if (cpus_empty(nodemask)) {
5918 sched_group_nodes[i] = NULL;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005919 continue;
John Hawkesd1b55132005-09-06 15:18:14 -07005920 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005921
5922 domainspan = sched_domain_node_span(i);
5923 cpus_and(domainspan, domainspan, *cpu_map);
5924
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005925 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005926 if (!sg) {
5927 printk(KERN_WARNING "Can not alloc domain group for "
5928 "node %d\n", i);
5929 goto error;
5930 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005931 sched_group_nodes[i] = sg;
5932 for_each_cpu_mask(j, nodemask) {
5933 struct sched_domain *sd;
Ingo Molnar9761eea2007-07-09 18:52:00 +02005934
John Hawkes9c1cfda2005-09-06 15:18:14 -07005935 sd = &per_cpu(node_domains, j);
5936 sd->groups = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005937 }
Eric Dumazet5517d862007-05-08 00:32:57 -07005938 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005939 sg->cpumask = nodemask;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005940 sg->next = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005941 cpus_or(covered, covered, nodemask);
5942 prev = sg;
5943
5944 for (j = 0; j < MAX_NUMNODES; j++) {
5945 cpumask_t tmp, notcovered;
5946 int n = (i + j) % MAX_NUMNODES;
5947
5948 cpus_complement(notcovered, covered);
5949 cpus_and(tmp, notcovered, *cpu_map);
5950 cpus_and(tmp, tmp, domainspan);
5951 if (cpus_empty(tmp))
5952 break;
5953
5954 nodemask = node_to_cpumask(n);
5955 cpus_and(tmp, tmp, nodemask);
5956 if (cpus_empty(tmp))
5957 continue;
5958
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005959 sg = kmalloc_node(sizeof(struct sched_group),
5960 GFP_KERNEL, i);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005961 if (!sg) {
5962 printk(KERN_WARNING
5963 "Can not alloc domain group for node %d\n", j);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005964 goto error;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005965 }
Eric Dumazet5517d862007-05-08 00:32:57 -07005966 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005967 sg->cpumask = tmp;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005968 sg->next = prev->next;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005969 cpus_or(covered, covered, tmp);
5970 prev->next = sg;
5971 prev = sg;
5972 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005973 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005974#endif
5975
5976 /* Calculate CPU power for physical packages and nodes */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005977#ifdef CONFIG_SCHED_SMT
5978 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005979 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5980
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005981 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005982 }
5983#endif
5984#ifdef CONFIG_SCHED_MC
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005985 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005986 struct sched_domain *sd = &per_cpu(core_domains, i);
5987
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005988 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005989 }
5990#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005991
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005992 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005993 struct sched_domain *sd = &per_cpu(phys_domains, i);
5994
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005995 init_sched_groups_power(i, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005996 }
5997
John Hawkes9c1cfda2005-09-06 15:18:14 -07005998#ifdef CONFIG_NUMA
Siddha, Suresh B08069032006-03-27 01:15:23 -08005999 for (i = 0; i < MAX_NUMNODES; i++)
6000 init_numa_sched_groups_power(sched_group_nodes[i]);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006001
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006002 if (sd_allnodes) {
6003 struct sched_group *sg;
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006004
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006005 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07006006 init_numa_sched_groups_power(sg);
6007 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006008#endif
6009
Linus Torvalds1da177e2005-04-16 15:20:36 -07006010 /* Attach the domains */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006011 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006012 struct sched_domain *sd;
6013#ifdef CONFIG_SCHED_SMT
6014 sd = &per_cpu(cpu_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006015#elif defined(CONFIG_SCHED_MC)
6016 sd = &per_cpu(core_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006017#else
6018 sd = &per_cpu(phys_domains, i);
6019#endif
6020 cpu_attach_domain(sd, i);
6021 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006022
6023 return 0;
6024
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006025#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006026error:
6027 free_sched_groups(cpu_map);
6028 return -ENOMEM;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006029#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006030}
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006031/*
6032 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6033 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006034static int arch_init_sched_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006035{
6036 cpumask_t cpu_default_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006037 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006038
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006039 /*
6040 * Setup mask for cpus without special case scheduling requirements.
6041 * For now this just excludes isolated cpus, but could be used to
6042 * exclude other special cases in the future.
6043 */
6044 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6045
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006046 err = build_sched_domains(&cpu_default_map);
6047
6048 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006049}
6050
6051static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006052{
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006053 free_sched_groups(cpu_map);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006054}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006055
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006056/*
6057 * Detach sched domains from a group of cpus specified in cpu_map
6058 * These cpus will now be attached to the NULL domain
6059 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08006060static void detach_destroy_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006061{
6062 int i;
6063
6064 for_each_cpu_mask(i, *cpu_map)
6065 cpu_attach_domain(NULL, i);
6066 synchronize_sched();
6067 arch_destroy_sched_domains(cpu_map);
6068}
6069
6070/*
6071 * Partition sched domains as specified by the cpumasks below.
6072 * This attaches all cpus from the cpumasks to the NULL domain,
6073 * waits for a RCU quiescent period, recalculates sched
6074 * domain information and then attaches them back to the
6075 * correct sched domains
6076 * Call with hotplug lock held
6077 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006078int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006079{
6080 cpumask_t change_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006081 int err = 0;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006082
6083 cpus_and(*partition1, *partition1, cpu_online_map);
6084 cpus_and(*partition2, *partition2, cpu_online_map);
6085 cpus_or(change_map, *partition1, *partition2);
6086
6087 /* Detach sched domains from all of the affected cpus */
6088 detach_destroy_domains(&change_map);
6089 if (!cpus_empty(*partition1))
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006090 err = build_sched_domains(partition1);
6091 if (!err && !cpus_empty(*partition2))
6092 err = build_sched_domains(partition2);
6093
6094 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006095}
6096
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006097#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6098int arch_reinit_sched_domains(void)
6099{
6100 int err;
6101
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006102 mutex_lock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006103 detach_destroy_domains(&cpu_online_map);
6104 err = arch_init_sched_domains(&cpu_online_map);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006105 mutex_unlock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006106
6107 return err;
6108}
6109
6110static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6111{
6112 int ret;
6113
6114 if (buf[0] != '0' && buf[0] != '1')
6115 return -EINVAL;
6116
6117 if (smt)
6118 sched_smt_power_savings = (buf[0] == '1');
6119 else
6120 sched_mc_power_savings = (buf[0] == '1');
6121
6122 ret = arch_reinit_sched_domains();
6123
6124 return ret ? ret : count;
6125}
6126
6127int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6128{
6129 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006130
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006131#ifdef CONFIG_SCHED_SMT
6132 if (smt_capable())
6133 err = sysfs_create_file(&cls->kset.kobj,
6134 &attr_sched_smt_power_savings.attr);
6135#endif
6136#ifdef CONFIG_SCHED_MC
6137 if (!err && mc_capable())
6138 err = sysfs_create_file(&cls->kset.kobj,
6139 &attr_sched_mc_power_savings.attr);
6140#endif
6141 return err;
6142}
6143#endif
6144
6145#ifdef CONFIG_SCHED_MC
6146static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6147{
6148 return sprintf(page, "%u\n", sched_mc_power_savings);
6149}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006150static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6151 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006152{
6153 return sched_power_savings_store(buf, count, 0);
6154}
6155SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6156 sched_mc_power_savings_store);
6157#endif
6158
6159#ifdef CONFIG_SCHED_SMT
6160static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6161{
6162 return sprintf(page, "%u\n", sched_smt_power_savings);
6163}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006164static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6165 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006166{
6167 return sched_power_savings_store(buf, count, 1);
6168}
6169SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6170 sched_smt_power_savings_store);
6171#endif
6172
Linus Torvalds1da177e2005-04-16 15:20:36 -07006173/*
6174 * Force a reinitialization of the sched domains hierarchy. The domains
6175 * and groups cannot be updated in place without racing with the balancing
Nick Piggin41c7ce92005-06-25 14:57:24 -07006176 * code, so we temporarily attach all running cpus to the NULL domain
Linus Torvalds1da177e2005-04-16 15:20:36 -07006177 * which will prevent rebalancing while the sched domains are recalculated.
6178 */
6179static int update_sched_domains(struct notifier_block *nfb,
6180 unsigned long action, void *hcpu)
6181{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006182 switch (action) {
6183 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006184 case CPU_UP_PREPARE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006185 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006186 case CPU_DOWN_PREPARE_FROZEN:
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006187 detach_destroy_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006188 return NOTIFY_OK;
6189
6190 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006191 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006192 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006193 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006194 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006195 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006196 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006197 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006198 /*
6199 * Fall through and re-initialise the domains.
6200 */
6201 break;
6202 default:
6203 return NOTIFY_DONE;
6204 }
6205
6206 /* The hotplug lock is already held by cpu_up/cpu_down */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006207 arch_init_sched_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006208
6209 return NOTIFY_OK;
6210}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006211
6212void __init sched_init_smp(void)
6213{
Nick Piggin5c1e1762006-10-03 01:14:04 -07006214 cpumask_t non_isolated_cpus;
6215
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006216 mutex_lock(&sched_hotcpu_mutex);
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006217 arch_init_sched_domains(&cpu_online_map);
Nathan Lynche5e56732007-01-10 23:15:28 -08006218 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006219 if (cpus_empty(non_isolated_cpus))
6220 cpu_set(smp_processor_id(), non_isolated_cpus);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006221 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006222 /* XXX: Theoretical race here - CPU may be hotplugged now */
6223 hotcpu_notifier(update_sched_domains, 0);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006224
6225 /* Move init over to a non-isolated CPU */
6226 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6227 BUG();
Ingo Molnardd41f592007-07-09 18:51:59 +02006228 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006229}
6230#else
6231void __init sched_init_smp(void)
6232{
Ingo Molnardd41f592007-07-09 18:51:59 +02006233 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006234}
6235#endif /* CONFIG_SMP */
6236
6237int in_sched_functions(unsigned long addr)
6238{
6239 /* Linker adds these: start and end of __sched functions */
6240 extern char __sched_text_start[], __sched_text_end[];
Ingo Molnar48f24c42006-07-03 00:25:40 -07006241
Linus Torvalds1da177e2005-04-16 15:20:36 -07006242 return in_lock_functions(addr) ||
6243 (addr >= (unsigned long)__sched_text_start
6244 && addr < (unsigned long)__sched_text_end);
6245}
6246
Ingo Molnardd41f592007-07-09 18:51:59 +02006247static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6248{
6249 cfs_rq->tasks_timeline = RB_ROOT;
6250 cfs_rq->fair_clock = 1;
6251#ifdef CONFIG_FAIR_GROUP_SCHED
6252 cfs_rq->rq = rq;
6253#endif
6254}
6255
Linus Torvalds1da177e2005-04-16 15:20:36 -07006256void __init sched_init(void)
6257{
Ingo Molnardd41f592007-07-09 18:51:59 +02006258 u64 now = sched_clock();
Christoph Lameter476f3532007-05-06 14:48:58 -07006259 int highest_cpu = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006260 int i, j;
6261
6262 /*
6263 * Link up the scheduling class hierarchy:
6264 */
6265 rt_sched_class.next = &fair_sched_class;
6266 fair_sched_class.next = &idle_sched_class;
6267 idle_sched_class.next = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006268
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08006269 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006270 struct rt_prio_array *array;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006271 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006272
6273 rq = cpu_rq(i);
6274 spin_lock_init(&rq->lock);
Ingo Molnarfcb99372006-07-03 00:25:10 -07006275 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
Nick Piggin78979862005-06-25 14:57:13 -07006276 rq->nr_running = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006277 rq->clock = 1;
6278 init_cfs_rq(&rq->cfs, rq);
6279#ifdef CONFIG_FAIR_GROUP_SCHED
6280 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6281 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6282#endif
6283 rq->ls.load_update_last = now;
6284 rq->ls.load_update_start = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285
Ingo Molnardd41f592007-07-09 18:51:59 +02006286 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6287 rq->cpu_load[j] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006288#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07006289 rq->sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006290 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006291 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006292 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07006293 rq->cpu = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006294 rq->migration_thread = NULL;
6295 INIT_LIST_HEAD(&rq->migration_queue);
6296#endif
6297 atomic_set(&rq->nr_iowait, 0);
6298
Ingo Molnardd41f592007-07-09 18:51:59 +02006299 array = &rq->rt.active;
6300 for (j = 0; j < MAX_RT_PRIO; j++) {
6301 INIT_LIST_HEAD(array->queue + j);
6302 __clear_bit(j, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006303 }
Christoph Lameter476f3532007-05-06 14:48:58 -07006304 highest_cpu = i;
Ingo Molnardd41f592007-07-09 18:51:59 +02006305 /* delimiter for bitsearch: */
6306 __set_bit(MAX_RT_PRIO, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006307 }
6308
Peter Williams2dd73a42006-06-27 02:54:34 -07006309 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006310
Christoph Lameterc9819f42006-12-10 02:20:25 -08006311#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006312 nr_cpu_ids = highest_cpu + 1;
Christoph Lameterc9819f42006-12-10 02:20:25 -08006313 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6314#endif
6315
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006316#ifdef CONFIG_RT_MUTEXES
6317 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6318#endif
6319
Linus Torvalds1da177e2005-04-16 15:20:36 -07006320 /*
6321 * The boot idle thread does lazy MMU switching as well:
6322 */
6323 atomic_inc(&init_mm.mm_count);
6324 enter_lazy_tlb(&init_mm, current);
6325
6326 /*
6327 * Make us the idle thread. Technically, schedule() should not be
6328 * called from this thread, however somewhere below it might be,
6329 * but because we are the idle thread, we just pick up running again
6330 * when this runqueue becomes "idle".
6331 */
6332 init_idle(current, smp_processor_id());
Ingo Molnardd41f592007-07-09 18:51:59 +02006333 /*
6334 * During early bootup we pretend to be a normal task:
6335 */
6336 current->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006337}
6338
6339#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6340void __might_sleep(char *file, int line)
6341{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006342#ifdef in_atomic
Linus Torvalds1da177e2005-04-16 15:20:36 -07006343 static unsigned long prev_jiffy; /* ratelimiting */
6344
6345 if ((in_atomic() || irqs_disabled()) &&
6346 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6347 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6348 return;
6349 prev_jiffy = jiffies;
Ingo Molnar91368d72006-03-23 03:00:54 -08006350 printk(KERN_ERR "BUG: sleeping function called from invalid"
Linus Torvalds1da177e2005-04-16 15:20:36 -07006351 " context at %s:%d\n", file, line);
6352 printk("in_atomic():%d, irqs_disabled():%d\n",
6353 in_atomic(), irqs_disabled());
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08006354 debug_show_held_locks(current);
Ingo Molnar3117df02006-12-13 00:34:43 -08006355 if (irqs_disabled())
6356 print_irqtrace_events(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006357 dump_stack();
6358 }
6359#endif
6360}
6361EXPORT_SYMBOL(__might_sleep);
6362#endif
6363
6364#ifdef CONFIG_MAGIC_SYSRQ
6365void normalize_rt_tasks(void)
6366{
Ingo Molnara0f98a12007-06-17 18:37:45 +02006367 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006368 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006369 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02006370 int on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006371
6372 read_lock_irq(&tasklist_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006373 do_each_thread(g, p) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006374 p->se.fair_key = 0;
6375 p->se.wait_runtime = 0;
6376 p->se.wait_start_fair = 0;
6377 p->se.wait_start = 0;
6378 p->se.exec_start = 0;
6379 p->se.sleep_start = 0;
6380 p->se.sleep_start_fair = 0;
6381 p->se.block_start = 0;
6382 task_rq(p)->cfs.fair_clock = 0;
6383 task_rq(p)->clock = 0;
6384
6385 if (!rt_task(p)) {
6386 /*
6387 * Renice negative nice level userspace
6388 * tasks back to 0:
6389 */
6390 if (TASK_NICE(p) < 0 && p->mm)
6391 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006392 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02006393 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006394
Ingo Molnarb29739f2006-06-27 02:54:51 -07006395 spin_lock_irqsave(&p->pi_lock, flags);
6396 rq = __task_rq_lock(p);
Ingo Molnardd41f592007-07-09 18:51:59 +02006397#ifdef CONFIG_SMP
6398 /*
6399 * Do not touch the migration thread:
6400 */
6401 if (p == rq->migration_thread)
6402 goto out_unlock;
6403#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006404
Ingo Molnardd41f592007-07-09 18:51:59 +02006405 on_rq = p->se.on_rq;
6406 if (on_rq)
6407 deactivate_task(task_rq(p), p, 0);
6408 __setscheduler(rq, p, SCHED_NORMAL, 0);
6409 if (on_rq) {
6410 activate_task(task_rq(p), p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006411 resched_task(rq->curr);
6412 }
Ingo Molnardd41f592007-07-09 18:51:59 +02006413#ifdef CONFIG_SMP
6414 out_unlock:
6415#endif
Ingo Molnarb29739f2006-06-27 02:54:51 -07006416 __task_rq_unlock(rq);
6417 spin_unlock_irqrestore(&p->pi_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006418 } while_each_thread(g, p);
6419
Linus Torvalds1da177e2005-04-16 15:20:36 -07006420 read_unlock_irq(&tasklist_lock);
6421}
6422
6423#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07006424
6425#ifdef CONFIG_IA64
6426/*
6427 * These functions are only useful for the IA64 MCA handling.
6428 *
6429 * They can only be called when the whole system has been
6430 * stopped - every CPU needs to be quiescent, and no scheduling
6431 * activity can take place. Using them for anything else would
6432 * be a serious bug, and as a result, they aren't even visible
6433 * under any other configuration.
6434 */
6435
6436/**
6437 * curr_task - return the current task for a given cpu.
6438 * @cpu: the processor in question.
6439 *
6440 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6441 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006442struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006443{
6444 return cpu_curr(cpu);
6445}
6446
6447/**
6448 * set_curr_task - set the current task for a given cpu.
6449 * @cpu: the processor in question.
6450 * @p: the task pointer to set.
6451 *
6452 * Description: This function must only be used when non-maskable interrupts
6453 * are serviced on a separate stack. It allows the architecture to switch the
6454 * notion of the current task on a cpu in a non-blocking manner. This function
6455 * must be called with all CPU's synchronized, and interrupts disabled, the
6456 * and caller must save the original value of the current task (see
6457 * curr_task() above) and restore that value before reenabling interrupts and
6458 * re-starting the system.
6459 *
6460 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6461 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006462void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006463{
6464 cpu_curr(cpu) = p;
6465}
6466
6467#endif